wesm commented on code in PR #13521:
URL: https://github.com/apache/arrow/pull/13521#discussion_r914208594


##########
cpp/src/arrow/array/data.h:
##########
@@ -266,16 +266,19 @@ struct ARROW_EXPORT ArraySpan {
   int64_t offset = 0;
   BufferSpan buffers[3];
 
+  // 16 bytes of scratch space to enable this ArraySpan to be a view onto
+  // scalar values including binary scalars (where we need to create a buffer
+  // that looks like two 32-bit or 64-bit offsets)
+  uint8_t scratch_space[16];

Review Comment:
   It might make sense to put this "scratch space" only in the Scalar values 
that require it (types that have offsets), then we don't have to carry around 
an extra 16 bytes with every ArraySpan 



##########
cpp/src/arrow/compute/exec.cc:
##########
@@ -385,13 +387,20 @@ int64_t ExecSpanIterator::GetNextChunkSpan(int64_t 
iteration_size, ExecSpan* spa
   return iteration_size;
 }
 
-bool ExecSpanIterator::Next(ExecSpan* span) {
-  if (position_ == length_) {
-    // This also protects from degenerate cases like ChunkedArrays
-    // without any chunks
-    return false;
+void PromoteExecSpanScalars(ExecSpan* span) {

Review Comment:
   Here is the Scalar -> ArraySpan promotion



##########
cpp/src/arrow/compute/exec/test_util.cc:
##########
@@ -143,16 +143,25 @@ ExecNode* MakeDummyNode(ExecPlan* plan, std::string 
label, std::vector<ExecNode*
   return node;
 }
 
-ExecBatch ExecBatchFromJSON(const std::vector<ValueDescr>& descrs,
+ExecBatch ExecBatchFromJSON(const std::vector<TypeHolder>& types,
                             util::string_view json) {
   auto fields = ::arrow::internal::MapVector(
-      [](const ValueDescr& descr) { return field("", descr.type); }, descrs);
+      [](const TypeHolder& th) { return field("", th.GetSharedPtr()); }, 
types);
 
   ExecBatch batch{*RecordBatchFromJSON(schema(std::move(fields)), json)};
 
+  return batch;
+}
+
+ExecBatch ExecBatchFromJSON(const std::vector<TypeHolder>& types,
+                            const std::vector<ArgShape>& shapes, 
util::string_view json) {

Review Comment:
   This is the only place that `ArgShape` is used now, just for the 
construction of test data



##########
cpp/src/arrow/compute/kernel.h:
##########
@@ -555,8 +505,7 @@ struct Kernel {
 /// endeavor to write into pre-allocated memory if they are able, though for
 /// some kernels (e.g. in cases when a builder like StringBuilder) must be
 /// employed this may not be possible.
-using ArrayKernelExec =
-    std::function<Status(KernelContext*, const ExecSpan&, ExecResult*)>;
+typedef Status (*ArrayKernelExec)(KernelContext*, const ExecSpan&, 
ExecResult*);

Review Comment:
   Using plain function pointers makes debugging this code much easier 
(`std::function` adds many levels to gdb stack traces); we should use function 
pointers for the other kernel APIs in this file, too



##########
cpp/src/arrow/compute/kernel.h:
##########
@@ -143,10 +143,14 @@ ARROW_EXPORT std::shared_ptr<TypeMatcher> Primitive();
 
 }  // namespace match
 
-/// \brief An object used for type- and shape-checking arguments to be passed
-/// to a kernel and stored in a KernelSignature. Distinguishes between ARRAY
-/// and SCALAR arguments using ValueDescr::Shape. The type-checking rule can be
-/// supplied either with an exact DataType instance or a custom TypeMatcher.
+/// \brief Shape qualifier for value types. In certain instances
+/// (e.g. "map_lookup" kernel), an argument may only be a scalar, where in
+/// other kernels arguments can be arrays or scalars
+enum class ArgShape { ANY, ARRAY, SCALAR };

Review Comment:
   This is ONLY used in testing. I will update the docstring and move this to 
the place where it's used



##########
cpp/src/arrow/array/builder_nested.h:
##########
@@ -304,10 +304,12 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder {
       if (!validity || bit_util::GetBit(validity, array.offset + row)) {
         ARROW_RETURN_NOT_OK(Append());
         const int64_t slot_length = offsets[row + 1] - offsets[row];
+        // Add together the inner StructArray offset to the Map/List offset
+        int64_t key_value_offset = array.child_data[0].offset + offsets[row];
         ARROW_RETURN_NOT_OK(key_builder_->AppendArraySlice(
-            array.child_data[0].child_data[0], offsets[row], slot_length));
+            array.child_data[0].child_data[0], key_value_offset, slot_length));
         ARROW_RETURN_NOT_OK(item_builder_->AppendArraySlice(
-            array.child_data[0].child_data[1], offsets[row], slot_length));
+            array.child_data[0].child_data[1], key_value_offset, slot_length));

Review Comment:
   This fixes a bug that I introduced in a previous PR



##########
cpp/src/arrow/compute/exec.h:
##########
@@ -254,19 +253,16 @@ inline bool operator==(const ExecBatch& l, const 
ExecBatch& r) { return l.Equals
 inline bool operator!=(const ExecBatch& l, const ExecBatch& r) { return 
!l.Equals(r); }
 
 struct ExecValue {
-  enum Kind { ARRAY, SCALAR };
-  Kind kind = ARRAY;
   ArraySpan array;
-  const Scalar* scalar;
+  const Scalar* scalar = NULLPTR;

Review Comment:
   I applied Antoine's suggested optimization



##########
cpp/src/arrow/compute/function_benchmark.cc:
##########
@@ -19,6 +19,7 @@
 
 #include "arrow/array/array_base.h"

Review Comment:
   Need to check this benchmark wasn't broken



##########
cpp/src/arrow/compute/exec.h:
##########
@@ -235,12 +235,11 @@ struct ARROW_EXPORT ExecBatch {
 
   ExecBatch Slice(int64_t offset, int64_t length) const;
 
-  /// \brief A convenience for returning the ValueDescr objects (types and
-  /// shapes) from the batch.
-  std::vector<ValueDescr> GetDescriptors() const {
-    std::vector<ValueDescr> result;
+  /// \brief A convenience for returning the types from the batch.
+  std::vector<TypeHolder> GetTypes() const {
+    std::vector<TypeHolder> result;
     for (const auto& value : this->values) {
-      result.emplace_back(value.descr());
+      result.emplace_back(value.type());

Review Comment:
   There are several places where we can probably drop to construct from all 
`const DataType*` which is less expensive, this is one of them (I need to check 
that it doesn't cause tests to fail)



##########
cpp/src/arrow/compute/exec.h:
##########
@@ -324,29 +310,21 @@ struct ExecValue {
 
 struct ARROW_EXPORT ExecResult {
   // The default value of the variant is ArraySpan
-  // TODO(wesm): remove Scalar output modality in ARROW-16577
-  util::Variant<ArraySpan, std::shared_ptr<ArrayData>, 
std::shared_ptr<Scalar>> value;
+  util::Variant<ArraySpan, std::shared_ptr<ArrayData>> value;

Review Comment:
   Kernels can no longer return `shared_ptr<Scalar>`



##########
cpp/src/arrow/compute/function_internal.h:
##########
@@ -430,6 +434,12 @@ static inline enable_if_same_result<T, 
std::shared_ptr<DataType>> GenericFromSca
   return value->type;
 }
 
+template <typename T>
+static inline enable_if_same_result<T, TypeHolder> GenericFromScalar(
+    const std::shared_ptr<Scalar>& value) {
+  return value->type;
+}
+

Review Comment:
   Aside: I struggled a bit with this reflection stuff



##########
cpp/src/arrow/compute/kernels/vector_cumulative_ops_test.cc:
##########
@@ -66,21 +66,45 @@ TEST(TestCumulativeSum, AllNulls) {
   }
 }
 
-using testing::HasSubstr;
-
-TEST(TestCumulativeSum, ScalarNotSupported) {
-  CumulativeSumOptions options;
-
-  EXPECT_RAISES_WITH_MESSAGE_THAT(
-      NotImplemented, HasSubstr("no kernel"),
-      CallFunction("cumulative_sum", {std::make_shared<Int64Scalar>(5)}, 
&options));
+TEST(TestCumulativeSum, ScalarInput) {
+  CumulativeSumOptions no_start_no_skip;
+  CumulativeSumOptions no_start_do_skip(0, true);
+  CumulativeSumOptions has_start_no_skip(10);
+  CumulativeSumOptions has_start_do_skip(10, true);
 
-  EXPECT_RAISES_WITH_MESSAGE_THAT(
-      NotImplemented, HasSubstr("no kernel"),
-      CallFunction("cumulative_sum_checked", 
{std::make_shared<Int64Scalar>(5)},
-                   &options));
+  for (auto ty : NumericTypes()) {
+    CheckVectorUnary("cumulative_sum", ScalarFromJSON(ty, "10"),
+                     ArrayFromJSON(ty, "[10]"), &no_start_no_skip);
+    CheckVectorUnary("cumulative_sum_checked", ScalarFromJSON(ty, "10"),
+                     ArrayFromJSON(ty, "[10]"), &no_start_no_skip);
+
+    CheckVectorUnary("cumulative_sum", ScalarFromJSON(ty, "10"),
+                     ArrayFromJSON(ty, "[20]"), &has_start_no_skip);
+    CheckVectorUnary("cumulative_sum_checked", ScalarFromJSON(ty, "10"),
+                     ArrayFromJSON(ty, "[20]"), &has_start_no_skip);
+
+    CheckVectorUnary("cumulative_sum", ScalarFromJSON(ty, "null"),
+                     ArrayFromJSON(ty, "[null]"), &no_start_no_skip);
+    CheckVectorUnary("cumulative_sum_checked", ScalarFromJSON(ty, "null"),
+                     ArrayFromJSON(ty, "[null]"), &no_start_no_skip);
+    CheckVectorUnary("cumulative_sum", ScalarFromJSON(ty, "null"),
+                     ArrayFromJSON(ty, "[null]"), &has_start_no_skip);
+    CheckVectorUnary("cumulative_sum_checked", ScalarFromJSON(ty, "null"),
+                     ArrayFromJSON(ty, "[null]"), &has_start_no_skip);
+
+    CheckVectorUnary("cumulative_sum", ScalarFromJSON(ty, "null"),
+                     ArrayFromJSON(ty, "[null]"), &no_start_do_skip);
+    CheckVectorUnary("cumulative_sum_checked", ScalarFromJSON(ty, "null"),
+                     ArrayFromJSON(ty, "[null]"), &no_start_do_skip);
+    CheckVectorUnary("cumulative_sum", ScalarFromJSON(ty, "null"),
+                     ArrayFromJSON(ty, "[null]"), &has_start_do_skip);
+    CheckVectorUnary("cumulative_sum_checked", ScalarFromJSON(ty, "null"),
+                     ArrayFromJSON(ty, "[null]"), &has_start_do_skip);
+  }

Review Comment:
   These tests are restored from having been deleted in one of my recent PRs



##########
cpp/src/arrow/scalar.h:
##########
@@ -110,7 +108,7 @@ struct ARROW_EXPORT Scalar : public 
std::enable_shared_from_this<Scalar>,
   /// \brief EXPERIMENTAL Enable obtaining shared_ptr<Scalar> from a const
   /// Scalar& context. Implementation depends on enable_shared_from_this, but
   /// we may change this in the future
-  std::shared_ptr<Scalar> Copy() const {
+  std::shared_ptr<Scalar> GetSharedPtr() const {

Review Comment:
   Per previous comments



##########
cpp/src/arrow/type_fwd.h:
##########
@@ -416,43 +416,43 @@ struct Type {
 /// @{
 
 /// \brief Return a NullType instance
-std::shared_ptr<DataType> ARROW_EXPORT null();
+const std::shared_ptr<DataType>& ARROW_EXPORT null();

Review Comment:
   This way we can do `null().get()` or `int32().get()` to get a pointer 
without having to copy the shared_ptr every time we call these functions (they 
return static instances)



##########
cpp/src/arrow/scalar.h:
##########
@@ -479,37 +481,52 @@ struct ARROW_EXPORT StructScalar : public Scalar {
 
   Result<std::shared_ptr<Scalar>> field(FieldRef ref) const;
 
-  StructScalar(ValueType value, std::shared_ptr<DataType> type)
-      : Scalar(std::move(type), true), value(std::move(value)) {}
+  StructScalar(ValueType value, std::shared_ptr<DataType> type, bool is_valid 
= true)
+      : Scalar(std::move(type), is_valid), value(std::move(value)) {}
 
   static Result<std::shared_ptr<StructScalar>> Make(ValueType value,
                                                     std::vector<std::string> 
field_names);
-
-  explicit StructScalar(std::shared_ptr<DataType> type) : 
Scalar(std::move(type)) {}
 };
 
 struct ARROW_EXPORT UnionScalar : public Scalar {
-  using Scalar::Scalar;
-  using ValueType = std::shared_ptr<Scalar>;
-
-  ValueType value;
   int8_t type_code;
 
-  UnionScalar(int8_t type_code, std::shared_ptr<DataType> type)
-      : Scalar(std::move(type), false), type_code(type_code) {}
-
-  UnionScalar(ValueType value, int8_t type_code, std::shared_ptr<DataType> 
type)
-      : Scalar(std::move(type), true), value(std::move(value)), 
type_code(type_code) {}
+ protected:
+  UnionScalar(std::shared_ptr<DataType> type, int8_t type_code, bool is_valid)
+      : Scalar(std::move(type), is_valid), type_code(type_code) {}
 };
 
 struct ARROW_EXPORT SparseUnionScalar : public UnionScalar {
-  using UnionScalar::UnionScalar;
   using TypeClass = SparseUnionType;
+
+  // Even though only one of the union values is relevant for this scalar, we
+  // nonetheless construct a vector of scalars, one per union value, to have
+  // enough data to reconstruct a valid ArraySpan of length 1 from this scalar
+  using ValueType = std::vector<std::shared_ptr<Scalar>>;
+  ValueType value;

Review Comment:
   Note: this change is necessary to establish a 1-1 mapping between 
SparseUnionScalar and an ArraySpan referencing a single-element slice of a 
SparseUnionArray. It's annoying but I think it's the simplest thing



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to