This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new c42df5fcd3 GH-44167: [C++][Acero] Add more row segmenter tests (#44166)
c42df5fcd3 is described below

commit c42df5fcd3532df26cf2a7fa5404e16701f25bc6
Author: Rossi Sun <[email protected]>
AuthorDate: Tue Sep 24 00:17:31 2024 +0800

    GH-44167: [C++][Acero] Add more row segmenter tests (#44166)
    
    
    
    ### Rationale for this change
    
    See #44167 .
    
    ### What changes are included in this PR?
    
    More tests covering other fixed width types for row segmenter.
    
    ### Are these changes tested?
    
    Yes.
    
    ### Are there any user-facing changes?
    
    None.
    
    * GitHub Issue: #44167
    
    Authored-by: Ruoxi Sun <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/arrow/acero/hash_aggregate_test.cc | 256 ++++++++++++++++++++++-------
 1 file changed, 197 insertions(+), 59 deletions(-)

diff --git a/cpp/src/arrow/acero/hash_aggregate_test.cc 
b/cpp/src/arrow/acero/hash_aggregate_test.cc
index f76e326cd7..1e2975afc9 100644
--- a/cpp/src/arrow/acero/hash_aggregate_test.cc
+++ b/cpp/src/arrow/acero/hash_aggregate_test.cc
@@ -667,19 +667,11 @@ TEST(RowSegmenter, Basics) {
 }
 
 TEST(RowSegmenter, NonOrdered) {
-  {
-    std::vector<TypeHolder> types = {int32()};
-    auto batch = ExecBatchFromJSON(types, "[[1], [1], [2], [1], [2]]");
-    ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types));
-    TestSegments(segmenter, ExecSpan(batch),
-                 {{0, 2, false, true},
-                  {2, 1, false, false},
-                  {3, 1, false, false},
-                  {4, 1, true, false}});
-  }
-  {
-    std::vector<TypeHolder> types = {int32(), int32()};
-    auto batch = ExecBatchFromJSON(types, "[[1, 1], [1, 1], [2, 2], [1, 2], 
[2, 2]]");
+  for (int num_keys = 1; num_keys <= 2; ++num_keys) {
+    SCOPED_TRACE("non-ordered " + ToChars(num_keys) + " int32(s)");
+    std::vector<TypeHolder> types(num_keys, int32());
+    std::vector<Datum> values(num_keys, ArrayFromJSON(int32(), "[1, 1, 2, 1, 
2]"));
+    ExecBatch batch(std::move(values), 5);
     ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types));
     TestSegments(segmenter, ExecSpan(batch),
                  {{0, 2, false, true},
@@ -691,6 +683,7 @@ TEST(RowSegmenter, NonOrdered) {
 
 TEST(RowSegmenter, EmptyBatches) {
   {
+    SCOPED_TRACE("empty batches {int32}");
     std::vector<TypeHolder> types = {int32()};
     std::vector<ExecBatch> batches = {
         ExecBatchFromJSON(types, "[]"),         ExecBatchFromJSON(types, "[]"),
@@ -709,6 +702,7 @@ TEST(RowSegmenter, EmptyBatches) {
     TestSegments(segmenter, ExecSpan(batches[7]), {});
   }
   {
+    SCOPED_TRACE("empty batches {int32, int32}");
     std::vector<TypeHolder> types = {int32(), int32()};
     std::vector<ExecBatch> batches = {
         ExecBatchFromJSON(types, "[]"),
@@ -733,24 +727,12 @@ TEST(RowSegmenter, EmptyBatches) {
 }
 
 TEST(RowSegmenter, MultipleSegments) {
-  {
-    std::vector<TypeHolder> types = {int32()};
-    auto batch =
-        ExecBatchFromJSON(types, "[[1], [1], [2], [5], [3], [3], [5], [5], 
[4]]");
-    ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types));
-    TestSegments(segmenter, ExecSpan(batch),
-                 {{0, 2, false, true},
-                  {2, 1, false, false},
-                  {3, 1, false, false},
-                  {4, 2, false, false},
-                  {6, 2, false, false},
-                  {8, 1, true, false}});
-  }
-  {
-    std::vector<TypeHolder> types = {int32(), int32()};
-    auto batch = ExecBatchFromJSON(
-        types,
-        "[[1, 1], [1, 1], [2, 2], [5, 5], [3, 3], [3, 3], [5, 5], [5, 5], [4, 
4]]");
+  auto test_with_keys = [](int num_keys, const std::shared_ptr<Array>& key) {
+    SCOPED_TRACE("multiple segments " + ToChars(num_keys) + " " +
+                 key->type()->ToString());
+    std::vector<TypeHolder> types(num_keys, key->type());
+    std::vector<Datum> values(num_keys, key);
+    ExecBatch batch(std::move(values), key->length());
     ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types));
     TestSegments(segmenter, ExecSpan(batch),
                  {{0, 2, false, true},
@@ -759,11 +741,22 @@ TEST(RowSegmenter, MultipleSegments) {
                   {4, 2, false, false},
                   {6, 2, false, false},
                   {8, 1, true, false}});
+  };
+  for (int num_keys = 1; num_keys <= 2; ++num_keys) {
+    test_with_keys(num_keys, ArrayFromJSON(int32(), "[1, 1, 2, 5, 3, 3, 5, 5, 
4]"));
+    test_with_keys(
+        num_keys,
+        ArrayFromJSON(fixed_size_binary(2),
+                      R"(["aa", "aa", "bb", "ee", "cc", "cc", "ee", "ee", 
"dd"])"));
+    test_with_keys(num_keys, DictArrayFromJSON(dictionary(int8(), utf8()),
+                                               "[0, 0, 1, 4, 2, 2, 4, 4, 3]",
+                                               R"(["a", "b", "c", "d", 
"e"])"));
   }
 }
 
 TEST(RowSegmenter, MultipleSegmentsMultipleBatches) {
   {
+    SCOPED_TRACE("multiple segments multiple batches {int32}");
     std::vector<TypeHolder> types = {int32()};
     std::vector<ExecBatch> batches = {
         ExecBatchFromJSON(types, "[[1]]"), ExecBatchFromJSON(types, "[[1], 
[2]]"),
@@ -781,6 +774,7 @@ TEST(RowSegmenter, MultipleSegmentsMultipleBatches) {
     TestSegments(segmenter, ExecSpan(batches[4]), {{0, 1, true, false}});
   }
   {
+    SCOPED_TRACE("multiple segments multiple batches {int32, int32}");
     std::vector<TypeHolder> types = {int32(), int32()};
     std::vector<ExecBatch> batches = {
         ExecBatchFromJSON(types, "[[1, 1]]"),
@@ -804,64 +798,208 @@ TEST(RowSegmenter, MultipleSegmentsMultipleBatches) {
 namespace {
 
 void TestRowSegmenterConstantBatch(
-    std::function<ArgShape(size_t i)> shape_func,
+    const std::shared_ptr<DataType>& type,
+    std::function<ArgShape(int64_t key)> shape_func,
+    std::function<Result<std::shared_ptr<Scalar>>(int64_t key)> value_func,
     std::function<Result<std::unique_ptr<RowSegmenter>>(const 
std::vector<TypeHolder>&)>
         make_segmenter) {
-  constexpr size_t n = 3, repetitions = 3;
-  std::vector<TypeHolder> types = {int32(), int32(), int32()};
-  std::vector<ArgShape> shapes(n);
-  for (size_t i = 0; i < n; i++) shapes[i] = shape_func(i);
-  auto full_batch = ExecBatchFromJSON(types, shapes, "[[1, 1, 1], [1, 1, 1], 
[1, 1, 1]]");
-  auto test_by_size = [&](size_t size) -> Status {
-    SCOPED_TRACE("constant-batch with " + ToChars(size) + " key(s)");
-    std::vector<Datum> values(full_batch.values.begin(),
-                              full_batch.values.begin() + size);
-    ExecBatch batch(values, full_batch.length);
-    std::vector<TypeHolder> key_types(types.begin(), types.begin() + size);
+  constexpr int64_t n_keys = 3, n_rows = 3, repetitions = 3;
+  std::vector<TypeHolder> types(n_keys, type);
+  std::vector<Datum> full_values(n_keys);
+  for (int64_t i = 0; i < n_keys; i++) {
+    auto shape = shape_func(i);
+    ASSERT_OK_AND_ASSIGN(auto scalar, value_func(i));
+    if (shape == ArgShape::SCALAR) {
+      full_values[i] = std::move(scalar);
+    } else {
+      ASSERT_OK_AND_ASSIGN(full_values[i], MakeArrayFromScalar(*scalar, 
n_rows));
+    }
+  }
+  auto test_with_keys = [&](int64_t keys) -> Status {
+    SCOPED_TRACE("constant-batch with " + ToChars(keys) + " key(s)");
+    std::vector<Datum> values(full_values.begin(), full_values.begin() + keys);
+    ExecBatch batch(values, n_rows);
+    std::vector<TypeHolder> key_types(types.begin(), types.begin() + keys);
     ARROW_ASSIGN_OR_RAISE(auto segmenter, make_segmenter(key_types));
-    for (size_t i = 0; i < repetitions; i++) {
-      TestSegments(segmenter, ExecSpan(batch), {{0, 3, true, true}});
+    for (int64_t i = 0; i < repetitions; i++) {
+      TestSegments(segmenter, ExecSpan(batch), {{0, n_rows, true, true}});
       ARROW_RETURN_NOT_OK(segmenter->Reset());
     }
     return Status::OK();
   };
-  for (size_t i = 0; i <= 3; i++) {
-    ASSERT_OK(test_by_size(i));
+  for (int64_t i = 0; i <= n_keys; i++) {
+    ASSERT_OK(test_with_keys(i));
   }
 }
 
 }  // namespace
 
 TEST(RowSegmenter, ConstantArrayBatch) {
-  TestRowSegmenterConstantBatch([](size_t i) { return ArgShape::ARRAY; },
-                                MakeRowSegmenter);
+  TestRowSegmenterConstantBatch(
+      int32(), [](int64_t key) { return ArgShape::ARRAY; },
+      [](int64_t key) { return MakeScalar(1); }, MakeRowSegmenter);
 }
 
 TEST(RowSegmenter, ConstantScalarBatch) {
-  TestRowSegmenterConstantBatch([](size_t i) { return ArgShape::SCALAR; },
-                                MakeRowSegmenter);
+  TestRowSegmenterConstantBatch(
+      int32(), [](int64_t key) { return ArgShape::SCALAR; },
+      [](int64_t key) { return MakeScalar(1); }, MakeRowSegmenter);
 }
 
 TEST(RowSegmenter, ConstantMixedBatch) {
   TestRowSegmenterConstantBatch(
-      [](size_t i) { return i % 2 == 0 ? ArgShape::SCALAR : ArgShape::ARRAY; },
-      MakeRowSegmenter);
+      int32(),
+      [](int64_t key) { return key % 2 == 0 ? ArgShape::SCALAR : 
ArgShape::ARRAY; },
+      [](int64_t key) { return MakeScalar(1); }, MakeRowSegmenter);
 }
 
 TEST(RowSegmenter, ConstantArrayBatchWithAnyKeysSegmenter) {
-  TestRowSegmenterConstantBatch([](size_t i) { return ArgShape::ARRAY; },
-                                MakeGenericSegmenter);
+  TestRowSegmenterConstantBatch(
+      int32(), [](int64_t key) { return ArgShape::ARRAY; },
+      [](int64_t key) { return MakeScalar(1); }, MakeGenericSegmenter);
 }
 
 TEST(RowSegmenter, ConstantScalarBatchWithAnyKeysSegmenter) {
-  TestRowSegmenterConstantBatch([](size_t i) { return ArgShape::SCALAR; },
-                                MakeGenericSegmenter);
+  TestRowSegmenterConstantBatch(
+      int32(), [](int64_t key) { return ArgShape::SCALAR; },
+      [](int64_t key) { return MakeScalar(1); }, MakeGenericSegmenter);
 }
 
 TEST(RowSegmenter, ConstantMixedBatchWithAnyKeysSegmenter) {
   TestRowSegmenterConstantBatch(
-      [](size_t i) { return i % 2 == 0 ? ArgShape::SCALAR : ArgShape::ARRAY; },
-      MakeGenericSegmenter);
+      int32(),
+      [](int64_t key) { return key % 2 == 0 ? ArgShape::SCALAR : 
ArgShape::ARRAY; },
+      [](int64_t key) { return MakeScalar(1); }, MakeGenericSegmenter);
+}
+
+TEST(RowSegmenter, ConstantFixedSizeBinaryArrayBatch) {
+  constexpr int fsb = 8;
+  auto type = fixed_size_binary(fsb);
+  ASSERT_OK_AND_ASSIGN(auto value, MakeScalar(type, std::string(fsb, 'X')));
+  TestRowSegmenterConstantBatch(
+      type, [](int64_t key) { return ArgShape::ARRAY; },
+      [&](int64_t key) { return value; }, MakeRowSegmenter);
+}
+
+TEST(RowSegmenter, ConstantFixedSizeBinaryScalarBatch) {
+  constexpr int fsb = 8;
+  auto type = fixed_size_binary(fsb);
+  ASSERT_OK_AND_ASSIGN(auto value, MakeScalar(type, std::string(fsb, 'X')));
+  TestRowSegmenterConstantBatch(
+      fixed_size_binary(8), [](int64_t key) { return ArgShape::SCALAR; },
+      [&](int64_t key) { return value; }, MakeRowSegmenter);
+}
+
+TEST(RowSegmenter, ConstantFixedSizeBinaryMixedBatch) {
+  constexpr int fsb = 8;
+  auto type = fixed_size_binary(fsb);
+  ASSERT_OK_AND_ASSIGN(auto value, MakeScalar(type, std::string(fsb, 'X')));
+  TestRowSegmenterConstantBatch(
+      fixed_size_binary(8),
+      [](int64_t key) { return key % 2 == 0 ? ArgShape::SCALAR : 
ArgShape::ARRAY; },
+      [&](int64_t key) { return value; }, MakeRowSegmenter);
+}
+
+TEST(RowSegmenter, ConstantFixedSizeBinaryArrayBatchWithAnyKeysSegmenter) {
+  constexpr int fsb = 8;
+  auto type = fixed_size_binary(fsb);
+  ASSERT_OK_AND_ASSIGN(auto value, MakeScalar(type, std::string(fsb, 'X')));
+  TestRowSegmenterConstantBatch(
+      type, [](int64_t key) { return ArgShape::ARRAY; },
+      [&](int64_t key) { return value; }, MakeGenericSegmenter);
+}
+
+TEST(RowSegmenter, ConstantFixedSizeBinaryScalarBatchWithAnyKeysSegmenter) {
+  constexpr int fsb = 8;
+  auto type = fixed_size_binary(fsb);
+  ASSERT_OK_AND_ASSIGN(auto value, MakeScalar(type, std::string(fsb, 'X')));
+  TestRowSegmenterConstantBatch(
+      fixed_size_binary(8), [](int64_t key) { return ArgShape::SCALAR; },
+      [&](int64_t key) { return value; }, MakeGenericSegmenter);
+}
+
+TEST(RowSegmenter, ConstantFixedSizeBinaryMixedBatchWithAnyKeysSegmenter) {
+  constexpr int fsb = 8;
+  auto type = fixed_size_binary(fsb);
+  ASSERT_OK_AND_ASSIGN(auto value, MakeScalar(type, std::string(fsb, 'X')));
+  TestRowSegmenterConstantBatch(
+      fixed_size_binary(8),
+      [](int64_t key) { return key % 2 == 0 ? ArgShape::SCALAR : 
ArgShape::ARRAY; },
+      [&](int64_t key) { return value; }, MakeGenericSegmenter);
+}
+
+TEST(RowSegmenter, ConstantDictionaryArrayBatch) {
+  auto index_type = int32();
+  auto value_type = utf8();
+  auto dict_type = dictionary(index_type, value_type);
+  auto dict = ArrayFromJSON(value_type, R"(["alpha", null, "gamma"])");
+  ASSERT_OK_AND_ASSIGN(auto index_value, MakeScalar(index_type, 0));
+  auto dict_value = DictionaryScalar::Make(std::move(index_value), dict);
+  TestRowSegmenterConstantBatch(
+      dict_type, [](int64_t key) { return ArgShape::ARRAY; },
+      [&](int64_t key) { return dict_value; }, MakeRowSegmenter);
+}
+
+TEST(RowSegmenter, ConstantDictionaryScalarBatch) {
+  auto index_type = int32();
+  auto value_type = utf8();
+  auto dict_type = dictionary(index_type, value_type);
+  auto dict = ArrayFromJSON(value_type, R"(["alpha", null, "gamma"])");
+  ASSERT_OK_AND_ASSIGN(auto index_value, MakeScalar(index_type, 0));
+  auto dict_value = DictionaryScalar::Make(std::move(index_value), dict);
+  TestRowSegmenterConstantBatch(
+      dict_type, [](int64_t key) { return ArgShape::SCALAR; },
+      [&](int64_t key) { return dict_value; }, MakeRowSegmenter);
+}
+
+TEST(RowSegmenter, ConstantDictionaryMixedBatch) {
+  auto index_type = int32();
+  auto value_type = utf8();
+  auto dict_type = dictionary(index_type, value_type);
+  auto dict = ArrayFromJSON(value_type, R"(["alpha", null, "gamma"])");
+  ASSERT_OK_AND_ASSIGN(auto index_value, MakeScalar(index_type, 0));
+  auto dict_value = DictionaryScalar::Make(std::move(index_value), dict);
+  TestRowSegmenterConstantBatch(
+      dict_type,
+      [](int64_t key) { return key % 2 == 0 ? ArgShape::SCALAR : 
ArgShape::ARRAY; },
+      [&](int64_t key) { return dict_value; }, MakeRowSegmenter);
+}
+
+TEST(RowSegmenter, ConstantDictionaryArrayBatchWithAnyKeysSegmenter) {
+  auto index_type = int32();
+  auto value_type = utf8();
+  auto dict_type = dictionary(index_type, value_type);
+  auto dict = ArrayFromJSON(value_type, R"(["alpha", null, "gamma"])");
+  ASSERT_OK_AND_ASSIGN(auto index_value, MakeScalar(index_type, 0));
+  auto dict_value = DictionaryScalar::Make(std::move(index_value), dict);
+  TestRowSegmenterConstantBatch(
+      dict_type, [](int64_t key) { return ArgShape::ARRAY; },
+      [&](int64_t key) { return dict_value; }, MakeGenericSegmenter);
+}
+
+TEST(RowSegmenter, ConstantDictionaryScalarBatchWithAnyKeysSegmenter) {
+  auto index_type = int32();
+  auto value_type = utf8();
+  auto dict_type = dictionary(index_type, value_type);
+  auto dict = ArrayFromJSON(value_type, R"(["alpha", null, "gamma"])");
+  ASSERT_OK_AND_ASSIGN(auto index_value, MakeScalar(index_type, 0));
+  auto dict_value = DictionaryScalar::Make(std::move(index_value), dict);
+  TestRowSegmenterConstantBatch(
+      dict_type, [](int64_t key) { return ArgShape::SCALAR; },
+      [&](int64_t key) { return dict_value; }, MakeGenericSegmenter);
+}
+
+TEST(RowSegmenter, ConstantDictionaryMixedBatchWithAnyKeysSegmenter) {
+  auto index_type = int32();
+  auto value_type = utf8();
+  auto dict_type = dictionary(index_type, value_type);
+  auto dict = ArrayFromJSON(value_type, R"(["alpha", null, "gamma"])");
+  ASSERT_OK_AND_ASSIGN(auto index_value, MakeScalar(index_type, 0));
+  auto dict_value = DictionaryScalar::Make(std::move(index_value), dict);
+  TestRowSegmenterConstantBatch(
+      dict_type,
+      [](int64_t key) { return key % 2 == 0 ? ArgShape::SCALAR : 
ArgShape::ARRAY; },
+      [&](int64_t key) { return dict_value; }, MakeGenericSegmenter);
 }
 
 TEST(RowSegmenter, RowConstantBatch) {

Reply via email to