andishgar commented on code in PR #46031: URL: https://github.com/apache/arrow/pull/46031#discussion_r2059591353
########## cpp/src/arrow/record_batch_test.cc: ########## @@ -1423,34 +1467,148 @@ TEST_F(TestRecordBatch, MakeStatisticsArrayMaxApproximate) { AssertArraysEqual(*expected_statistics_array, *statistics_array, true); } -TEST_F(TestRecordBatch, MakeStatisticsArrayString) { - auto schema = - ::arrow::schema({field("no-statistics", boolean()), field("string", utf8())}); - auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); - auto string_array_data = ArrayFromJSON(utf8(), "[\"a\", null, \"c\"]")->data()->Copy(); - string_array_data->statistics = std::make_shared<ArrayStatistics>(); - string_array_data->statistics->is_max_exact = true; - string_array_data->statistics->max = "c"; - auto string_array = MakeArray(std::move(string_array_data)); - auto batch = RecordBatch::Make(schema, string_array->length(), - {no_statistics_array, string_array}); +template <typename DataType> +class TestRecordBatchMakeStatisticsArrayBinary : public ::testing::Test { + public: + void TestMaxApproximation() { + auto max = ArrayStatistics::ValueType{"c"}; + auto schema = + ::arrow::schema({field("no-statistics", boolean()), field("string", type())}); + auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); + auto string_array = GenerateString(type()); + string_array->data()->statistics = std::make_shared<ArrayStatistics>(); + string_array->data()->statistics->max = max; + + auto batch = RecordBatch::Make(schema, string_array->length(), + {no_statistics_array, string_array}); + + ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); + + ASSERT_OK_AND_ASSIGN( + auto expected_statistics_array, + MakeStatisticsArray("[null, 1]", + {{ + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, + }, + { + ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE, + }}, + {{ + ArrayStatistics::ValueType{int64_t{3}}, + }, + { + max, + }}, + {null(), type()})); + AssertArraysEqual(*expected_statistics_array, *statistics_array, true); + } + + std::shared_ptr<::arrow::DataType> type() { + if constexpr (std::is_same_v<DataType, FixedSizeBinaryType>) { + return fixed_size_binary(1); + } else { + return TypeTraits<DataType>::type_singleton(); + } + } +}; +TYPED_TEST_SUITE(TestRecordBatchMakeStatisticsArrayBinary, + AllBinaryOrBinrayViewLikeArrowTypes); + +TYPED_TEST(TestRecordBatchMakeStatisticsArrayBinary, MaxApproximation) { + this->TestMaxApproximation(); +} + +// Validates that the union array creates two distinct child arrays for two +// FixedSizeBinaryArrays with unequal byte widths. +TEST_F(TestRecordBatch, MakeStatisticsArrayDifferentSizeFixedSizeBinary) { + auto fixed_size_type1 = fixed_size_binary(1); + auto fixed_size_type2 = fixed_size_binary(2); + + auto fixed_size_array1 = GenerateString(fixed_size_type1); + fixed_size_array1->data()->statistics = std::make_shared<ArrayStatistics>(); + fixed_size_array1->data()->statistics->max = + std::string(fixed_size_type1->byte_width(), 'c'); + + auto fixed_size_array2 = GenerateString(fixed_size_type2); + fixed_size_array2->data()->statistics = std::make_shared<ArrayStatistics>(); + fixed_size_array2->data()->statistics->max = + std::string(fixed_size_type2->byte_width(), 'c'); + + auto schema = ::arrow::schema( + {field("fixed_size1", fixed_size_type1), field("fixed_size2", fixed_size_type2)}); + auto batch = RecordBatch::Make(schema, fixed_size_array1->length(), + {fixed_size_array1, fixed_size_array2}); ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); + ASSERT_OK_AND_ASSIGN( + auto expected_statistics_array, + MakeStatisticsArray("[null, 0, 1]", + {{ + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, + }, + { + ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE, + }, + { + ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE, + }}, + {{ + ArrayStatistics::ValueType{int64_t{3}}, + }, + { + ArrayStatistics::ValueType{ + std::string(fixed_size_type1->byte_width(), 'c')}, + }, + { + ArrayStatistics::ValueType{ + std::string(fixed_size_type2->byte_width(), 'c')}, + }}, + {null(), fixed_size_type1, fixed_size_type2})); + + AssertArraysEqual(*expected_statistics_array, *statistics_array, true); +} + +// Validates that the union array creates a single child array for two +// FixedSizeBinaryArrays with equal byte widths. +TEST_F(TestRecordBatch, MakeStatisticsArraySameSizeFixedSizeBinary) { + auto fixed_size_type = fixed_size_binary(2); + auto max = ArrayStatistics::ValueType{std::string(fixed_size_type->byte_width(), 'c')}; Review Comment: Thanks for the detailed review! I learned a ton. Hopefully, my next pull requests will have cleaner code. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org