This is an automated email from the ASF dual-hosted git repository. raulcd pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push: new baf97fd12a GH-46157: [C++] Move test utility RunEndEncodeTableColumns that uses REE to test_util_internal on acero instead of common gtest_util (#46161) baf97fd12a is described below commit baf97fd12ac0254b3aef8e8329a3373050bcb8f1 Author: Raúl Cumplido <raulcumpl...@gmail.com> AuthorDate: Wed Apr 30 09:38:59 2025 +0200 GH-46157: [C++] Move test utility RunEndEncodeTableColumns that uses REE to test_util_internal on acero instead of common gtest_util (#46161) ### Rationale for this change When compiling without `ARROW_COMPUTE` we should not require the run end encode kernel to be available. The current test utility `RunEndEncodeTableColumns` should not be on gtest_util as this is going to be compiled whether `ARROW_COMPUTE` flag is enabled or not. ### What changes are included in this PR? Move the `RunEndEncodeTableColumns` utility that requires REE to be available to `arrow/acero/test_util_internal` as is only used there, move corresponding test to its own test module and remove the requirement for it to be exported with `ARROW_TESTING_EXPORT` ### Are these changes tested? Yes, on CI and tested locally that `ARROW_COMPUTE=ON` and `ARROW_COMPUTE=OFF` behave as expected. ### Are there any user-facing changes? If someone was using the `RunEndEncodeTableColumns` from `libarrow_testing` this is not being exported anymore. * GitHub Issue: #46157 Lead-authored-by: Raúl Cumplido <raulcumpl...@gmail.com> Co-authored-by: Antoine Pitrou <pit...@free.fr> Signed-off-by: Raúl Cumplido <raulcumpl...@gmail.com> --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/acero/CMakeLists.txt | 2 ++ cpp/src/arrow/acero/test_util_internal.cc | 26 +++++++++++++++++ cpp/src/arrow/acero/test_util_internal.h | 4 +++ cpp/src/arrow/acero/test_util_internal_test.cc | 39 ++++++++++++++++++++++++++ cpp/src/arrow/testing/gtest_util.cc | 26 ----------------- cpp/src/arrow/testing/gtest_util.h | 4 --- cpp/src/arrow/testing/gtest_util_test.cc | 13 --------- 8 files changed, 72 insertions(+), 43 deletions(-) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index a4f67408bf..0168e93626 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -785,6 +785,7 @@ if(ARROW_COMPUTE) append_runtime_avx2_src(ARROW_COMPUTE_SRCS compute/row/compare_internal_avx2.cc) append_runtime_avx2_src(ARROW_COMPUTE_SRCS compute/row/encode_internal_avx2.cc) append_runtime_avx2_bmi2_src(ARROW_COMPUTE_SRCS compute/util_avx2.cc) + endif() arrow_add_object_library(ARROW_COMPUTE ${ARROW_COMPUTE_SRCS}) diff --git a/cpp/src/arrow/acero/CMakeLists.txt b/cpp/src/arrow/acero/CMakeLists.txt index 8034ea2570..c3b08af84e 100644 --- a/cpp/src/arrow/acero/CMakeLists.txt +++ b/cpp/src/arrow/acero/CMakeLists.txt @@ -181,6 +181,8 @@ add_arrow_acero_test(aggregate_node_test SOURCES aggregate_node_test.cc) add_arrow_acero_test(util_test SOURCES util_test.cc task_util_test.cc) add_arrow_acero_test(hash_aggregate_test SOURCES hash_aggregate_test.cc) +add_arrow_acero_test(test_util_internal_test SOURCES test_util_internal_test.cc) + if(ARROW_BUILD_BENCHMARKS) function(add_arrow_acero_benchmark REL_BENCHMARK_NAME) set(options) diff --git a/cpp/src/arrow/acero/test_util_internal.cc b/cpp/src/arrow/acero/test_util_internal.cc index 2748d4107e..3e27917581 100644 --- a/cpp/src/arrow/acero/test_util_internal.cc +++ b/cpp/src/arrow/acero/test_util_internal.cc @@ -613,5 +613,31 @@ Result<std::shared_ptr<Table>> MakeRandomTimeSeriesTable( return Table::Make(schema, columns, num_rows); } +Result<std::shared_ptr<Table>> RunEndEncodeTableColumns( + const Table& table, const std::vector<int>& column_indices) { + const int num_columns = table.num_columns(); + std::vector<std::shared_ptr<ChunkedArray>> encoded_columns; + encoded_columns.reserve(num_columns); + std::vector<std::shared_ptr<Field>> encoded_fields; + encoded_fields.reserve(num_columns); + for (int i = 0; i < num_columns; i++) { + const auto& field = table.schema()->field(i); + if (std::find(column_indices.begin(), column_indices.end(), i) != + column_indices.end()) { + ARROW_ASSIGN_OR_RAISE(auto run_end_encoded, + arrow::compute::RunEndEncode(table.column(i))); + ARROW_DCHECK_EQ(run_end_encoded.kind(), Datum::CHUNKED_ARRAY); + encoded_columns.push_back(run_end_encoded.chunked_array()); + auto encoded_type = arrow::run_end_encoded(arrow::int32(), field->type()); + encoded_fields.push_back(field->WithType(encoded_type)); + } else { + encoded_columns.push_back(table.column(i)); + encoded_fields.push_back(field); + } + } + auto updated_schema = arrow::schema(std::move(encoded_fields)); + return Table::Make(std::move(updated_schema), std::move(encoded_columns)); +} + } // namespace acero } // namespace arrow diff --git a/cpp/src/arrow/acero/test_util_internal.h b/cpp/src/arrow/acero/test_util_internal.h index 2367524a56..5fab9bfcd4 100644 --- a/cpp/src/arrow/acero/test_util_internal.h +++ b/cpp/src/arrow/acero/test_util_internal.h @@ -30,6 +30,7 @@ #include "arrow/acero/exec_plan.h" #include "arrow/compute/exec.h" #include "arrow/compute/kernel.h" +#include "arrow/table.h" #include "arrow/testing/visibility.h" #include "arrow/util/async_generator.h" #include "arrow/util/pcg_random.h" @@ -195,4 +196,7 @@ struct TableGenerationProperties { Result<std::shared_ptr<Table>> MakeRandomTimeSeriesTable( const TableGenerationProperties& properties); +Result<std::shared_ptr<Table>> RunEndEncodeTableColumns( + const Table& table, const std::vector<int>& column_indices); + } // namespace arrow::acero diff --git a/cpp/src/arrow/acero/test_util_internal_test.cc b/cpp/src/arrow/acero/test_util_internal_test.cc new file mode 100644 index 0000000000..7a42b9d172 --- /dev/null +++ b/cpp/src/arrow/acero/test_util_internal_test.cc @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest.h> + +#include "arrow/acero/test_util_internal.h" +#include "arrow/testing/gtest_util.h" + +namespace arrow::acero { + +TEST(RunEndEncodeTableColumnsTest, SchemaTypeIsModified) { + std::shared_ptr<Table> table = + arrow::TableFromJSON(arrow::schema({arrow::field("col", arrow::utf8())}), {R"([ + {"col": "a"}, + {"col": "b"}, + {"col": "c"}, + {"col": "d"} + ])"}); + ASSERT_OK_AND_ASSIGN(std::shared_ptr<Table> ree_table, + RunEndEncodeTableColumns(*table, {0})); + ASSERT_OK(ree_table->ValidateFull()); + ASSERT_TRUE(ree_table->schema()->field(0)->type()->Equals( + arrow::run_end_encoded(arrow::int32(), arrow::utf8()))); +} +} // namespace arrow::acero diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index 9eeca32e72..061956ab1a 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -47,7 +47,6 @@ #include "arrow/array.h" #include "arrow/buffer.h" -#include "arrow/compute/api_vector.h" #include "arrow/datum.h" #include "arrow/extension/json.h" #include "arrow/io/memory.h" @@ -473,31 +472,6 @@ std::shared_ptr<Tensor> TensorFromJSON(const std::shared_ptr<DataType>& type, return *Tensor::Make(type, array->data()->buffers[1], shape, strides, dim_names); } -Result<std::shared_ptr<Table>> RunEndEncodeTableColumns( - const Table& table, const std::vector<int>& column_indices) { - const int num_columns = table.num_columns(); - std::vector<std::shared_ptr<ChunkedArray>> encoded_columns; - encoded_columns.reserve(num_columns); - std::vector<std::shared_ptr<Field>> encoded_fields; - encoded_fields.reserve(num_columns); - for (int i = 0; i < num_columns; i++) { - const auto& field = table.schema()->field(i); - if (std::find(column_indices.begin(), column_indices.end(), i) != - column_indices.end()) { - ARROW_ASSIGN_OR_RAISE(auto run_end_encoded, compute::RunEndEncode(table.column(i))); - DCHECK_EQ(run_end_encoded.kind(), Datum::CHUNKED_ARRAY); - encoded_columns.push_back(run_end_encoded.chunked_array()); - auto encoded_type = arrow::run_end_encoded(arrow::int32(), field->type()); - encoded_fields.push_back(field->WithType(encoded_type)); - } else { - encoded_columns.push_back(table.column(i)); - encoded_fields.push_back(field); - } - } - auto updated_schema = arrow::schema(std::move(encoded_fields)); - return Table::Make(std::move(updated_schema), std::move(encoded_columns)); -} - Result<std::optional<std::string>> PrintArrayDiff(const ChunkedArray& expected, const ChunkedArray& actual) { if (actual.Equals(expected)) { diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index 2ff4b72af4..29bb52c216 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -373,10 +373,6 @@ std::shared_ptr<Tensor> TensorFromJSON(const std::shared_ptr<DataType>& type, const std::vector<int64_t>& strides = {}, const std::vector<std::string>& dim_names = {}); -ARROW_TESTING_EXPORT -Result<std::shared_ptr<Table>> RunEndEncodeTableColumns( - const Table& table, const std::vector<int>& column_indices); - // Given an array, return a new identical array except for one validity bit // set to a new value. // This is useful to force the underlying "value" of null entries to otherwise diff --git a/cpp/src/arrow/testing/gtest_util_test.cc b/cpp/src/arrow/testing/gtest_util_test.cc index 8dc496fa50..4ad5159a6b 100644 --- a/cpp/src/arrow/testing/gtest_util_test.cc +++ b/cpp/src/arrow/testing/gtest_util_test.cc @@ -292,17 +292,4 @@ TEST(AssertTestWithinUlp, Basics) { EXPECT_FATAL_FAILURE(AssertWithinUlp(123.456f, 123.456085f, 10), "not within 10 ulps"); } -TEST(RunEndEncodeGtestUtilTest, SchemaTypeIsModified) { - std::shared_ptr<Table> table = - arrow::TableFromJSON(arrow::schema({arrow::field("col", arrow::utf8())}), {R"([ - {"col": "a"}, - {"col": "b"}, - {"col": "c"}, - {"col": "d"} - ])"}); - ASSERT_OK_AND_ASSIGN(std::shared_ptr<Table> ree_table, - RunEndEncodeTableColumns(*table, {0})); - ASSERT_TRUE(ree_table->schema()->field(0)->type()->Equals( - arrow::run_end_encoded(arrow::int32(), arrow::utf8()))); -} } // namespace arrow