bkietz commented on a change in pull request #9638: URL: https://github.com/apache/arrow/pull/9638#discussion_r590395994
########## File path: cpp/src/arrow/dataset/expression_benchmark.cc ########## @@ -0,0 +1,105 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <iostream> + +#include "benchmark/benchmark.h" + +#include "arrow/compute/cast.h" +#include "arrow/dataset/expression.h" +#include "arrow/dataset/partition.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/type.h" +#include "arrow/util/range.h" + +namespace arrow { +namespace dataset { + +static std::shared_ptr<Partitioning> MakePartitioning() { + auto schema = arrow::schema({ + arrow::field("a", arrow::dictionary(arrow::int32(), arrow::int64())), + arrow::field("b", arrow::dictionary(arrow::int32(), arrow::int64())), Review comment: nit: we don't need to be explicit about the arrow namespace here ```suggestion auto schema = schema({ field("a", dictionary(int32(), int64())), feld("b", dictionary(int32(), int64())), ``` ########## File path: cpp/src/arrow/dataset/expression_benchmark.cc ########## @@ -0,0 +1,105 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <iostream> + +#include "benchmark/benchmark.h" + +#include "arrow/compute/cast.h" +#include "arrow/dataset/expression.h" +#include "arrow/dataset/partition.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/type.h" +#include "arrow/util/range.h" + +namespace arrow { +namespace dataset { + +static std::shared_ptr<Partitioning> MakePartitioning() { + auto schema = arrow::schema({ + arrow::field("a", arrow::dictionary(arrow::int32(), arrow::int64())), + arrow::field("b", arrow::dictionary(arrow::int32(), arrow::int64())), + }); + std::shared_ptr<Array> a_dict, b_dict; + auto values = internal::Iota<int64_t>(100); + ArrayFromVector<Int64Type>(arrow::int64(), values, &a_dict); + ArrayFromVector<Int64Type>(arrow::int64(), values, &b_dict); + ArrayVector dictionaries = {a_dict, b_dict}; + return std::make_shared<HivePartitioning>(schema, dictionaries); +} + +static Expression GetPartitionExpression(const std::string& path) { + static std::shared_ptr<Partitioning> partitioning = MakePartitioning(); + ASSIGN_OR_ABORT(auto expr, partitioning->Parse(path)); + return expr; +} + +// A benchmark of SimplifyWithGuarantee using expressions arising from partitioning. +static void SimplifyQueryWithGuarantee(benchmark::State& state, Expression lhs, + Expression guarantee) { + auto schema = arrow::schema( + {arrow::field("a", arrow::int64()), arrow::field("b", arrow::int64())}); + ASSIGN_OR_ABORT(lhs, lhs.Bind(*schema)); + + // std::cerr << "LHS: " << lhs.ToString() << std::endl; + // std::cerr << "Guarantee: " << guarantee.ToString() << std::endl; + + for (auto _ : state) { + ABORT_NOT_OK(SimplifyWithGuarantee(lhs, guarantee)); + } +} + +auto cast_options = compute::CastOptions::Safe(arrow::int64()); Review comment: For readability: ```suggestion auto to_int64 = compute::CastOptions::Safe(arrow::int64()); ``` alternatively, we could promote the [cast convenience factory](https://github.com/apache/arrow/blob/af8e312121fb9c2c4825d540c7e735329ed7f30c/cpp/src/arrow/dataset/expression_test.cc#L46-L49) to expression.h ########## File path: cpp/src/arrow/dataset/expression_benchmark.cc ########## @@ -0,0 +1,105 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <iostream> + +#include "benchmark/benchmark.h" + +#include "arrow/compute/cast.h" +#include "arrow/dataset/expression.h" +#include "arrow/dataset/partition.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/type.h" +#include "arrow/util/range.h" + +namespace arrow { +namespace dataset { + +static std::shared_ptr<Partitioning> MakePartitioning() { + auto schema = arrow::schema({ + arrow::field("a", arrow::dictionary(arrow::int32(), arrow::int64())), + arrow::field("b", arrow::dictionary(arrow::int32(), arrow::int64())), + }); + std::shared_ptr<Array> a_dict, b_dict; + auto values = internal::Iota<int64_t>(100); + ArrayFromVector<Int64Type>(arrow::int64(), values, &a_dict); + ArrayFromVector<Int64Type>(arrow::int64(), values, &b_dict); + ArrayVector dictionaries = {a_dict, b_dict}; + return std::make_shared<HivePartitioning>(schema, dictionaries); +} + +static Expression GetPartitionExpression(const std::string& path) { + static std::shared_ptr<Partitioning> partitioning = MakePartitioning(); + ASSIGN_OR_ABORT(auto expr, partitioning->Parse(path)); + return expr; +} + +// A benchmark of SimplifyWithGuarantee using expressions arising from partitioning. +static void SimplifyQueryWithGuarantee(benchmark::State& state, Expression lhs, + Expression guarantee) { + auto schema = arrow::schema( + {arrow::field("a", arrow::int64()), arrow::field("b", arrow::int64())}); + ASSIGN_OR_ABORT(lhs, lhs.Bind(*schema)); + + // std::cerr << "LHS: " << lhs.ToString() << std::endl; + // std::cerr << "Guarantee: " << guarantee.ToString() << std::endl; + + for (auto _ : state) { + ABORT_NOT_OK(SimplifyWithGuarantee(lhs, guarantee)); + } +} + +auto cast_options = compute::CastOptions::Safe(arrow::int64()); +// A fully simplified filter. +auto lhs_simple_negative = and_(equal(field_ref(FieldRef("a")), literal(99)), + equal(field_ref(FieldRef("b")), literal(98))); Review comment: Nit: it's not necessary to explicitly construct a FieldRef here ```suggestion auto lhs_simple_negative = and_(equal(field_ref("a"), literal(99)), equal(field_ref("b"), literal(98))); ``` ########## File path: cpp/src/arrow/dataset/expression_benchmark.cc ########## @@ -0,0 +1,105 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <iostream> + +#include "benchmark/benchmark.h" + +#include "arrow/compute/cast.h" +#include "arrow/dataset/expression.h" +#include "arrow/dataset/partition.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/type.h" +#include "arrow/util/range.h" + +namespace arrow { +namespace dataset { + +static std::shared_ptr<Partitioning> MakePartitioning() { + auto schema = arrow::schema({ + arrow::field("a", arrow::dictionary(arrow::int32(), arrow::int64())), + arrow::field("b", arrow::dictionary(arrow::int32(), arrow::int64())), + }); + std::shared_ptr<Array> a_dict, b_dict; + auto values = internal::Iota<int64_t>(100); + ArrayFromVector<Int64Type>(arrow::int64(), values, &a_dict); + ArrayFromVector<Int64Type>(arrow::int64(), values, &b_dict); + ArrayVector dictionaries = {a_dict, b_dict}; + return std::make_shared<HivePartitioning>(schema, dictionaries); +} + +static Expression GetPartitionExpression(const std::string& path) { + static std::shared_ptr<Partitioning> partitioning = MakePartitioning(); + ASSIGN_OR_ABORT(auto expr, partitioning->Parse(path)); + return expr; +} Review comment: To maintain parity with the python case which motivated this benchmark (and to keep the boilerplate of constructing a `Partitioning` minimal), let's use a `PartitioningFactory` here: ```suggestion static Expression GetPartitionExpression(const std::string& path, bool infer_dictionary) { auto options = HivePartitioningFactoryOptions(); options.infer_dictionary = infer_dictionary; auto factory = HivePartitioning::MakeFactory(options); ASSIGN_OR_ABORT(auto schema, factory->Inspect({path})); ASSIGN_OR_ABORT(auto partitioning, factory->Finish(schema)); ASSIGN_OR_ABORT(auto expr, partitioning->Parse(path)); return expr; } ``` This way the dict and non-dict partition expressions are derived equivalently ########## File path: cpp/src/arrow/dataset/expression_benchmark.cc ########## @@ -0,0 +1,105 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <iostream> + +#include "benchmark/benchmark.h" + +#include "arrow/compute/cast.h" +#include "arrow/dataset/expression.h" +#include "arrow/dataset/partition.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/type.h" +#include "arrow/util/range.h" + +namespace arrow { +namespace dataset { + +static std::shared_ptr<Partitioning> MakePartitioning() { + auto schema = arrow::schema({ + arrow::field("a", arrow::dictionary(arrow::int32(), arrow::int64())), + arrow::field("b", arrow::dictionary(arrow::int32(), arrow::int64())), + }); + std::shared_ptr<Array> a_dict, b_dict; + auto values = internal::Iota<int64_t>(100); + ArrayFromVector<Int64Type>(arrow::int64(), values, &a_dict); + ArrayFromVector<Int64Type>(arrow::int64(), values, &b_dict); + ArrayVector dictionaries = {a_dict, b_dict}; + return std::make_shared<HivePartitioning>(schema, dictionaries); +} + +static Expression GetPartitionExpression(const std::string& path) { + static std::shared_ptr<Partitioning> partitioning = MakePartitioning(); + ASSIGN_OR_ABORT(auto expr, partitioning->Parse(path)); + return expr; +} + +// A benchmark of SimplifyWithGuarantee using expressions arising from partitioning. +static void SimplifyQueryWithGuarantee(benchmark::State& state, Expression lhs, + Expression guarantee) { + auto schema = arrow::schema( + {arrow::field("a", arrow::int64()), arrow::field("b", arrow::int64())}); + ASSIGN_OR_ABORT(lhs, lhs.Bind(*schema)); Review comment: ```suggestion auto dataset_schema = schema( {field("a", int64()), field("b", int64())}); ASSIGN_OR_ABORT(filter, filter.Bind(*dataset_schema)); ``` ########## File path: cpp/src/arrow/dataset/expression_benchmark.cc ########## @@ -0,0 +1,105 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <iostream> + +#include "benchmark/benchmark.h" + +#include "arrow/compute/cast.h" +#include "arrow/dataset/expression.h" +#include "arrow/dataset/partition.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/type.h" +#include "arrow/util/range.h" + +namespace arrow { +namespace dataset { + +static std::shared_ptr<Partitioning> MakePartitioning() { + auto schema = arrow::schema({ + arrow::field("a", arrow::dictionary(arrow::int32(), arrow::int64())), + arrow::field("b", arrow::dictionary(arrow::int32(), arrow::int64())), + }); + std::shared_ptr<Array> a_dict, b_dict; + auto values = internal::Iota<int64_t>(100); + ArrayFromVector<Int64Type>(arrow::int64(), values, &a_dict); + ArrayFromVector<Int64Type>(arrow::int64(), values, &b_dict); + ArrayVector dictionaries = {a_dict, b_dict}; + return std::make_shared<HivePartitioning>(schema, dictionaries); +} + +static Expression GetPartitionExpression(const std::string& path) { + static std::shared_ptr<Partitioning> partitioning = MakePartitioning(); + ASSIGN_OR_ABORT(auto expr, partitioning->Parse(path)); + return expr; +} + +// A benchmark of SimplifyWithGuarantee using expressions arising from partitioning. +static void SimplifyQueryWithGuarantee(benchmark::State& state, Expression lhs, Review comment: ```suggestion static void SimplifyFilterWithGuarantee(benchmark::State& state, Expression filter, ``` ########## File path: cpp/src/arrow/dataset/expression_benchmark.cc ########## @@ -0,0 +1,105 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <iostream> + +#include "benchmark/benchmark.h" + +#include "arrow/compute/cast.h" +#include "arrow/dataset/expression.h" +#include "arrow/dataset/partition.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/type.h" +#include "arrow/util/range.h" + +namespace arrow { +namespace dataset { + +static std::shared_ptr<Partitioning> MakePartitioning() { + auto schema = arrow::schema({ + arrow::field("a", arrow::dictionary(arrow::int32(), arrow::int64())), + arrow::field("b", arrow::dictionary(arrow::int32(), arrow::int64())), + }); + std::shared_ptr<Array> a_dict, b_dict; + auto values = internal::Iota<int64_t>(100); + ArrayFromVector<Int64Type>(arrow::int64(), values, &a_dict); + ArrayFromVector<Int64Type>(arrow::int64(), values, &b_dict); + ArrayVector dictionaries = {a_dict, b_dict}; + return std::make_shared<HivePartitioning>(schema, dictionaries); +} + +static Expression GetPartitionExpression(const std::string& path) { + static std::shared_ptr<Partitioning> partitioning = MakePartitioning(); + ASSIGN_OR_ABORT(auto expr, partitioning->Parse(path)); + return expr; +} + +// A benchmark of SimplifyWithGuarantee using expressions arising from partitioning. +static void SimplifyQueryWithGuarantee(benchmark::State& state, Expression lhs, + Expression guarantee) { + auto schema = arrow::schema( + {arrow::field("a", arrow::int64()), arrow::field("b", arrow::int64())}); + ASSIGN_OR_ABORT(lhs, lhs.Bind(*schema)); + + // std::cerr << "LHS: " << lhs.ToString() << std::endl; + // std::cerr << "Guarantee: " << guarantee.ToString() << std::endl; + + for (auto _ : state) { + ABORT_NOT_OK(SimplifyWithGuarantee(lhs, guarantee)); + } +} + +auto cast_options = compute::CastOptions::Safe(arrow::int64()); +// A fully simplified filter. +auto lhs_simple_negative = and_(equal(field_ref(FieldRef("a")), literal(99)), + equal(field_ref(FieldRef("b")), literal(98))); Review comment: Additionally, note that since your dataset_schema specifies that both fields are int64 your fully simplified filters should include correctly typed literals: `literal(int64_t(99))` ########## File path: cpp/src/arrow/dataset/expression_benchmark.cc ########## @@ -0,0 +1,105 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <iostream> + +#include "benchmark/benchmark.h" + +#include "arrow/compute/cast.h" +#include "arrow/dataset/expression.h" +#include "arrow/dataset/partition.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/type.h" +#include "arrow/util/range.h" + +namespace arrow { +namespace dataset { + +static std::shared_ptr<Partitioning> MakePartitioning() { + auto schema = arrow::schema({ + arrow::field("a", arrow::dictionary(arrow::int32(), arrow::int64())), + arrow::field("b", arrow::dictionary(arrow::int32(), arrow::int64())), + }); + std::shared_ptr<Array> a_dict, b_dict; + auto values = internal::Iota<int64_t>(100); + ArrayFromVector<Int64Type>(arrow::int64(), values, &a_dict); + ArrayFromVector<Int64Type>(arrow::int64(), values, &b_dict); + ArrayVector dictionaries = {a_dict, b_dict}; + return std::make_shared<HivePartitioning>(schema, dictionaries); +} + +static Expression GetPartitionExpression(const std::string& path) { + static std::shared_ptr<Partitioning> partitioning = MakePartitioning(); + ASSIGN_OR_ABORT(auto expr, partitioning->Parse(path)); + return expr; +} + +// A benchmark of SimplifyWithGuarantee using expressions arising from partitioning. +static void SimplifyQueryWithGuarantee(benchmark::State& state, Expression lhs, + Expression guarantee) { + auto schema = arrow::schema( + {arrow::field("a", arrow::int64()), arrow::field("b", arrow::int64())}); + ASSIGN_OR_ABORT(lhs, lhs.Bind(*schema)); + + // std::cerr << "LHS: " << lhs.ToString() << std::endl; + // std::cerr << "Guarantee: " << guarantee.ToString() << std::endl; + Review comment: ```suggestion ``` ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected]
