westonpace commented on code in PR #13426:
URL: https://github.com/apache/arrow/pull/13426#discussion_r923307184
##########
cpp/src/arrow/compute/exec/test_util.cc:
##########
@@ -459,5 +460,44 @@ void PrintTo(const Declaration& decl, std::ostream* os) {
*os << "}";
}
+std::shared_ptr<Table> MakeRandomTable(TableGenerationProperties properties) {
Review Comment:
Can you rename this table so it is obvious it has something to do with asof
join / time series data? We have a couple of different random generation
utilities already and so it will help to distinguish.
##########
cpp/src/arrow/compute/exec/test_util.cc:
##########
@@ -459,5 +460,44 @@ void PrintTo(const Declaration& decl, std::ostream* os) {
*os << "}";
}
+std::shared_ptr<Table> MakeRandomTable(TableGenerationProperties properties) {
+ int total_columns = properties.num_columns + 2;
+ std::vector<std::shared_ptr<Array>> columns;
+ columns.reserve(total_columns);
+ arrow::FieldVector field_vector = arrow::FieldVector();
+ field_vector.reserve(total_columns);
+
+ field_vector.push_back(std::make_shared<Field>("time", int64()));
+ field_vector.push_back(std::make_shared<Field>("id", int32()));
Review Comment:
```suggestion
field_vector.push_back(field("time", int64()));
field_vector.push_back(field("id", int32()));
```
What you have is fine, this is just a slightly more compact shortcut
##########
cpp/src/arrow/compute/exec/test_util.cc:
##########
@@ -459,5 +460,44 @@ void PrintTo(const Declaration& decl, std::ostream* os) {
*os << "}";
}
+std::shared_ptr<Table> MakeRandomTable(TableGenerationProperties properties) {
+ int total_columns = properties.num_columns + 2;
+ std::vector<std::shared_ptr<Array>> columns;
+ columns.reserve(total_columns);
+ arrow::FieldVector field_vector = arrow::FieldVector();
Review Comment:
```suggestion
arrow::FieldVector field_vector;
```
##########
cpp/src/arrow/compute/exec/test_util.cc:
##########
@@ -459,5 +460,44 @@ void PrintTo(const Declaration& decl, std::ostream* os) {
*os << "}";
}
+std::shared_ptr<Table> MakeRandomTable(TableGenerationProperties properties) {
+ int total_columns = properties.num_columns + 2;
+ std::vector<std::shared_ptr<Array>> columns;
+ columns.reserve(total_columns);
+ arrow::FieldVector field_vector = arrow::FieldVector();
+ field_vector.reserve(total_columns);
+
+ field_vector.push_back(std::make_shared<Field>("time", int64()));
+ field_vector.push_back(std::make_shared<Field>("id", int32()));
+
+ int num_rows = 0;
+ std::vector<int64_t> time_column;
+ std::vector<int32_t> id_column;
+ for (int time = properties.start; time <= properties.end;
+ time += properties.time_frequency) {
+ for (int id = 0; id < properties.num_ids; id++) {
+ time_column.push_back(time);
+ id_column.push_back(id);
+ num_rows += 1;
+ }
+ }
+ std::shared_ptr<Array> time_array;
+ ArrayFromVector<Int64Type, int64_t>(int64(), time_column, &time_array);
+ columns.push_back(time_array);
+ std::shared_ptr<Array> id_array;
+ ArrayFromVector<Int32Type, int32_t>(int32(), id_column, &id_array);
+ columns.push_back(id_array);
+
+ for (int i = 0; i < properties.num_columns; i++) {
+ std::ostringstream string_stream;
+ string_stream << properties.column_prefix << i;
+ field_vector.push_back(std::make_shared<Field>(string_stream.str(),
float64()));
Review Comment:
```suggestion
field_vector.push_back(field(properties.column_prefix +
std::to_string(i), float64()));
```
##########
cpp/src/arrow/compute/exec/test_util.cc:
##########
@@ -459,5 +460,44 @@ void PrintTo(const Declaration& decl, std::ostream* os) {
*os << "}";
}
+std::shared_ptr<Table> MakeRandomTable(TableGenerationProperties properties) {
+ int total_columns = properties.num_columns + 2;
+ std::vector<std::shared_ptr<Array>> columns;
+ columns.reserve(total_columns);
+ arrow::FieldVector field_vector = arrow::FieldVector();
+ field_vector.reserve(total_columns);
+
+ field_vector.push_back(std::make_shared<Field>("time", int64()));
+ field_vector.push_back(std::make_shared<Field>("id", int32()));
+
+ int num_rows = 0;
+ std::vector<int64_t> time_column;
+ std::vector<int32_t> id_column;
+ for (int time = properties.start; time <= properties.end;
+ time += properties.time_frequency) {
+ for (int id = 0; id < properties.num_ids; id++) {
+ time_column.push_back(time);
+ id_column.push_back(id);
+ num_rows += 1;
Review Comment:
```suggestion
num_rows++;
```
##########
cpp/src/arrow/compute/exec/test_util.cc:
##########
@@ -459,5 +460,44 @@ void PrintTo(const Declaration& decl, std::ostream* os) {
*os << "}";
}
+std::shared_ptr<Table> MakeRandomTable(TableGenerationProperties properties) {
+ int total_columns = properties.num_columns + 2;
+ std::vector<std::shared_ptr<Array>> columns;
+ columns.reserve(total_columns);
+ arrow::FieldVector field_vector = arrow::FieldVector();
+ field_vector.reserve(total_columns);
+
+ field_vector.push_back(std::make_shared<Field>("time", int64()));
+ field_vector.push_back(std::make_shared<Field>("id", int32()));
+
+ int num_rows = 0;
Review Comment:
You could also just do `int num_rows = time_column.size()` after the
for-loop. Or you could precompute this (`((properties.end - properties.start)
/ properties.time_frequency) * properties.num_ids`) but it works how you have
it.
##########
cpp/src/arrow/compute/exec/asof_join_benchmark.cc:
##########
@@ -0,0 +1,159 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <boost/process.hpp>
+#include <string>
+
+#include "benchmark/benchmark.h"
+
+#include "arrow/compute/exec/test_util.h"
+#include "arrow/dataset/file_parquet.h"
+#include "arrow/table.h"
+#include "arrow/testing/future_util.h"
+
+namespace arrow {
+namespace compute {
+
+static const char* time_col = "time";
+static const char* key_col = "id";
Review Comment:
```suggestion
static const char* kTimeCol = "time";
static const char* kKeyCol = "id";
```
It's a bit weird but the Google style guide suggests kConstantCase for
constants.
##########
cpp/src/arrow/compute/exec/test_util.cc:
##########
@@ -459,5 +460,44 @@ void PrintTo(const Declaration& decl, std::ostream* os) {
*os << "}";
}
+std::shared_ptr<Table> MakeRandomTable(TableGenerationProperties properties) {
Review Comment:
```suggestion
std::shared_ptr<Table> MakeRandomTable(const TableGenerationProperties&
properties) {
```
##########
cpp/src/arrow/compute/exec/asof_join_benchmark.cc:
##########
@@ -0,0 +1,159 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <boost/process.hpp>
+#include <string>
+
+#include "benchmark/benchmark.h"
+
+#include "arrow/compute/exec/test_util.h"
+#include "arrow/dataset/file_parquet.h"
+#include "arrow/table.h"
+#include "arrow/testing/future_util.h"
+
+namespace arrow {
+namespace compute {
+
+static const char* time_col = "time";
+static const char* key_col = "id";
+const int default_start = 0;
+const int default_end = 500;
+
+struct TableSourceNodeStats {
+ ExecNode* execNode;
+ size_t total_rows;
+ size_t total_bytes;
+};
+
+static TableSourceNodeStats MakeTableSourceNode(
+ std::shared_ptr<arrow::compute::ExecPlan>& plan, TableGenerationProperties
properties,
+ int batch_size) {
+ std::shared_ptr<Table> table = MakeRandomTable(properties);
+ size_t row_size = sizeof(double) * (table.get()->schema()->num_fields() - 2)
+
+ sizeof(int64_t) + sizeof(int32_t);
+ size_t rows = table.get()->num_rows();
+ return {*arrow::compute::MakeExecNode(
+ "table_source", // registered type
+ plan.get(), // execution plan
+ {}, // inputs
Review Comment:
If a parameter name is not obvious it should be commented as
`/*factory_name=*/"table_source"`. Also, we should use the same argument names
as the method declares (e.g. `factory_name` and not `registered type`.
##########
cpp/src/arrow/compute/exec/test_util.h:
##########
@@ -145,5 +145,38 @@ class Random64Bit {
std::uniform_int_distribution<uint64_t> dist_;
};
+/*
Review Comment:
Can you use the `/// \brief`... style of commenting found elsewhere in the
code base?
##########
cpp/src/arrow/compute/exec/test_util.cc:
##########
@@ -459,5 +460,44 @@ void PrintTo(const Declaration& decl, std::ostream* os) {
*os << "}";
}
+std::shared_ptr<Table> MakeRandomTable(TableGenerationProperties properties) {
+ int total_columns = properties.num_columns + 2;
+ std::vector<std::shared_ptr<Array>> columns;
+ columns.reserve(total_columns);
+ arrow::FieldVector field_vector = arrow::FieldVector();
+ field_vector.reserve(total_columns);
+
+ field_vector.push_back(std::make_shared<Field>("time", int64()));
+ field_vector.push_back(std::make_shared<Field>("id", int32()));
+
+ int num_rows = 0;
+ std::vector<int64_t> time_column;
+ std::vector<int32_t> id_column;
+ for (int time = properties.start; time <= properties.end;
+ time += properties.time_frequency) {
+ for (int id = 0; id < properties.num_ids; id++) {
+ time_column.push_back(time);
+ id_column.push_back(id);
+ num_rows += 1;
+ }
+ }
+ std::shared_ptr<Array> time_array;
+ ArrayFromVector<Int64Type, int64_t>(int64(), time_column, &time_array);
+ columns.push_back(time_array);
+ std::shared_ptr<Array> id_array;
+ ArrayFromVector<Int32Type, int32_t>(int32(), id_column, &id_array);
+ columns.push_back(id_array);
+
+ for (int i = 0; i < properties.num_columns; i++) {
+ std::ostringstream string_stream;
+ string_stream << properties.column_prefix << i;
+ field_vector.push_back(std::make_shared<Field>(string_stream.str(),
float64()));
+ random::RandomArrayGenerator rand =
random::RandomArrayGenerator(properties.seed + i);
+ columns.push_back(rand.Float64(num_rows, -1e5, 1e5));
Review Comment:
Is there a particular reason for bounding the range of values to `-1e5, 1e5`?
##########
cpp/src/arrow/compute/exec/test_util.cc:
##########
@@ -459,5 +460,44 @@ void PrintTo(const Declaration& decl, std::ostream* os) {
*os << "}";
}
+std::shared_ptr<Table> MakeRandomTable(TableGenerationProperties properties) {
+ int total_columns = properties.num_columns + 2;
+ std::vector<std::shared_ptr<Array>> columns;
+ columns.reserve(total_columns);
+ arrow::FieldVector field_vector = arrow::FieldVector();
+ field_vector.reserve(total_columns);
+
+ field_vector.push_back(std::make_shared<Field>("time", int64()));
+ field_vector.push_back(std::make_shared<Field>("id", int32()));
+
+ int num_rows = 0;
+ std::vector<int64_t> time_column;
+ std::vector<int32_t> id_column;
+ for (int time = properties.start; time <= properties.end;
+ time += properties.time_frequency) {
+ for (int id = 0; id < properties.num_ids; id++) {
+ time_column.push_back(time);
+ id_column.push_back(id);
+ num_rows += 1;
+ }
+ }
+ std::shared_ptr<Array> time_array;
+ ArrayFromVector<Int64Type, int64_t>(int64(), time_column, &time_array);
+ columns.push_back(time_array);
+ std::shared_ptr<Array> id_array;
+ ArrayFromVector<Int32Type, int32_t>(int32(), id_column, &id_array);
+ columns.push_back(id_array);
Review Comment:
It would be slightly more efficient to use `Int64Builder` and `Int32Builder`
instead of creating a vector first and then copying the vector into an array.
##########
cpp/src/arrow/compute/exec/test_util.cc:
##########
@@ -459,5 +460,44 @@ void PrintTo(const Declaration& decl, std::ostream* os) {
*os << "}";
}
+std::shared_ptr<Table> MakeRandomTable(TableGenerationProperties properties) {
+ int total_columns = properties.num_columns + 2;
+ std::vector<std::shared_ptr<Array>> columns;
+ columns.reserve(total_columns);
+ arrow::FieldVector field_vector = arrow::FieldVector();
+ field_vector.reserve(total_columns);
+
+ field_vector.push_back(std::make_shared<Field>("time", int64()));
+ field_vector.push_back(std::make_shared<Field>("id", int32()));
+
+ int num_rows = 0;
+ std::vector<int64_t> time_column;
+ std::vector<int32_t> id_column;
+ for (int time = properties.start; time <= properties.end;
+ time += properties.time_frequency) {
+ for (int id = 0; id < properties.num_ids; id++) {
+ time_column.push_back(time);
+ id_column.push_back(id);
+ num_rows += 1;
+ }
+ }
+ std::shared_ptr<Array> time_array;
+ ArrayFromVector<Int64Type, int64_t>(int64(), time_column, &time_array);
+ columns.push_back(time_array);
+ std::shared_ptr<Array> id_array;
+ ArrayFromVector<Int32Type, int32_t>(int32(), id_column, &id_array);
+ columns.push_back(id_array);
+
+ for (int i = 0; i < properties.num_columns; i++) {
+ std::ostringstream string_stream;
+ string_stream << properties.column_prefix << i;
+ field_vector.push_back(std::make_shared<Field>(string_stream.str(),
float64()));
+ random::RandomArrayGenerator rand =
random::RandomArrayGenerator(properties.seed + i);
+ columns.push_back(rand.Float64(num_rows, -1e5, 1e5));
+ }
+ std::shared_ptr<arrow::Schema> schema =
std::make_shared<arrow::Schema>(field_vector);
Review Comment:
```suggestion
std::shared_ptr<arrow::Schema> schema = schema(std::move(field_vector));
```
another shortcut method :)
##########
cpp/src/arrow/compute/exec/test_util.h:
##########
@@ -145,5 +145,38 @@ class Random64Bit {
std::uniform_int_distribution<uint64_t> dist_;
};
+/*
+ Specify properties of a table to be generated.
+ - num_ids is the number of unique keys in the table
+ - time_frequency indicates the amount of time between data points that lie
between
+ start and end (inclusive)
+ - num_columns indicates the amount of random columns in the table
+ - column_prefix specifies the prefix each randomly generated column should
have
+ - seed is the random seed the random array generator is given to generate
the random
+ columns
+ - start specifies the beginning of 'time' recorded in the table
+ - end specifies the end of 'time' recorded in the table
+*/
+struct TableGenerationProperties {
+ int time_frequency;
+ int num_columns;
+ int num_ids;
+ std::string column_prefix;
+ uint seed;
+ int start;
+ int end;
+};
+
+/*
Review Comment:
Same suggestion as above regarding commenting style.
##########
cpp/src/arrow/compute/exec/asof_join_benchmark.cc:
##########
@@ -0,0 +1,159 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <boost/process.hpp>
+#include <string>
+
+#include "benchmark/benchmark.h"
+
+#include "arrow/compute/exec/test_util.h"
+#include "arrow/dataset/file_parquet.h"
+#include "arrow/table.h"
+#include "arrow/testing/future_util.h"
+
+namespace arrow {
+namespace compute {
+
+static const char* time_col = "time";
+static const char* key_col = "id";
+const int default_start = 0;
+const int default_end = 500;
Review Comment:
```suggestion
constexpr int kDefaultStart = 0;
constexpr int kDefaultEnd = 500;
```
##########
cpp/src/arrow/compute/exec/test_util.h:
##########
@@ -145,5 +145,38 @@ class Random64Bit {
std::uniform_int_distribution<uint64_t> dist_;
};
+/*
+ Specify properties of a table to be generated.
+ - num_ids is the number of unique keys in the table
+ - time_frequency indicates the amount of time between data points that lie
between
+ start and end (inclusive)
+ - num_columns indicates the amount of random columns in the table
+ - column_prefix specifies the prefix each randomly generated column should
have
+ - seed is the random seed the random array generator is given to generate
the random
Review Comment:
These can be comments on the fields themselves. See `AsofJoinNodeOptions`
in `arrow/compute/exec/options.h`
##########
cpp/src/arrow/compute/exec/asof_join_benchmark.cc:
##########
@@ -0,0 +1,159 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <boost/process.hpp>
Review Comment:
```suggestion
```
##########
cpp/src/arrow/compute/exec/asof_join_benchmark.cc:
##########
@@ -0,0 +1,159 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <boost/process.hpp>
+#include <string>
+
+#include "benchmark/benchmark.h"
+
+#include "arrow/compute/exec/test_util.h"
+#include "arrow/dataset/file_parquet.h"
+#include "arrow/table.h"
+#include "arrow/testing/future_util.h"
+
+namespace arrow {
+namespace compute {
+
+static const char* time_col = "time";
+static const char* key_col = "id";
+const int default_start = 0;
+const int default_end = 500;
+
+struct TableSourceNodeStats {
+ ExecNode* execNode;
+ size_t total_rows;
+ size_t total_bytes;
+};
+
+static TableSourceNodeStats MakeTableSourceNode(
+ std::shared_ptr<arrow::compute::ExecPlan>& plan, TableGenerationProperties
properties,
+ int batch_size) {
+ std::shared_ptr<Table> table = MakeRandomTable(properties);
+ size_t row_size = sizeof(double) * (table.get()->schema()->num_fields() - 2)
+
+ sizeof(int64_t) + sizeof(int32_t);
+ size_t rows = table.get()->num_rows();
+ return {*arrow::compute::MakeExecNode(
+ "table_source", // registered type
+ plan.get(), // execution plan
+ {}, // inputs
+ arrow::compute::TableSourceNodeOptions(table, batch_size)),
+ rows, row_size * rows};
+}
+
+static void TableJoinOverhead(benchmark::State& state,
+ TableGenerationProperties left_table_properties,
+ int left_table_batch_size,
+ TableGenerationProperties right_table_properties,
+ int right_table_batch_size, int num_right_tables,
+ std::string factory_name, ExecNodeOptions&
options) {
+ ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool());
+ size_t rows = 0;
+ size_t bytes = 0;
+ for (auto _ : state) {
+ state.PauseTiming();
+
+ ASSERT_OK_AND_ASSIGN(std::shared_ptr<arrow::compute::ExecPlan> plan,
+ ExecPlan::Make(&ctx));
+ left_table_properties.column_prefix = "lt";
+ left_table_properties.seed = 0;
Review Comment:
Since the seed is constant you should be able to generate your inputs
outside this loop, once. Then you can run many iterations on the same inputs.
This should speed up the runtime of this benchmark.
##########
cpp/src/arrow/compute/exec/test_util.h:
##########
@@ -145,5 +145,38 @@ class Random64Bit {
std::uniform_int_distribution<uint64_t> dist_;
};
+/*
+ Specify properties of a table to be generated.
+ - num_ids is the number of unique keys in the table
+ - time_frequency indicates the amount of time between data points that lie
between
+ start and end (inclusive)
+ - num_columns indicates the amount of random columns in the table
+ - column_prefix specifies the prefix each randomly generated column should
have
+ - seed is the random seed the random array generator is given to generate
the random
+ columns
+ - start specifies the beginning of 'time' recorded in the table
+ - end specifies the end of 'time' recorded in the table
+*/
+struct TableGenerationProperties {
+ int time_frequency;
+ int num_columns;
+ int num_ids;
+ std::string column_prefix;
+ uint seed;
+ int start;
+ int end;
+};
+
+/*
+ The table generated in accordance to the TableGenerationProperties has the
following
+ schema: time (int64) id (int32) [properties.column_prefix]0 (float64)
+ [properties.column_prefix]1 (float64)
+ ...
+ [properties.column_prefix][properties.num_columns] (float64)
+ Each id has rows corresponding to a singular data point in the time range
(start, end,
+ time_frequency). The table is sorted by time.
+*/
+std::shared_ptr<Table> MakeRandomTable(TableGenerationProperties properties);
Review Comment:
```suggestion
std::shared_ptr<Table> MakeRandomTable(const TableGenerationProperties&
properties);
```
##########
cpp/src/arrow/compute/exec/asof_join_benchmark.cc:
##########
@@ -0,0 +1,159 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <boost/process.hpp>
+#include <string>
+
+#include "benchmark/benchmark.h"
+
+#include "arrow/compute/exec/test_util.h"
+#include "arrow/dataset/file_parquet.h"
+#include "arrow/table.h"
+#include "arrow/testing/future_util.h"
+
+namespace arrow {
+namespace compute {
+
+static const char* time_col = "time";
+static const char* key_col = "id";
+const int default_start = 0;
+const int default_end = 500;
+
+struct TableSourceNodeStats {
+ ExecNode* execNode;
+ size_t total_rows;
+ size_t total_bytes;
+};
+
+static TableSourceNodeStats MakeTableSourceNode(
+ std::shared_ptr<arrow::compute::ExecPlan>& plan, TableGenerationProperties
properties,
+ int batch_size) {
+ std::shared_ptr<Table> table = MakeRandomTable(properties);
+ size_t row_size = sizeof(double) * (table.get()->schema()->num_fields() - 2)
+
+ sizeof(int64_t) + sizeof(int32_t);
+ size_t rows = table.get()->num_rows();
+ return {*arrow::compute::MakeExecNode(
+ "table_source", // registered type
+ plan.get(), // execution plan
+ {}, // inputs
+ arrow::compute::TableSourceNodeOptions(table, batch_size)),
+ rows, row_size * rows};
+}
+
+static void TableJoinOverhead(benchmark::State& state,
+ TableGenerationProperties left_table_properties,
+ int left_table_batch_size,
+ TableGenerationProperties right_table_properties,
+ int right_table_batch_size, int num_right_tables,
+ std::string factory_name, ExecNodeOptions&
options) {
+ ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool());
+ size_t rows = 0;
+ size_t bytes = 0;
+ for (auto _ : state) {
+ state.PauseTiming();
+
+ ASSERT_OK_AND_ASSIGN(std::shared_ptr<arrow::compute::ExecPlan> plan,
+ ExecPlan::Make(&ctx));
+ left_table_properties.column_prefix = "lt";
+ left_table_properties.seed = 0;
+ TableSourceNodeStats left_table_stats =
+ MakeTableSourceNode(plan, left_table_properties,
left_table_batch_size);
+ std::vector<ExecNode*> inputs = {left_table_stats.execNode};
+ int right_hand_rows = 0;
+ size_t right_hand_bytes = 0;
+ for (int i = 0; i < num_right_tables; i++) {
+ std::ostringstream string_stream;
+ string_stream << "rt" << i;
+ right_table_properties.column_prefix = string_stream.str();
Review Comment:
```suggestion
right_table_properties.column_prefix = "rt" + std::to_string(i);
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]