westonpace commented on code in PR #13426:
URL: https://github.com/apache/arrow/pull/13426#discussion_r923307184


##########
cpp/src/arrow/compute/exec/test_util.cc:
##########
@@ -459,5 +460,44 @@ void PrintTo(const Declaration& decl, std::ostream* os) {
   *os << "}";
 }
 
+std::shared_ptr<Table> MakeRandomTable(TableGenerationProperties properties) {

Review Comment:
   Can you rename this table so it is obvious it has something to do with asof 
join / time series data?  We have a couple of different random generation 
utilities already and so it will help to distinguish.



##########
cpp/src/arrow/compute/exec/test_util.cc:
##########
@@ -459,5 +460,44 @@ void PrintTo(const Declaration& decl, std::ostream* os) {
   *os << "}";
 }
 
+std::shared_ptr<Table> MakeRandomTable(TableGenerationProperties properties) {
+  int total_columns = properties.num_columns + 2;
+  std::vector<std::shared_ptr<Array>> columns;
+  columns.reserve(total_columns);
+  arrow::FieldVector field_vector = arrow::FieldVector();
+  field_vector.reserve(total_columns);
+
+  field_vector.push_back(std::make_shared<Field>("time", int64()));
+  field_vector.push_back(std::make_shared<Field>("id", int32()));

Review Comment:
   ```suggestion
     field_vector.push_back(field("time", int64()));
     field_vector.push_back(field("id", int32()));
   ```
   
   What you have is fine, this is just a slightly more compact shortcut



##########
cpp/src/arrow/compute/exec/test_util.cc:
##########
@@ -459,5 +460,44 @@ void PrintTo(const Declaration& decl, std::ostream* os) {
   *os << "}";
 }
 
+std::shared_ptr<Table> MakeRandomTable(TableGenerationProperties properties) {
+  int total_columns = properties.num_columns + 2;
+  std::vector<std::shared_ptr<Array>> columns;
+  columns.reserve(total_columns);
+  arrow::FieldVector field_vector = arrow::FieldVector();

Review Comment:
   ```suggestion
     arrow::FieldVector field_vector;
   ```



##########
cpp/src/arrow/compute/exec/test_util.cc:
##########
@@ -459,5 +460,44 @@ void PrintTo(const Declaration& decl, std::ostream* os) {
   *os << "}";
 }
 
+std::shared_ptr<Table> MakeRandomTable(TableGenerationProperties properties) {
+  int total_columns = properties.num_columns + 2;
+  std::vector<std::shared_ptr<Array>> columns;
+  columns.reserve(total_columns);
+  arrow::FieldVector field_vector = arrow::FieldVector();
+  field_vector.reserve(total_columns);
+
+  field_vector.push_back(std::make_shared<Field>("time", int64()));
+  field_vector.push_back(std::make_shared<Field>("id", int32()));
+
+  int num_rows = 0;
+  std::vector<int64_t> time_column;
+  std::vector<int32_t> id_column;
+  for (int time = properties.start; time <= properties.end;
+         time += properties.time_frequency) {
+    for (int id = 0; id < properties.num_ids; id++) {
+      time_column.push_back(time);
+      id_column.push_back(id);
+      num_rows += 1;
+    }
+  }
+  std::shared_ptr<Array> time_array;
+  ArrayFromVector<Int64Type, int64_t>(int64(), time_column, &time_array);
+  columns.push_back(time_array);
+  std::shared_ptr<Array> id_array;
+  ArrayFromVector<Int32Type, int32_t>(int32(), id_column, &id_array);
+  columns.push_back(id_array);
+
+  for (int i = 0; i < properties.num_columns; i++) {
+    std::ostringstream string_stream;
+    string_stream << properties.column_prefix << i;
+    field_vector.push_back(std::make_shared<Field>(string_stream.str(), 
float64()));

Review Comment:
   ```suggestion
       field_vector.push_back(field(properties.column_prefix + 
std::to_string(i), float64()));
   ```



##########
cpp/src/arrow/compute/exec/test_util.cc:
##########
@@ -459,5 +460,44 @@ void PrintTo(const Declaration& decl, std::ostream* os) {
   *os << "}";
 }
 
+std::shared_ptr<Table> MakeRandomTable(TableGenerationProperties properties) {
+  int total_columns = properties.num_columns + 2;
+  std::vector<std::shared_ptr<Array>> columns;
+  columns.reserve(total_columns);
+  arrow::FieldVector field_vector = arrow::FieldVector();
+  field_vector.reserve(total_columns);
+
+  field_vector.push_back(std::make_shared<Field>("time", int64()));
+  field_vector.push_back(std::make_shared<Field>("id", int32()));
+
+  int num_rows = 0;
+  std::vector<int64_t> time_column;
+  std::vector<int32_t> id_column;
+  for (int time = properties.start; time <= properties.end;
+         time += properties.time_frequency) {
+    for (int id = 0; id < properties.num_ids; id++) {
+      time_column.push_back(time);
+      id_column.push_back(id);
+      num_rows += 1;

Review Comment:
   ```suggestion
         num_rows++;
   ```



##########
cpp/src/arrow/compute/exec/test_util.cc:
##########
@@ -459,5 +460,44 @@ void PrintTo(const Declaration& decl, std::ostream* os) {
   *os << "}";
 }
 
+std::shared_ptr<Table> MakeRandomTable(TableGenerationProperties properties) {
+  int total_columns = properties.num_columns + 2;
+  std::vector<std::shared_ptr<Array>> columns;
+  columns.reserve(total_columns);
+  arrow::FieldVector field_vector = arrow::FieldVector();
+  field_vector.reserve(total_columns);
+
+  field_vector.push_back(std::make_shared<Field>("time", int64()));
+  field_vector.push_back(std::make_shared<Field>("id", int32()));
+
+  int num_rows = 0;

Review Comment:
   You could also just do `int num_rows = time_column.size()` after the 
for-loop.  Or you could precompute this (`((properties.end - properties.start) 
/ properties.time_frequency) * properties.num_ids`) but it works how you have 
it.



##########
cpp/src/arrow/compute/exec/asof_join_benchmark.cc:
##########
@@ -0,0 +1,159 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <boost/process.hpp>
+#include <string>
+
+#include "benchmark/benchmark.h"
+
+#include "arrow/compute/exec/test_util.h"
+#include "arrow/dataset/file_parquet.h"
+#include "arrow/table.h"
+#include "arrow/testing/future_util.h"
+
+namespace arrow {
+namespace compute {
+
+static const char* time_col = "time";
+static const char* key_col = "id";

Review Comment:
   ```suggestion
   static const char* kTimeCol = "time";
   static const char* kKeyCol = "id";
   ```
   
   It's a bit weird but the Google style guide suggests kConstantCase for 
constants.



##########
cpp/src/arrow/compute/exec/test_util.cc:
##########
@@ -459,5 +460,44 @@ void PrintTo(const Declaration& decl, std::ostream* os) {
   *os << "}";
 }
 
+std::shared_ptr<Table> MakeRandomTable(TableGenerationProperties properties) {

Review Comment:
   ```suggestion
   std::shared_ptr<Table> MakeRandomTable(const TableGenerationProperties& 
properties) {
   ```



##########
cpp/src/arrow/compute/exec/asof_join_benchmark.cc:
##########
@@ -0,0 +1,159 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <boost/process.hpp>
+#include <string>
+
+#include "benchmark/benchmark.h"
+
+#include "arrow/compute/exec/test_util.h"
+#include "arrow/dataset/file_parquet.h"
+#include "arrow/table.h"
+#include "arrow/testing/future_util.h"
+
+namespace arrow {
+namespace compute {
+
+static const char* time_col = "time";
+static const char* key_col = "id";
+const int default_start = 0;
+const int default_end = 500;
+
+struct TableSourceNodeStats {
+  ExecNode* execNode;
+  size_t total_rows;
+  size_t total_bytes;
+};
+
+static TableSourceNodeStats MakeTableSourceNode(
+    std::shared_ptr<arrow::compute::ExecPlan>& plan, TableGenerationProperties 
properties,
+    int batch_size) {
+  std::shared_ptr<Table> table = MakeRandomTable(properties);
+  size_t row_size = sizeof(double) * (table.get()->schema()->num_fields() - 2) 
+
+                    sizeof(int64_t) + sizeof(int32_t);
+  size_t rows = table.get()->num_rows();
+  return {*arrow::compute::MakeExecNode(
+              "table_source",  // registered type
+              plan.get(),      // execution plan
+              {},              // inputs

Review Comment:
   If a parameter name is not obvious it should be commented as 
`/*factory_name=*/"table_source"`.  Also, we should use the same argument names 
as the method declares (e.g. `factory_name` and not `registered type`.



##########
cpp/src/arrow/compute/exec/test_util.h:
##########
@@ -145,5 +145,38 @@ class Random64Bit {
   std::uniform_int_distribution<uint64_t> dist_;
 };
 
+/*

Review Comment:
   Can you use the `/// \brief`... style of commenting found elsewhere in the 
code base? 



##########
cpp/src/arrow/compute/exec/test_util.cc:
##########
@@ -459,5 +460,44 @@ void PrintTo(const Declaration& decl, std::ostream* os) {
   *os << "}";
 }
 
+std::shared_ptr<Table> MakeRandomTable(TableGenerationProperties properties) {
+  int total_columns = properties.num_columns + 2;
+  std::vector<std::shared_ptr<Array>> columns;
+  columns.reserve(total_columns);
+  arrow::FieldVector field_vector = arrow::FieldVector();
+  field_vector.reserve(total_columns);
+
+  field_vector.push_back(std::make_shared<Field>("time", int64()));
+  field_vector.push_back(std::make_shared<Field>("id", int32()));
+
+  int num_rows = 0;
+  std::vector<int64_t> time_column;
+  std::vector<int32_t> id_column;
+  for (int time = properties.start; time <= properties.end;
+         time += properties.time_frequency) {
+    for (int id = 0; id < properties.num_ids; id++) {
+      time_column.push_back(time);
+      id_column.push_back(id);
+      num_rows += 1;
+    }
+  }
+  std::shared_ptr<Array> time_array;
+  ArrayFromVector<Int64Type, int64_t>(int64(), time_column, &time_array);
+  columns.push_back(time_array);
+  std::shared_ptr<Array> id_array;
+  ArrayFromVector<Int32Type, int32_t>(int32(), id_column, &id_array);
+  columns.push_back(id_array);
+
+  for (int i = 0; i < properties.num_columns; i++) {
+    std::ostringstream string_stream;
+    string_stream << properties.column_prefix << i;
+    field_vector.push_back(std::make_shared<Field>(string_stream.str(), 
float64()));
+    random::RandomArrayGenerator rand = 
random::RandomArrayGenerator(properties.seed + i);
+    columns.push_back(rand.Float64(num_rows, -1e5, 1e5));

Review Comment:
   Is there a particular reason for bounding the range of values to `-1e5, 1e5`?



##########
cpp/src/arrow/compute/exec/test_util.cc:
##########
@@ -459,5 +460,44 @@ void PrintTo(const Declaration& decl, std::ostream* os) {
   *os << "}";
 }
 
+std::shared_ptr<Table> MakeRandomTable(TableGenerationProperties properties) {
+  int total_columns = properties.num_columns + 2;
+  std::vector<std::shared_ptr<Array>> columns;
+  columns.reserve(total_columns);
+  arrow::FieldVector field_vector = arrow::FieldVector();
+  field_vector.reserve(total_columns);
+
+  field_vector.push_back(std::make_shared<Field>("time", int64()));
+  field_vector.push_back(std::make_shared<Field>("id", int32()));
+
+  int num_rows = 0;
+  std::vector<int64_t> time_column;
+  std::vector<int32_t> id_column;
+  for (int time = properties.start; time <= properties.end;
+         time += properties.time_frequency) {
+    for (int id = 0; id < properties.num_ids; id++) {
+      time_column.push_back(time);
+      id_column.push_back(id);
+      num_rows += 1;
+    }
+  }
+  std::shared_ptr<Array> time_array;
+  ArrayFromVector<Int64Type, int64_t>(int64(), time_column, &time_array);
+  columns.push_back(time_array);
+  std::shared_ptr<Array> id_array;
+  ArrayFromVector<Int32Type, int32_t>(int32(), id_column, &id_array);
+  columns.push_back(id_array);

Review Comment:
   It would be slightly more efficient to use `Int64Builder` and `Int32Builder` 
instead of creating a vector first and then copying the vector into an array.



##########
cpp/src/arrow/compute/exec/test_util.cc:
##########
@@ -459,5 +460,44 @@ void PrintTo(const Declaration& decl, std::ostream* os) {
   *os << "}";
 }
 
+std::shared_ptr<Table> MakeRandomTable(TableGenerationProperties properties) {
+  int total_columns = properties.num_columns + 2;
+  std::vector<std::shared_ptr<Array>> columns;
+  columns.reserve(total_columns);
+  arrow::FieldVector field_vector = arrow::FieldVector();
+  field_vector.reserve(total_columns);
+
+  field_vector.push_back(std::make_shared<Field>("time", int64()));
+  field_vector.push_back(std::make_shared<Field>("id", int32()));
+
+  int num_rows = 0;
+  std::vector<int64_t> time_column;
+  std::vector<int32_t> id_column;
+  for (int time = properties.start; time <= properties.end;
+         time += properties.time_frequency) {
+    for (int id = 0; id < properties.num_ids; id++) {
+      time_column.push_back(time);
+      id_column.push_back(id);
+      num_rows += 1;
+    }
+  }
+  std::shared_ptr<Array> time_array;
+  ArrayFromVector<Int64Type, int64_t>(int64(), time_column, &time_array);
+  columns.push_back(time_array);
+  std::shared_ptr<Array> id_array;
+  ArrayFromVector<Int32Type, int32_t>(int32(), id_column, &id_array);
+  columns.push_back(id_array);
+
+  for (int i = 0; i < properties.num_columns; i++) {
+    std::ostringstream string_stream;
+    string_stream << properties.column_prefix << i;
+    field_vector.push_back(std::make_shared<Field>(string_stream.str(), 
float64()));
+    random::RandomArrayGenerator rand = 
random::RandomArrayGenerator(properties.seed + i);
+    columns.push_back(rand.Float64(num_rows, -1e5, 1e5));
+  }
+  std::shared_ptr<arrow::Schema> schema = 
std::make_shared<arrow::Schema>(field_vector);

Review Comment:
   ```suggestion
     std::shared_ptr<arrow::Schema> schema = schema(std::move(field_vector));
   ```
   another shortcut method :)



##########
cpp/src/arrow/compute/exec/test_util.h:
##########
@@ -145,5 +145,38 @@ class Random64Bit {
   std::uniform_int_distribution<uint64_t> dist_;
 };
 
+/*
+  Specify properties of a table to be generated.
+    - num_ids is the number of unique keys in the table
+    - time_frequency indicates the amount of time between data points that lie 
between
+  start and end (inclusive)
+    - num_columns indicates the amount of random columns in the table
+    - column_prefix specifies the prefix each randomly generated column should 
have
+    - seed is the random seed the random array generator is given to generate 
the random
+  columns
+    - start specifies the beginning of 'time' recorded in the table
+    - end specifies the end of 'time' recorded in the table
+*/
+struct TableGenerationProperties {
+  int time_frequency;
+  int num_columns;
+  int num_ids;
+  std::string column_prefix;
+  uint seed;
+  int start;
+  int end;
+};
+
+/*

Review Comment:
   Same suggestion as above regarding commenting style.



##########
cpp/src/arrow/compute/exec/asof_join_benchmark.cc:
##########
@@ -0,0 +1,159 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <boost/process.hpp>
+#include <string>
+
+#include "benchmark/benchmark.h"
+
+#include "arrow/compute/exec/test_util.h"
+#include "arrow/dataset/file_parquet.h"
+#include "arrow/table.h"
+#include "arrow/testing/future_util.h"
+
+namespace arrow {
+namespace compute {
+
+static const char* time_col = "time";
+static const char* key_col = "id";
+const int default_start = 0;
+const int default_end = 500;

Review Comment:
   ```suggestion
   constexpr int kDefaultStart = 0;
   constexpr int kDefaultEnd = 500;
   ```



##########
cpp/src/arrow/compute/exec/test_util.h:
##########
@@ -145,5 +145,38 @@ class Random64Bit {
   std::uniform_int_distribution<uint64_t> dist_;
 };
 
+/*
+  Specify properties of a table to be generated.
+    - num_ids is the number of unique keys in the table
+    - time_frequency indicates the amount of time between data points that lie 
between
+  start and end (inclusive)
+    - num_columns indicates the amount of random columns in the table
+    - column_prefix specifies the prefix each randomly generated column should 
have
+    - seed is the random seed the random array generator is given to generate 
the random

Review Comment:
   These can be comments on the fields themselves.  See `AsofJoinNodeOptions` 
in `arrow/compute/exec/options.h`



##########
cpp/src/arrow/compute/exec/asof_join_benchmark.cc:
##########
@@ -0,0 +1,159 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <boost/process.hpp>

Review Comment:
   ```suggestion
   ```



##########
cpp/src/arrow/compute/exec/asof_join_benchmark.cc:
##########
@@ -0,0 +1,159 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <boost/process.hpp>
+#include <string>
+
+#include "benchmark/benchmark.h"
+
+#include "arrow/compute/exec/test_util.h"
+#include "arrow/dataset/file_parquet.h"
+#include "arrow/table.h"
+#include "arrow/testing/future_util.h"
+
+namespace arrow {
+namespace compute {
+
+static const char* time_col = "time";
+static const char* key_col = "id";
+const int default_start = 0;
+const int default_end = 500;
+
+struct TableSourceNodeStats {
+  ExecNode* execNode;
+  size_t total_rows;
+  size_t total_bytes;
+};
+
+static TableSourceNodeStats MakeTableSourceNode(
+    std::shared_ptr<arrow::compute::ExecPlan>& plan, TableGenerationProperties 
properties,
+    int batch_size) {
+  std::shared_ptr<Table> table = MakeRandomTable(properties);
+  size_t row_size = sizeof(double) * (table.get()->schema()->num_fields() - 2) 
+
+                    sizeof(int64_t) + sizeof(int32_t);
+  size_t rows = table.get()->num_rows();
+  return {*arrow::compute::MakeExecNode(
+              "table_source",  // registered type
+              plan.get(),      // execution plan
+              {},              // inputs
+              arrow::compute::TableSourceNodeOptions(table, batch_size)),
+          rows, row_size * rows};
+}
+
+static void TableJoinOverhead(benchmark::State& state,
+                              TableGenerationProperties left_table_properties,
+                              int left_table_batch_size,
+                              TableGenerationProperties right_table_properties,
+                              int right_table_batch_size, int num_right_tables,
+                              std::string factory_name, ExecNodeOptions& 
options) {
+  ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool());
+  size_t rows = 0;
+  size_t bytes = 0;
+  for (auto _ : state) {
+    state.PauseTiming();
+
+    ASSERT_OK_AND_ASSIGN(std::shared_ptr<arrow::compute::ExecPlan> plan,
+                         ExecPlan::Make(&ctx));
+    left_table_properties.column_prefix = "lt";
+    left_table_properties.seed = 0;

Review Comment:
   Since the seed is constant you should be able to generate your inputs 
outside this loop, once.  Then you can run many iterations on the same inputs.  
This should speed up the runtime of this benchmark.



##########
cpp/src/arrow/compute/exec/test_util.h:
##########
@@ -145,5 +145,38 @@ class Random64Bit {
   std::uniform_int_distribution<uint64_t> dist_;
 };
 
+/*
+  Specify properties of a table to be generated.
+    - num_ids is the number of unique keys in the table
+    - time_frequency indicates the amount of time between data points that lie 
between
+  start and end (inclusive)
+    - num_columns indicates the amount of random columns in the table
+    - column_prefix specifies the prefix each randomly generated column should 
have
+    - seed is the random seed the random array generator is given to generate 
the random
+  columns
+    - start specifies the beginning of 'time' recorded in the table
+    - end specifies the end of 'time' recorded in the table
+*/
+struct TableGenerationProperties {
+  int time_frequency;
+  int num_columns;
+  int num_ids;
+  std::string column_prefix;
+  uint seed;
+  int start;
+  int end;
+};
+
+/*
+  The table generated in accordance to the TableGenerationProperties has the 
following
+  schema: time (int64) id (int32) [properties.column_prefix]0 (float64)
+      [properties.column_prefix]1 (float64)
+      ...
+      [properties.column_prefix][properties.num_columns] (float64)
+  Each id has rows corresponding to a singular data point in the time range 
(start, end,
+  time_frequency). The table is sorted by time.
+*/
+std::shared_ptr<Table> MakeRandomTable(TableGenerationProperties properties);

Review Comment:
   ```suggestion
   std::shared_ptr<Table> MakeRandomTable(const TableGenerationProperties& 
properties);
   ```



##########
cpp/src/arrow/compute/exec/asof_join_benchmark.cc:
##########
@@ -0,0 +1,159 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <boost/process.hpp>
+#include <string>
+
+#include "benchmark/benchmark.h"
+
+#include "arrow/compute/exec/test_util.h"
+#include "arrow/dataset/file_parquet.h"
+#include "arrow/table.h"
+#include "arrow/testing/future_util.h"
+
+namespace arrow {
+namespace compute {
+
+static const char* time_col = "time";
+static const char* key_col = "id";
+const int default_start = 0;
+const int default_end = 500;
+
+struct TableSourceNodeStats {
+  ExecNode* execNode;
+  size_t total_rows;
+  size_t total_bytes;
+};
+
+static TableSourceNodeStats MakeTableSourceNode(
+    std::shared_ptr<arrow::compute::ExecPlan>& plan, TableGenerationProperties 
properties,
+    int batch_size) {
+  std::shared_ptr<Table> table = MakeRandomTable(properties);
+  size_t row_size = sizeof(double) * (table.get()->schema()->num_fields() - 2) 
+
+                    sizeof(int64_t) + sizeof(int32_t);
+  size_t rows = table.get()->num_rows();
+  return {*arrow::compute::MakeExecNode(
+              "table_source",  // registered type
+              plan.get(),      // execution plan
+              {},              // inputs
+              arrow::compute::TableSourceNodeOptions(table, batch_size)),
+          rows, row_size * rows};
+}
+
+static void TableJoinOverhead(benchmark::State& state,
+                              TableGenerationProperties left_table_properties,
+                              int left_table_batch_size,
+                              TableGenerationProperties right_table_properties,
+                              int right_table_batch_size, int num_right_tables,
+                              std::string factory_name, ExecNodeOptions& 
options) {
+  ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool());
+  size_t rows = 0;
+  size_t bytes = 0;
+  for (auto _ : state) {
+    state.PauseTiming();
+
+    ASSERT_OK_AND_ASSIGN(std::shared_ptr<arrow::compute::ExecPlan> plan,
+                         ExecPlan::Make(&ctx));
+    left_table_properties.column_prefix = "lt";
+    left_table_properties.seed = 0;
+    TableSourceNodeStats left_table_stats =
+        MakeTableSourceNode(plan, left_table_properties, 
left_table_batch_size);
+    std::vector<ExecNode*> inputs = {left_table_stats.execNode};
+    int right_hand_rows = 0;
+    size_t right_hand_bytes = 0;
+    for (int i = 0; i < num_right_tables; i++) {
+      std::ostringstream string_stream;
+      string_stream << "rt" << i;
+      right_table_properties.column_prefix = string_stream.str();

Review Comment:
   ```suggestion
         right_table_properties.column_prefix = "rt" + std::to_string(i);
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to