save-buffer commented on code in PR #12289:
URL: https://github.com/apache/arrow/pull/12289#discussion_r859078078
##########
cpp/src/arrow/compute/exec/hash_join_node_test.cc:
##########
@@ -1900,5 +1903,150 @@ TEST(HashJoin, TrivialResidualFilter) {
}
}
+HashJoinNodeOptions GenerateHashJoinNodeOptions(Random64Bit& rng, int
num_left_cols,
+ int num_right_cols) {
+ HashJoinNodeOptions opts;
+ opts.join_type = static_cast<JoinType>(rng.from_range(0, 7));
+ bool is_left_join = opts.join_type == JoinType::LEFT_SEMI ||
+ opts.join_type == JoinType::LEFT_ANTI ||
+ opts.join_type == JoinType::LEFT_OUTER;
+ bool is_right_join = opts.join_type == JoinType::RIGHT_SEMI ||
+ opts.join_type == JoinType::RIGHT_ANTI ||
+ opts.join_type == JoinType::RIGHT_OUTER;
+
+ int num_keys = rng.from_range(1, std::min(num_left_cols, num_right_cols));
+ for (int i = 0; i < num_left_cols; i++) {
+ bool is_out = rng.from_range(0, 2) != 2;
+ if (is_out && !is_right_join) opts.left_output.push_back(FieldRef(i));
+ }
+ for (int i = 0; i < num_right_cols; i++) {
+ bool is_out = rng.from_range(0, 2) == 2;
+ if (is_out && !is_left_join) opts.right_output.push_back(FieldRef(i));
+ }
+ // We need at least one output
+ if (opts.right_output.empty() && opts.left_output.empty()) {
+ if (is_left_join) {
+ int col = rng.from_range(0, num_left_cols - 1);
+ opts.left_output.push_back(FieldRef(col));
+ } else if (is_right_join) {
+ int col = rng.from_range(0, num_right_cols - 1);
+ opts.right_output.push_back(FieldRef(col));
+ } else {
+ if (rng.from_range(0, 1) == 0) {
+ int col = rng.from_range(0, num_left_cols - 1);
+ opts.left_output.push_back(FieldRef(col));
+ } else {
+ int col = rng.from_range(0, num_right_cols - 1);
+ opts.right_output.push_back(FieldRef(col));
+ }
+ }
+ }
+
+ for (int i = 0; i < num_keys; i++) {
+ int left = rng.from_range(0, num_left_cols - 1);
+ int right = rng.from_range(0, num_right_cols - 1);
+ bool is_or_eq = rng.from_range(0, 1) == 0;
+ opts.left_keys.push_back(FieldRef(left));
+ opts.right_keys.push_back(FieldRef(right));
+ opts.key_cmp.push_back(is_or_eq ? JoinKeyCmp::IS : JoinKeyCmp::EQ);
+ }
+ return opts;
+}
+
+void TestSingleChainOfHashJoins(Random64Bit& rng) {
+ int num_joins = rng.from_range(2, 5);
+ std::vector<HashJoinNodeOptions> opts;
+ int num_left_cols = rng.from_range(1, 8);
+ int num_right_cols = rng.from_range(1, 8);
+ HashJoinNodeOptions first_opt =
+ GenerateHashJoinNodeOptions(rng, num_left_cols, num_right_cols);
+ opts.push_back(std::move(first_opt));
+
+ std::unordered_map<std::string, std::string> metadata_map;
+ metadata_map["min"] = "0";
+ metadata_map["max"] = "10";
+ auto metadata = key_value_metadata(metadata_map);
+ std::vector<std::shared_ptr<Field>> left_fields;
+ for (int i = 0; i < num_left_cols; i++)
+ left_fields.push_back(field(std::string("l") + std::to_string(i), int32(),
metadata));
+ std::vector<std::shared_ptr<Field>> first_right_fields;
+ for (int i = 0; i < num_right_cols; i++)
+ first_right_fields.push_back(
+ field(std::string("r_0_") + std::to_string(i), int32(), metadata));
+
+ BatchesWithSchema input_left =
MakeRandomBatches(schema(std::move(left_fields)));
+ std::vector<BatchesWithSchema> input_right;
+
input_right.push_back(MakeRandomBatches(schema(std::move(first_right_fields))));
+
+ for (int i = 1; i < num_joins; i++) {
+ int num_right_cols = rng.from_range(1, 8);
+ HashJoinNodeOptions opt =
+ GenerateHashJoinNodeOptions(rng,
+ static_cast<int>(opts[i -
1].left_output.size() +
+ opts[i -
1].right_output.size()),
+ num_right_cols);
+ opts.push_back(std::move(opt));
+
+ std::vector<std::shared_ptr<Field>> right_fields;
+ for (int j = 0; j < num_right_cols; j++)
+ right_fields.push_back(
+ field(std::string("r_") + std::to_string(i) + "_" +
std::to_string(j), int32(),
+ metadata));
+ BatchesWithSchema input =
MakeRandomBatches(schema(std::move(right_fields)));
+ input_right.push_back(std::move(input));
+ }
+
+ std::vector<ExecBatch> reference;
+ for (bool bloom_filters : {false, true}) {
+ bool parallel = true;
Review Comment:
Don't really remember, but I've changed it to a constant. I might've just
had it there for debugging.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]