kou commented on code in PR #33654:
URL: https://github.com/apache/arrow/pull/33654#discussion_r1070523127
##########
ruby/red-arrow/lib/arrow/table.rb:
##########
@@ -472,18 +472,22 @@ def pack
#
# If both of `left_outputs` and `right_outputs` aren't
# specified, all columns in `self` and `right` are
- # outputted.
+ # output.
# @param right_outputs [::Array<String, Symbol>] Output columns in
# `right`.
#
# If both of `left_outputs` and `right_outputs` aren't
# specified, all columns in `self` and `right` are
- # outputted.
+ # output.
# @return [Arrow::Table]
# The joined `Arrow::Table`.
#
# @overload join(right, type: :inner, left_outputs: nil, right_outputs:
nil)
- # If key(s) are not supplied, common keys in self and right are used.
+ # If key(s) are not supplied, common keys in self and right are used
+ # (natural join).
+ #
+ # Column used as keys are merged and remain in left side
+ # when both of `left_outputs` and `right_outputs` are `nil`.
Review Comment:
```suggestion
# (natural join).
#
# Column used as keys are merged and remain in left side
# when both of `left_outputs` and `right_outputs` are `nil`.
```
##########
ruby/red-arrow/test/test-table.rb:
##########
@@ -1252,5 +1338,94 @@ def setup
left_outputs: ["key", "number"],
right_outputs: ["string"]))
end
+
+ test("preserve outputs, type: :inner") do
Review Comment:
```suggestion
test("left_outputs: & type: :inner") do
```
##########
ruby/red-arrow/test/test-table.rb:
##########
@@ -1252,5 +1338,94 @@ def setup
left_outputs: ["key", "number"],
right_outputs: ["string"]))
end
+
+ test("preserve outputs, type: :inner") do
+ table1 = Arrow::Table.new(key: [1, 2, 3],
+ number: [10, 20, 30])
+ table2 = Arrow::Table.new(key: [3, 1],
+ string: ["three", "one"])
+ assert_equal(Arrow::Table.new([
+ ["key", [1, 3]],
+ ["number", [10, 30]],
+ ["key", [1, 3]],
+ ["string", ["one", "three"]]
+ ]),
+ table1.join(table2,
+ type: :inner,
+ left_outputs: table1.column_names,
+ right_outputs: table2.column_names))
+ end
+
+ test("preserve outputs, type: :left_outer") do
Review Comment:
```suggestion
test("left_outputs: & type: :left_outer") do
```
##########
ruby/red-arrow/test/test-table.rb:
##########
@@ -1126,15 +1126,14 @@ def setup
end
sub_test_case("#join") do
- test("no keys") do
+ test("w/o keys (natural join)") do
Review Comment:
```suggestion
test("keys: nil (natural join)") do
```
##########
ruby/red-arrow/test/test-table.rb:
##########
@@ -1252,5 +1338,94 @@ def setup
left_outputs: ["key", "number"],
right_outputs: ["string"]))
end
+
+ test("preserve outputs, type: :inner") do
+ table1 = Arrow::Table.new(key: [1, 2, 3],
+ number: [10, 20, 30])
+ table2 = Arrow::Table.new(key: [3, 1],
+ string: ["three", "one"])
+ assert_equal(Arrow::Table.new([
+ ["key", [1, 3]],
+ ["number", [10, 30]],
+ ["key", [1, 3]],
+ ["string", ["one", "three"]]
+ ]),
+ table1.join(table2,
+ type: :inner,
+ left_outputs: table1.column_names,
+ right_outputs: table2.column_names))
+ end
+
+ test("preserve outputs, type: :left_outer") do
+ table1 = Arrow::Table.new(key: [1, 2, 3],
+ number: [10, 20, 30])
+ table2 = Arrow::Table.new(key: [3, 1],
+ string: ["three", "one"])
+ assert_equal(Arrow::Table.new([
+ ["key", [1, 3, 2]],
+ ["number", [10, 30, 20]],
+ ["key", [1, 3, nil]],
+ ["string", ["one", "three", nil]],
+ ]),
+ table1.join(table2,
+ type: :left_outer,
+ left_outputs: table1.column_names,
+ right_outputs: table2.column_names))
+ end
+
+ test("'preserve outputs, type: :right_outer") do
+ table1 = Arrow::Table.new(key: [1, 2, 3],
+ number: [10, 20, 30])
+ table2 = Arrow::Table.new(key: [3, 1],
+ string: ["three", "one"])
+ assert_equal(Arrow::Table.new([
+ ["key", [1, 3]],
+ ["number", [10, 30]],
+ ["key", [1, 3]],
+ ["string", ["one", "three"]],
+ ]),
+ table1.join(table2,
+ type: :right_outer,
+ left_outputs: table1.column_names,
+ right_outputs: table2.column_names))
+ end
+
+ test("preserve outputs, type: :full_outer") do
+ table1 = Arrow::Table.new(key: [1, 2, 3],
+ number: [10, 20, 30])
+ table2 = Arrow::Table.new(key: [3, 1],
+ string: ["three", "one"])
+ assert_equal(Arrow::Table.new([
+ ["key", [1, 3, 2]],
+ ["number", [10, 30, 20]],
+ ["key", [1, 3, nil]],
+ ["string", ["one", "three", nil]],
+ ]),
+ table1.join(table2,
+ type: :full_outer,
+ left_outputs: table1.column_names,
+ right_outputs: table2.column_names))
+ end
+
+ test(":left_suffix and :right_suffix, keys in Array") do
Review Comment:
```suggestion
test("left_suffix: & keys: [String]") do
```
##########
ruby/red-arrow/test/test-table.rb:
##########
@@ -1225,20 +1236,95 @@ def setup
type: :inner))
end
- test("type:") do
+ test("type: :left_outer") do
table1 = Arrow::Table.new(key: [1, 2, 3],
number: [10, 20, 30])
table2 = Arrow::Table.new(key: [3, 1],
string: ["three", "one"])
assert_equal(Arrow::Table.new([
["key", [1, 3, 2]],
["number", [10, 30, 20]],
- ["key", [1, 3, nil]],
["string", ["one", "three", nil]],
]),
table1.join(table2, "key", type: :left_outer))
end
+ test("type: :right_outer") do
+ table1 = Arrow::Table.new(key: [1, 2, 3],
+ number: [10, 20, 30])
+ table2 = Arrow::Table.new(key: [3, 1],
+ string: ["three", "one"])
+ assert_equal(Arrow::Table.new([
+ ["key", [1, 3]],
+ ["number", [10, 30]],
+ ["string", ["one", "three"]],
+ ]),
+ table1.join(table2, "key", type: :right_outer))
+ end
+
+ test("type: :full_outer") do
+ table1 = Arrow::Table.new(key: [1, 2, 3],
+ number: [10, 20, 30])
+ table2 = Arrow::Table.new(key: [3, 1],
+ string: ["three", "one"])
+ assert_equal(Arrow::Table.new([
+ ["key", [1, 3, 2]],
+ ["number", [10, 30, 20]],
+ ["string", ["one", "three", nil]],
+ ]),
+ table1.join(table2, "key", type: :full_outer))
+ end
+
+ test("type: :left_semi") do
+ table1 = Arrow::Table.new(key: [1, 2, 3],
+ number: [10, 20, 30])
+ table2 = Arrow::Table.new(key: [3, 1],
+ string: ["three", "one"])
+ assert_equal(Arrow::Table.new([
+ ["key", [1, 3]],
+ ["number", [10, 30]],
+ ]),
+ table1.join(table2, "key", type: :left_semi))
+ end
+
+ test("type: :right_semi") do
+ table1 = Arrow::Table.new(key: [1, 2, 3],
+ number: [10, 20, 30])
+ table2 = Arrow::Table.new(key: [3, 1],
+ string: ["three", "one"])
+ assert_equal(Arrow::Table.new([
+ ["key", [3, 1]],
+ ["string", ["three", "one"]],
+ ]),
+ table1.join(table2, "key", type: :right_semi))
+ end
+
+ test("type: :left_anti") do
+ table1 = Arrow::Table.new(key: [1, 2, 3],
+ number: [10, 20, 30])
+ table2 = Arrow::Table.new(key: [3, 1],
+ string: ["three", "one"])
+ assert_equal(Arrow::Table.new([
+ ["key", [2]],
+ ["number", [20]],
+ ]),
+ table1.join(table2, "key", type: :left_anti))
+ end
+
+ test("type: :right_anti") do
+ table1 = Arrow::Table.new(key: [1, 2, 3],
+ number: [10, 20, 30])
+ table2 = Arrow::Table.new(key: [3, 1],
+ string: ["three", "one"])
+ assert_equal(Arrow::Table.new([
+ ["key", []],
+ ["string", []],
+ ]).to_s,
+ table1.join(table2, "key", type: :right_anti).to_s)
+ # There is a bug: right_anti result has incorrect chunked array.
+ # Temporarily compare #to_s result.
Review Comment:
It's not a bug. Apache Arrow GLib should add support for creating a new
empty chunked array: #33671
##########
ruby/red-arrow/lib/arrow/table.rb:
##########
@@ -620,5 +657,24 @@ def ensure_raw_column(name, data)
raise ArgumentError, message
end
end
+
+ def merge_columns_in_table(table, keys)
+ fields = table.schema.fields
+ arrays = table.columns.map(&:data)
+ keys.each_with_index do |k, i|
+ l = table.column_names.index(k)
+ r = table.column_names.rindex(k)
+ arrays[l] = merge_arrays(table.columns[l].data, table.columns[r].data)
+ fields[l] = Arrow::Field.new(k, fields[l].data_type)
+ arrays.delete_at(r - i)
+ fields.delete_at(r - i)
+ end
+ Arrow::Table.new(Arrow::Schema.new(fields), arrays)
+ end
+
+ def merge_arrays(array1, array2)
+ booleans = Arrow::Function.find(:is_null).execute([array1])
+ Arrow::Function.find(:if_else).execute([booleans, array2, array1]).value
Review Comment:
We want to implement this by a project node. But it's not implemented yet:
#33670
##########
ruby/red-arrow/lib/arrow/table.rb:
##########
@@ -493,13 +497,19 @@ def pack
# @overload join(right, key, type: :inner, left_outputs: nil,
right_outputs: nil)
# Join right by a key.
#
+ # Column used as keys are merged and remain in left side
+ # when both of `left_outputs` and `right_outputs` are `nil`.
+ #
# @macro join_common_before
# @param key [String, Symbol] A join key.
# @macro join_common_after
#
- # @overload join(right, keys, type: :inner, left_outputs: nil,
right_outputs: nil)
+ # @overload join(right, keys, type: :inner, left_suffix: "", right_suffix:
"",
+ # left_outputs: nil, right_outputs: nil)
# Join right by keys.
#
+ # Column name can be renamed by appending `left_suffix` or
`right_suffix`.
Review Comment:
```suggestion
# Column name can be renamed by appending `left_suffix` or
`right_suffix`.
```
##########
ruby/red-arrow/lib/arrow/table.rb:
##########
@@ -493,13 +497,19 @@ def pack
# @overload join(right, key, type: :inner, left_outputs: nil,
right_outputs: nil)
# Join right by a key.
#
+ # Column used as keys are merged and remain in left side
+ # when both of `left_outputs` and `right_outputs` are `nil`.
Review Comment:
```suggestion
# Column used as keys are merged and remain in left side
# when both of `left_outputs` and `right_outputs` are `nil`.
```
##########
ruby/red-arrow/test/test-table.rb:
##########
@@ -1252,5 +1338,94 @@ def setup
left_outputs: ["key", "number"],
right_outputs: ["string"]))
end
+
+ test("preserve outputs, type: :inner") do
+ table1 = Arrow::Table.new(key: [1, 2, 3],
+ number: [10, 20, 30])
+ table2 = Arrow::Table.new(key: [3, 1],
+ string: ["three", "one"])
+ assert_equal(Arrow::Table.new([
+ ["key", [1, 3]],
+ ["number", [10, 30]],
+ ["key", [1, 3]],
+ ["string", ["one", "three"]]
+ ]),
+ table1.join(table2,
+ type: :inner,
+ left_outputs: table1.column_names,
+ right_outputs: table2.column_names))
+ end
+
+ test("preserve outputs, type: :left_outer") do
+ table1 = Arrow::Table.new(key: [1, 2, 3],
+ number: [10, 20, 30])
+ table2 = Arrow::Table.new(key: [3, 1],
+ string: ["three", "one"])
+ assert_equal(Arrow::Table.new([
+ ["key", [1, 3, 2]],
+ ["number", [10, 30, 20]],
+ ["key", [1, 3, nil]],
+ ["string", ["one", "three", nil]],
+ ]),
+ table1.join(table2,
+ type: :left_outer,
+ left_outputs: table1.column_names,
+ right_outputs: table2.column_names))
+ end
+
+ test("'preserve outputs, type: :right_outer") do
Review Comment:
```suggestion
test("left_outputs: & type: :right_outer") do
```
##########
ruby/red-arrow/lib/arrow/table.rb:
##########
@@ -549,6 +560,32 @@ def join(right, keys=nil, type: :inner, left_outputs: nil,
right_outputs: nil)
plan.wait
reader = sink_node_options.get_reader(hash_join_node.output_schema)
table = reader.read_all
+
+ table =
+ if left_outputs || right_outputs
+ table
+ elsif type.end_with?("semi") || type.end_with?("anti")
Review Comment:
For old Ruby:
```suggestion
elsif type.to_s.end_with?("semi") || type.to_s.end_with?("anti")
```
##########
ruby/red-arrow/test/test-table.rb:
##########
@@ -1252,5 +1338,94 @@ def setup
left_outputs: ["key", "number"],
right_outputs: ["string"]))
end
+
+ test("preserve outputs, type: :inner") do
+ table1 = Arrow::Table.new(key: [1, 2, 3],
+ number: [10, 20, 30])
+ table2 = Arrow::Table.new(key: [3, 1],
+ string: ["three", "one"])
+ assert_equal(Arrow::Table.new([
+ ["key", [1, 3]],
+ ["number", [10, 30]],
+ ["key", [1, 3]],
+ ["string", ["one", "three"]]
+ ]),
+ table1.join(table2,
+ type: :inner,
+ left_outputs: table1.column_names,
+ right_outputs: table2.column_names))
+ end
+
+ test("preserve outputs, type: :left_outer") do
+ table1 = Arrow::Table.new(key: [1, 2, 3],
+ number: [10, 20, 30])
+ table2 = Arrow::Table.new(key: [3, 1],
+ string: ["three", "one"])
+ assert_equal(Arrow::Table.new([
+ ["key", [1, 3, 2]],
+ ["number", [10, 30, 20]],
+ ["key", [1, 3, nil]],
+ ["string", ["one", "three", nil]],
+ ]),
+ table1.join(table2,
+ type: :left_outer,
+ left_outputs: table1.column_names,
+ right_outputs: table2.column_names))
+ end
+
+ test("'preserve outputs, type: :right_outer") do
+ table1 = Arrow::Table.new(key: [1, 2, 3],
+ number: [10, 20, 30])
+ table2 = Arrow::Table.new(key: [3, 1],
+ string: ["three", "one"])
+ assert_equal(Arrow::Table.new([
+ ["key", [1, 3]],
+ ["number", [10, 30]],
+ ["key", [1, 3]],
+ ["string", ["one", "three"]],
+ ]),
+ table1.join(table2,
+ type: :right_outer,
+ left_outputs: table1.column_names,
+ right_outputs: table2.column_names))
+ end
+
+ test("preserve outputs, type: :full_outer") do
Review Comment:
```suggestion
test("left_outputs: & type: :full_outer") do
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]