kou commented on code in PR #33654:
URL: https://github.com/apache/arrow/pull/33654#discussion_r1070523127


##########
ruby/red-arrow/lib/arrow/table.rb:
##########
@@ -472,18 +472,22 @@ def pack
     #
     #     If both of `left_outputs` and `right_outputs` aren't
     #     specified, all columns in `self` and `right` are
-    #     outputted.
+    #     output.
     #   @param right_outputs [::Array<String, Symbol>] Output columns in
     #     `right`.
     #
     #     If both of `left_outputs` and `right_outputs` aren't
     #     specified, all columns in `self` and `right` are
-    #     outputted.
+    #     output.
     #   @return [Arrow::Table]
     #     The joined `Arrow::Table`.
     #
     # @overload join(right, type: :inner, left_outputs: nil, right_outputs: 
nil)
-    #   If key(s) are not supplied, common keys in self and right are used.
+    #   If key(s) are not supplied, common keys in self and right are used
+    #     (natural join).
+    #
+    #     Column used as keys are merged and remain in left side
+    #     when both of `left_outputs` and `right_outputs` are `nil`.

Review Comment:
   ```suggestion
       #   (natural join).
       #
       #   Column used as keys are merged and remain in left side
       #   when both of `left_outputs` and `right_outputs` are `nil`.
   ```



##########
ruby/red-arrow/test/test-table.rb:
##########
@@ -1252,5 +1338,94 @@ def setup
                                left_outputs: ["key", "number"],
                                right_outputs: ["string"]))
     end
+
+    test("preserve outputs, type: :inner") do

Review Comment:
   ```suggestion
       test("left_outputs: & type: :inner") do
   ```



##########
ruby/red-arrow/test/test-table.rb:
##########
@@ -1252,5 +1338,94 @@ def setup
                                left_outputs: ["key", "number"],
                                right_outputs: ["string"]))
     end
+
+    test("preserve outputs, type: :inner") do
+      table1 = Arrow::Table.new(key: [1, 2, 3],
+                                number: [10, 20, 30])
+      table2 = Arrow::Table.new(key: [3, 1],
+                                string: ["three", "one"])
+      assert_equal(Arrow::Table.new([
+                                      ["key", [1, 3]],
+                                      ["number", [10, 30]],
+                                      ["key", [1, 3]],
+                                      ["string", ["one", "three"]]
+                                    ]),
+                   table1.join(table2,
+                               type: :inner,
+                               left_outputs: table1.column_names,
+                               right_outputs: table2.column_names))
+    end
+
+    test("preserve outputs, type: :left_outer") do

Review Comment:
   ```suggestion
       test("left_outputs: & type: :left_outer") do
   ```



##########
ruby/red-arrow/test/test-table.rb:
##########
@@ -1126,15 +1126,14 @@ def setup
   end
 
   sub_test_case("#join") do
-    test("no keys") do
+    test("w/o keys (natural join)") do

Review Comment:
   ```suggestion
       test("keys: nil (natural join)") do
   ```



##########
ruby/red-arrow/test/test-table.rb:
##########
@@ -1252,5 +1338,94 @@ def setup
                                left_outputs: ["key", "number"],
                                right_outputs: ["string"]))
     end
+
+    test("preserve outputs, type: :inner") do
+      table1 = Arrow::Table.new(key: [1, 2, 3],
+                                number: [10, 20, 30])
+      table2 = Arrow::Table.new(key: [3, 1],
+                                string: ["three", "one"])
+      assert_equal(Arrow::Table.new([
+                                      ["key", [1, 3]],
+                                      ["number", [10, 30]],
+                                      ["key", [1, 3]],
+                                      ["string", ["one", "three"]]
+                                    ]),
+                   table1.join(table2,
+                               type: :inner,
+                               left_outputs: table1.column_names,
+                               right_outputs: table2.column_names))
+    end
+
+    test("preserve outputs, type: :left_outer") do
+      table1 = Arrow::Table.new(key: [1, 2, 3],
+                                number: [10, 20, 30])
+      table2 = Arrow::Table.new(key: [3, 1],
+                                string: ["three", "one"])
+      assert_equal(Arrow::Table.new([
+                                      ["key", [1, 3, 2]],
+                                      ["number", [10, 30, 20]],
+                                      ["key", [1, 3, nil]],
+                                      ["string", ["one", "three", nil]],
+                                    ]),
+                   table1.join(table2,
+                               type: :left_outer,
+                               left_outputs: table1.column_names,
+                               right_outputs: table2.column_names))
+    end
+
+    test("'preserve outputs, type: :right_outer") do
+      table1 = Arrow::Table.new(key: [1, 2, 3],
+                                number: [10, 20, 30])
+      table2 = Arrow::Table.new(key: [3, 1],
+                                string: ["three", "one"])
+      assert_equal(Arrow::Table.new([
+                                      ["key", [1, 3]],
+                                      ["number", [10, 30]],
+                                      ["key", [1, 3]],
+                                      ["string", ["one", "three"]],
+                                    ]),
+                   table1.join(table2,
+                               type: :right_outer,
+                               left_outputs: table1.column_names,
+                               right_outputs: table2.column_names))
+    end
+
+    test("preserve outputs, type: :full_outer") do
+      table1 = Arrow::Table.new(key: [1, 2, 3],
+                                number: [10, 20, 30])
+      table2 = Arrow::Table.new(key: [3, 1],
+                                string: ["three", "one"])
+      assert_equal(Arrow::Table.new([
+                                      ["key", [1, 3, 2]],
+                                      ["number", [10, 30, 20]],
+                                      ["key", [1, 3, nil]],
+                                      ["string", ["one", "three", nil]],
+                                    ]),
+                   table1.join(table2,
+                               type: :full_outer,
+                               left_outputs: table1.column_names,
+                               right_outputs: table2.column_names))
+    end
+
+    test(":left_suffix and :right_suffix, keys in Array") do

Review Comment:
   ```suggestion
       test("left_suffix: & keys: [String]") do
   ```



##########
ruby/red-arrow/test/test-table.rb:
##########
@@ -1225,20 +1236,95 @@ def setup
                                type: :inner))
     end
 
-    test("type:") do
+    test("type: :left_outer") do
       table1 = Arrow::Table.new(key: [1, 2, 3],
                                 number: [10, 20, 30])
       table2 = Arrow::Table.new(key: [3, 1],
                                 string: ["three", "one"])
       assert_equal(Arrow::Table.new([
                                       ["key", [1, 3, 2]],
                                       ["number", [10, 30, 20]],
-                                      ["key", [1, 3, nil]],
                                       ["string", ["one", "three", nil]],
                                     ]),
                    table1.join(table2, "key", type: :left_outer))
     end
 
+    test("type: :right_outer") do
+      table1 = Arrow::Table.new(key: [1, 2, 3],
+                                number: [10, 20, 30])
+      table2 = Arrow::Table.new(key: [3, 1],
+                                string: ["three", "one"])
+      assert_equal(Arrow::Table.new([
+                                      ["key", [1, 3]],
+                                      ["number", [10, 30]],
+                                      ["string", ["one", "three"]],
+                                    ]),
+                   table1.join(table2, "key", type: :right_outer))
+    end
+
+    test("type: :full_outer") do
+      table1 = Arrow::Table.new(key: [1, 2, 3],
+                                number: [10, 20, 30])
+      table2 = Arrow::Table.new(key: [3, 1],
+                                string: ["three", "one"])
+      assert_equal(Arrow::Table.new([
+                                      ["key", [1, 3, 2]],
+                                      ["number", [10, 30, 20]],
+                                      ["string", ["one", "three", nil]],
+                                    ]),
+                   table1.join(table2, "key", type: :full_outer))
+    end
+
+    test("type: :left_semi") do
+      table1 = Arrow::Table.new(key: [1, 2, 3],
+                                number: [10, 20, 30])
+      table2 = Arrow::Table.new(key: [3, 1],
+                                string: ["three", "one"])
+      assert_equal(Arrow::Table.new([
+                                      ["key", [1, 3]],
+                                      ["number", [10, 30]],
+                                    ]),
+                   table1.join(table2, "key", type: :left_semi))
+    end
+
+    test("type: :right_semi") do
+      table1 = Arrow::Table.new(key: [1, 2, 3],
+                                number: [10, 20, 30])
+      table2 = Arrow::Table.new(key: [3, 1],
+                                string: ["three", "one"])
+      assert_equal(Arrow::Table.new([
+                                      ["key", [3, 1]],
+                                      ["string", ["three", "one"]],
+                                    ]),
+                   table1.join(table2, "key", type: :right_semi))
+    end
+
+    test("type: :left_anti") do
+      table1 = Arrow::Table.new(key: [1, 2, 3],
+                                number: [10, 20, 30])
+      table2 = Arrow::Table.new(key: [3, 1],
+                                string: ["three", "one"])
+      assert_equal(Arrow::Table.new([
+                                      ["key", [2]],
+                                      ["number", [20]],
+                                    ]),
+                   table1.join(table2, "key", type: :left_anti))
+    end
+
+    test("type: :right_anti") do
+      table1 = Arrow::Table.new(key: [1, 2, 3],
+                                number: [10, 20, 30])
+      table2 = Arrow::Table.new(key: [3, 1],
+                                string: ["three", "one"])
+      assert_equal(Arrow::Table.new([
+                                      ["key", []],
+                                      ["string", []],
+                                    ]).to_s,
+                   table1.join(table2, "key", type: :right_anti).to_s)
+      # There is a bug: right_anti result has incorrect chunked array.
+      # Temporarily compare #to_s result.

Review Comment:
   It's not a bug. Apache Arrow GLib should add support for creating a new 
empty chunked array: #33671



##########
ruby/red-arrow/lib/arrow/table.rb:
##########
@@ -620,5 +657,24 @@ def ensure_raw_column(name, data)
         raise ArgumentError, message
       end
     end
+
+    def merge_columns_in_table(table, keys)
+      fields = table.schema.fields
+      arrays = table.columns.map(&:data)
+      keys.each_with_index do |k, i|
+        l = table.column_names.index(k)
+        r = table.column_names.rindex(k)
+        arrays[l] = merge_arrays(table.columns[l].data, table.columns[r].data)
+        fields[l] = Arrow::Field.new(k, fields[l].data_type)
+        arrays.delete_at(r - i)
+        fields.delete_at(r - i)
+      end
+      Arrow::Table.new(Arrow::Schema.new(fields), arrays)
+    end
+
+    def merge_arrays(array1, array2)
+      booleans = Arrow::Function.find(:is_null).execute([array1])
+      Arrow::Function.find(:if_else).execute([booleans, array2, array1]).value

Review Comment:
   We want to implement this by a project node. But it's not implemented yet: 
#33670



##########
ruby/red-arrow/lib/arrow/table.rb:
##########
@@ -493,13 +497,19 @@ def pack
     # @overload join(right, key, type: :inner, left_outputs: nil, 
right_outputs: nil)
     #   Join right by a key.
     #
+    #     Column used as keys are merged and remain in left side
+    #     when both of `left_outputs` and `right_outputs` are `nil`.
+    #
     #   @macro join_common_before
     #   @param key [String, Symbol] A join key.
     #   @macro join_common_after
     #
-    # @overload join(right, keys, type: :inner, left_outputs: nil, 
right_outputs: nil)
+    # @overload join(right, keys, type: :inner, left_suffix: "", right_suffix: 
"",
+    #                left_outputs: nil, right_outputs: nil)
     #   Join right by keys.
     #
+    #     Column name can be renamed by appending `left_suffix` or 
`right_suffix`.

Review Comment:
   ```suggestion
       #   Column name can be renamed by appending `left_suffix` or 
`right_suffix`.
   ```



##########
ruby/red-arrow/lib/arrow/table.rb:
##########
@@ -493,13 +497,19 @@ def pack
     # @overload join(right, key, type: :inner, left_outputs: nil, 
right_outputs: nil)
     #   Join right by a key.
     #
+    #     Column used as keys are merged and remain in left side
+    #     when both of `left_outputs` and `right_outputs` are `nil`.

Review Comment:
   ```suggestion
       #   Column used as keys are merged and remain in left side
       #   when both of `left_outputs` and `right_outputs` are `nil`.
   ```



##########
ruby/red-arrow/test/test-table.rb:
##########
@@ -1252,5 +1338,94 @@ def setup
                                left_outputs: ["key", "number"],
                                right_outputs: ["string"]))
     end
+
+    test("preserve outputs, type: :inner") do
+      table1 = Arrow::Table.new(key: [1, 2, 3],
+                                number: [10, 20, 30])
+      table2 = Arrow::Table.new(key: [3, 1],
+                                string: ["three", "one"])
+      assert_equal(Arrow::Table.new([
+                                      ["key", [1, 3]],
+                                      ["number", [10, 30]],
+                                      ["key", [1, 3]],
+                                      ["string", ["one", "three"]]
+                                    ]),
+                   table1.join(table2,
+                               type: :inner,
+                               left_outputs: table1.column_names,
+                               right_outputs: table2.column_names))
+    end
+
+    test("preserve outputs, type: :left_outer") do
+      table1 = Arrow::Table.new(key: [1, 2, 3],
+                                number: [10, 20, 30])
+      table2 = Arrow::Table.new(key: [3, 1],
+                                string: ["three", "one"])
+      assert_equal(Arrow::Table.new([
+                                      ["key", [1, 3, 2]],
+                                      ["number", [10, 30, 20]],
+                                      ["key", [1, 3, nil]],
+                                      ["string", ["one", "three", nil]],
+                                    ]),
+                   table1.join(table2,
+                               type: :left_outer,
+                               left_outputs: table1.column_names,
+                               right_outputs: table2.column_names))
+    end
+
+    test("'preserve outputs, type: :right_outer") do

Review Comment:
   ```suggestion
       test("left_outputs: & type: :right_outer") do
   ```



##########
ruby/red-arrow/lib/arrow/table.rb:
##########
@@ -549,6 +560,32 @@ def join(right, keys=nil, type: :inner, left_outputs: nil, 
right_outputs: nil)
       plan.wait
       reader = sink_node_options.get_reader(hash_join_node.output_schema)
       table = reader.read_all
+
+      table =
+        if left_outputs || right_outputs
+          table
+        elsif type.end_with?("semi") || type.end_with?("anti")

Review Comment:
   For old Ruby:
   
   ```suggestion
           elsif type.to_s.end_with?("semi") || type.to_s.end_with?("anti")
   ```



##########
ruby/red-arrow/test/test-table.rb:
##########
@@ -1252,5 +1338,94 @@ def setup
                                left_outputs: ["key", "number"],
                                right_outputs: ["string"]))
     end
+
+    test("preserve outputs, type: :inner") do
+      table1 = Arrow::Table.new(key: [1, 2, 3],
+                                number: [10, 20, 30])
+      table2 = Arrow::Table.new(key: [3, 1],
+                                string: ["three", "one"])
+      assert_equal(Arrow::Table.new([
+                                      ["key", [1, 3]],
+                                      ["number", [10, 30]],
+                                      ["key", [1, 3]],
+                                      ["string", ["one", "three"]]
+                                    ]),
+                   table1.join(table2,
+                               type: :inner,
+                               left_outputs: table1.column_names,
+                               right_outputs: table2.column_names))
+    end
+
+    test("preserve outputs, type: :left_outer") do
+      table1 = Arrow::Table.new(key: [1, 2, 3],
+                                number: [10, 20, 30])
+      table2 = Arrow::Table.new(key: [3, 1],
+                                string: ["three", "one"])
+      assert_equal(Arrow::Table.new([
+                                      ["key", [1, 3, 2]],
+                                      ["number", [10, 30, 20]],
+                                      ["key", [1, 3, nil]],
+                                      ["string", ["one", "three", nil]],
+                                    ]),
+                   table1.join(table2,
+                               type: :left_outer,
+                               left_outputs: table1.column_names,
+                               right_outputs: table2.column_names))
+    end
+
+    test("'preserve outputs, type: :right_outer") do
+      table1 = Arrow::Table.new(key: [1, 2, 3],
+                                number: [10, 20, 30])
+      table2 = Arrow::Table.new(key: [3, 1],
+                                string: ["three", "one"])
+      assert_equal(Arrow::Table.new([
+                                      ["key", [1, 3]],
+                                      ["number", [10, 30]],
+                                      ["key", [1, 3]],
+                                      ["string", ["one", "three"]],
+                                    ]),
+                   table1.join(table2,
+                               type: :right_outer,
+                               left_outputs: table1.column_names,
+                               right_outputs: table2.column_names))
+    end
+
+    test("preserve outputs, type: :full_outer") do

Review Comment:
   ```suggestion
       test("left_outputs: & type: :full_outer") do
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to