This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 6d655698f3 GH-48360: [Ruby] Add support for reading large binary array 
(#48361)
6d655698f3 is described below

commit 6d655698f32a0f16db3ea3c3878158062dc2a10c
Author: Sutou Kouhei <[email protected]>
AuthorDate: Sun Dec 7 14:37:44 2025 +0900

    GH-48360: [Ruby] Add support for reading large binary array (#48361)
    
    ### Rationale for this change
    
    It's the 64 bit offset version of binary array.
    
    ### What changes are included in this PR?
    
    * Add `ArrowFormat::LargeBinaryType`
    * Add `ArrowFormat::LargeBinaryArray`
    
    ### Are these changes tested?
    
    Yes.
    
    ### Are there any user-facing changes?
    
    Yes.
    * GitHub Issue: #48360
    
    Authored-by: Sutou Kouhei <[email protected]>
    Signed-off-by: Sutou Kouhei <[email protected]>
---
 ruby/red-arrow-format/lib/arrow-format/array.rb    | 21 ++++++++++++++++-
 .../lib/arrow-format/file-reader.rb                |  6 +++--
 ruby/red-arrow-format/lib/arrow-format/type.rb     | 27 ++++++++++++++++++++--
 ruby/red-arrow-format/test/test-file-reader.rb     | 11 +++++++++
 4 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/ruby/red-arrow-format/lib/arrow-format/array.rb 
b/ruby/red-arrow-format/lib/arrow-format/array.rb
index f3c3c49233..d4995cda3e 100644
--- a/ruby/red-arrow-format/lib/arrow-format/array.rb
+++ b/ruby/red-arrow-format/lib/arrow-format/array.rb
@@ -113,7 +113,7 @@ module ArrowFormat
 
     def to_a
       values = @offsets_buffer.
-        each(:s32, 0, @size + 1). # TODO: big endian support
+        each(buffer_type, 0, @size + 1).
         each_cons(2).
         collect do |(_, offset), (_, next_offset)|
         length = next_offset - offset
@@ -125,6 +125,21 @@ module ArrowFormat
 
   class BinaryArray < VariableSizeBinaryLayoutArray
     private
+    def buffer_type
+      :s32 # TODO: big endian support
+    end
+
+    def encoding
+      Encoding::ASCII_8BIT
+    end
+  end
+
+  class LargeBinaryArray < VariableSizeBinaryLayoutArray
+    private
+    def buffer_type
+      :s64 # TODO: big endian support
+    end
+
     def encoding
       Encoding::ASCII_8BIT
     end
@@ -132,6 +147,10 @@ module ArrowFormat
 
   class UTF8Array < VariableSizeBinaryLayoutArray
     private
+    def buffer_type
+      :s32 # TODO: big endian support
+    end
+
     def encoding
       Encoding::UTF_8
     end
diff --git a/ruby/red-arrow-format/lib/arrow-format/file-reader.rb 
b/ruby/red-arrow-format/lib/arrow-format/file-reader.rb
index edc866b3f0..acd21b9764 100644
--- a/ruby/red-arrow-format/lib/arrow-format/file-reader.rb
+++ b/ruby/red-arrow-format/lib/arrow-format/file-reader.rb
@@ -27,6 +27,7 @@ require_relative "org/apache/arrow/flatbuf/bool"
 require_relative "org/apache/arrow/flatbuf/floating_point"
 require_relative "org/apache/arrow/flatbuf/footer"
 require_relative "org/apache/arrow/flatbuf/int"
+require_relative "org/apache/arrow/flatbuf/large_binary"
 require_relative "org/apache/arrow/flatbuf/list"
 require_relative "org/apache/arrow/flatbuf/message"
 require_relative "org/apache/arrow/flatbuf/null"
@@ -158,6 +159,8 @@ module ArrowFormat
         type = ListType.new(read_field(fb_field.children[0]))
       when Org::Apache::Arrow::Flatbuf::Binary
         type = BinaryType.singleton
+      when Org::Apache::Arrow::Flatbuf::LargeBinary
+        type = LargeBinaryType.singleton
       when Org::Apache::Arrow::Flatbuf::Utf8
         type = UTF8Type.singleton
       end
@@ -196,8 +199,7 @@ module ArrowFormat
         offsets = body.slice(offsets_buffer.offset, offsets_buffer.length)
         child = read_column(field.type.child, nodes, buffers, body)
         field.type.build_array(length, validity, offsets, child)
-      when BinaryType,
-           UTF8Type
+      when VariableSizeBinaryType
         offsets_buffer = buffers.shift
         values_buffer = buffers.shift
         offsets = body.slice(offsets_buffer.offset, offsets_buffer.length)
diff --git a/ruby/red-arrow-format/lib/arrow-format/type.rb 
b/ruby/red-arrow-format/lib/arrow-format/type.rb
index 75586c2f35..b656395634 100644
--- a/ruby/red-arrow-format/lib/arrow-format/type.rb
+++ b/ruby/red-arrow-format/lib/arrow-format/type.rb
@@ -120,7 +120,10 @@ module ArrowFormat
     end
   end
 
-  class BinaryType < Type
+  class VariableSizeBinaryType < Type
+  end
+
+  class BinaryType < VariableSizeBinaryType
     class << self
       def singleton
         @singleton ||= new
@@ -136,7 +139,27 @@ module ArrowFormat
     end
   end
 
-  class UTF8Type < Type
+  class LargeBinaryType < VariableSizeBinaryType
+    class << self
+      def singleton
+        @singleton ||= new
+      end
+    end
+
+    def initialize
+      super("LargeBinary")
+    end
+
+    def build_array(size, validity_buffer, offsets_buffer, values_buffer)
+      LargeBinaryArray.new(self,
+                           size,
+                           validity_buffer,
+                           offsets_buffer,
+                           values_buffer)
+    end
+  end
+
+  class UTF8Type < VariableSizeBinaryType
     class << self
       def singleton
         @singleton ||= new
diff --git a/ruby/red-arrow-format/test/test-file-reader.rb 
b/ruby/red-arrow-format/test/test-file-reader.rb
index 02685b1987..b39d7b1fff 100644
--- a/ruby/red-arrow-format/test/test-file-reader.rb
+++ b/ruby/red-arrow-format/test/test-file-reader.rb
@@ -106,6 +106,17 @@ class TestFileReader < Test::Unit::TestCase
     end
   end
 
+  sub_test_case("LargeBinary") do
+    def build_array
+      Arrow::LargeBinaryArray.new(["Hello".b, nil, "World".b])
+    end
+
+    def test_read
+      assert_equal([{"value" => ["Hello".b, nil, "World".b]}],
+                   read)
+    end
+  end
+
   sub_test_case("UTF8") do
     def build_array
       Arrow::StringArray.new(["Hello", nil, "World"])

Reply via email to