This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 252a6850bd GH-48132: [Ruby] Add support for writing dictionary array 
(#49175)
252a6850bd is described below

commit 252a6850bd4d6d11b2c97adf4b5907de177b3bf8
Author: Sutou Kouhei <[email protected]>
AuthorDate: Sun Feb 8 16:42:03 2026 +0900

    GH-48132: [Ruby] Add support for writing dictionary array (#49175)
    
    ### Rationale for this change
    
    Delta dictionary message support is out of scope.
    
    ### What changes are included in this PR?
    
    * Add `ArrowFormat::DictionaryArray#each_buffer`
    * Add `ArrowFormat::DictionaryType#build_fb_type`
    * Add support for dictionary message in `ArrowFormat::StreamingWriter`
    * Add support for writing dictionary message blocks in footer in 
`ArrowFormat::FileWriter`.
    
    ### Are these changes tested?
    
    Yes.
    
    ### Are there any user-facing changes?
    
    Yes.
    * GitHub Issue: #48132
    
    Authored-by: Sutou Kouhei <[email protected]>
    Signed-off-by: Sutou Kouhei <[email protected]>
---
 ruby/red-arrow-format/lib/arrow-format/array.rb    |  9 +++
 ruby/red-arrow-format/lib/arrow-format/bitmap.rb   |  2 +-
 ruby/red-arrow-format/lib/arrow-format/field.rb    | 14 +---
 .../lib/arrow-format/file-writer.rb                |  2 +-
 .../lib/arrow-format/streaming-writer.rb           | 79 +++++++++++++++------
 ruby/red-arrow-format/lib/arrow-format/type.rb     | 14 ++++
 ruby/red-arrow-format/test/test-writer.rb          | 81 ++++++++++++++--------
 7 files changed, 137 insertions(+), 64 deletions(-)

diff --git a/ruby/red-arrow-format/lib/arrow-format/array.rb 
b/ruby/red-arrow-format/lib/arrow-format/array.rb
index 4728d7ca70..73e87cf721 100644
--- a/ruby/red-arrow-format/lib/arrow-format/array.rb
+++ b/ruby/red-arrow-format/lib/arrow-format/array.rb
@@ -508,12 +508,21 @@ module ArrowFormat
   end
 
   class DictionaryArray < Array
+    attr_reader :indices_buffer
+    attr_reader :dictionary
     def initialize(type, size, validity_buffer, indices_buffer, dictionary)
       super(type, size, validity_buffer)
       @indices_buffer = indices_buffer
       @dictionary = dictionary
     end
 
+    def each_buffer
+      return to_enum(__method__) unless block_given?
+
+      yield(@validity_buffer)
+      yield(@indices_buffer)
+    end
+
     def to_a
       values = []
       @dictionary.each do |dictionary_chunk|
diff --git a/ruby/red-arrow-format/lib/arrow-format/bitmap.rb 
b/ruby/red-arrow-format/lib/arrow-format/bitmap.rb
index 0cd517a37f..88a1ab2ff4 100644
--- a/ruby/red-arrow-format/lib/arrow-format/bitmap.rb
+++ b/ruby/red-arrow-format/lib/arrow-format/bitmap.rb
@@ -24,7 +24,7 @@ module ArrowFormat
     end
 
     def [](i)
-      (@validity_buffer.get_value(:U8, i / 8) & (1 << (i % 8))) > 0
+      (@buffer.get_value(:U8, i / 8) & (1 << (i % 8))) > 0
     end
 
     def each
diff --git a/ruby/red-arrow-format/lib/arrow-format/field.rb 
b/ruby/red-arrow-format/lib/arrow-format/field.rb
index 3642c867c8..7736bbf5e7 100644
--- a/ruby/red-arrow-format/lib/arrow-format/field.rb
+++ b/ruby/red-arrow-format/lib/arrow-format/field.rb
@@ -34,18 +34,8 @@ module ArrowFormat
       fb_field = FB::Field::Data.new
       fb_field.name = @name
       fb_field.nullable = @nullable
-      if @type.is_a?(DictionaryType)
-        fb_field.type = @type.value_type.to_flatbuffers
-        dictionary_encoding = FB::DictionaryEncoding::Data.new
-        dictionary_encoding.id = @dictionary_id
-        int = FB::Int::Data.new
-        int.bit_width = @type.index_type.bit_width
-        int.signed = @type.index_type.signed?
-        dictionary_encoding.index_type = int
-        dictionary_encoding.ordered = @type.ordered?
-        dictionary_encoding.dictionary_kind =
-          FB::DictionaryKind::DENSE_ARRAY
-        fb_field.dictionary = dictionary
+      if @type.respond_to?(:build_fb_field)
+        @type.build_fb_field(fb_field, self)
       else
         fb_field.type = @type.to_flatbuffers
       end
diff --git a/ruby/red-arrow-format/lib/arrow-format/file-writer.rb 
b/ruby/red-arrow-format/lib/arrow-format/file-writer.rb
index 8509be59b6..27b6b55bbf 100644
--- a/ruby/red-arrow-format/lib/arrow-format/file-writer.rb
+++ b/ruby/red-arrow-format/lib/arrow-format/file-writer.rb
@@ -41,7 +41,7 @@ module ArrowFormat
       fb_footer = FB::Footer::Data.new
       fb_footer.version = FB::MetadataVersion::V5
       fb_footer.schema = @fb_schema
-      # fb_footer.dictionaries = ... # TODO
+      fb_footer.dictionaries = @fb_dictionary_blocks
       fb_footer.record_batches = @fb_record_batch_blocks
       # fb_footer.custom_metadata = ... # TODO
       FB::Footer.serialize(fb_footer)
diff --git a/ruby/red-arrow-format/lib/arrow-format/streaming-writer.rb 
b/ruby/red-arrow-format/lib/arrow-format/streaming-writer.rb
index 313c1b38ad..2f8f90b706 100644
--- a/ruby/red-arrow-format/lib/arrow-format/streaming-writer.rb
+++ b/ruby/red-arrow-format/lib/arrow-format/streaming-writer.rb
@@ -29,38 +29,26 @@ module ArrowFormat
     def initialize(output)
       @output = output
       @offset = 0
+      @fb_dictionary_blocks = []
       @fb_record_batch_blocks = []
+      @written_dictionary_offsets = {}
     end
 
     def start(schema)
       write_message(build_metadata(schema.to_flatbuffers))
-      # TODO: Write dictionaries
     end
 
     def write_record_batch(record_batch)
-      body_length = 0
-      record_batch.all_buffers_enumerator.each do |buffer|
-        body_length += aligned_buffer_size(buffer) if buffer
+      record_batch.schema.fields.each_with_index do |field, i|
+        next if field.dictionary_id.nil?
+        dictionary_array = record_batch.columns[i]
+        write_dictionary(field.dictionary_id, dictionary_array)
       end
-      metadata = build_metadata(record_batch.to_flatbuffers, body_length)
-      fb_block = FB::Block::Data.new
-      fb_block.offset = @offset
-      fb_block.meta_data_length =
-        CONTINUATION.bytesize +
-        MessagePullReader::METADATA_LENGTH_SIZE +
-        metadata.bytesize
-      fb_block.body_length = body_length
-      @fb_record_batch_blocks << fb_block
-      write_message(metadata) do
-        record_batch.all_buffers_enumerator.each do |buffer|
-          write_buffer(buffer) if buffer
-        end
-      end
-    end
 
-    # TODO
-    # def write_dictionary_delta(id, dictionary)
-    # end
+      write_record_batch_based_message(record_batch,
+                                       record_batch.to_flatbuffers,
+                                       @fb_record_batch_blocks)
+    end
 
     def finish
       write_data(EOS)
@@ -100,6 +88,53 @@ module ArrowFormat
       metadata
     end
 
+    def write_record_batch_based_message(record_batch, fb_header, fb_blocks)
+      body_length = 0
+      record_batch.all_buffers_enumerator.each do |buffer|
+        body_length += aligned_buffer_size(buffer) if buffer
+      end
+      metadata = build_metadata(fb_header, body_length)
+      fb_block = FB::Block::Data.new
+      fb_block.offset = @offset
+      fb_block.meta_data_length =
+        CONTINUATION.bytesize +
+        MessagePullReader::METADATA_LENGTH_SIZE +
+        metadata.bytesize
+      fb_block.body_length = body_length
+      fb_blocks << fb_block
+      write_message(metadata) do
+        record_batch.all_buffers_enumerator.each do |buffer|
+          write_buffer(buffer) if buffer
+        end
+      end
+    end
+
+    def write_dictionary(id, dictionary_array)
+      value_type = dictionary_array.type.value_type
+      dictionary = dictionary_array.dictionary
+
+      offset = @written_dictionary_offsets[id]
+      if offset.nil?
+        is_delta = false
+      else
+        is_delta = true
+        raise NotImplementedError,
+              "Delta dictionary message isn't implemented yet"
+      end
+
+      schema = Schema.new([Field.new("dummy", value_type, true, nil)])
+      size = dictionary.size
+      record_batch = RecordBatch.new(schema, size, [dictionary])
+      fb_dictionary_batch = FB::DictionaryBatch::Data.new
+      fb_dictionary_batch.id = id
+      fb_dictionary_batch.data = record_batch.to_flatbuffers
+      fb_dictionary_batch.delta = is_delta
+      write_record_batch_based_message(record_batch,
+                                       fb_dictionary_batch,
+                                       @fb_dictionary_blocks)
+      @written_dictionary_offsets[id] = dictionary_array.dictionary.size
+    end
+
     def write_message(metadata)
       write_data(CONTINUATION)
       metadata_size = metadata.bytesize
diff --git a/ruby/red-arrow-format/lib/arrow-format/type.rb 
b/ruby/red-arrow-format/lib/arrow-format/type.rb
index 808117740e..4ea41a2538 100644
--- a/ruby/red-arrow-format/lib/arrow-format/type.rb
+++ b/ruby/red-arrow-format/lib/arrow-format/type.rb
@@ -873,5 +873,19 @@ module ArrowFormat
                           indices_buffer,
                           dictionary)
     end
+
+    def build_fb_field(fb_field, field)
+      fb_dictionary_encoding = FB::DictionaryEncoding::Data.new
+      fb_dictionary_encoding.id = field.dictionary_id
+      fb_int = FB::Int::Data.new
+      fb_int.bit_width = @index_type.bit_width
+      fb_int.signed = @index_type.signed?
+      fb_dictionary_encoding.index_type = fb_int
+      fb_dictionary_encoding.ordered = @ordered
+      fb_dictionary_encoding.dictionary_kind =
+        FB::DictionaryKind::DENSE_ARRAY
+      fb_field.type = @value_type.to_flatbuffers
+      fb_field.dictionary = fb_dictionary_encoding
+    end
   end
 end
diff --git a/ruby/red-arrow-format/test/test-writer.rb 
b/ruby/red-arrow-format/test/test-writer.rb
index 183a5f29dd..3e4b5bedba 100644
--- a/ruby/red-arrow-format/test/test-writer.rb
+++ b/ruby/red-arrow-format/test/test-writer.rb
@@ -106,16 +106,30 @@ module WriterTests
         convert_field(field)
       end
       ArrowFormat::SparseUnionType.new(fields, red_arrow_type.type_codes)
+    when Arrow::DictionaryDataType
+      index_type = convert_type(red_arrow_type.index_data_type)
+      type = convert_type(red_arrow_type.value_data_type)
+      ArrowFormat::DictionaryType.new(index_type,
+                                      type,
+                                      red_arrow_type.ordered?)
     else
       raise "Unsupported type: #{red_arrow_type.inspect}"
     end
   end
 
   def convert_field(red_arrow_field)
+    type = convert_type(red_arrow_field.data_type)
+    if type.is_a?(ArrowFormat::DictionaryType)
+      @dictionary_id ||= 0
+      dictionary_id = @dictionary_id
+      @dictionary_id += 1
+    else
+      dictionary_id = nil
+    end
     ArrowFormat::Field.new(red_arrow_field.name,
-                           convert_type(red_arrow_field.data_type),
+                           type,
                            red_arrow_field.nullable?,
-                           nil)
+                           dictionary_id)
   end
 
   def convert_buffer(buffer)
@@ -171,11 +185,33 @@ module WriterTests
       type.build_array(red_arrow_array.size,
                        types_buffer,
                        children)
+    when ArrowFormat::DictionaryType
+      validity_buffer = convert_buffer(red_arrow_array.null_bitmap)
+      indices_buffer = convert_buffer(red_arrow_array.indices.data_buffer)
+      dictionary = convert_array(red_arrow_array.dictionary)
+      type.build_array(red_arrow_array.size,
+                       validity_buffer,
+                       indices_buffer,
+                       dictionary)
     else
       raise "Unsupported array #{red_arrow_array.inspect}"
     end
   end
 
+  def write(writer)
+    red_arrow_array = build_array
+    array = convert_array(red_arrow_array)
+    red_arrow_field = Arrow::Field.new("value",
+                                       red_arrow_array.value_data_type,
+                                       true)
+    fields = [convert_field(red_arrow_field)]
+    schema = ArrowFormat::Schema.new(fields)
+    record_batch = ArrowFormat::RecordBatch.new(schema, array.size, [array])
+    writer.start(schema)
+    writer.write_record_batch(record_batch)
+    writer.finish
+  end
+
   class << self
     def included(base)
       base.class_eval do
@@ -939,6 +975,19 @@ module WriterTests
                          @values)
           end
         end
+
+        sub_test_case("Dictionary") do
+          def build_array
+            values = ["a", "b", "c", nil, "a"]
+            string_array = Arrow::StringArray.new(values)
+            string_array.dictionary_encode
+          end
+
+          def test_write
+            assert_equal(["a", "b", "c", nil, "a"],
+                         @values)
+          end
+        end
       end
     end
   end
@@ -952,19 +1001,7 @@ class TestFileWriter < Test::Unit::TestCase
       path = File.join(tmp_dir, "data.arrow")
       File.open(path, "wb") do |output|
         writer = ArrowFormat::FileWriter.new(output)
-        red_arrow_array = build_array
-        array = convert_array(red_arrow_array)
-        fields = [
-          ArrowFormat::Field.new("value",
-                                 array.type,
-                                 true,
-                                 nil),
-        ]
-        schema = ArrowFormat::Schema.new(fields)
-        record_batch = ArrowFormat::RecordBatch.new(schema, array.size, 
[array])
-        writer.start(schema)
-        writer.write_record_batch(record_batch)
-        writer.finish
+        write(writer)
       end
       data = File.open(path, "rb", &:read).freeze
       table = Arrow::Table.load(Arrow::Buffer.new(data), format: :arrow)
@@ -982,19 +1019,7 @@ class TestStreamingWriter < Test::Unit::TestCase
       path = File.join(tmp_dir, "data.arrows")
       File.open(path, "wb") do |output|
         writer = ArrowFormat::StreamingWriter.new(output)
-        red_arrow_array = build_array
-        array = convert_array(red_arrow_array)
-        fields = [
-          ArrowFormat::Field.new("value",
-                                 array.type,
-                                 true,
-                                 nil),
-        ]
-        schema = ArrowFormat::Schema.new(fields)
-        record_batch = ArrowFormat::RecordBatch.new(schema, array.size, 
[array])
-        writer.start(schema)
-        writer.write_record_batch(record_batch)
-        writer.finish
+        write(writer)
       end
       data = File.open(path, "rb", &:read).freeze
       table = Arrow::Table.load(Arrow::Buffer.new(data), format: :arrows)

Reply via email to