This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new bc48921816 GH-49208: [Ruby] Add support for writing dictionary delta
message (#49209)
bc48921816 is described below
commit bc489218164218387c9728b89497c80d41eb1c00
Author: Sutou Kouhei <[email protected]>
AuthorDate: Wed Feb 11 09:07:38 2026 +0900
GH-49208: [Ruby] Add support for writing dictionary delta message (#49209)
### Rationale for this change
This focuses on implementing base dictionary delta message support
mechanism. So this adds support for only UTF-8 array as dictionary. Other
arrays will be supported in follow-up tasks.
### What changes are included in this PR?
* Add support for `ArrowFromat#slice` (But it's not completed. It just
works partially.)
* If the second record batch includes an updated dictionary (new entries
are appended), these appended entries are sliced and they are only written as
delta.
### Are these changes tested?
Yes.
### Are there any user-facing changes?
Yes.
* GitHub Issue: #49208
Authored-by: Sutou Kouhei <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
---
ruby/red-arrow-format/lib/arrow-format/array.rb | 310 +++--
ruby/red-arrow-format/lib/arrow-format/bitmap.rb | 12 +-
.../lib/arrow-format/streaming-writer.rb | 3 +-
ruby/red-arrow-format/lib/arrow-format/type.rb | 88 ++
ruby/red-arrow-format/test/test-writer.rb | 1325 +++++++++-----------
5 files changed, 926 insertions(+), 812 deletions(-)
diff --git a/ruby/red-arrow-format/lib/arrow-format/array.rb
b/ruby/red-arrow-format/lib/arrow-format/array.rb
index 73e87cf721..87dbd0e0d6 100644
--- a/ruby/red-arrow-format/lib/arrow-format/array.rb
+++ b/ruby/red-arrow-format/lib/arrow-format/array.rb
@@ -23,11 +23,20 @@ module ArrowFormat
attr_reader :type
attr_reader :size
alias_method :length, :size
+ attr_reader :offset
attr_reader :validity_buffer
def initialize(type, size, validity_buffer)
@type = type
@size = size
+ @offset = 0
@validity_buffer = validity_buffer
+ @sliced_buffers = {}
+ end
+
+ def slice(offset, size=nil)
+ sliced = dup
+ sliced.slice!(@offset + offset, size || @size - offset)
+ sliced
end
def valid?(i)
@@ -43,16 +52,20 @@ module ArrowFormat
if @validity_buffer.nil?
0
else
- # TODO: popcount
- validity_bitmap.count do |is_valid|
- not is_valid
- end
+ @size - validity_bitmap.popcount
end
end
+ protected
+ def slice!(offset, size)
+ @offset = offset
+ @size = size
+ clear_cache
+ end
+
private
def validity_bitmap
- @validity_bitmap ||= Bitmap.new(@validity_buffer, @size)
+ @validity_bitmap ||= Bitmap.new(@validity_buffer, @offset, @size)
end
def apply_validity(array)
@@ -62,6 +75,63 @@ module ArrowFormat
end
array
end
+
+ def clear_cache
+ @validity_bitmap = nil
+ @sliced_buffers = {}
+ end
+
+ def slice_buffer(id, buffer)
+ return buffer if buffer.nil?
+ return buffer if @offset.zero?
+
+ @sliced_buffers[id] ||= yield(buffer)
+ end
+
+ def slice_bitmap_buffer(id, buffer)
+ slice_buffer(id, buffer) do
+ if (@offset % 8).zero?
+ buffer.slice(@offset / 8)
+ else
+ # We need to copy because we can't do bit level slice.
+ # TODO: Optimize.
+ valid_bytes = []
+ Bitmap.new(buffer, @offset, @size).each_slice(8) do |valids|
+ valid_byte = 0
+ valids.each_with_index do |valid, i|
+ valid_byte |= 1 << (i % 8) if valid
+ end
+ valid_bytes << valid_byte
+ end
+ IO::Buffer.for(valid_bytes.pack("C*"))
+ end
+ end
+ end
+
+ def slice_fixed_element_size_buffer(id, buffer, element_size)
+ slice_buffer(id, buffer) do
+ buffer.slice(element_size * @offset)
+ end
+ end
+
+ def slice_offsets_buffer(id, buffer, buffer_type)
+ slice_buffer(id, buffer) do
+ offset_size = IO::Buffer.size_of(buffer_type)
+ buffer_offset = offset_size * (@offset - 1)
+ first_offset = buffer.get_value(buffer_type, buffer_offset)
+ # TODO: Optimize
+ sliced_buffer = IO::Buffer.new(offset_size * (@size + 1))
+ buffer.each(buffer_type,
+ buffer_offset,
+ @size + 1).with_index do |(_, offset), i|
+ new_offset = offset - first_offset
+ sliced_buffer.set_value(buffer_type,
+ offset_size * i,
+ new_offset)
+ end
+ sliced_buffer
+ end
+ end
end
class NullArray < Array
@@ -84,26 +154,48 @@ module ArrowFormat
@values_buffer = values_buffer
end
+ def to_a
+ offset = element_size * @offset
+ apply_validity(@values_buffer.values(@type.buffer_type, offset, @size))
+ end
+
def each_buffer
return to_enum(__method__) unless block_given?
- yield(@validity_buffer)
- yield(@values_buffer)
+ yield(slice_bitmap_buffer(:validity, @validity_buffer))
+ yield(slice_fixed_element_size_buffer(:values,
+ @values_buffer,
+ element_size))
+ end
+
+ private
+ def element_size
+ IO::Buffer.size_of(@type.buffer_type)
end
end
class BooleanArray < PrimitiveArray
def to_a
- @values_bitmap ||= Bitmap.new(@values_buffer, @size)
+ @values_bitmap ||= Bitmap.new(@values_buffer, @offset, @size)
values = @values_bitmap.to_a
apply_validity(values)
end
+
+ def each_buffer
+ return to_enum(__method__) unless block_given?
+
+ yield(slice_bitmap_buffer(:validity, @validity_buffer))
+ yield(slice_bitmap_buffer(:values, @values_buffer))
+ end
+
+ private
+ def clear_cache
+ super
+ @values_bitmap = nil
+ end
end
class IntArray < PrimitiveArray
- def to_a
- apply_validity(@values_buffer.values(@type.buffer_type, 0, @size))
- end
end
class Int8Array < IntArray
@@ -134,15 +226,9 @@ module ArrowFormat
end
class Float32Array < FloatingPointArray
- def to_a
- apply_validity(@values_buffer.values(:f32, 0, @size))
- end
end
class Float64Array < FloatingPointArray
- def to_a
- apply_validity(@values_buffer.values(:f64, 0, @size))
- end
end
class TemporalArray < PrimitiveArray
@@ -152,51 +238,34 @@ module ArrowFormat
end
class Date32Array < DateArray
- def to_a
- apply_validity(@values_buffer.values(:s32, 0, @size))
- end
end
class Date64Array < DateArray
- def to_a
- apply_validity(@values_buffer.values(:s64, 0, @size))
- end
end
class TimeArray < TemporalArray
end
class Time32Array < TimeArray
- def to_a
- apply_validity(@values_buffer.values(:s32, 0, @size))
- end
end
class Time64Array < TimeArray
- def to_a
- apply_validity(@values_buffer.values(:s64, 0, @size))
- end
end
class TimestampArray < TemporalArray
- def to_a
- apply_validity(@values_buffer.values(:s64, 0, @size))
- end
end
class IntervalArray < TemporalArray
end
class YearMonthIntervalArray < IntervalArray
- def to_a
- apply_validity(@values_buffer.values(:s32, 0, @size))
- end
end
class DayTimeIntervalArray < IntervalArray
def to_a
+ offset = element_size * @offset
values = @values_buffer.
- each(:s32, 0, @size * 2).
+ each(@type.buffer_type, offset, @size * 2).
each_slice(2).
collect do |(_, day), (_, time)|
[day, time]
@@ -207,20 +276,23 @@ module ArrowFormat
class MonthDayNanoIntervalArray < IntervalArray
def to_a
- buffer_types = [:s32, :s32, :s64]
+ buffer_types = @type.buffer_types
value_size = IO::Buffer.size_of(buffer_types)
+ base_offset = value_size * @offset
values = @size.times.collect do |i|
- offset = value_size * i
+ offset = base_offset + value_size * i
@values_buffer.get_values(buffer_types, offset)
end
apply_validity(values)
end
+
+ private
+ def element_size
+ IO::Buffer.size_of(@type.buffer_types)
+ end
end
class DurationArray < TemporalArray
- def to_a
- apply_validity(@values_buffer.values(:s64, 0, @size))
- end
end
class VariableSizeBinaryLayoutArray < Array
@@ -233,65 +305,45 @@ module ArrowFormat
def each_buffer
return to_enum(__method__) unless block_given?
- yield(@validity_buffer)
- yield(@offsets_buffer)
- yield(@values_buffer)
+ yield(slice_bitmap_buffer(:validity, @validity_buffer))
+ yield(slice_offsets_buffer(:offsets,
+ @offsets_buffer,
+ @type.offset_buffer_type))
+ sliced_values_buffer = slice_buffer(:values, @values_buffer) do
+ first_offset = @offsets_buffer.get_value(@type.offset_buffer_type,
+ offset_size * @offset)
+ @values_buffer.slice(first_offset)
+ end
+ yield(sliced_values_buffer)
end
def to_a
values = @offsets_buffer.
- each(buffer_type, 0, @size + 1).
+ each(@type.offset_buffer_type, offset_size * @offset, @size + 1).
each_cons(2).
collect do |(_, offset), (_, next_offset)|
length = next_offset - offset
- @values_buffer.get_string(offset, length, encoding)
+ @values_buffer.get_string(offset, length, @type.encoding)
end
apply_validity(values)
end
- end
- class BinaryArray < VariableSizeBinaryLayoutArray
private
- def buffer_type
- :s32 # TODO: big endian support
+ def offset_size
+ IO::Buffer.size_of(@type.offset_buffer_type)
end
+ end
- def encoding
- Encoding::ASCII_8BIT
- end
+ class BinaryArray < VariableSizeBinaryLayoutArray
end
class LargeBinaryArray < VariableSizeBinaryLayoutArray
- private
- def buffer_type
- :s64 # TODO: big endian support
- end
-
- def encoding
- Encoding::ASCII_8BIT
- end
end
class UTF8Array < VariableSizeBinaryLayoutArray
- private
- def buffer_type
- :s32 # TODO: big endian support
- end
-
- def encoding
- Encoding::UTF_8
- end
end
class LargeUTF8Array < VariableSizeBinaryLayoutArray
- private
- def buffer_type
- :s64 # TODO: big endian support
- end
-
- def encoding
- Encoding::UTF_8
- end
end
class FixedSizeBinaryArray < Array
@@ -303,8 +355,10 @@ module ArrowFormat
def each_buffer
return to_enum(__method__) unless block_given?
- yield(@validity_buffer)
- yield(@values_buffer)
+ yield(slice_bitmap_buffer(:validity, @validity_buffer))
+ yield(slice_fixed_element_size_buffer(:values,
+ @values_buffer,
+ @type.byte_width))
end
def to_a
@@ -320,8 +374,9 @@ module ArrowFormat
def to_a
byte_width = @type.byte_width
buffer_types = [:u64] * (byte_width / 8 - 1) + [:s64]
+ base_offset = byte_width * @offset
values = 0.step(@size * byte_width - 1, byte_width).collect do |offset|
- @values_buffer.get_values(buffer_types, offset)
+ @values_buffer.get_values(buffer_types, base_offset + offset)
end
apply_validity(values).collect do |value|
if value.nil?
@@ -379,34 +434,44 @@ module ArrowFormat
def each_buffer(&block)
return to_enum(__method__) unless block_given?
- yield(@validity_buffer)
- yield(@offsets_buffer)
+ yield(slice_bitmap_buffer(:validity, @validity_buffer))
+ yield(slice_offsets_buffer(:offsets,
+ @offsets_buffer,
+ @type.offset_buffer_type))
end
def to_a
child_values = @child.to_a
values = @offsets_buffer.
- each(offset_type, 0, @size + 1).
+ each(@type.offset_buffer_type, offset_size * @offset, @size + 1).
each_cons(2).
collect do |(_, offset), (_, next_offset)|
child_values[offset...next_offset]
end
apply_validity(values)
end
- end
- class ListArray < VariableSizeListArray
private
- def offset_type
- :s32 # TODO: big endian support
+ def offset_size
+ IO::Buffer.size_of(@type.offset_buffer_type)
end
+
+ def slice!(offset, size)
+ super
+ first_offset =
+ @offsets_buffer.get_value(@type.offset_buffer_type,
+ offset_size * @offset)
+ last_offset =
+ @offsets_buffer.get_value(@type.offset_buffer_type,
+ offset_size * (@offset + @size + 1))
+ @child = @child.slice(first_offset, last_offset - first_offset)
+ end
+ end
+
+ class ListArray < VariableSizeListArray
end
class LargeListArray < VariableSizeListArray
- private
- def offset_type
- :s64 # TODO: big endian support
- end
end
class StructArray < Array
@@ -419,7 +484,7 @@ module ArrowFormat
def each_buffer(&block)
return to_enum(__method__) unless block_given?
- yield(@validity_buffer)
+ yield(slice_bitmap_buffer(:validity, @validity_buffer))
end
def to_a
@@ -431,6 +496,14 @@ module ArrowFormat
end
apply_validity(values)
end
+
+ private
+ def slice!(offset, size)
+ super
+ @children = @children.collect do |child|
+ child.slice(offset, size)
+ end
+ end
end
class MapArray < VariableSizeListArray
@@ -447,11 +520,6 @@ module ArrowFormat
end
end
end
-
- private
- def offset_type
- :s32 # TODO: big endian support
- end
end
class UnionArray < Array
@@ -461,6 +529,15 @@ module ArrowFormat
@types_buffer = types_buffer
@children = children
end
+
+ private
+ def type_buffer_type
+ :S8
+ end
+
+ def type_element_size
+ IO::Buffer.size_of(type_buffer_type)
+ end
end
class DenseUnionArray < UnionArray
@@ -476,35 +553,61 @@ module ArrowFormat
def each_buffer(&block)
return to_enum(__method__) unless block_given?
+ # TODO: Dictionary delta support (slice support)
yield(@types_buffer)
yield(@offsets_buffer)
end
def to_a
children_values = @children.collect(&:to_a)
- types = @types_buffer.each(:S8, 0, @size)
- offsets = @offsets_buffer.each(:s32, 0, @size)
+ types = @types_buffer.each(type_buffer_type,
+ type_element_size * @offset,
+ @size)
+ offsets = @offsets_buffer.each(:s32,
+ offset_element_size * @offset,
+ @size)
types.zip(offsets).collect do |(_, type), (_, offset)|
index = @type.resolve_type_index(type)
children_values[index][offset]
end
end
+
+ private
+ def offset_buffer_type
+ :s32
+ end
+
+ def offset_element_size
+ IO::Buffer.size_of(offset_buffer_type)
+ end
end
class SparseUnionArray < UnionArray
def each_buffer(&block)
return to_enum(__method__) unless block_given?
- yield(@types_buffer)
+ yield(slice_fixed_element_size_buffer(:types,
+ @types_buffer,
+ type_element_size))
end
def to_a
children_values = @children.collect(&:to_a)
- @types_buffer.each(:S8, 0, @size).with_index.collect do |(_, type), i|
+ @types_buffer.each(type_buffer_type,
+ type_element_size * @offset,
+ @size).with_index.collect do |(_, type), i|
index = @type.resolve_type_index(type)
children_values[index][i]
end
end
+
+ private
+ def slice!(offset, size)
+ super
+ @children = @children.collect do |child|
+ child.slice(offset, size)
+ end
+ end
end
class DictionaryArray < Array
@@ -516,6 +619,7 @@ module ArrowFormat
@dictionary = dictionary
end
+ # TODO: Slice support
def each_buffer
return to_enum(__method__) unless block_given?
@@ -529,7 +633,9 @@ module ArrowFormat
values.concat(dictionary_chunk.to_a)
end
buffer_type = @type.index_type.buffer_type
- indices = apply_validity(@indices_buffer.values(buffer_type, 0, @size))
+ offset = IO::Buffer.size_of(buffer_type) * @offset
+ indices =
+ apply_validity(@indices_buffer.values(buffer_type, offset, @size))
indices.collect do |index|
if index.nil?
nil
diff --git a/ruby/red-arrow-format/lib/arrow-format/bitmap.rb
b/ruby/red-arrow-format/lib/arrow-format/bitmap.rb
index 88a1ab2ff4..17d7db872e 100644
--- a/ruby/red-arrow-format/lib/arrow-format/bitmap.rb
+++ b/ruby/red-arrow-format/lib/arrow-format/bitmap.rb
@@ -18,15 +18,18 @@ module ArrowFormat
class Bitmap
include Enumerable
- def initialize(buffer, n_values)
+ def initialize(buffer, offset, n_values)
@buffer = buffer
+ @offset = offset
@n_values = n_values
end
def [](i)
+ i += @offset
(@buffer.get_value(:U8, i / 8) & (1 << (i % 8))) > 0
end
+ # TODO: offset support
def each
return to_enum(__method__) unless block_given?
@@ -44,5 +47,12 @@ module ArrowFormat
end
end
end
+
+ def popcount
+ # TODO: Optimize
+ count do |flaged|
+ flaged
+ end
+ end
end
end
diff --git a/ruby/red-arrow-format/lib/arrow-format/streaming-writer.rb
b/ruby/red-arrow-format/lib/arrow-format/streaming-writer.rb
index 2f8f90b706..d63016a25b 100644
--- a/ruby/red-arrow-format/lib/arrow-format/streaming-writer.rb
+++ b/ruby/red-arrow-format/lib/arrow-format/streaming-writer.rb
@@ -118,8 +118,7 @@ module ArrowFormat
is_delta = false
else
is_delta = true
- raise NotImplementedError,
- "Delta dictionary message isn't implemented yet"
+ dictionary = dictionary.slice(offset)
end
schema = Schema.new([Field.new("dummy", value_type, true, nil)])
diff --git a/ruby/red-arrow-format/lib/arrow-format/type.rb
b/ruby/red-arrow-format/lib/arrow-format/type.rb
index 4ea41a2538..8d49b3810b 100644
--- a/ruby/red-arrow-format/lib/arrow-format/type.rb
+++ b/ruby/red-arrow-format/lib/arrow-format/type.rb
@@ -305,6 +305,10 @@ module ArrowFormat
"Float32"
end
+ def buffer_type
+ :f32
+ end
+
def build_array(size, validity_buffer, values_buffer)
Float32Array.new(self, size, validity_buffer, values_buffer)
end
@@ -325,6 +329,10 @@ module ArrowFormat
"Float64"
end
+ def buffer_type
+ :f64
+ end
+
def build_array(size, validity_buffer, values_buffer)
Float64Array.new(self, size, validity_buffer, values_buffer)
end
@@ -362,6 +370,10 @@ module ArrowFormat
"Date32"
end
+ def buffer_type
+ :s32
+ end
+
def build_array(size, validity_buffer, values_buffer)
Date32Array.new(self, size, validity_buffer, values_buffer)
end
@@ -382,6 +394,10 @@ module ArrowFormat
"Date64"
end
+ def buffer_type
+ :s64
+ end
+
def build_array(size, validity_buffer, values_buffer)
Date64Array.new(self, size, validity_buffer, values_buffer)
end
@@ -413,6 +429,10 @@ module ArrowFormat
"Time32"
end
+ def buffer_type
+ :s32
+ end
+
def build_array(size, validity_buffer, values_buffer)
Time32Array.new(self, size, validity_buffer, values_buffer)
end
@@ -427,6 +447,10 @@ module ArrowFormat
"Time64"
end
+ def buffer_type
+ :s64
+ end
+
def build_array(size, validity_buffer, values_buffer)
Time64Array.new(self, size, validity_buffer, values_buffer)
end
@@ -445,6 +469,10 @@ module ArrowFormat
"Timestamp"
end
+ def buffer_type
+ :s64
+ end
+
def build_array(size, validity_buffer, values_buffer)
TimestampArray.new(self, size, validity_buffer, values_buffer)
end
@@ -486,6 +514,10 @@ module ArrowFormat
"YearMonthInterval"
end
+ def buffer_type
+ :s32
+ end
+
def build_array(size, validity_buffer, values_buffer)
YearMonthIntervalArray.new(self, size, validity_buffer, values_buffer)
end
@@ -500,6 +532,10 @@ module ArrowFormat
"DayTimeInterval"
end
+ def buffer_type
+ :s32
+ end
+
def build_array(size, validity_buffer, values_buffer)
DayTimeIntervalArray.new(self, size, validity_buffer, values_buffer)
end
@@ -514,6 +550,10 @@ module ArrowFormat
"MonthDayNanoInterval"
end
+ def buffer_types
+ @buffer_types ||= [:s32, :s32, :s64]
+ end
+
def build_array(size, validity_buffer, values_buffer)
MonthDayNanoIntervalArray.new(self,
size,
@@ -533,6 +573,10 @@ module ArrowFormat
"Duration"
end
+ def buffer_type
+ :s64
+ end
+
def build_array(size, validity_buffer, values_buffer)
DurationArray.new(self, size, validity_buffer, values_buffer)
end
@@ -558,6 +602,14 @@ module ArrowFormat
"Binary"
end
+ def offset_buffer_type
+ :s32 # TODO: big endian support
+ end
+
+ def encoding
+ Encoding::ASCII_8BIT
+ end
+
def build_array(size, validity_buffer, offsets_buffer, values_buffer)
BinaryArray.new(self,
size,
@@ -582,6 +634,14 @@ module ArrowFormat
"LargeBinary"
end
+ def offset_buffer_type
+ :s64 # TODO: big endian support
+ end
+
+ def encoding
+ Encoding::ASCII_8BIT
+ end
+
def build_array(size, validity_buffer, offsets_buffer, values_buffer)
LargeBinaryArray.new(self,
size,
@@ -606,6 +666,14 @@ module ArrowFormat
"UTF8"
end
+ def offset_buffer_type
+ :s32 # TODO: big endian support
+ end
+
+ def encoding
+ Encoding::UTF_8
+ end
+
def build_array(size, validity_buffer, offsets_buffer, values_buffer)
UTF8Array.new(self, size, validity_buffer, offsets_buffer, values_buffer)
end
@@ -626,6 +694,14 @@ module ArrowFormat
"LargeUTF8"
end
+ def offset_buffer_type
+ :s64 # TODO: big endian support
+ end
+
+ def encoding
+ Encoding::UTF_8
+ end
+
def build_array(size, validity_buffer, offsets_buffer, values_buffer)
LargeUTF8Array.new(self,
size,
@@ -720,6 +796,10 @@ module ArrowFormat
"List"
end
+ def offset_buffer_type
+ :s32 # TODO: big endian support
+ end
+
def build_array(size, validity_buffer, offsets_buffer, child)
ListArray.new(self, size, validity_buffer, offsets_buffer, child)
end
@@ -734,6 +814,10 @@ module ArrowFormat
"LargeList"
end
+ def offset_buffer_type
+ :s64 # TODO: big endian support
+ end
+
def build_array(size, validity_buffer, offsets_buffer, child)
LargeListArray.new(self, size, validity_buffer, offsets_buffer, child)
end
@@ -788,6 +872,10 @@ module ArrowFormat
"Map"
end
+ def offset_buffer_type
+ :s32 # TODO: big endian support
+ end
+
def build_array(size, validity_buffer, offsets_buffer, child)
MapArray.new(self, size, validity_buffer, offsets_buffer, child)
end
diff --git a/ruby/red-arrow-format/test/test-writer.rb
b/ruby/red-arrow-format/test/test-writer.rb
index 3e4b5bedba..33b3c2db22 100644
--- a/ruby/red-arrow-format/test/test-writer.rb
+++ b/ruby/red-arrow-format/test/test-writer.rb
@@ -198,465 +198,391 @@ module WriterTests
end
end
- def write(writer)
- red_arrow_array = build_array
- array = convert_array(red_arrow_array)
- red_arrow_field = Arrow::Field.new("value",
- red_arrow_array.value_data_type,
- true)
- fields = [convert_field(red_arrow_field)]
- schema = ArrowFormat::Schema.new(fields)
- record_batch = ArrowFormat::RecordBatch.new(schema, array.size, [array])
- writer.start(schema)
- writer.write_record_batch(record_batch)
+ def write(writer, *inputs)
+ inputs.each_with_index do |input, i|
+ case input
+ when ArrowFormat::RecordBatch
+ record_batch = input
+ else
+ red_arrow_array = input
+ array = convert_array(red_arrow_array)
+ red_arrow_field = Arrow::Field.new("value",
+ red_arrow_array.value_data_type,
+ true)
+ fields = [convert_field(red_arrow_field)]
+ schema = ArrowFormat::Schema.new(fields)
+ record_batch = ArrowFormat::RecordBatch.new(schema,
+ array.size,
+ [array])
+ end
+ writer.start(record_batch.schema) if i.zero?
+ writer.write_record_batch(record_batch)
+ end
writer.finish
end
+ def roundtrip(*inputs)
+ Dir.mktmpdir do |tmp_dir|
+ path = File.join(tmp_dir, "data.#{file_extension}")
+ File.open(path, "wb") do |output|
+ writer = writer_class.new(output)
+ write(writer, *inputs)
+ end
+ data = File.open(path, "rb", &:read).freeze
+ table = Arrow::Table.load(Arrow::Buffer.new(data), format: :arrow)
+ [table.value.data_type, table.value.values]
+ end
+ end
+
class << self
def included(base)
base.class_eval do
- sub_test_case("Null") do
- def build_array
- Arrow::NullArray.new(3)
- end
-
- def test_write
- assert_equal([nil, nil, nil],
- @values)
- end
+ def test_null
+ array = Arrow::NullArray.new(3)
+ type, values = roundtrip(array)
+ assert_equal(["null", [nil, nil, nil]],
+ [type.to_s, values])
end
- sub_test_case("Boolean") do
- def build_array
- Arrow::BooleanArray.new([true, nil, false])
- end
-
- def test_write
- assert_equal([true, nil, false],
- @values)
- end
+ def test_boolean
+ array = Arrow::BooleanArray.new([true, nil, false])
+ type, values = roundtrip(array)
+ assert_equal(["bool", [true, nil, false]],
+ [type.to_s, values])
end
- sub_test_case("Int8") do
- def build_array
- Arrow::Int8Array.new([-128, nil, 127])
- end
-
- def test_write
- assert_equal([-128, nil, 127],
- @values)
- end
+ def test_int8
+ array = Arrow::Int8Array.new([-128, nil, 127])
+ type, values = roundtrip(array)
+ assert_equal(["int8", [-128, nil, 127]],
+ [type.to_s, values])
end
- sub_test_case("UInt8") do
- def build_array
- Arrow::UInt8Array.new([0, nil, 255])
- end
-
- def test_write
- assert_equal([0, nil, 255],
- @values)
- end
+ def test_uint8
+ array = Arrow::UInt8Array.new([0, nil, 255])
+ type, values = roundtrip(array)
+ assert_equal(["uint8", [0, nil, 255]],
+ [type.to_s, values])
end
- sub_test_case("Int16") do
- def build_array
- Arrow::Int16Array.new([-32768, nil, 32767])
- end
-
- def test_write
- assert_equal([-32768, nil, 32767],
- @values)
- end
+ def test_int16
+ array = Arrow::Int16Array.new([-32768, nil, 32767])
+ type, values = roundtrip(array)
+ assert_equal(["int16", [-32768, nil, 32767]],
+ [type.to_s, values])
end
- sub_test_case("UInt16") do
- def build_array
- Arrow::UInt16Array.new([0, nil, 65535])
- end
-
- def test_write
- assert_equal([0, nil, 65535],
- @values)
- end
+ def test_uint16
+ array = Arrow::UInt16Array.new([0, nil, 65535])
+ type, values = roundtrip(array)
+ assert_equal(["uint16", [0, nil, 65535]],
+ [type.to_s, values])
end
- sub_test_case("Int32") do
- def build_array
- Arrow::Int32Array.new([-2147483648, nil, 2147483647])
- end
-
- def test_write
- assert_equal([-2147483648, nil, 2147483647],
- @values)
- end
+ def test_int32
+ array = Arrow::Int32Array.new([-2147483648, nil, 2147483647])
+ type, values = roundtrip(array)
+ assert_equal(["int32", [-2147483648, nil, 2147483647]],
+ [type.to_s, values])
end
- sub_test_case("UInt32") do
- def build_array
- Arrow::UInt32Array.new([0, nil, 4294967295])
- end
-
- def test_write
- assert_equal([0, nil, 4294967295],
- @values)
- end
+ def test_uint32
+ array = Arrow::UInt32Array.new([0, nil, 4294967295])
+ type, values = roundtrip(array)
+ assert_equal(["uint32", [0, nil, 4294967295]],
+ [type.to_s, values])
end
- sub_test_case("Int64") do
- def build_array
- Arrow::Int64Array.new([
- -9223372036854775808,
- nil,
- 9223372036854775807
- ])
- end
-
- def test_write
- assert_equal([
+ def test_int64
+ array = Arrow::Int64Array.new([
+ -9223372036854775808,
+ nil,
+ 9223372036854775807
+ ])
+ type, values = roundtrip(array)
+ assert_equal([
+ "int64",
+ [
-9223372036854775808,
nil,
9223372036854775807
],
- @values)
- end
- end
-
- sub_test_case("UInt64") do
- def build_array
- Arrow::UInt64Array.new([0, nil, 18446744073709551615])
- end
-
- def test_write
- assert_equal([0, nil, 18446744073709551615],
- @values)
- end
- end
-
- sub_test_case("Float32") do
- def build_array
- Arrow::FloatArray.new([-0.5, nil, 0.5])
- end
-
- def test_write
- assert_equal([-0.5, nil, 0.5],
- @values)
- end
- end
-
- sub_test_case("Float64") do
- def build_array
- Arrow::DoubleArray.new([-0.5, nil, 0.5])
- end
-
- def test_write
- assert_equal([-0.5, nil, 0.5],
- @values)
- end
- end
-
- sub_test_case("Date32") do
- def setup(&block)
- @date_2017_08_28 = 17406
- @date_2025_12_09 = 20431
- super(&block)
- end
-
- def build_array
- Arrow::Date32Array.new([@date_2017_08_28, nil, @date_2025_12_09])
- end
-
- def test_write
- assert_equal([Date.new(2017, 8, 28), nil, Date.new(2025, 12, 9)],
- @values)
- end
- end
-
- sub_test_case("Date64") do
- def setup(&block)
- @date_2017_08_28_00_00_00 = 1503878400000
- @date_2025_12_10_00_00_00 = 1765324800000
- super(&block)
- end
-
- def build_array
- Arrow::Date64Array.new([
- @date_2017_08_28_00_00_00,
- nil,
- @date_2025_12_10_00_00_00,
- ])
- end
-
- def test_write
- assert_equal([
+ ],
+ [type.to_s, values])
+ end
+
+ def test_uint64
+ array = Arrow::UInt64Array.new([0, nil, 18446744073709551615])
+ type, values = roundtrip(array)
+ assert_equal(["uint64", [0, nil, 18446744073709551615]],
+ [type.to_s, values])
+ end
+
+ def test_float32
+ array = Arrow::FloatArray.new([-0.5, nil, 0.5])
+ type, values = roundtrip(array)
+ assert_equal(["float", [-0.5, nil, 0.5]],
+ [type.to_s, values])
+ end
+
+ def test_float64
+ array = Arrow::DoubleArray.new([-0.5, nil, 0.5])
+ type, values = roundtrip(array)
+ assert_equal(["double", [-0.5, nil, 0.5]],
+ [type.to_s, values])
+ end
+
+ def test_date32
+ date_2017_08_28 = 17406
+ date_2025_12_09 = 20431
+ array = Arrow::Date32Array.new([
+ date_2017_08_28,
+ nil,
+ date_2025_12_09,
+ ])
+ type, values = roundtrip(array)
+ assert_equal([
+ "date32[day]",
+ [Date.new(2017, 8, 28), nil, Date.new(2025, 12, 9)],
+ ],
+ [type.to_s, values])
+ end
+
+ def test_date64
+ date_2017_08_28_00_00_00 = 1503878400000
+ date_2025_12_10_00_00_00 = 1765324800000
+ array = Arrow::Date64Array.new([
+ date_2017_08_28_00_00_00,
+ nil,
+ date_2025_12_10_00_00_00,
+ ])
+ type, values = roundtrip(array)
+ assert_equal([
+ "date64[ms]",
+ [
DateTime.new(2017, 8, 28, 0, 0, 0),
nil,
DateTime.new(2025, 12, 10, 0, 0, 0),
],
- @values)
- end
- end
-
- sub_test_case("Time32(:second)") do
- def setup(&block)
- @time_00_00_10 = 10
- @time_00_01_10 = 60 + 10
- super(&block)
- end
-
- def build_array
- Arrow::Time32Array.new(:second,
- [@time_00_00_10, nil, @time_00_01_10])
- end
-
- def test_write
- assert_equal([
- Arrow::Time.new(:second, @time_00_00_10),
+ ],
+ [type.to_s, values])
+ end
+
+ def test_time32_second
+ time_00_00_10 = 10
+ time_00_01_10 = 60 + 10
+ array = Arrow::Time32Array.new(:second,
+ [time_00_00_10, nil, time_00_01_10])
+ type, values = roundtrip(array)
+ assert_equal([
+ "time32[s]",
+ [
+ Arrow::Time.new(:second, time_00_00_10),
nil,
- Arrow::Time.new(:second, @time_00_01_10),
+ Arrow::Time.new(:second, time_00_01_10),
],
- @values)
- end
- end
-
- sub_test_case("Time32(:millisecond)") do
- def setup(&block)
- @time_00_00_10_000 = 10 * 1000
- @time_00_01_10_000 = (60 + 10) * 1000
- super(&block)
- end
-
- def build_array
- Arrow::Time32Array.new(:milli,
- [
- @time_00_00_10_000,
- nil,
- @time_00_01_10_000,
- ])
- end
-
- def test_write
- assert_equal([
- Arrow::Time.new(:milli, @time_00_00_10_000),
+ ],
+ [type.to_s, values])
+ end
+
+ def test_time32_millisecond
+ time_00_00_10_000 = 10 * 1000
+ time_00_01_10_000 = (60 + 10) * 1000
+ array = Arrow::Time32Array.new(:milli,
+ [
+ time_00_00_10_000,
+ nil,
+ time_00_01_10_000,
+ ])
+ type, values = roundtrip(array)
+ assert_equal([
+ "time32[ms]",
+ [
+ Arrow::Time.new(:milli, time_00_00_10_000),
nil,
- Arrow::Time.new(:milli, @time_00_01_10_000),
+ Arrow::Time.new(:milli, time_00_01_10_000),
],
- @values)
- end
- end
-
- sub_test_case("Time64(:microsecond)") do
- def setup(&block)
- @time_00_00_10_000_000 = 10 * 1_000_000
- @time_00_01_10_000_000 = (60 + 10) * 1_000_000
- super(&block)
- end
-
- def build_array
- Arrow::Time64Array.new(:micro,
- [
- @time_00_00_10_000_000,
- nil,
- @time_00_01_10_000_000,
- ])
- end
-
- def test_write
- assert_equal([
- Arrow::Time.new(:micro, @time_00_00_10_000_000),
+ ],
+ [type.to_s, values])
+ end
+
+ def test_time64_microsecond
+ time_00_00_10_000_000 = 10 * 1_000_000
+ time_00_01_10_000_000 = (60 + 10) * 1_000_000
+ array = Arrow::Time64Array.new(:micro,
+ [
+ time_00_00_10_000_000,
+ nil,
+ time_00_01_10_000_000,
+ ])
+ type, values = roundtrip(array)
+ assert_equal([
+ "time64[us]",
+ [
+ Arrow::Time.new(:micro, time_00_00_10_000_000),
nil,
- Arrow::Time.new(:micro, @time_00_01_10_000_000),
+ Arrow::Time.new(:micro, time_00_01_10_000_000),
],
- @values)
- end
- end
-
- sub_test_case("Time64(:nanosecond)") do
- def setup(&block)
- @time_00_00_10_000_000_000 = 10 * 1_000_000_000
- @time_00_01_10_000_000_000 = (60 + 10) * 1_000_000_000
- super(&block)
- end
-
- def build_array
- Arrow::Time64Array.new(:nano,
- [
- @time_00_00_10_000_000_000,
- nil,
- @time_00_01_10_000_000_000,
- ])
- end
-
- def test_write
- assert_equal([
- Arrow::Time.new(:nano, @time_00_00_10_000_000_000),
+ ],
+ [type.to_s, values])
+ end
+
+ def test_time64_nanosecond
+ time_00_00_10_000_000_000 = 10 * 1_000_000_000
+ time_00_01_10_000_000_000 = (60 + 10) * 1_000_000_000
+ array = Arrow::Time64Array.new(:nano,
+ [
+ time_00_00_10_000_000_000,
+ nil,
+ time_00_01_10_000_000_000,
+ ])
+ type, values = roundtrip(array)
+ assert_equal([
+ "time64[ns]",
+ [
+ Arrow::Time.new(:nano, time_00_00_10_000_000_000),
nil,
- Arrow::Time.new(:nano, @time_00_01_10_000_000_000),
+ Arrow::Time.new(:nano, time_00_01_10_000_000_000),
],
- @values)
- end
+ ],
+ [type.to_s, values])
end
- sub_test_case("Timestamp(:second)") do
- def setup(&block)
- @timestamp_2019_11_17_15_09_11 = 1574003351
- @timestamp_2025_12_16_05_33_58 = 1765863238
- super(&block)
- end
-
- def build_array
- Arrow::TimestampArray.new(:second,
- [
- @timestamp_2019_11_17_15_09_11,
- nil,
- @timestamp_2025_12_16_05_33_58,
- ])
- end
-
- def test_write
- assert_equal([
- Time.at(@timestamp_2019_11_17_15_09_11),
+ def test_timestamp_second
+ timestamp_2019_11_17_15_09_11 = 1574003351
+ timestamp_2025_12_16_05_33_58 = 1765863238
+ array = Arrow::TimestampArray.new(:second,
+ [
+ timestamp_2019_11_17_15_09_11,
+ nil,
+ timestamp_2025_12_16_05_33_58,
+ ])
+ type, values = roundtrip(array)
+ assert_equal([
+ "timestamp[s]",
+ [
+ Time.at(timestamp_2019_11_17_15_09_11),
nil,
- Time.at(@timestamp_2025_12_16_05_33_58),
+ Time.at(timestamp_2025_12_16_05_33_58),
],
- @values)
- end
+ ],
+ [type.to_s, values])
end
- sub_test_case("Timestamp(:millisecond)") do
- def setup(&block)
- @timestamp_2019_11_17_15_09_11 = 1574003351 * 1_000
- @timestamp_2025_12_16_05_33_58 = 1765863238 * 1_000
- super(&block)
- end
-
- def build_array
- Arrow::TimestampArray.new(:milli,
- [
- @timestamp_2019_11_17_15_09_11,
- nil,
- @timestamp_2025_12_16_05_33_58,
- ])
- end
-
- def test_write
- assert_equal([
- Time.at(@timestamp_2019_11_17_15_09_11 / 1_000),
+ def test_timestamp_millisecond
+ timestamp_2019_11_17_15_09_11 = 1574003351 * 1_000
+ timestamp_2025_12_16_05_33_58 = 1765863238 * 1_000
+ array = Arrow::TimestampArray.new(:milli,
+ [
+ timestamp_2019_11_17_15_09_11,
+ nil,
+ timestamp_2025_12_16_05_33_58,
+ ])
+ type, values = roundtrip(array)
+ assert_equal([
+ "timestamp[ms]",
+ [
+ Time.at(timestamp_2019_11_17_15_09_11 / 1_000),
nil,
- Time.at(@timestamp_2025_12_16_05_33_58 / 1_000),
+ Time.at(timestamp_2025_12_16_05_33_58 / 1_000),
],
- @values)
- end
+ ],
+ [type.to_s, values])
end
- sub_test_case("Timestamp(:microsecond)") do
- def setup(&block)
- @timestamp_2019_11_17_15_09_11 = 1574003351 * 1_000_000
- @timestamp_2025_12_16_05_33_58 = 1765863238 * 1_000_000
- super(&block)
- end
-
- def build_array
- Arrow::TimestampArray.new(:micro,
- [
- @timestamp_2019_11_17_15_09_11,
- nil,
- @timestamp_2025_12_16_05_33_58,
- ])
- end
-
- def test_write
- assert_equal([
- Time.at(@timestamp_2019_11_17_15_09_11 / 1_000_000),
+ def test_timestamp_microsecond
+ timestamp_2019_11_17_15_09_11 = 1574003351 * 1_000_000
+ timestamp_2025_12_16_05_33_58 = 1765863238 * 1_000_000
+ array = Arrow::TimestampArray.new(:micro,
+ [
+ timestamp_2019_11_17_15_09_11,
+ nil,
+ timestamp_2025_12_16_05_33_58,
+ ])
+ type, values = roundtrip(array)
+ assert_equal([
+ "timestamp[us]",
+ [
+ Time.at(timestamp_2019_11_17_15_09_11 / 1_000_000),
nil,
- Time.at(@timestamp_2025_12_16_05_33_58 / 1_000_000),
+ Time.at(timestamp_2025_12_16_05_33_58 / 1_000_000),
],
- @values)
- end
+ ],
+ [type.to_s, values])
end
- sub_test_case("Timestamp(:nanosecond)") do
- def setup(&block)
- @timestamp_2019_11_17_15_09_11 = 1574003351 * 1_000_000_000
- @timestamp_2025_12_16_05_33_58 = 1765863238 * 1_000_000_000
- super(&block)
- end
-
- def build_array
- Arrow::TimestampArray.new(:nano,
- [
- @timestamp_2019_11_17_15_09_11,
- nil,
- @timestamp_2025_12_16_05_33_58,
- ])
- end
-
- def test_write
- assert_equal([
- Time.at(@timestamp_2019_11_17_15_09_11 /
1_000_000_000),
+ def test_timestamp_nanosecond
+ timestamp_2019_11_17_15_09_11 = 1574003351 * 1_000_000_000
+ timestamp_2025_12_16_05_33_58 = 1765863238 * 1_000_000_000
+ array = Arrow::TimestampArray.new(:nano,
+ [
+ timestamp_2019_11_17_15_09_11,
+ nil,
+ timestamp_2025_12_16_05_33_58,
+ ])
+ type, values = roundtrip(array)
+ assert_equal([
+ "timestamp[ns]",
+ [
+ Time.at(timestamp_2019_11_17_15_09_11 /
1_000_000_000),
nil,
- Time.at(@timestamp_2025_12_16_05_33_58 /
1_000_000_000),
+ Time.at(timestamp_2025_12_16_05_33_58 /
1_000_000_000),
],
- @values)
- end
- end
-
- sub_test_case("Timestamp(time_zone)") do
- def setup(&block)
- @time_zone = "UTC"
- @timestamp_2019_11_17_15_09_11 = 1574003351
- @timestamp_2025_12_16_05_33_58 = 1765863238
- super(&block)
- end
-
- def build_array
- data_type = Arrow::TimestampDataType.new(:second, @time_zone)
- Arrow::TimestampArray.new(data_type,
- [
- @timestamp_2019_11_17_15_09_11,
- nil,
- @timestamp_2025_12_16_05_33_58,
- ])
- end
-
- def test_type
- assert_equal([Arrow::TimeUnit::SECOND, @time_zone],
- [@type.unit, @type.time_zone&.identifier])
- end
+ ],
+ [type.to_s, values])
+ end
+
+ def test_timestamp_time_zone
+ time_zone = "UTC"
+ timestamp_2019_11_17_15_09_11 = 1574003351
+ timestamp_2025_12_16_05_33_58 = 1765863238
+ data_type = Arrow::TimestampDataType.new(:second, time_zone)
+ array = Arrow::TimestampArray.new(data_type,
+ [
+ timestamp_2019_11_17_15_09_11,
+ nil,
+ timestamp_2025_12_16_05_33_58,
+ ])
+ type, values = roundtrip(array)
+ assert_equal([
+ "timestamp[s, tz=#{time_zone}]",
+ [
+ Time.at(timestamp_2019_11_17_15_09_11),
+ nil,
+ Time.at(timestamp_2025_12_16_05_33_58),
+ ],
+ ],
+ [type.to_s, values])
end
- sub_test_case("YearMonthInterval") do
- def build_array
- Arrow::MonthIntervalArray.new([0, nil, 100])
- end
-
- def test_write
- assert_equal([0, nil, 100],
- @values)
- end
+ def test_year_month_interval
+ array = Arrow::MonthIntervalArray.new([0, nil, 100])
+ type, values = roundtrip(array)
+ assert_equal(["month_interval", [0, nil, 100]],
+ [type.to_s, values])
end
- sub_test_case("DayTimeInterval") do
- def build_array
+ def test_day_time_interval
+ array =
Arrow::DayTimeIntervalArray.new([
{day: 1, millisecond: 100},
nil,
{day: 3, millisecond: 300},
])
- end
-
- def test_write
- assert_equal([
+ type, values = roundtrip(array)
+ assert_equal([
+ "day_time_interval",
+ [
{day: 1, millisecond: 100},
nil,
{day: 3, millisecond: 300},
],
- @values)
- end
+ ],
+ [type.to_s, values])
end
- sub_test_case("MonthDayNanoInterval") do
- def build_array
+ def test_month_day_nano_interval
+ array =
Arrow::MonthDayNanoIntervalArray.new([
{
month: 1,
@@ -670,10 +596,10 @@ module WriterTests
nanosecond: 300,
},
])
- end
-
- def test_write
- assert_equal([
+ type, values = roundtrip(array)
+ assert_equal([
+ "month_day_nano_interval",
+ [
{
month: 1,
day: 1,
@@ -686,307 +612,304 @@ module WriterTests
nanosecond: 300,
},
],
- @values)
- end
- end
-
- sub_test_case("Duration(:second)") do
- def build_array
- Arrow::DurationArray.new(:second, [0, nil, 100])
- end
-
- def test_write
- assert_equal([0, nil, 100],
- @values)
- end
-
- def test_type
- assert_equal(Arrow::TimeUnit::SECOND, @type.unit)
- end
- end
-
- sub_test_case("Duration(:millisecond)") do
- def build_array
- Arrow::DurationArray.new(:milli, [0, nil, 100])
- end
-
- def test_write
- assert_equal([0, nil, 100],
- @values)
- end
-
- def test_type
- assert_equal(Arrow::TimeUnit::MILLI, @type.unit)
- end
- end
-
- sub_test_case("Duration(:microsecond)") do
- def build_array
- Arrow::DurationArray.new(:micro, [0, nil, 100])
- end
-
- def test_write
- assert_equal([0, nil, 100],
- @values)
- end
-
- def test_type
- assert_equal(Arrow::TimeUnit::MICRO, @type.unit)
- end
- end
-
- sub_test_case("Duration(:nanosecond)") do
- def build_array
- Arrow::DurationArray.new(:nano, [0, nil, 100])
- end
-
- def test_write
- assert_equal([0, nil, 100],
- @values)
- end
-
- def test_type
- assert_equal(Arrow::TimeUnit::NANO, @type.unit)
- end
- end
-
- sub_test_case("Binary") do
- def build_array
- Arrow::BinaryArray.new(["Hello".b, nil, "World".b])
- end
-
- def test_write
- assert_equal(["Hello".b, nil, "World".b],
- @values)
- end
- end
-
- sub_test_case("LargeBinary") do
- def build_array
- Arrow::LargeBinaryArray.new(["Hello".b, nil, "World".b])
- end
-
- def test_write
- assert_equal(["Hello".b, nil, "World".b],
- @values)
- end
- end
-
- sub_test_case("String") do
- def build_array
- Arrow::StringArray.new(["Hello", nil, "World"])
- end
-
- def test_write
- assert_equal(["Hello", nil, "World"],
- @values)
- end
- end
-
- sub_test_case("LargeString") do
- def build_array
- Arrow::LargeStringArray.new(["Hello", nil, "World"])
- end
-
- def test_write
- assert_equal(["Hello", nil, "World"],
- @values)
- end
- end
-
- sub_test_case("FixedSizeBinary") do
- def build_array
- data_type = Arrow::FixedSizeBinaryDataType.new(4)
- Arrow::FixedSizeBinaryArray.new(data_type,
- ["0124".b, nil, "abcd".b])
- end
-
- def test_write
- assert_equal(["0124".b, nil, "abcd".b],
- @values)
- end
- end
-
- sub_test_case("Decimal128") do
- def build_array
- @positive_small = "1.200"
- @positive_large = ("1234567890" * 3) + "12345.678"
- @negative_small = "-1.200"
- @negative_large = "-" + ("1234567890" * 3) + "12345.678"
- Arrow::Decimal128Array.new({precision: 38, scale: 3},
- [
- @positive_large,
- @positive_small,
- nil,
- @negative_small,
- @negative_large,
- ])
- end
-
- def test_write
- assert_equal([
- BigDecimal(@positive_large),
- BigDecimal(@positive_small),
+ ],
+ [type.to_s, values])
+ end
+
+ def test_duration_second
+ array = Arrow::DurationArray.new(:second, [0, nil, 100])
+ type, values = roundtrip(array)
+ assert_equal(["duration[s]", [0, nil, 100]],
+ [type.to_s, values])
+ end
+
+ def test_duration_millisecond
+ array = Arrow::DurationArray.new(:milli, [0, nil, 100])
+ type, values = roundtrip(array)
+ assert_equal(["duration[ms]", [0, nil, 100]],
+ [type.to_s, values])
+ end
+
+ def test_duration_microsecond
+ array = Arrow::DurationArray.new(:micro, [0, nil, 100])
+ type, values = roundtrip(array)
+ assert_equal(["duration[us]", [0, nil, 100]],
+ [type.to_s, values])
+ end
+
+ def test_duration_nanosecond
+ array = Arrow::DurationArray.new(:nano, [0, nil, 100])
+ type, values = roundtrip(array)
+ assert_equal(["duration[ns]", [0, nil, 100]],
+ [type.to_s, values])
+ end
+
+ def test_binary
+ array = Arrow::BinaryArray.new(["Hello".b, nil, "World".b])
+ type, values = roundtrip(array)
+ assert_equal(["binary", ["Hello".b, nil, "World".b]],
+ [type.to_s, values])
+ end
+
+ def test_large_binary
+ array = Arrow::LargeBinaryArray.new(["Hello".b, nil, "World".b])
+ type, values = roundtrip(array)
+ assert_equal(["large_binary", ["Hello".b, nil, "World".b]],
+ [type.to_s, values])
+ end
+
+ def test_utf8
+ array = Arrow::StringArray.new(["Hello", nil, "World"])
+ type, values = roundtrip(array)
+ assert_equal(["string", ["Hello", nil, "World"]],
+ [type.to_s, values])
+ end
+
+ def test_large_utf8
+ array = Arrow::LargeStringArray.new(["Hello", nil, "World"])
+ type, values = roundtrip(array)
+ assert_equal(["large_string", ["Hello", nil, "World"]],
+ [type.to_s, values])
+ end
+
+ def test_fixed_size_binary
+ data_type = Arrow::FixedSizeBinaryDataType.new(4)
+ array = Arrow::FixedSizeBinaryArray.new(data_type,
+ ["0124".b, nil, "abcd".b])
+ type, values = roundtrip(array)
+ assert_equal(["fixed_size_binary[4]", ["0124".b, nil, "abcd".b]],
+ [type.to_s, values])
+ end
+
+ def test_decimal128
+ positive_small = "1.200"
+ positive_large = ("1234567890" * 3) + "12345.678"
+ negative_small = "-1.200"
+ negative_large = "-" + ("1234567890" * 3) + "12345.678"
+ array = Arrow::Decimal128Array.new({precision: 38, scale: 3},
+ [
+ positive_large,
+ positive_small,
+ nil,
+ negative_small,
+ negative_large,
+ ])
+ type, values = roundtrip(array)
+ assert_equal([
+ "decimal128(38, 3)",
+ [
+ BigDecimal(positive_large),
+ BigDecimal(positive_small),
nil,
- BigDecimal(@negative_small),
- BigDecimal(@negative_large),
+ BigDecimal(negative_small),
+ BigDecimal(negative_large),
],
- @values)
- end
- end
-
- sub_test_case("Decimal256") do
- def build_array
- @positive_small = "1.200"
- @positive_large = ("1234567890" * 7) + "123.456"
- @negative_small = "-1.200"
- @negative_large = "-" + ("1234567890" * 7) + "123.456"
- Arrow::Decimal256Array.new({precision: 76, scale: 3},
- [
- @positive_large,
- @positive_small,
- nil,
- @negative_small,
- @negative_large,
- ])
- end
-
- def test_write
- assert_equal([
- BigDecimal(@positive_large),
- BigDecimal(@positive_small),
+ ],
+ [type.to_s, values])
+ end
+
+ def test_decimal256
+ positive_small = "1.200"
+ positive_large = ("1234567890" * 7) + "123.456"
+ negative_small = "-1.200"
+ negative_large = "-" + ("1234567890" * 7) + "123.456"
+ array = Arrow::Decimal256Array.new({precision: 76, scale: 3},
+ [
+ positive_large,
+ positive_small,
+ nil,
+ negative_small,
+ negative_large,
+ ])
+ type, values = roundtrip(array)
+ assert_equal([
+ "decimal256(76, 3)",
+ [
+ BigDecimal(positive_large),
+ BigDecimal(positive_small),
nil,
- BigDecimal(@negative_small),
- BigDecimal(@negative_large),
+ BigDecimal(negative_small),
+ BigDecimal(negative_large),
],
- @values)
- end
- end
-
- sub_test_case("List") do
- def build_array
- data_type = Arrow::ListDataType.new(name: "count", type: :int8)
- Arrow::ListArray.new(data_type, [[-128, 127], nil, [-1, 0, 1]])
- end
-
- def test_write
- assert_equal([[-128, 127], nil, [-1, 0, 1]],
- @values)
- end
- end
-
- sub_test_case("LargeList") do
- def build_array
- data_type = Arrow::LargeListDataType.new(name: "count",
- type: :int8)
- Arrow::LargeListArray.new(data_type,
- [[-128, 127], nil, [-1, 0, 1]])
- end
-
- def test_write
- assert_equal([[-128, 127], nil, [-1, 0, 1]],
- @values)
- end
- end
-
- sub_test_case("Map") do
- def build_array
- data_type = Arrow::MapDataType.new(:string, :int8)
- Arrow::MapArray.new(data_type,
- [
- {"a" => -128, "b" => 127},
- nil,
- {"c" => nil},
- ])
- end
-
- def test_write
- assert_equal([
+ ],
+ [type.to_s, values])
+ end
+
+ def test_list
+ data_type = Arrow::ListDataType.new(name: "count", type: :int8)
+ array = Arrow::ListArray.new(data_type,
+ [[-128, 127], nil, [-1, 0, 1]])
+ type, values = roundtrip(array)
+ assert_equal([
+ "list<count: int8>",
+ [[-128, 127], nil, [-1, 0, 1]],
+ ],
+ [type.to_s, values])
+ end
+
+ def test_large_lsit
+ data_type = Arrow::LargeListDataType.new(name: "count",
+ type: :int8)
+ array = Arrow::LargeListArray.new(data_type,
+ [[-128, 127], nil, [-1, 0, 1]])
+ type, values = roundtrip(array)
+ assert_equal([
+ "large_list<count: int8>",
+ [[-128, 127], nil, [-1, 0, 1]],
+ ],
+ [type.to_s, values])
+ end
+
+ def test_map
+ data_type = Arrow::MapDataType.new(:string, :int8)
+ array = Arrow::MapArray.new(data_type,
+ [
+ {"a" => -128, "b" => 127},
+ nil,
+ {"c" => nil},
+ ])
+ type, values = roundtrip(array)
+ assert_equal([
+ "map<string, int8>",
+ [
{"a" => -128, "b" => 127},
nil,
{"c" => nil},
],
- @values)
- end
- end
-
- sub_test_case("Struct") do
- def build_array
- data_type = Arrow::StructDataType.new(count: :int8,
- visible: :boolean)
- Arrow::StructArray.new(data_type,
- [[-128, nil], nil, [nil, true]])
- end
-
- def test_write
- assert_equal([
+ ],
+ [type.to_s, values])
+ end
+
+ def test_struct
+ data_type = Arrow::StructDataType.new(count: :int8,
+ visible: :boolean)
+ array = Arrow::StructArray.new(data_type,
+ [[-128, nil], nil, [nil, true]])
+ type, values = roundtrip(array)
+ assert_equal([
+ "struct<count: int8, visible: bool>",
+ [
{"count" => -128, "visible" => nil},
nil,
{"count" => nil, "visible" => true},
],
- @values)
- end
- end
-
- sub_test_case("DenseUnion") do
- def build_array
- fields = [
- Arrow::Field.new("number", :int8),
- Arrow::Field.new("text", :string),
- ]
- type_ids = [11, 13]
- data_type = Arrow::DenseUnionDataType.new(fields, type_ids)
- types = Arrow::Int8Array.new([11, 13, 11, 13, 13])
- value_offsets = Arrow::Int32Array.new([0, 0, 1, 1, 2])
- children = [
- Arrow::Int8Array.new([1, nil]),
- Arrow::StringArray.new(["a", "b", "c"])
- ]
- Arrow::DenseUnionArray.new(data_type,
- types,
- value_offsets,
- children)
- end
-
- def test_write
- assert_equal([1, "a", nil, "b", "c"],
- @values)
- end
- end
-
- sub_test_case("SparseUnion") do
- def build_array
- fields = [
- Arrow::Field.new("number", :int8),
- Arrow::Field.new("text", :string),
- ]
- type_ids = [11, 13]
- data_type = Arrow::SparseUnionDataType.new(fields, type_ids)
- types = Arrow::Int8Array.new([11, 13, 11, 13, 11])
- children = [
- Arrow::Int8Array.new([1, nil, nil, nil, 5]),
- Arrow::StringArray.new([nil, "b", nil, "d", nil])
- ]
- Arrow::SparseUnionArray.new(data_type, types, children)
- end
-
- def test_write
- assert_equal([1, "b", nil, "d", 5],
- @values)
- end
- end
-
- sub_test_case("Dictionary") do
- def build_array
- values = ["a", "b", "c", nil, "a"]
- string_array = Arrow::StringArray.new(values)
- string_array.dictionary_encode
- end
-
- def test_write
- assert_equal(["a", "b", "c", nil, "a"],
- @values)
- end
+ ],
+ [type.to_s, values])
+ end
+
+ def test_dense_union
+ fields = [
+ Arrow::Field.new("number", :int8),
+ Arrow::Field.new("text", :string),
+ ]
+ type_ids = [11, 13]
+ data_type = Arrow::DenseUnionDataType.new(fields, type_ids)
+ types = Arrow::Int8Array.new([11, 13, 11, 13, 13])
+ value_offsets = Arrow::Int32Array.new([0, 0, 1, 1, 2])
+ children = [
+ Arrow::Int8Array.new([1, nil]),
+ Arrow::StringArray.new(["a", "b", "c"])
+ ]
+ array = Arrow::DenseUnionArray.new(data_type,
+ types,
+ value_offsets,
+ children)
+ type, values = roundtrip(array)
+ assert_equal([
+ "dense_union<number: int8=11, text: string=13>",
+ [1, "a", nil, "b", "c"],
+ ],
+ [type.to_s, values])
+ end
+
+ def test_sparse_union
+ fields = [
+ Arrow::Field.new("number", :int8),
+ Arrow::Field.new("text", :string),
+ ]
+ type_ids = [11, 13]
+ data_type = Arrow::SparseUnionDataType.new(fields, type_ids)
+ types = Arrow::Int8Array.new([11, 13, 11, 13, 11])
+ children = [
+ Arrow::Int8Array.new([1, nil, nil, nil, 5]),
+ Arrow::StringArray.new([nil, "b", nil, "d", nil])
+ ]
+ array = Arrow::SparseUnionArray.new(data_type, types, children)
+ type, values = roundtrip(array)
+ assert_equal([
+ "sparse_union<number: int8=11, text: string=13>",
+ [1, "b", nil, "d", 5],
+ ],
+ [type.to_s, values])
+ end
+
+ def test_dictionary
+ values = ["a", "b", "c", nil, "a"]
+ string_array = Arrow::StringArray.new(values)
+ array = string_array.dictionary_encode
+ type, values = roundtrip(array)
+ assert_equal([
+ "dictionary<values=string, " +
+ "indices=int32, " +
+ "ordered=0>",
+ ["a", "b", "c", nil, "a"],
+ ],
+ [type.to_s, values])
+ end
+
+ def build_dictionary_delta_schema(value_type)
+ index_type = ArrowFormat::Int32Type.singleton
+ ordered = false
+ type = ArrowFormat::DictionaryType.new(index_type,
+ value_type,
+ ordered)
+ nullable = true
+ dictionary_id = 1
+ field = ArrowFormat::Field.new("value",
+ type,
+ nullable,
+ dictionary_id)
+ ArrowFormat::Schema.new([field])
+ end
+
+ def build_dictionary_array(type, indices, dictionary)
+ indices_buffer = IO::Buffer.for(indices.pack("l<*"))
+ ArrowFormat::DictionaryArray.new(type,
+ indices.size,
+ nil,
+ indices_buffer,
+ dictionary)
+ end
+
+ def test_dictionary_delta_utf8
+ value_type = ArrowFormat::UTF8Type.singleton
+ schema = build_dictionary_delta_schema(value_type)
+ type = schema.fields[0].type
+
+ dictionary = convert_array(Arrow::StringArray.new(["a", "b", "c"]))
+ # ["c", "a", "b", "a", "a"]
+ indices = [2, 0, 1, 0, 0]
+ array = build_dictionary_array(type, indices, dictionary)
+ record_batch =
+ ArrowFormat::RecordBatch.new(schema, array.size, [array])
+
+ dictionary_more =
+ convert_array(Arrow::StringArray.new(["a", "b", "c", "d", "e"]))
+ # ["e", "a", "c", "d", "b", "d"]
+ indices = [4, 0, 2, 3, 1, 3]
+ array = build_dictionary_array(type, indices, dictionary_more)
+ record_batch_delta =
+ ArrowFormat::RecordBatch.new(schema, array.size, [array])
+
+ type, values = roundtrip(record_batch, record_batch_delta)
+ assert_equal([
+ "dictionary<values=string, " +
+ "indices=int32, " +
+ "ordered=0>",
+ ["c", "a", "b", "a", "a"] +
+ ["e", "a", "c", "d", "b", "d"],
+ ],
+ [type.to_s, values])
end
end
end
@@ -996,35 +919,23 @@ end
class TestFileWriter < Test::Unit::TestCase
include WriterTests
- def setup
- Dir.mktmpdir do |tmp_dir|
- path = File.join(tmp_dir, "data.arrow")
- File.open(path, "wb") do |output|
- writer = ArrowFormat::FileWriter.new(output)
- write(writer)
- end
- data = File.open(path, "rb", &:read).freeze
- table = Arrow::Table.load(Arrow::Buffer.new(data), format: :arrow)
- @type = table.value.data_type
- @values = table.value.values
- end
+ def file_extension
+ "arrow"
+ end
+
+ def writer_class
+ ArrowFormat::FileWriter
end
end
class TestStreamingWriter < Test::Unit::TestCase
include WriterTests
- def setup
- Dir.mktmpdir do |tmp_dir|
- path = File.join(tmp_dir, "data.arrows")
- File.open(path, "wb") do |output|
- writer = ArrowFormat::StreamingWriter.new(output)
- write(writer)
- end
- data = File.open(path, "rb", &:read).freeze
- table = Arrow::Table.load(Arrow::Buffer.new(data), format: :arrows)
- @type = table.value.data_type
- @values = table.value.values
- end
+ def file_extension
+ "arrows"
+ end
+
+ def writer_class
+ ArrowFormat::StreamingWriter
end
end