Re: [PR] feat: Add array data type support [fluss-rust]

via GitHub Sun, 08 Mar 2026 04:29:08 -0700


fresh-borzoni commented on code in PR #433:
URL: https://github.com/apache/fluss-rust/pull/433#discussion_r2901689746



##########
crates/fluss/src/row/datum.rs:
##########
@@ -458,6 +476,86 @@ impl AppendResult for std::result::Result<(), ArrowError> {
     }
 }
 
+fn append_fluss_array_to_list_builder(
+    arr: &FlussArray,
+    builder: &mut dyn ArrayBuilder,
+    data_type: &arrow_schema::DataType,
+) -> Result<()> {
+    use crate::record::from_arrow_type;
+
+    let list_builder = builder
+        .as_any_mut()
+        .downcast_mut::<ListBuilder<Box<dyn ArrayBuilder>>>()
+        .ok_or_else(|| RowConvertError {
+            message: "Builder type mismatch for Array: expected 
ListBuilder".to_string(),
+        })?;
+
+    let element_arrow_type = match data_type {
+        arrow_schema::DataType::List(field) => field.data_type().clone(),
+        _ => {
+            return Err(RowConvertError {
+                message: format!("Expected List Arrow type for Array datum, 
got: {data_type:?}"),
+            });
+        }
+    };
+
+    let element_fluss_type = from_arrow_type(&element_arrow_type)?;
+    let values_builder = list_builder.values();
+
+    for i in 0..arr.size() {
+        if arr.is_null_at(i) {
+            let null_datum = Datum::Null;
+            null_datum.append_to(values_builder, &element_arrow_type)?;

Review Comment:
   with `Datum::Null` it would be a series of downcasts tries until match in 
macro `append_value_to_arrow`, we need to fix it as for sparse lists - it's a 
real waste. 
   Let's add TODO and followup



##########
crates/fluss/src/row/column.rs:
##########
@@ -407,17 +407,115 @@ impl InternalRow for ColumnarRow {
             })?
             .value(self.row_id))
     }
+
+    fn get_array(&self, pos: usize) -> Result<crate::row::FlussArray> {
+        use crate::record::from_arrow_type;
+        use crate::row::binary_array::FlussArrayWriter;
+        use arrow::array::ListArray;
+
+        let column = self.column(pos)?;
+        let list_array =
+            column
+                .as_any()
+                .downcast_ref::<ListArray>()
+                .ok_or_else(|| IllegalArgument {
+                    message: format!("expected List array at position {pos}"),
+                })?;
+
+        let values = list_array.value(self.row_id);
+        let num_elements = values.len();
+        let element_arrow_type = values.data_type();
+        let element_fluss_type = from_arrow_type(element_arrow_type)?;
+
+        let mut writer = FlussArrayWriter::new(num_elements, 
&element_fluss_type);
+        let element_row = ColumnarRow::new(std::sync::Arc::new(
+            arrow::array::RecordBatch::try_from_iter(vec![("v", 
values)]).map_err(|e| {
+                IllegalArgument {
+                    message: format!("Failed to create RecordBatch from list 
values: {e}"),
+                }
+            })?,
+        ));

Review Comment:
   +1, it's not good for performance



##########
crates/fluss/src/row/binary_array.rs:
##########
@@ -0,0 +1,736 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Binary array format matching Java's `BinaryArray.java` layout.
+//!
+//! Binary layout:
+//! ```text
+//! [size(4B)] + [null bits (4-byte word aligned)] + [fixed-length part] + 
[variable-length part]
+//! ```
+//!
+//! Java reference: `BinaryArray.java`, `BinaryArrayWriter.java`
+
+use crate::error::Error::IllegalArgument;
+use crate::error::Result;
+use crate::metadata::DataType;
+use crate::row::Decimal;
+use crate::row::datum::{Date, Time, TimestampLtz, TimestampNtz};
+use serde::Serialize;
+use std::fmt;
+use std::hash::{Hash, Hasher};
+
+const MAX_FIX_PART_DATA_SIZE: usize = 7;
+const HIGHEST_FIRST_BIT: u64 = 0x80_u64 << 56;
+const HIGHEST_SECOND_TO_EIGHTH_BIT: u64 = 0x7F_u64 << 56;
+
+/// Calculates the header size in bytes: 4 (for element count) + null bits 
(4-byte word aligned).
+/// Matches Java's `BinaryArray.calculateHeaderInBytes(numFields)`.
+pub fn calculate_header_in_bytes(num_elements: usize) -> usize {
+    4 + num_elements.div_ceil(32) * 4
+}
+
+/// Calculates the fixed-length part size per element for a given data type.
+/// Matches Java's `BinaryArray.calculateFixLengthPartSize(DataType)`.
+pub fn calculate_fix_length_part_size(element_type: &DataType) -> usize {
+    match element_type {
+        DataType::Boolean(_) | DataType::TinyInt(_) => 1,
+        DataType::SmallInt(_) => 2,
+        DataType::Int(_) | DataType::Float(_) | DataType::Date(_) | 
DataType::Time(_) => 4,
+        DataType::BigInt(_)
+        | DataType::Double(_)
+        | DataType::Char(_)
+        | DataType::String(_)
+        | DataType::Binary(_)
+        | DataType::Bytes(_)
+        | DataType::Decimal(_)
+        | DataType::Timestamp(_)
+        | DataType::TimestampLTz(_)
+        | DataType::Array(_)
+        | DataType::Map(_)
+        | DataType::Row(_) => 8,
+    }
+}
+
+/// Rounds a byte count up to the nearest 8-byte word boundary.
+/// Matches Java's `roundNumberOfBytesToNearestWord`.
+fn round_to_nearest_word(num_bytes: usize) -> usize {
+    (num_bytes + 7) & !7
+}
+
+/// A Fluss binary array, wire-compatible with Java's `BinaryArray`.
+///
+/// Stores elements in a flat byte buffer with a header (element count + null 
bitmap)
+/// followed by fixed-length slots and an optional variable-length section.
+#[derive(Clone)]
+pub struct FlussArray {
+    data: Vec<u8>,
+    size: usize,
+    element_offset: usize,
+}
+
+impl fmt::Debug for FlussArray {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("FlussArray")
+            .field("size", &self.size)
+            .field("data_len", &self.data.len())
+            .finish()
+    }
+}
+
+impl fmt::Display for FlussArray {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "FlussArray[size={}]", self.size)
+    }
+}
+
+impl PartialEq for FlussArray {
+    fn eq(&self, other: &Self) -> bool {
+        self.data == other.data
+    }
+}
+
+impl Eq for FlussArray {}
+
+impl PartialOrd for FlussArray {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for FlussArray {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        self.data.cmp(&other.data)
+    }
+}
+
+impl Hash for FlussArray {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.data.hash(state);
+    }
+}
+
+impl Serialize for FlussArray {
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, 
S::Error>
+    where
+        S: serde::Serializer,
+    {
+        serializer.serialize_bytes(&self.data)
+    }
+}
+
+impl FlussArray {
+    /// Creates a FlussArray by pointing to existing bytes.
+    pub fn from_bytes(data: &[u8]) -> Result<Self> {
+        if data.len() < 4 {
+            return Err(IllegalArgument {
+                message: format!(
+                    "FlussArray data too short: need at least 4 bytes, got {}",
+                    data.len()
+                ),
+            });
+        }
+        let raw_size = i32::from_ne_bytes(data[0..4].try_into().unwrap());
+        if raw_size < 0 {
+            return Err(IllegalArgument {
+                message: format!("FlussArray size must be non-negative, got 
{raw_size}"),
+            });
+        }
+        let size = raw_size as usize;
+        let element_offset = calculate_header_in_bytes(size);
+        if element_offset > data.len() {
+            return Err(IllegalArgument {
+                message: format!(
+                    "FlussArray header exceeds payload: header={}, payload={}",
+                    element_offset,
+                    data.len()
+                ),
+            });
+        }
+
+        Ok(FlussArray {
+            data: data.to_vec(),
+            size,
+            element_offset,
+        })
+    }
+
+    /// Returns the number of elements.
+    pub fn size(&self) -> usize {
+        self.size
+    }
+
+    /// Returns the raw bytes of this array (the complete binary 
representation).
+    pub fn as_bytes(&self) -> &[u8] {
+        &self.data
+    }
+
+    /// Returns true if the element at position `pos` is null.
+    pub fn is_null_at(&self, pos: usize) -> bool {
+        let byte_index = pos >> 3;
+        let bit = pos & 7;
+        (self.data[4 + byte_index] & (1u8 << bit)) != 0
+    }
+
+    fn element_offset(&self, ordinal: usize, element_size: usize) -> usize {
+        self.element_offset + ordinal * element_size
+    }
+
+    fn checked_slice(&self, start: usize, len: usize, context: &str) -> 
Result<&[u8]> {
+        let end = start.checked_add(len).ok_or_else(|| IllegalArgument {
+            message: format!("Overflow while reading {context}: start={start}, 
len={len}"),
+        })?;
+        if end > self.data.len() {
+            return Err(IllegalArgument {
+                message: format!(
+                    "Out-of-bounds while reading {context}: start={start}, 
len={len}, payload={}",
+                    self.data.len()
+                ),
+            });
+        }
+        Ok(&self.data[start..end])
+    }
+
+    fn read_var_len_bytes(&self, pos: usize) -> Result<&[u8]> {
+        let field_offset = self.element_offset(pos, 8);
+        let packed = self.get_long(pos) as u64;
+        let mark = packed & HIGHEST_FIRST_BIT;
+
+        if mark == 0 {
+            let offset = (packed >> 32) as usize;
+            let len = (packed & 0xFFFF_FFFF) as usize;
+            self.checked_slice(offset, len, "variable-length array element")
+        } else {
+            let len = ((packed & HIGHEST_SECOND_TO_EIGHTH_BIT) >> 56) as usize;
+            if len > MAX_FIX_PART_DATA_SIZE {
+                return Err(IllegalArgument {
+                    message: format!(
+                        "Inline array element length must be <= 
{MAX_FIX_PART_DATA_SIZE}, got {len}"
+                    ),
+                });
+            }
+            // Java stores inline bytes in the 8-byte slot itself.
+            // On little-endian, bytes start at field_offset; on big-endian 
they start at +1.
+            let start = if cfg!(target_endian = "little") {
+                field_offset
+            } else {
+                field_offset + 1
+            };
+            self.checked_slice(start, len, "inline array element")
+        }
+    }
+
+    pub fn get_boolean(&self, pos: usize) -> bool {
+        let offset = self.element_offset(pos, 1);
+        self.data[offset] != 0
+    }
+
+    pub fn get_byte(&self, pos: usize) -> i8 {
+        let offset = self.element_offset(pos, 1);
+        self.data[offset] as i8
+    }
+
+    pub fn get_short(&self, pos: usize) -> i16 {
+        let offset = self.element_offset(pos, 2);
+        i16::from_ne_bytes(self.data[offset..offset + 2].try_into().unwrap())
+    }
+
+    pub fn get_int(&self, pos: usize) -> i32 {
+        let offset = self.element_offset(pos, 4);
+        i32::from_ne_bytes(self.data[offset..offset + 4].try_into().unwrap())
+    }
+
+    pub fn get_long(&self, pos: usize) -> i64 {
+        let offset = self.element_offset(pos, 8);
+        i64::from_ne_bytes(self.data[offset..offset + 8].try_into().unwrap())
+    }
+
+    pub fn get_float(&self, pos: usize) -> f32 {
+        let offset = self.element_offset(pos, 4);
+        f32::from_ne_bytes(self.data[offset..offset + 4].try_into().unwrap())
+    }
+
+    pub fn get_double(&self, pos: usize) -> f64 {
+        let offset = self.element_offset(pos, 8);
+        f64::from_ne_bytes(self.data[offset..offset + 8].try_into().unwrap())
+    }
+
+    /// Reads the offset_and_size packed long for variable-length elements.
+    fn get_offset_and_size(&self, pos: usize) -> (usize, usize) {
+        let packed = self.get_long(pos) as u64;
+        let offset = (packed >> 32) as usize;
+        let size = (packed & 0xFFFF_FFFF) as usize;
+        (offset, size)
+    }
+
+    pub fn get_string(&self, pos: usize) -> Result<&str> {
+        let bytes = self.read_var_len_bytes(pos)?;
+        std::str::from_utf8(bytes).map_err(|e| IllegalArgument {
+            message: format!("Invalid UTF-8 in array element at position 
{pos}: {e}"),
+        })
+    }
+
+    pub fn get_binary(&self, pos: usize) -> Result<&[u8]> {
+        self.read_var_len_bytes(pos)
+    }
+
+    pub fn get_decimal(&self, pos: usize, precision: u32, scale: u32) -> 
Result<Decimal> {
+        if Decimal::is_compact_precision(precision) {
+            let unscaled = self.get_long(pos);
+            Decimal::from_unscaled_long(unscaled, precision, scale)
+        } else {
+            let (offset, size) = self.get_offset_and_size(pos);
+            let bytes = self.checked_slice(offset, size, "decimal bytes")?;
+            Decimal::from_unscaled_bytes(bytes, precision, scale)
+        }
+    }
+
+    pub fn get_date(&self, pos: usize) -> Date {
+        Date::new(self.get_int(pos))
+    }
+
+    pub fn get_time(&self, pos: usize) -> Time {
+        Time::new(self.get_int(pos))
+    }
+
+    pub fn get_timestamp_ntz(&self, pos: usize, precision: u32) -> 
Result<TimestampNtz> {
+        if TimestampNtz::is_compact(precision) {
+            Ok(TimestampNtz::new(self.get_long(pos)))
+        } else {
+            let (offset, _size) = self.get_offset_and_size(pos);

Review Comment:
   nit: this is confusing as we use it, also mb we should give it a proper name 
like `nanos_of_millis`



##########
bindings/cpp/src/types.rs:
##########
@@ -351,6 +351,7 @@ pub fn resolve_row_types(
             Datum::Time(t) => Datum::Time(*t),
             Datum::TimestampNtz(ts) => Datum::TimestampNtz(*ts),
             Datum::TimestampLtz(ts) => Datum::TimestampLtz(*ts),
+            Datum::Array(a) => Datum::Array(a.clone()),

Review Comment:
   I doubt that cpp bindings would be able to use arrays without wrapper and 
proper CXX wirings.
   Let's add TODO to follow up, so we don't forget about it



##########
crates/fluss/src/row/datum.rs:
##########
@@ -504,6 +602,16 @@ impl Datum<'_> {
                 append_null_to_arrow!(TimestampMillisecondBuilder);
                 append_null_to_arrow!(TimestampMicrosecondBuilder);
                 append_null_to_arrow!(TimestampNanosecondBuilder);
+                // For List (Array) type, append null generically
+                if let arrow_schema::DataType::List(_) = data_type {
+                    if let Some(b) = builder
+                        .as_any_mut()
+                        .downcast_mut::<ListBuilder<Box<dyn ArrayBuilder>>>()
+                    {
+                        b.append_null();
+                    }

Review Comment:
   +1



##########
crates/fluss/src/row/column.rs:
##########
@@ -407,17 +407,115 @@ impl InternalRow for ColumnarRow {
             })?
             .value(self.row_id))
     }
+
+    fn get_array(&self, pos: usize) -> Result<crate::row::FlussArray> {
+        use crate::record::from_arrow_type;
+        use crate::row::binary_array::FlussArrayWriter;
+        use arrow::array::ListArray;
+
+        let column = self.column(pos)?;
+        let list_array =
+            column
+                .as_any()
+                .downcast_ref::<ListArray>()
+                .ok_or_else(|| IllegalArgument {
+                    message: format!("expected List array at position {pos}"),
+                })?;
+
+        let values = list_array.value(self.row_id);
+        let num_elements = values.len();
+        let element_arrow_type = values.data_type();
+        let element_fluss_type = from_arrow_type(element_arrow_type)?;
+
+        let mut writer = FlussArrayWriter::new(num_elements, 
&element_fluss_type);
+        let element_row = ColumnarRow::new(std::sync::Arc::new(
+            arrow::array::RecordBatch::try_from_iter(vec![("v", 
values)]).map_err(|e| {
+                IllegalArgument {
+                    message: format!("Failed to create RecordBatch from list 
values: {e}"),
+                }
+            })?,
+        ));
+
+        for i in 0..num_elements {
+            let mut row = element_row.clone();

Review Comment:
   it's better to just make `element_row` mutable and call `set_row_id` directly



##########
crates/fluss/src/row/binary_array.rs:
##########
@@ -0,0 +1,736 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Binary array format matching Java's `BinaryArray.java` layout.
+//!
+//! Binary layout:
+//! ```text
+//! [size(4B)] + [null bits (4-byte word aligned)] + [fixed-length part] + 
[variable-length part]
+//! ```
+//!
+//! Java reference: `BinaryArray.java`, `BinaryArrayWriter.java`
+
+use crate::error::Error::IllegalArgument;
+use crate::error::Result;
+use crate::metadata::DataType;
+use crate::row::Decimal;
+use crate::row::datum::{Date, Time, TimestampLtz, TimestampNtz};
+use serde::Serialize;
+use std::fmt;
+use std::hash::{Hash, Hasher};
+
+const MAX_FIX_PART_DATA_SIZE: usize = 7;
+const HIGHEST_FIRST_BIT: u64 = 0x80_u64 << 56;
+const HIGHEST_SECOND_TO_EIGHTH_BIT: u64 = 0x7F_u64 << 56;
+
+/// Calculates the header size in bytes: 4 (for element count) + null bits 
(4-byte word aligned).
+/// Matches Java's `BinaryArray.calculateHeaderInBytes(numFields)`.
+pub fn calculate_header_in_bytes(num_elements: usize) -> usize {
+    4 + num_elements.div_ceil(32) * 4
+}
+
+/// Calculates the fixed-length part size per element for a given data type.
+/// Matches Java's `BinaryArray.calculateFixLengthPartSize(DataType)`.
+pub fn calculate_fix_length_part_size(element_type: &DataType) -> usize {
+    match element_type {
+        DataType::Boolean(_) | DataType::TinyInt(_) => 1,
+        DataType::SmallInt(_) => 2,
+        DataType::Int(_) | DataType::Float(_) | DataType::Date(_) | 
DataType::Time(_) => 4,
+        DataType::BigInt(_)
+        | DataType::Double(_)
+        | DataType::Char(_)
+        | DataType::String(_)
+        | DataType::Binary(_)
+        | DataType::Bytes(_)
+        | DataType::Decimal(_)
+        | DataType::Timestamp(_)
+        | DataType::TimestampLTz(_)
+        | DataType::Array(_)
+        | DataType::Map(_)
+        | DataType::Row(_) => 8,
+    }
+}
+
+/// Rounds a byte count up to the nearest 8-byte word boundary.
+/// Matches Java's `roundNumberOfBytesToNearestWord`.
+fn round_to_nearest_word(num_bytes: usize) -> usize {
+    (num_bytes + 7) & !7
+}
+
+/// A Fluss binary array, wire-compatible with Java's `BinaryArray`.
+///
+/// Stores elements in a flat byte buffer with a header (element count + null 
bitmap)
+/// followed by fixed-length slots and an optional variable-length section.
+#[derive(Clone)]
+pub struct FlussArray {
+    data: Vec<u8>,
+    size: usize,
+    element_offset: usize,
+}
+
+impl fmt::Debug for FlussArray {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("FlussArray")
+            .field("size", &self.size)
+            .field("data_len", &self.data.len())
+            .finish()
+    }
+}
+
+impl fmt::Display for FlussArray {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "FlussArray[size={}]", self.size)
+    }
+}
+
+impl PartialEq for FlussArray {
+    fn eq(&self, other: &Self) -> bool {
+        self.data == other.data
+    }
+}
+
+impl Eq for FlussArray {}
+
+impl PartialOrd for FlussArray {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for FlussArray {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        self.data.cmp(&other.data)
+    }
+}
+
+impl Hash for FlussArray {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.data.hash(state);
+    }
+}
+
+impl Serialize for FlussArray {
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, 
S::Error>
+    where
+        S: serde::Serializer,
+    {
+        serializer.serialize_bytes(&self.data)
+    }
+}
+
+impl FlussArray {
+    /// Creates a FlussArray by pointing to existing bytes.
+    pub fn from_bytes(data: &[u8]) -> Result<Self> {
+        if data.len() < 4 {
+            return Err(IllegalArgument {
+                message: format!(
+                    "FlussArray data too short: need at least 4 bytes, got {}",
+                    data.len()
+                ),
+            });
+        }
+        let raw_size = i32::from_ne_bytes(data[0..4].try_into().unwrap());
+        if raw_size < 0 {
+            return Err(IllegalArgument {
+                message: format!("FlussArray size must be non-negative, got 
{raw_size}"),
+            });
+        }
+        let size = raw_size as usize;
+        let element_offset = calculate_header_in_bytes(size);
+        if element_offset > data.len() {
+            return Err(IllegalArgument {
+                message: format!(
+                    "FlussArray header exceeds payload: header={}, payload={}",
+                    element_offset,
+                    data.len()
+                ),
+            });
+        }
+
+        Ok(FlussArray {
+            data: data.to_vec(),
+            size,
+            element_offset,
+        })
+    }
+
+    /// Returns the number of elements.
+    pub fn size(&self) -> usize {
+        self.size
+    }
+
+    /// Returns the raw bytes of this array (the complete binary 
representation).
+    pub fn as_bytes(&self) -> &[u8] {
+        &self.data
+    }
+
+    /// Returns true if the element at position `pos` is null.
+    pub fn is_null_at(&self, pos: usize) -> bool {
+        let byte_index = pos >> 3;
+        let bit = pos & 7;
+        (self.data[4 + byte_index] & (1u8 << bit)) != 0
+    }
+
+    fn element_offset(&self, ordinal: usize, element_size: usize) -> usize {
+        self.element_offset + ordinal * element_size
+    }
+
+    fn checked_slice(&self, start: usize, len: usize, context: &str) -> 
Result<&[u8]> {
+        let end = start.checked_add(len).ok_or_else(|| IllegalArgument {
+            message: format!("Overflow while reading {context}: start={start}, 
len={len}"),
+        })?;
+        if end > self.data.len() {
+            return Err(IllegalArgument {
+                message: format!(
+                    "Out-of-bounds while reading {context}: start={start}, 
len={len}, payload={}",
+                    self.data.len()
+                ),
+            });
+        }
+        Ok(&self.data[start..end])
+    }
+
+    fn read_var_len_bytes(&self, pos: usize) -> Result<&[u8]> {
+        let field_offset = self.element_offset(pos, 8);
+        let packed = self.get_long(pos) as u64;
+        let mark = packed & HIGHEST_FIRST_BIT;
+
+        if mark == 0 {
+            let offset = (packed >> 32) as usize;
+            let len = (packed & 0xFFFF_FFFF) as usize;
+            self.checked_slice(offset, len, "variable-length array element")
+        } else {
+            let len = ((packed & HIGHEST_SECOND_TO_EIGHTH_BIT) >> 56) as usize;
+            if len > MAX_FIX_PART_DATA_SIZE {
+                return Err(IllegalArgument {
+                    message: format!(
+                        "Inline array element length must be <= 
{MAX_FIX_PART_DATA_SIZE}, got {len}"
+                    ),
+                });
+            }
+            // Java stores inline bytes in the 8-byte slot itself.
+            // On little-endian, bytes start at field_offset; on big-endian 
they start at +1.
+            let start = if cfg!(target_endian = "little") {
+                field_offset
+            } else {
+                field_offset + 1
+            };
+            self.checked_slice(start, len, "inline array element")
+        }
+    }
+
+    pub fn get_boolean(&self, pos: usize) -> bool {
+        let offset = self.element_offset(pos, 1);
+        self.data[offset] != 0
+    }
+
+    pub fn get_byte(&self, pos: usize) -> i8 {
+        let offset = self.element_offset(pos, 1);
+        self.data[offset] as i8
+    }
+
+    pub fn get_short(&self, pos: usize) -> i16 {
+        let offset = self.element_offset(pos, 2);
+        i16::from_ne_bytes(self.data[offset..offset + 2].try_into().unwrap())
+    }
+
+    pub fn get_int(&self, pos: usize) -> i32 {
+        let offset = self.element_offset(pos, 4);
+        i32::from_ne_bytes(self.data[offset..offset + 4].try_into().unwrap())
+    }
+
+    pub fn get_long(&self, pos: usize) -> i64 {
+        let offset = self.element_offset(pos, 8);
+        i64::from_ne_bytes(self.data[offset..offset + 8].try_into().unwrap())
+    }
+
+    pub fn get_float(&self, pos: usize) -> f32 {
+        let offset = self.element_offset(pos, 4);
+        f32::from_ne_bytes(self.data[offset..offset + 4].try_into().unwrap())
+    }
+
+    pub fn get_double(&self, pos: usize) -> f64 {
+        let offset = self.element_offset(pos, 8);
+        f64::from_ne_bytes(self.data[offset..offset + 8].try_into().unwrap())
+    }
+
+    /// Reads the offset_and_size packed long for variable-length elements.
+    fn get_offset_and_size(&self, pos: usize) -> (usize, usize) {
+        let packed = self.get_long(pos) as u64;
+        let offset = (packed >> 32) as usize;
+        let size = (packed & 0xFFFF_FFFF) as usize;
+        (offset, size)
+    }
+
+    pub fn get_string(&self, pos: usize) -> Result<&str> {
+        let bytes = self.read_var_len_bytes(pos)?;
+        std::str::from_utf8(bytes).map_err(|e| IllegalArgument {
+            message: format!("Invalid UTF-8 in array element at position 
{pos}: {e}"),
+        })
+    }
+
+    pub fn get_binary(&self, pos: usize) -> Result<&[u8]> {
+        self.read_var_len_bytes(pos)
+    }
+
+    pub fn get_decimal(&self, pos: usize, precision: u32, scale: u32) -> 
Result<Decimal> {
+        if Decimal::is_compact_precision(precision) {
+            let unscaled = self.get_long(pos);
+            Decimal::from_unscaled_long(unscaled, precision, scale)
+        } else {
+            let (offset, size) = self.get_offset_and_size(pos);
+            let bytes = self.checked_slice(offset, size, "decimal bytes")?;
+            Decimal::from_unscaled_bytes(bytes, precision, scale)
+        }
+    }
+
+    pub fn get_date(&self, pos: usize) -> Date {
+        Date::new(self.get_int(pos))
+    }
+
+    pub fn get_time(&self, pos: usize) -> Time {
+        Time::new(self.get_int(pos))
+    }
+
+    pub fn get_timestamp_ntz(&self, pos: usize, precision: u32) -> 
Result<TimestampNtz> {
+        if TimestampNtz::is_compact(precision) {
+            Ok(TimestampNtz::new(self.get_long(pos)))
+        } else {
+            let (offset, _size) = self.get_offset_and_size(pos);
+            let millis_bytes = self.checked_slice(offset, 8, "timestamp ntz 
millis")?;
+            let millis = i64::from_ne_bytes(millis_bytes.try_into().unwrap());
+            let nanos = _size as i32;
+            TimestampNtz::from_millis_nanos(millis, nanos)
+        }
+    }
+
+    pub fn get_timestamp_ltz(&self, pos: usize, precision: u32) -> 
Result<TimestampLtz> {
+        if TimestampLtz::is_compact(precision) {
+            Ok(TimestampLtz::new(self.get_long(pos)))
+        } else {
+            let (offset, _size) = self.get_offset_and_size(pos);
+            let millis_bytes = self.checked_slice(offset, 8, "timestamp ltz 
millis")?;
+            let millis = i64::from_ne_bytes(millis_bytes.try_into().unwrap());
+            let nanos = _size as i32;

Review Comment:
   ditto



##########
crates/fluss/src/record/arrow.rs:
##########
@@ -314,6 +314,10 @@ impl RowAppendRecordBatchBuilder {
             
arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, _) => {
                 Ok(Box::new(TimestampNanosecondBuilder::new()))
             }
+            arrow_schema::DataType::List(field) => {
+                let inner_builder = Self::create_builder(field.data_type())?;
+                Ok(Box::new(ListBuilder::new(inner_builder)))

Review Comment:
   now we use capacity and presized builders



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] feat: Add array data type support [fluss-rust]

Reply via email to