PinkCrow007 commented on code in PR #7452:
URL: https://github.com/apache/arrow-rs/pull/7452#discussion_r2074218346


##########
arrow-variant/src/encoder/mod.rs:
##########
@@ -0,0 +1,761 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Core encoding primitives for the Variant binary format
+
+use arrow_schema::ArrowError;
+use std::io::Write;
+
+/// Maximum value that can be stored in a single byte (2^8 - 1)
+pub const MAX_1BYTE_VALUE: usize = 255;
+
+/// Maximum value that can be stored in two bytes (2^16 - 1)
+pub const MAX_2BYTE_VALUE: usize = 65535;
+
+/// Maximum value that can be stored in three bytes (2^24 - 1)
+pub const MAX_3BYTE_VALUE: usize = 16777215;
+
+/// Calculate the minimum number of bytes required to represent a value.
+///
+/// Returns a value between 1 and 4, representing the minimum number of
+/// bytes needed to store the given value.
+///
+/// # Arguments
+///
+/// * `value` - The value to determine the size for
+///
+/// # Returns
+///
+/// The number of bytes (1, 2, 3, or 4) needed to represent the value
+pub fn min_bytes_needed(value: usize) -> usize {
+    if value <= MAX_1BYTE_VALUE {
+        1
+    } else if value <= MAX_2BYTE_VALUE {
+        2
+    } else if value <= MAX_3BYTE_VALUE {
+        3
+    } else {
+        4
+    }
+}
+
+/// Variant basic types as defined in the Arrow Variant specification
+///
+/// Basic Type ID      Description
+/// Primitive  0       One of the primitive types
+/// Short string       1       A string with a length less than 64 bytes
+/// Object     2       A collection of (string-key, variant-value) pairs
+/// Array      3       An ordered sequence of variant values
+pub enum VariantBasicType {
+    /// Primitive type (0)
+    Primitive = 0,
+    /// Short string (1)
+    ShortString = 1,
+    /// Object (2)
+    Object = 2,
+    /// Array (3)
+    Array = 3,
+}
+
+/// Variant primitive types as defined in the Arrow Variant specification
+///
+/// Equivalence Class  Variant Physical Type   Type ID Equivalent Parquet Type 
Binary format
+/// NullType   null    0       UNKNOWN none
+/// Boolean    boolean (True)  1       BOOLEAN none
+/// Boolean    boolean (False) 2       BOOLEAN none
+/// Exact Numeric      int8    3       INT(8, signed)  1 byte
+/// Exact Numeric      int16   4       INT(16, signed) 2 byte little-endian
+/// Exact Numeric      int32   5       INT(32, signed) 4 byte little-endian
+/// Exact Numeric      int64   6       INT(64, signed) 8 byte little-endian
+/// Double     double  7       DOUBLE  IEEE little-endian
+/// Exact Numeric      decimal4        8       DECIMAL(precision, scale)       
1 byte scale in range [0, 38], followed by little-endian unscaled value
+/// Exact Numeric      decimal8        9       DECIMAL(precision, scale)       
1 byte scale in range [0, 38], followed by little-endian unscaled value
+/// Exact Numeric      decimal16       10      DECIMAL(precision, scale)       
1 byte scale in range [0, 38], followed by little-endian unscaled value
+/// Date       date    11      DATE    4 byte little-endian
+/// Timestamp  timestamp       12      TIMESTAMP(isAdjustedToUTC=true, MICROS) 
8-byte little-endian
+/// TimestampNTZ       timestamp without time zone     13      
TIMESTAMP(isAdjustedToUTC=false, MICROS)        8-byte little-endian
+/// Float      float   14      FLOAT   IEEE little-endian
+/// Binary     binary  15      BINARY  4 byte little-endian size, followed by 
bytes
+/// String     string  16      STRING  4 byte little-endian size, followed by 
UTF-8 encoded bytes
+/// TimeNTZ    time without time zone  17      TIME(isAdjustedToUTC=false, 
MICROS)     8-byte little-endian
+/// Timestamp  timestamp with time zone        18      
TIMESTAMP(isAdjustedToUTC=true, NANOS)  8-byte little-endian
+/// TimestampNTZ       timestamp without time zone     19      
TIMESTAMP(isAdjustedToUTC=false, NANOS) 8-byte little-endian
+/// UUID       uuid    20      UUID    16-byte big-endian
+pub enum VariantPrimitiveType {
+    /// Null type (0)
+    Null = 0,
+    /// Boolean true (1)
+    BooleanTrue = 1,
+    /// Boolean false (2)
+    BooleanFalse = 2,
+    /// 8-bit signed integer (3)
+    Int8 = 3,
+    /// 16-bit signed integer (4)
+    Int16 = 4,
+    /// 32-bit signed integer (5)
+    Int32 = 5,
+    /// 64-bit signed integer (6)
+    Int64 = 6,
+    /// 64-bit floating point (7)
+    Double = 7,
+    /// 32-bit decimal (8)
+    Decimal4 = 8,
+    /// 64-bit decimal (9)
+    Decimal8 = 9,
+    /// 128-bit decimal (10)
+    Decimal16 = 10,
+    /// Date (11)
+    Date = 11,
+    /// Timestamp with timezone (12)
+    Timestamp = 12,
+    /// Timestamp without timezone (13)
+    TimestampNTZ = 13,
+    /// 32-bit floating point (14)
+    Float = 14,
+    /// Binary data (15)
+    Binary = 15,
+    /// UTF-8 string (16)
+    String = 16,
+    /// Time without timezone (17)
+    TimeNTZ = 17,
+    /// Timestamp with timezone (nanos) (18)
+    TimestampNanos = 18,
+    /// Timestamp without timezone (nanos) (19)
+    TimestampNTZNanos = 19,
+    /// UUID (20)
+    Uuid = 20,
+}
+
+/// Creates a header byte for a primitive type value
+///
+/// The header byte contains:
+/// - Basic type (2 bits) in the lower bits
+/// - Type ID (6 bits) in the upper bits
+fn primitive_header(type_id: u8) -> u8 {
+    (type_id << 2) | VariantBasicType::Primitive as u8
+}
+
+/// Creates a header byte for a short string value
+///
+/// The header byte contains:
+/// - Basic type (2 bits) in the lower bits
+/// - String length (6 bits) in the upper bits
+fn short_str_header(size: u8) -> u8 {
+    (size << 2) | VariantBasicType::ShortString as u8
+}
+
+/// Creates a header byte for an object value
+///
+/// The header byte contains:
+/// - Basic type (2 bits) in the lower bits
+/// - is_large (1 bit) at position 6
+/// - field_id_size_minus_one (2 bits) at positions 4-5
+/// - field_offset_size_minus_one (2 bits) at positions 2-3
+pub fn object_header(is_large: bool, id_size: u8, offset_size: u8) -> u8 {
+    ((is_large as u8) << 6)
+        | ((id_size - 1) << 4)
+        | ((offset_size - 1) << 2)
+        | VariantBasicType::Object as u8
+}
+
+/// Creates a header byte for an array value
+///
+/// The header byte contains:
+/// - Basic type (2 bits) in the lower bits
+/// - is_large (1 bit) at position 4
+/// - field_offset_size_minus_one (2 bits) at positions 2-3
+pub fn array_header(is_large: bool, offset_size: u8) -> u8 {
+    ((is_large as u8) << 4) | ((offset_size - 1) << 2) | 
VariantBasicType::Array as u8
+}
+
+/// Encodes a null value
+pub fn encode_null(output: &mut Vec<u8>) {
+    output.push(primitive_header(VariantPrimitiveType::Null as u8));
+}
+
+/// Encodes a boolean value
+pub fn encode_boolean(value: bool, output: &mut Vec<u8>) {
+    if value {
+        output.push(primitive_header(VariantPrimitiveType::BooleanTrue as u8));
+    } else {
+        output.push(primitive_header(VariantPrimitiveType::BooleanFalse as 
u8));
+    }
+}
+
+/// Encodes an integer value, choosing the smallest sufficient type
+pub fn encode_integer(value: i64, output: &mut Vec<u8>) {
+    if value >= -128 && value <= 127 {
+        // Int8
+        output.push(primitive_header(VariantPrimitiveType::Int8 as u8));
+        output.push(value as u8);
+    } else if value >= -32768 && value <= 32767 {
+        // Int16
+        output.push(primitive_header(VariantPrimitiveType::Int16 as u8));
+        output.extend_from_slice(&(value as i16).to_le_bytes());
+    } else if value >= -2147483648 && value <= 2147483647 {
+        // Int32
+        output.push(primitive_header(VariantPrimitiveType::Int32 as u8));
+        output.extend_from_slice(&(value as i32).to_le_bytes());
+    } else {
+        // Int64
+        output.push(primitive_header(VariantPrimitiveType::Int64 as u8));
+        output.extend_from_slice(&value.to_le_bytes());
+    }
+}
+
+/// Encodes a float value
+pub fn encode_float(value: f64, output: &mut Vec<u8>) {
+    output.push(primitive_header(VariantPrimitiveType::Double as u8));
+    output.extend_from_slice(&value.to_le_bytes());
+}
+
+/// Encodes a string value
+pub fn encode_string(value: &str, output: &mut Vec<u8>) {
+    let bytes = value.as_bytes();
+    let len = bytes.len();
+
+    if len < 64 {
+        // Short string format - encode length in header
+        let header = short_str_header(len as u8);
+        output.push(header);
+        output.extend_from_slice(bytes);
+    } else {
+        // Long string format (using primitive string type)
+        let header = primitive_header(VariantPrimitiveType::String as u8);
+        output.push(header);
+
+        // Write length as 4-byte little-endian
+        output.extend_from_slice(&(len as u32).to_le_bytes());
+
+        // Write string bytes
+        output.extend_from_slice(bytes);
+    }
+}
+
+/// Encodes a binary value
+pub fn encode_binary(value: &[u8], output: &mut Vec<u8>) {
+    // Use primitive + binary type
+    let header = primitive_header(VariantPrimitiveType::Binary as u8);
+    output.push(header);
+
+    // Write length followed by bytes
+    let len = value.len() as u32;
+    output.extend_from_slice(&len.to_le_bytes());
+    output.extend_from_slice(value);
+}
+
+/// Encodes a date value (days since epoch)
+pub fn encode_date(value: i32, output: &mut Vec<u8>) {
+    // Use primitive + date type
+    let header = primitive_header(VariantPrimitiveType::Date as u8);
+    output.push(header);
+    output.extend_from_slice(&value.to_le_bytes());
+}
+
+/// Encodes a timestamp value (milliseconds since epoch)
+pub fn encode_timestamp(value: i64, output: &mut Vec<u8>) {
+    // Use primitive + timestamp type
+    let header = primitive_header(VariantPrimitiveType::Timestamp as u8);
+    output.push(header);
+    output.extend_from_slice(&value.to_le_bytes());
+}
+
+/// Encodes a timestamp without timezone value (milliseconds since epoch)
+pub fn encode_timestamp_ntz(value: i64, output: &mut Vec<u8>) {
+    // Use primitive + timestamp_ntz type
+    let header = primitive_header(VariantPrimitiveType::TimestampNTZ as u8);
+    output.push(header);
+    output.extend_from_slice(&value.to_le_bytes());
+}
+
+/// Encodes a time without timezone value (milliseconds)
+pub fn encode_time_ntz(value: i64, output: &mut Vec<u8>) {
+    // Use primitive + time_ntz type
+    let header = primitive_header(VariantPrimitiveType::TimeNTZ as u8);
+    output.push(header);
+    output.extend_from_slice(&value.to_le_bytes());
+}

Review Comment:
   Thanks @Weijun-H and @alamb ! I've refactored the primitive encoding using 
an Encoder trait for VariantPrimitiveType and set visibility to pub(crate). How 
does this overall design look to you now?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to