This is an automated email from the ASF dual-hosted git repository.

etseidl pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 1377761779 Validate FIXED_LEN_BYTE_ARRAY length for DECIMAL and 
INTERVAL types (#9985)
1377761779 is described below

commit 1377761779afb1ce70a7a9a9038a308d2ff1ab88
Author: eunsang <[email protected]>
AuthorDate: Fri May 29 03:05:56 2026 +0900

    Validate FIXED_LEN_BYTE_ARRAY length for DECIMAL and INTERVAL types (#9985)
    
    ## Which issue does this PR close?
    
    - Closes #9984.
    
    ## Rationale for this change
    
    `from_fixed_len_byte_array` in `parquet/src/arrow/schema/primitive.rs`
    does not validate `type_length`. While `PrimitiveTypeBuilder::build()`
    enforces these constraints during schema construction
    (`schema/types.rs:477` for INTERVAL, `:565-580` for DECIMAL), schemas
    decoded directly from Thrift bypass that validation path entirely. As a
    result:
    
    - `DECIMAL` with a `type_length` outside `1..=32` was silently routed
    through `Decimal128` / `Decimal256` using invalid parameters.
    - `INTERVAL` with a `type_length != 12` silently returned
    `Interval(DayTime)` regardless.
    
    The same function already rejects `FLOAT16` when `type_length != 2`.
    This PR mirrors that pattern for DECIMAL and INTERVAL, closing the TODO
    introduced in #1682.
    
    ## What changes are included in this PR?
    
    - Added a `check_decimal_length` helper to reject `type_length` values
    outside `1..=32` for both `LogicalType::Decimal` and
    `ConvertedType::DECIMAL`.
    - Added an inline `type_length == 12` check for
    `ConvertedType::INTERVAL`.
    
    ## Are these changes tested?
    
    Yes. Added five new tests in
    `parquet/src/arrow/schema/primitive.rs::tests` covering:
    - Invalid lengths (`{-1, 0, 33}` for DECIMAL, `{0, 11, 13}` for
    INTERVAL)
    - Valid lengths (16 → `Decimal128`, 32 → `Decimal256`, 12 →
    `Interval(DayTime)`)
    
    To exercise the reader-side check, the tests construct a valid
    `Type::PrimitiveType` via the builder and directly modify the
    `type_length` on the resulting enum, simulating a malformed schema
    decoded from Thrift.
    
    ## Are there any user-facing changes?
    
    No public API changes. The only behavior change is on the reader side:
    schemas with an out-of-range `type_length` for DECIMAL or INTERVAL will
    now return a `ParquetError::General` instead of silently producing a
    mismatched Arrow type.
---
 parquet/src/arrow/schema/primitive.rs | 132 +++++++++++++++++++++++++++++++++-
 1 file changed, 131 insertions(+), 1 deletion(-)

diff --git a/parquet/src/arrow/schema/primitive.rs 
b/parquet/src/arrow/schema/primitive.rs
index 2272014a93..c70885355d 100644
--- a/parquet/src/arrow/schema/primitive.rs
+++ b/parquet/src/arrow/schema/primitive.rs
@@ -170,6 +170,16 @@ fn decimal_256_type(scale: i32, precision: i32) -> 
Result<DataType> {
     Ok(DataType::Decimal256(precision, scale))
 }
 
+#[allow(clippy::manual_range_contains)]
+fn check_decimal_length(type_length: i32) -> Result<()> {
+    if type_length < 1 || type_length > 32 {
+        return Err(ParquetError::General(format!(
+            "DECIMAL must be a Fixed Length Byte Array with length 1 to 32, 
got {type_length}"
+        )));
+    }
+    Ok(())
+}
+
 fn from_int32(info: &BasicTypeInfo, scale: i32, precision: i32) -> 
Result<DataType> {
     match (info.logical_type_ref(), info.converted_type()) {
         (None, ConvertedType::NONE) => Ok(DataType::Int32),
@@ -293,9 +303,10 @@ fn from_fixed_len_byte_array(
     precision: i32,
     type_length: i32,
 ) -> Result<DataType> {
-    // TODO: This should check the type length for the decimal and interval 
types
     match (info.logical_type_ref(), info.converted_type()) {
         (Some(LogicalType::Decimal(decimal)), _) => {
+            check_decimal_length(type_length)?;
+            // lengths 1..=16 map to Decimal128, 17..=32 to Decimal256
             if type_length <= 16 {
                 decimal_128_type(decimal.scale, decimal.precision)
             } else {
@@ -303,6 +314,7 @@ fn from_fixed_len_byte_array(
             }
         }
         (None, ConvertedType::DECIMAL) => {
+            check_decimal_length(type_length)?;
             if type_length <= 16 {
                 decimal_128_type(scale, precision)
             } else {
@@ -310,6 +322,11 @@ fn from_fixed_len_byte_array(
             }
         }
         (None, ConvertedType::INTERVAL) => {
+            if type_length != 12 {
+                return Err(ParquetError::General(format!(
+                    "INTERVAL must be a Fixed Length Byte Array with length 
12, got {type_length}"
+                )));
+            }
             // There is currently no reliable way of determining which 
IntervalUnit
             // to return. Thus without the original Arrow schema, the results
             // would be incorrect if all 12 bytes of the interval are populated
@@ -328,3 +345,116 @@ fn from_fixed_len_byte_array(
         _ => Ok(DataType::FixedSizeBinary(type_length)),
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::basic::{DecimalType, Repetition};
+    use crate::schema::types::Type;
+
+    // The PrimitiveTypeBuilder rejects bad lengths at construction. To 
exercise
+    // the reader-side checks, build a valid type then overwrite its 
type_length,
+    // simulating a schema decoded from a file that wasn't produced via the 
builder.
+    fn with_type_length(ty: Type, type_length: i32) -> Type {
+        match ty {
+            Type::PrimitiveType {
+                basic_info,
+                physical_type,
+                precision,
+                scale,
+                ..
+            } => Type::PrimitiveType {
+                basic_info,
+                physical_type,
+                type_length,
+                precision,
+                scale,
+            },
+            _ => unreachable!(),
+        }
+    }
+
+    fn flba_decimal_logical(type_length: i32) -> Type {
+        let valid = Type::primitive_type_builder("c", 
PhysicalType::FIXED_LEN_BYTE_ARRAY)
+            .with_repetition(Repetition::REQUIRED)
+            .with_logical_type(Some(LogicalType::Decimal(DecimalType {
+                precision: 5,
+                scale: 2,
+            })))
+            .with_length(16)
+            .with_precision(5)
+            .with_scale(2)
+            .build()
+            .unwrap();
+        with_type_length(valid, type_length)
+    }
+
+    fn flba_decimal_converted(type_length: i32) -> Type {
+        let valid = Type::primitive_type_builder("c", 
PhysicalType::FIXED_LEN_BYTE_ARRAY)
+            .with_repetition(Repetition::REQUIRED)
+            .with_converted_type(ConvertedType::DECIMAL)
+            .with_length(16)
+            .with_precision(5)
+            .with_scale(2)
+            .build()
+            .unwrap();
+        with_type_length(valid, type_length)
+    }
+
+    fn flba_interval(type_length: i32) -> Type {
+        let valid = Type::primitive_type_builder("c", 
PhysicalType::FIXED_LEN_BYTE_ARRAY)
+            .with_repetition(Repetition::REQUIRED)
+            .with_converted_type(ConvertedType::INTERVAL)
+            .with_length(12)
+            .build()
+            .unwrap();
+        with_type_length(valid, type_length)
+    }
+
+    fn assert_err_contains(ty: &Type, needle: &str) {
+        let err = convert_primitive(ty, None).expect_err("expected an error");
+        let msg = err.to_string();
+        assert!(msg.contains(needle), "expected {needle:?} in error: {msg}");
+    }
+
+    #[test]
+    fn decimal_logical_rejects_invalid_length() {
+        for bad in [-1, 0, 33] {
+            assert_err_contains(&flba_decimal_logical(bad), "DECIMAL");
+        }
+    }
+
+    #[test]
+    fn decimal_converted_rejects_invalid_length() {
+        for bad in [-1, 0, 33] {
+            assert_err_contains(&flba_decimal_converted(bad), "DECIMAL");
+        }
+    }
+
+    #[test]
+    fn decimal_accepts_valid_lengths() {
+        assert!(matches!(
+            convert_primitive(&flba_decimal_logical(16), None).unwrap(),
+            DataType::Decimal128(_, _)
+        ));
+        assert!(matches!(
+            convert_primitive(&flba_decimal_logical(32), None).unwrap(),
+            DataType::Decimal256(_, _)
+        ));
+    }
+
+    #[test]
+    fn interval_rejects_wrong_length() {
+        for bad in [0, 11, 13] {
+            assert_err_contains(&flba_interval(bad), "INTERVAL");
+        }
+    }
+
+    #[test]
+    fn interval_accepts_length_12() {
+        assert_eq!(
+            convert_primitive(&flba_interval(12), None).unwrap(),
+            DataType::Interval(IntervalUnit::DayTime)
+        );
+    }
+}

Reply via email to