This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new fe126d0ddb make_decoder accepts borrowed DataType instead of owned 
(#9270)
fe126d0ddb is described below

commit fe126d0ddb5a7629dcbfe240145a0f1ec5a365b5
Author: Ryan Johnson <[email protected]>
AuthorDate: Mon Jan 26 18:19:59 2026 -0700

    make_decoder accepts borrowed DataType instead of owned (#9270)
    
    # Which issue does this PR close?
    
    - Part of https://github.com/apache/arrow-rs/issues/8987
    
    # Rationale for this change
    
    Today's json decoder helper, `make_decoder`, takes an owned data type
    whose components are cloned at every level during the recursive decoder
    initialization process. This breaks pointer stability of the resulting
    `DataType` instances that a custom JSON decoder factory would see, vs.
    those of the schema it and the reader builder were initialized with.
    
    The lack of pointer stability prevents users from creating "path based"
    decoder factories, that are able to customize decoder behavior based not
    only on type, but also on the field's path in the schema. See the
    `PathBasedDecoderFactory` in arrow-json/tests/custom_decoder_tests.rs of
    https://github.com/apache/arrow-rs/pull/9259, for an example.
    
    # What changes are included in this PR?
    
    By passing `&DataType` instead, we change code like this:
    ```rust
    let decoder = make_decoder(field.data_type().clone(), ...);
    Ok(Self {
        data_type,
        decoder,
          ...
    })
    ```
    
    to this:
    ```rust
    let child_decoder = make_decoder(field.data_type(), ...);
    Ok(Self {
        data_type: data_type.clone(),
        decoder,
          ...
    })
    ```
    
    Result: Every call to `make_decoder` receives a reference to the actual
    original data type from the builder's input schema. The final decoder
    `Self` is unchanged -- it already received a clone and continues to do
    so.
    
    NOTE: There is one additional clone of the top-level `DataType::Struct`
    we create for normal (`!is_field`) builders. But that's a cheap arc
    clone of a `Fields` member.
    
    # Are these changes tested?
    
    Yes, existing unit tests validate the change.
    
    # Are there any user-facing changes?
    
    No. All functions and data types involved are private -- the array
    decoders are marked `pub` but are defined in a private mod with no
    public re-export that would make them available outside the crate.
---
 arrow-json/src/reader/list_array.rs      |  8 ++++----
 arrow-json/src/reader/map_array.rs       | 10 +++++-----
 arrow-json/src/reader/mod.rs             | 24 +++++++++++++-----------
 arrow-json/src/reader/primitive_array.rs |  4 ++--
 arrow-json/src/reader/struct_array.rs    | 20 +++++++++++---------
 arrow-json/src/reader/timestamp_array.rs |  4 ++--
 6 files changed, 37 insertions(+), 33 deletions(-)

diff --git a/arrow-json/src/reader/list_array.rs 
b/arrow-json/src/reader/list_array.rs
index 81fe4e3ffa..d363b6be97 100644
--- a/arrow-json/src/reader/list_array.rs
+++ b/arrow-json/src/reader/list_array.rs
@@ -34,18 +34,18 @@ pub struct ListArrayDecoder<O> {
 impl<O: OffsetSizeTrait> ListArrayDecoder<O> {
     pub fn new(
         ctx: &DecoderContext,
-        data_type: DataType,
+        data_type: &DataType,
         is_nullable: bool,
     ) -> Result<Self, ArrowError> {
-        let field = match &data_type {
+        let field = match data_type {
             DataType::List(f) if !O::IS_LARGE => f,
             DataType::LargeList(f) if O::IS_LARGE => f,
             _ => unreachable!(),
         };
-        let decoder = ctx.make_decoder(field.data_type().clone(), 
field.is_nullable())?;
+        let decoder = ctx.make_decoder(field.data_type(), 
field.is_nullable())?;
 
         Ok(Self {
-            data_type,
+            data_type: data_type.clone(),
             decoder,
             phantom: Default::default(),
             is_nullable,
diff --git a/arrow-json/src/reader/map_array.rs 
b/arrow-json/src/reader/map_array.rs
index 3c19cb3eb5..ff22b588c5 100644
--- a/arrow-json/src/reader/map_array.rs
+++ b/arrow-json/src/reader/map_array.rs
@@ -33,10 +33,10 @@ pub struct MapArrayDecoder {
 impl MapArrayDecoder {
     pub fn new(
         ctx: &DecoderContext,
-        data_type: DataType,
+        data_type: &DataType,
         is_nullable: bool,
     ) -> Result<Self, ArrowError> {
-        let fields = match &data_type {
+        let fields = match data_type {
             DataType::Map(_, true) => {
                 return Err(ArrowError::NotYetImplemented(
                     "Decoding MapArray with sorted fields".to_string(),
@@ -53,11 +53,11 @@ impl MapArrayDecoder {
             _ => unreachable!(),
         };
 
-        let keys = ctx.make_decoder(fields[0].data_type().clone(), 
fields[0].is_nullable())?;
-        let values = ctx.make_decoder(fields[1].data_type().clone(), 
fields[1].is_nullable())?;
+        let keys = ctx.make_decoder(fields[0].data_type(), 
fields[0].is_nullable())?;
+        let values = ctx.make_decoder(fields[1].data_type(), 
fields[1].is_nullable())?;
 
         Ok(Self {
-            data_type,
+            data_type: data_type.clone(),
             keys,
             values,
             is_nullable,
diff --git a/arrow-json/src/reader/mod.rs b/arrow-json/src/reader/mod.rs
index 6b3ab0097f..1a63ea7565 100644
--- a/arrow-json/src/reader/mod.rs
+++ b/arrow-json/src/reader/mod.rs
@@ -137,6 +137,7 @@ use crate::StructMode;
 use crate::reader::binary_array::{
     BinaryArrayDecoder, BinaryViewDecoder, FixedSizeBinaryArrayDecoder,
 };
+use std::borrow::Cow;
 use std::io::BufRead;
 use std::sync::Arc;
 
@@ -295,12 +296,13 @@ impl ReaderBuilder {
 
     /// Create a [`Decoder`]
     pub fn build_decoder(self) -> Result<Decoder, ArrowError> {
-        let (data_type, nullable) = match self.is_field {
-            false => (DataType::Struct(self.schema.fields.clone()), false),
-            true => {
-                let field = &self.schema.fields[0];
-                (field.data_type().clone(), field.is_nullable())
-            }
+        let (data_type, nullable) = if self.is_field {
+            let field = &self.schema.fields[0];
+            let data_type = Cow::Borrowed(field.data_type());
+            (data_type, field.is_nullable())
+        } else {
+            let data_type = 
Cow::Owned(DataType::Struct(self.schema.fields.clone()));
+            (data_type, false)
         };
 
         let ctx = DecoderContext {
@@ -308,7 +310,7 @@ impl ReaderBuilder {
             strict_mode: self.strict_mode,
             struct_mode: self.struct_mode,
         };
-        let decoder = ctx.make_decoder(data_type, nullable)?;
+        let decoder = ctx.make_decoder(data_type.as_ref(), nullable)?;
 
         let num_fields = self.schema.flattened_fields().len();
 
@@ -713,7 +715,7 @@ impl DecoderContext {
     /// implementation.
     fn make_decoder(
         &self,
-        data_type: DataType,
+        data_type: &DataType,
         is_nullable: bool,
     ) -> Result<Box<dyn ArrayDecoder>, ArrowError> {
         make_decoder(self, data_type, is_nullable)
@@ -728,12 +730,12 @@ macro_rules! primitive_decoder {
 
 fn make_decoder(
     ctx: &DecoderContext,
-    data_type: DataType,
+    data_type: &DataType,
     is_nullable: bool,
 ) -> Result<Box<dyn ArrayDecoder>, ArrowError> {
     let coerce_primitive = ctx.coerce_primitive();
     downcast_integer! {
-        data_type => (primitive_decoder, data_type),
+        *data_type => (primitive_decoder, data_type),
         DataType::Null => Ok(Box::<NullArrayDecoder>::default()),
         DataType::Float16 => primitive_decoder!(Float16Type, data_type),
         DataType::Float32 => primitive_decoder!(Float32Type, data_type),
@@ -792,7 +794,7 @@ fn make_decoder(
         DataType::FixedSizeBinary(len) => 
Ok(Box::new(FixedSizeBinaryArrayDecoder::new(len))),
         DataType::BinaryView => Ok(Box::new(BinaryViewDecoder::default())),
         DataType::Map(_, _) => Ok(Box::new(MapArrayDecoder::new(ctx, 
data_type, is_nullable)?)),
-        d => Err(ArrowError::NotYetImplemented(format!("Support for {d} in 
JSON reader")))
+        _ => Err(ArrowError::NotYetImplemented(format!("Support for 
{data_type} in JSON reader")))
     }
 }
 
diff --git a/arrow-json/src/reader/primitive_array.rs 
b/arrow-json/src/reader/primitive_array.rs
index fa8464aa32..b2bffe45e4 100644
--- a/arrow-json/src/reader/primitive_array.rs
+++ b/arrow-json/src/reader/primitive_array.rs
@@ -80,9 +80,9 @@ pub struct PrimitiveArrayDecoder<P: ArrowPrimitiveType> {
 }
 
 impl<P: ArrowPrimitiveType> PrimitiveArrayDecoder<P> {
-    pub fn new(data_type: DataType) -> Self {
+    pub fn new(data_type: &DataType) -> Self {
         Self {
-            data_type,
+            data_type: data_type.clone(),
             phantom: Default::default(),
         }
     }
diff --git a/arrow-json/src/reader/struct_array.rs 
b/arrow-json/src/reader/struct_array.rs
index 915c0607fd..9191afb8e6 100644
--- a/arrow-json/src/reader/struct_array.rs
+++ b/arrow-json/src/reader/struct_array.rs
@@ -81,20 +81,22 @@ pub struct StructArrayDecoder {
 impl StructArrayDecoder {
     pub fn new(
         ctx: &DecoderContext,
-        data_type: DataType,
+        data_type: &DataType,
         is_nullable: bool,
     ) -> Result<Self, ArrowError> {
-        let struct_mode = ctx.struct_mode();
-        let fields = struct_fields(&data_type);
-
-        // If this struct nullable, need to permit nullability in child array
-        // StructArrayDecoder::decode verifies that if the child is not 
nullable
-        // it doesn't contain any nulls not masked by its parent
+        let fields = struct_fields(data_type);
         let decoders = fields
             .iter()
-            .map(|f| ctx.make_decoder(f.data_type().clone(), f.is_nullable() 
|| is_nullable))
+            .map(|f| {
+                // If this struct nullable, need to permit nullability in 
child array
+                // StructArrayDecoder::decode verifies that if the child is 
not nullable
+                // it doesn't contain any nulls not masked by its parent
+                let nullable = f.is_nullable() || is_nullable;
+                ctx.make_decoder(f.data_type(), nullable)
+            })
             .collect::<Result<Vec<_>, ArrowError>>()?;
 
+        let struct_mode = ctx.struct_mode();
         let field_name_to_index = if struct_mode == StructMode::ObjectOnly {
             build_field_index(fields)
         } else {
@@ -102,7 +104,7 @@ impl StructArrayDecoder {
         };
 
         Ok(Self {
-            data_type,
+            data_type: data_type.clone(),
             decoders,
             strict_mode: ctx.strict_mode(),
             is_nullable,
diff --git a/arrow-json/src/reader/timestamp_array.rs 
b/arrow-json/src/reader/timestamp_array.rs
index 79f2b04eeb..ff24d5391f 100644
--- a/arrow-json/src/reader/timestamp_array.rs
+++ b/arrow-json/src/reader/timestamp_array.rs
@@ -37,9 +37,9 @@ pub struct TimestampArrayDecoder<P: ArrowTimestampType, Tz: 
TimeZone> {
 }
 
 impl<P: ArrowTimestampType, Tz: TimeZone> TimestampArrayDecoder<P, Tz> {
-    pub fn new(data_type: DataType, timezone: Tz) -> Self {
+    pub fn new(data_type: &DataType, timezone: Tz) -> Self {
         Self {
-            data_type,
+            data_type: data_type.clone(),
             timezone,
             phantom: Default::default(),
         }

Reply via email to