This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 22cf1ba998 Factor out json reader's static make_decoder args to a
struct (#9271)
22cf1ba998 is described below
commit 22cf1ba998bbf979888bda79f33f180b4ea19aba
Author: Ryan Johnson <[email protected]>
AuthorDate: Mon Jan 26 15:05:47 2026 -0700
Factor out json reader's static make_decoder args to a struct (#9271)
# Which issue does this PR close?
Part of https://github.com/apache/arrow-rs/issues/8987
# Rationale for this change
Supporting custom type decoders in JSON reads requires some ability to
create nested and/or delegating decoder factories. For example:
1. Quirks-mode decoding of a complex type that attempts to parse string
inputs to the requested type
* Falls back to the default decoder for non-string inputs
2. Convert complex type mismatches to NULL instead of error
* Use the default decoder only if the input type matches
3. Compound decoder factories, e.g.
* Support multiple extension types, each provided by their own factory
* Support a mix of "standard" extension type decoders and custom "quicks
mode" decoders
Such use cases mean decoder factories will need access to the
functionality currently provided by `make_decoder`, and the latter will
need to be aware of the top-level decoder factory in use, even if that
decoder factory has delegated to a more specific decoder factory who
would actually call `make_decoder`.
# What changes are included in this PR?
Create a new `DecoderContext` struct that bundles up the top-level args
that are passed unchanged through the entire schema traversal that
`make_decoder` performs. It reduces the arg count, and also gives a
place to store the decoder factory that `make_decoder` should eventually
know how to use, without churning function signatures a second time. It
will also help insulate the signature of the private `make_decoder` API
from the decoder factory API that will become public.
# Are these changes tested?
Existing unit tests validate the refactor.
# Are there any user-facing changes?
No. All methods and data types involved are private to the arrow-json
crate.
---
arrow-json/src/reader/list_array.rs | 15 ++------
arrow-json/src/reader/map_array.rs | 23 +++---------
arrow-json/src/reader/mod.rs | 68 +++++++++++++++++++++++++++--------
arrow-json/src/reader/struct_array.rs | 47 ++++++++++--------------
4 files changed, 79 insertions(+), 74 deletions(-)
diff --git a/arrow-json/src/reader/list_array.rs
b/arrow-json/src/reader/list_array.rs
index e74fef7917..81fe4e3ffa 100644
--- a/arrow-json/src/reader/list_array.rs
+++ b/arrow-json/src/reader/list_array.rs
@@ -15,9 +15,8 @@
// specific language governing permissions and limitations
// under the License.
-use crate::StructMode;
use crate::reader::tape::{Tape, TapeElement};
-use crate::reader::{ArrayDecoder, make_decoder};
+use crate::reader::{ArrayDecoder, DecoderContext};
use arrow_array::OffsetSizeTrait;
use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder};
use arrow_buffer::buffer::NullBuffer;
@@ -34,24 +33,16 @@ pub struct ListArrayDecoder<O> {
impl<O: OffsetSizeTrait> ListArrayDecoder<O> {
pub fn new(
+ ctx: &DecoderContext,
data_type: DataType,
- coerce_primitive: bool,
- strict_mode: bool,
is_nullable: bool,
- struct_mode: StructMode,
) -> Result<Self, ArrowError> {
let field = match &data_type {
DataType::List(f) if !O::IS_LARGE => f,
DataType::LargeList(f) if O::IS_LARGE => f,
_ => unreachable!(),
};
- let decoder = make_decoder(
- field.data_type().clone(),
- coerce_primitive,
- strict_mode,
- field.is_nullable(),
- struct_mode,
- )?;
+ let decoder = ctx.make_decoder(field.data_type().clone(),
field.is_nullable())?;
Ok(Self {
data_type,
diff --git a/arrow-json/src/reader/map_array.rs
b/arrow-json/src/reader/map_array.rs
index c2068577a0..3c19cb3eb5 100644
--- a/arrow-json/src/reader/map_array.rs
+++ b/arrow-json/src/reader/map_array.rs
@@ -15,9 +15,8 @@
// specific language governing permissions and limitations
// under the License.
-use crate::StructMode;
use crate::reader::tape::{Tape, TapeElement};
-use crate::reader::{ArrayDecoder, make_decoder};
+use crate::reader::{ArrayDecoder, DecoderContext};
use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder};
use arrow_buffer::ArrowNativeType;
use arrow_buffer::buffer::NullBuffer;
@@ -33,11 +32,9 @@ pub struct MapArrayDecoder {
impl MapArrayDecoder {
pub fn new(
+ ctx: &DecoderContext,
data_type: DataType,
- coerce_primitive: bool,
- strict_mode: bool,
is_nullable: bool,
- struct_mode: StructMode,
) -> Result<Self, ArrowError> {
let fields = match &data_type {
DataType::Map(_, true) => {
@@ -56,20 +53,8 @@ impl MapArrayDecoder {
_ => unreachable!(),
};
- let keys = make_decoder(
- fields[0].data_type().clone(),
- coerce_primitive,
- strict_mode,
- fields[0].is_nullable(),
- struct_mode,
- )?;
- let values = make_decoder(
- fields[1].data_type().clone(),
- coerce_primitive,
- strict_mode,
- fields[1].is_nullable(),
- struct_mode,
- )?;
+ let keys = ctx.make_decoder(fields[0].data_type().clone(),
fields[0].is_nullable())?;
+ let values = ctx.make_decoder(fields[1].data_type().clone(),
fields[1].is_nullable())?;
Ok(Self {
data_type,
diff --git a/arrow-json/src/reader/mod.rs b/arrow-json/src/reader/mod.rs
index f5fd1a8e7c..6b3ab0097f 100644
--- a/arrow-json/src/reader/mod.rs
+++ b/arrow-json/src/reader/mod.rs
@@ -303,13 +303,12 @@ impl ReaderBuilder {
}
};
- let decoder = make_decoder(
- data_type,
- self.coerce_primitive,
- self.strict_mode,
- nullable,
- self.struct_mode,
- )?;
+ let ctx = DecoderContext {
+ coerce_primitive: self.coerce_primitive,
+ strict_mode: self.strict_mode,
+ struct_mode: self.struct_mode,
+ };
+ let decoder = ctx.make_decoder(data_type, nullable)?;
let num_fields = self.schema.flattened_fields().len();
@@ -679,6 +678,48 @@ trait ArrayDecoder: Send {
fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData,
ArrowError>;
}
+/// Context for decoder creation, containing configuration.
+///
+/// This context is passed through the decoder creation process and contains
+/// all the configuration needed to create decoders recursively.
+pub struct DecoderContext {
+ /// Whether to coerce primitives to strings
+ coerce_primitive: bool,
+ /// Whether to validate struct fields strictly
+ strict_mode: bool,
+ /// How to decode struct fields
+ struct_mode: StructMode,
+}
+
+impl DecoderContext {
+ /// Returns whether to coerce primitive types (e.g., number to string)
+ pub fn coerce_primitive(&self) -> bool {
+ self.coerce_primitive
+ }
+
+ /// Returns whether to validate struct fields strictly
+ pub fn strict_mode(&self) -> bool {
+ self.strict_mode
+ }
+
+ /// Returns how to decode struct fields
+ pub fn struct_mode(&self) -> StructMode {
+ self.struct_mode
+ }
+
+ /// Create a decoder for a type.
+ ///
+ /// This is the standard way to create child decoders from within a decoder
+ /// implementation.
+ fn make_decoder(
+ &self,
+ data_type: DataType,
+ is_nullable: bool,
+ ) -> Result<Box<dyn ArrayDecoder>, ArrowError> {
+ make_decoder(self, data_type, is_nullable)
+ }
+}
+
macro_rules! primitive_decoder {
($t:ty, $data_type:expr) => {
Ok(Box::new(PrimitiveArrayDecoder::<$t>::new($data_type)))
@@ -686,12 +727,11 @@ macro_rules! primitive_decoder {
}
fn make_decoder(
+ ctx: &DecoderContext,
data_type: DataType,
- coerce_primitive: bool,
- strict_mode: bool,
is_nullable: bool,
- struct_mode: StructMode,
) -> Result<Box<dyn ArrayDecoder>, ArrowError> {
+ let coerce_primitive = ctx.coerce_primitive();
downcast_integer! {
data_type => (primitive_decoder, data_type),
DataType::Null => Ok(Box::<NullArrayDecoder>::default()),
@@ -744,14 +784,14 @@ fn make_decoder(
DataType::Utf8 =>
Ok(Box::new(StringArrayDecoder::<i32>::new(coerce_primitive))),
DataType::Utf8View =>
Ok(Box::new(StringViewArrayDecoder::new(coerce_primitive))),
DataType::LargeUtf8 =>
Ok(Box::new(StringArrayDecoder::<i64>::new(coerce_primitive))),
- DataType::List(_) =>
Ok(Box::new(ListArrayDecoder::<i32>::new(data_type, coerce_primitive,
strict_mode, is_nullable, struct_mode)?)),
- DataType::LargeList(_) =>
Ok(Box::new(ListArrayDecoder::<i64>::new(data_type, coerce_primitive,
strict_mode, is_nullable, struct_mode)?)),
- DataType::Struct(_) => Ok(Box::new(StructArrayDecoder::new(data_type,
coerce_primitive, strict_mode, is_nullable, struct_mode)?)),
+ DataType::List(_) => Ok(Box::new(ListArrayDecoder::<i32>::new(ctx,
data_type, is_nullable)?)),
+ DataType::LargeList(_) =>
Ok(Box::new(ListArrayDecoder::<i64>::new(ctx, data_type, is_nullable)?)),
+ DataType::Struct(_) => Ok(Box::new(StructArrayDecoder::new(ctx,
data_type, is_nullable)?)),
DataType::Binary => Ok(Box::new(BinaryArrayDecoder::<i32>::default())),
DataType::LargeBinary =>
Ok(Box::new(BinaryArrayDecoder::<i64>::default())),
DataType::FixedSizeBinary(len) =>
Ok(Box::new(FixedSizeBinaryArrayDecoder::new(len))),
DataType::BinaryView => Ok(Box::new(BinaryViewDecoder::default())),
- DataType::Map(_, _) => Ok(Box::new(MapArrayDecoder::new(data_type,
coerce_primitive, strict_mode, is_nullable, struct_mode)?)),
+ DataType::Map(_, _) => Ok(Box::new(MapArrayDecoder::new(ctx,
data_type, is_nullable)?)),
d => Err(ArrowError::NotYetImplemented(format!("Support for {d} in
JSON reader")))
}
}
diff --git a/arrow-json/src/reader/struct_array.rs
b/arrow-json/src/reader/struct_array.rs
index df0d5b8a5b..915c0607fd 100644
--- a/arrow-json/src/reader/struct_array.rs
+++ b/arrow-json/src/reader/struct_array.rs
@@ -16,7 +16,7 @@
// under the License.
use crate::reader::tape::{Tape, TapeElement};
-use crate::reader::{ArrayDecoder, StructMode, make_decoder};
+use crate::reader::{ArrayDecoder, DecoderContext, StructMode};
use arrow_array::builder::BooleanBufferBuilder;
use arrow_buffer::buffer::NullBuffer;
use arrow_data::{ArrayData, ArrayDataBuilder};
@@ -80,42 +80,31 @@ pub struct StructArrayDecoder {
impl StructArrayDecoder {
pub fn new(
+ ctx: &DecoderContext,
data_type: DataType,
- coerce_primitive: bool,
- strict_mode: bool,
is_nullable: bool,
- struct_mode: StructMode,
) -> Result<Self, ArrowError> {
- let (decoders, field_name_to_index) = {
- let fields = struct_fields(&data_type);
- let decoders = fields
- .iter()
- .map(|f| {
- // If this struct nullable, need to permit nullability in
child array
- // StructArrayDecoder::decode verifies that if the child
is not nullable
- // it doesn't contain any nulls not masked by its parent
- let nullable = f.is_nullable() || is_nullable;
- make_decoder(
- f.data_type().clone(),
- coerce_primitive,
- strict_mode,
- nullable,
- struct_mode,
- )
- })
- .collect::<Result<Vec<_>, ArrowError>>()?;
- let field_name_to_index = if struct_mode == StructMode::ObjectOnly
{
- build_field_index(fields)
- } else {
- None
- };
- (decoders, field_name_to_index)
+ let struct_mode = ctx.struct_mode();
+ let fields = struct_fields(&data_type);
+
+ // If this struct nullable, need to permit nullability in child array
+ // StructArrayDecoder::decode verifies that if the child is not
nullable
+ // it doesn't contain any nulls not masked by its parent
+ let decoders = fields
+ .iter()
+ .map(|f| ctx.make_decoder(f.data_type().clone(), f.is_nullable()
|| is_nullable))
+ .collect::<Result<Vec<_>, ArrowError>>()?;
+
+ let field_name_to_index = if struct_mode == StructMode::ObjectOnly {
+ build_field_index(fields)
+ } else {
+ None
};
Ok(Self {
data_type,
decoders,
- strict_mode,
+ strict_mode: ctx.strict_mode(),
is_nullable,
struct_mode,
field_name_to_index,