scovich commented on code in PR #9021:
URL: https://github.com/apache/arrow-rs/pull/9021#discussion_r2724352921
##########
arrow-json/src/reader/mod.rs:
##########
@@ -373,6 +386,95 @@ impl<R: BufRead> RecordBatchReader for Reader<R> {
}
}
+/// A trait to create custom decoders for specific data types.
+///
+/// This allows overriding the default decoders for specific data types,
+/// or adding new decoders for custom data types.
+///
+/// # Examples
+///
+/// ```
+/// use arrow_json::{ArrayDecoder, DecoderFactory, TapeElement, Tape,
ReaderBuilder, StructMode};
+/// use arrow_schema::ArrowError;
+/// use arrow_schema::{DataType, Field, Fields, Schema};
+/// use arrow_array::cast::AsArray;
+/// use arrow_array::Array;
+/// use arrow_array::builder::StringBuilder;
+/// use arrow_data::ArrayData;
+/// use std::sync::Arc;
+///
+/// struct IncorrectStringAsNullDecoder {}
+///
+/// impl ArrayDecoder for IncorrectStringAsNullDecoder {
+/// fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) ->
Result<ArrayData, ArrowError> {
+/// let mut builder = StringBuilder::new();
+/// for p in pos {
+/// match tape.get(*p) {
+/// TapeElement::String(idx) => {
+/// builder.append_value(tape.get_string(idx));
+/// }
+/// _ => builder.append_null(),
+/// }
+/// }
+/// Ok(builder.finish().into_data())
+/// }
+/// }
+///
+/// #[derive(Debug)]
+/// struct IncorrectStringAsNullDecoderFactory;
+///
+/// impl DecoderFactory for IncorrectStringAsNullDecoderFactory {
+/// fn make_default_decoder<'a>(
+/// &self,
+/// _field: Option<FieldRef>,
+/// data_type: DataType,
+/// _coerce_primitive: bool,
+/// _strict_mode: bool,
+/// _is_nullable: bool,
+/// _struct_mode: StructMode,
+/// ) -> Result<Option<Box<dyn ArrayDecoder>>, ArrowError> {
+/// match data_type {
+/// DataType::Utf8 =>
Ok(Some(Box::new(IncorrectStringAsNullDecoder {}))),
+/// _ => Ok(None),
+/// }
+/// }
+/// }
+///
+/// let json = r#"
+/// {"a": "a"}
+/// {"a": 12}
+/// "#;
+/// let batch =
ReaderBuilder::new(Arc::new(Schema::new(Fields::from(vec![Field::new(
+/// "a",
+/// DataType::Utf8,
+/// true,
+/// )]))))
+/// .with_decoder_factory(Arc::new(IncorrectStringAsNullDecoderFactory))
+/// .build(json.as_bytes())
+/// .unwrap()
+/// .next()
+/// .unwrap()
+/// .unwrap();
+///
+/// let values = batch.column(0).as_string::<i32>();
+/// assert_eq!(values.len(), 2);
+/// assert_eq!(values.value(0), "a");
+/// assert!(values.is_null(1));
+/// ```
+pub trait DecoderFactory: std::fmt::Debug + Send + Sync {
Review Comment:
Exploratory PR here:
* https://github.com/apache/arrow-rs/pull/9259/
Although schema annotation _can_ work, I'm not actually convinced it's the
mechanism of choice after playing around with several options.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]