alamb commented on code in PR #7015: URL: https://github.com/apache/arrow-rs/pull/7015#discussion_r1999482887
########## arrow-json/src/writer/encoder.rs: ########## @@ -362,42 +480,36 @@ impl<O: OffsetSizeTrait> Encoder for ListEncoder<'_, O> { let end = self.offsets[idx + 1].as_usize(); let start = self.offsets[idx].as_usize(); out.push(b'['); - match self.nulls.as_ref() { - Some(n) => (start..end).for_each(|idx| { - if idx != start { - out.push(b',') - } - match n.is_null(idx) { - true => out.extend_from_slice(b"null"), - false => self.encoder.encode(idx, out), - } - }), - None => (start..end).for_each(|idx| { - if idx != start { - out.push(b',') - } + Review Comment: I think the previous formulation is important for performance: 1. It checks for nulls once per output list (rather than once per output list element) 2. The code for the `None` case is simpler and this more likely to be auto vectorized ########## arrow-json/src/writer/encoder.rs: ########## @@ -25,126 +27,230 @@ use arrow_schema::{ArrowError, DataType, FieldRef}; use half::f16; use lexical_core::FormattedSize; use serde::Serializer; -use std::io::Write; +/// Configuration options for the JSON encoder. Review Comment: I verified that this structure is not currently publically exported and thus this change is not a breaking API change: https://docs.rs/arrow-json/54.2.1/arrow_json/index.html?search=EncoderOptions ########## arrow-json/src/writer/mod.rs: ########## @@ -1953,4 +1968,317 @@ mod tests { "#, ); } + + fn make_fallback_encoder_test_data() -> (RecordBatch, Arc<dyn EncoderFactory>) { + // Note: this is not intended to be an efficient implementation. + // Just a simple example to demonstrate how to implement a custom encoder. + #[derive(Debug)] + enum UnionValue { + Null, + Int32(i32), + String(String), + } + + #[derive(Debug)] + struct UnionEncoder { Review Comment: Filed a ticket to track - https://github.com/apache/arrow-rs/issues/7302 ########## arrow-json/src/writer/encoder.rs: ########## @@ -25,126 +27,230 @@ use arrow_schema::{ArrowError, DataType, FieldRef}; use half::f16; use lexical_core::FormattedSize; use serde::Serializer; -use std::io::Write; +/// Configuration options for the JSON encoder. #[derive(Debug, Clone, Default)] pub struct EncoderOptions { - pub explicit_nulls: bool, - pub struct_mode: StructMode, + /// Whether to include nulls in the output or elide them. + explicit_nulls: bool, + /// Whether to encode structs as JSON objects or JSON arrays of their values. + struct_mode: StructMode, + /// An optional hook for customizing encoding behavior. + encoder_factory: Option<Arc<dyn EncoderFactory>>, +} + +impl EncoderOptions { + /// Set whether to include nulls in the output or elide them. + pub fn with_explicit_nulls(mut self, explicit_nulls: bool) -> Self { + self.explicit_nulls = explicit_nulls; + self + } + + /// Set whether to encode structs as JSON objects or JSON arrays of their values. + pub fn with_struct_mode(mut self, struct_mode: StructMode) -> Self { + self.struct_mode = struct_mode; + self + } + + /// Set an optional hook for customizing encoding behavior. + pub fn with_encoder_factory(mut self, encoder_factory: Arc<dyn EncoderFactory>) -> Self { + self.encoder_factory = Some(encoder_factory); + self + } + + /// Get whether to include nulls in the output or elide them. + pub fn explicit_nulls(&self) -> bool { + self.explicit_nulls + } + + /// Get whether to encode structs as JSON objects or JSON arrays of their values. + pub fn struct_mode(&self) -> StructMode { + self.struct_mode + } + + /// Get the optional hook for customizing encoding behavior. + pub fn encoder_factory(&self) -> Option<&Arc<dyn EncoderFactory>> { + self.encoder_factory.as_ref() + } +} + +/// A trait to create custom encoders for specific data types. +/// +/// This allows overriding the default encoders for specific data types, Review Comment: It would be sweet to add some toy doc example of using this `EncoderFactory` in a doc comment -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org