Re: [PR] Add hooks to json encoder to override default encoding or add support for unsupported types [arrow-rs]

via GitHub Sat, 25 Jan 2025 07:55:52 -0800


tustvold commented on code in PR #7015:
URL: https://github.com/apache/arrow-rs/pull/7015#discussion_r1929562915



##########
arrow-json/src/writer/encoder.rs:
##########
@@ -24,11 +26,42 @@ use arrow_schema::{ArrowError, DataType, FieldRef};
 use half::f16;
 use lexical_core::FormattedSize;
 use serde::Serializer;
-use std::io::Write;
 
 #[derive(Debug, Clone, Default)]
 pub struct EncoderOptions {
     pub explicit_nulls: bool,
+    pub encoder_factory: Option<Arc<dyn EncoderFactory>>,
+}
+
+type EncoderFactoryResult<'a> =
+    Result<Option<(Box<dyn Encoder + 'a>, Option<NullBuffer>)>, ArrowError>;
+
+/// A trait to create custom encoders for specific data types.
+///
+/// This allows overriding the default encoders for specific data types,
+/// or adding new encoders for custom data types.
+pub trait EncoderFactory: std::fmt::Debug {
+    /// Make an encoder that if returned runs before all of the default 
encoders.
+    /// This can be used to override how e.g. binary data is encoded so that 
it is an encoded string or an array of integers.
+    fn make_default_encoder<'a>(
+        &self,
+        _array: &'a dyn Array,
+        _data_type: &DataType,

Review Comment:
   Why do we pass both DataType and Array?



##########
arrow-json/src/writer/encoder.rs:
##########
@@ -24,11 +26,42 @@ use arrow_schema::{ArrowError, DataType, FieldRef};
 use half::f16;
 use lexical_core::FormattedSize;
 use serde::Serializer;
-use std::io::Write;
 
 #[derive(Debug, Clone, Default)]
 pub struct EncoderOptions {
     pub explicit_nulls: bool,
+    pub encoder_factory: Option<Arc<dyn EncoderFactory>>,
+}
+
+type EncoderFactoryResult<'a> =
+    Result<Option<(Box<dyn Encoder + 'a>, Option<NullBuffer>)>, ArrowError>;
+
+/// A trait to create custom encoders for specific data types.
+///
+/// This allows overriding the default encoders for specific data types,
+/// or adding new encoders for custom data types.
+pub trait EncoderFactory: std::fmt::Debug {
+    /// Make an encoder that if returned runs before all of the default 
encoders.
+    /// This can be used to override how e.g. binary data is encoded so that 
it is an encoded string or an array of integers.
+    fn make_default_encoder<'a>(
+        &self,
+        _array: &'a dyn Array,
+        _data_type: &DataType,
+        _options: &EncoderOptions,
+    ) -> EncoderFactoryResult<'a> {
+        Ok(None)
+    }
+
+    /// Make an encoder that if returned runs after all of the default 
encoders.
+    /// If this method returns None then an error will be returned.
+    fn make_fallback_encoder<'a>(

Review Comment:
   Why do we need this? AFAICT we could remove this with no real loss of 
functionality, if anything it would be more resilient as adding a new default 
encoder wouldn't break existing overrides.



##########
arrow-json/src/writer/mod.rs:
##########
@@ -1953,4 +1968,317 @@ mod tests {
 "#,
         );
     }
+
+    fn make_fallback_encoder_test_data() -> (RecordBatch, Arc<dyn 
EncoderFactory>) {
+        // Note: this is not intended to be an efficient implementation.
+        // Just a simple example to demonstrate how to implement a custom 
encoder.
+        #[derive(Debug)]
+        enum UnionValue {
+            Null,
+            Int32(i32),
+            String(String),
+        }
+
+        #[derive(Debug)]
+        struct UnionEncoder {

Review Comment:
   FWIW adding a native UnionEncoder is likely not overly complicated 



##########
arrow-json/src/writer/mod.rs:
##########
@@ -426,10 +438,13 @@ mod tests {
 
         let actual: Vec<Option<Value>> = input
             .split(|b| *b == b'\n')
-            .map(|s| (!s.is_empty()).then(|| 
serde_json::from_slice(s).unwrap()))
+            .map(|s| {
+                println!("{:?}", str::from_utf8(s));

Review Comment:
   ?



##########
arrow-json/src/writer/mod.rs:
##########
@@ -1953,4 +1968,317 @@ mod tests {
 "#,
         );
     }
+
+    fn make_fallback_encoder_test_data() -> (RecordBatch, Arc<dyn 
EncoderFactory>) {
+        // Note: this is not intended to be an efficient implementation.
+        // Just a simple example to demonstrate how to implement a custom 
encoder.
+        #[derive(Debug)]
+        enum UnionValue {
+            Null,
+            Int32(i32),
+            String(String),
+        }
+
+        #[derive(Debug)]
+        struct UnionEncoder {
+            array: Vec<UnionValue>,
+        }
+
+        impl Encoder for UnionEncoder {
+            fn encode(&mut self, idx: usize, out: &mut Vec<u8>) {
+                match &self.array[idx] {
+                    UnionValue::Null => out.extend_from_slice(b"null"),
+                    UnionValue::Int32(v) => 
out.extend_from_slice(v.to_string().as_bytes()),
+                    UnionValue::String(v) => 
out.extend_from_slice(format!("\"{}\"", v).as_bytes()),
+                }
+            }
+        }
+
+        #[derive(Debug)]
+        struct UnionEncoderFactory;
+
+        impl EncoderFactory for UnionEncoderFactory {
+            fn make_fallback_encoder(
+                &self,
+                array: &dyn Array,
+                data_type: &DataType,
+                _options: &EncoderOptions,
+            ) -> Result<Option<(Box<dyn Encoder>, Option<NullBuffer>)>, 
ArrowError> {
+                let fields = match data_type {
+                    DataType::Union(fields, UnionMode::Sparse) => fields,
+                    _ => return Ok(None),
+                };
+                // check that the fields are supported
+                let fields = fields.iter().map(|(_, f)| f).collect::<Vec<_>>();
+                for f in fields.iter() {
+                    match f.data_type() {
+                        DataType::Null => {}
+                        DataType::Int32 => {}
+                        DataType::Utf8 => {}
+                        _ => return Ok(None),
+                    }
+                }
+                let (_, type_ids, _, buffers) = array
+                    .as_any()

Review Comment:
   Can we use AsArray instead of manual downcasts, it is much easier to read



##########
arrow-json/src/writer/encoder.rs:
##########
@@ -24,11 +26,42 @@ use arrow_schema::{ArrowError, DataType, FieldRef};
 use half::f16;
 use lexical_core::FormattedSize;
 use serde::Serializer;
-use std::io::Write;
 
 #[derive(Debug, Clone, Default)]
 pub struct EncoderOptions {
     pub explicit_nulls: bool,
+    pub encoder_factory: Option<Arc<dyn EncoderFactory>>,
+}
+
+type EncoderFactoryResult<'a> =
+    Result<Option<(Box<dyn Encoder + 'a>, Option<NullBuffer>)>, ArrowError>;

Review Comment:
   It probably could be for consistency, but it isn't likely to have any 
material performance impact



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Add hooks to json encoder to override default encoding or add support for unsupported types [arrow-rs]

Reply via email to