This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new b07fd5dc4e4 feat(arrow-json): encode `Binary` and `LargeBinary` types 
as hex when writing JSON (#5785)
b07fd5dc4e4 is described below

commit b07fd5dc4e4ecddb9bb3b6ed0304217f0251a6ac
Author: Trevor Hilton <[email protected]>
AuthorDate: Tue May 21 03:14:19 2024 -0400

    feat(arrow-json): encode `Binary` and `LargeBinary` types as hex when 
writing JSON (#5785)
    
    * feat: encode Binary and LargeBinary types in JSON as hex
    
    Added ability to the JSON writer to encode Binary and LargeBinary types
    as hex. This follows the behaviour for FixedSizeBinary.
    
    A test was added to check functionality for both Binary and LargeBinary.
    
    * refactor: use ArrayAccessor instead of custom trait
    
    * refactor: use generic in test instead of macro
    
    * refactor: use const DATA_TYPE from GenericBinaryType
---
 arrow-json/src/writer.rs         | 83 ++++++++++++++++++++++++++++++++++++++++
 arrow-json/src/writer/encoder.rs | 28 +++++++++++---
 2 files changed, 106 insertions(+), 5 deletions(-)

diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs
index c9619f80b4f..ef4141d7ab2 100644
--- a/arrow-json/src/writer.rs
+++ b/arrow-json/src/writer.rs
@@ -1565,6 +1565,89 @@ mod tests {
         Ok(())
     }
 
+    fn binary_encoding_test<O: OffsetSizeTrait>() {
+        // set up schema
+        let schema = SchemaRef::new(Schema::new(vec![Field::new(
+            "bytes",
+            GenericBinaryType::<O>::DATA_TYPE,
+            true,
+        )]));
+
+        // build record batch:
+        let mut builder = GenericByteBuilder::<GenericBinaryType<O>>::new();
+        let values = [Some(b"Ned Flanders"), None, Some(b"Troy McClure")];
+        for value in values {
+            match value {
+                Some(v) => builder.append_value(v),
+                None => builder.append_null(),
+            }
+        }
+        let array = Arc::new(builder.finish()) as ArrayRef;
+        let batch = RecordBatch::try_new(schema, vec![array]).unwrap();
+
+        // encode and check JSON with explicit nulls:
+        {
+            let mut buf = Vec::new();
+            let json_value: Value = {
+                let mut writer = WriterBuilder::new()
+                    .with_explicit_nulls(true)
+                    .build::<_, JsonArray>(&mut buf);
+                writer.write(&batch).unwrap();
+                writer.close().unwrap();
+                serde_json::from_slice(&buf).unwrap()
+            };
+
+            assert_eq!(
+                json!([
+                    {
+                        "bytes": "4e656420466c616e64657273"
+                    },
+                    {
+                        "bytes": null // the explicit null
+                    },
+                    {
+                        "bytes": "54726f79204d63436c757265"
+                    }
+                ]),
+                json_value,
+            );
+        }
+
+        // encode and check JSON with no explicit nulls:
+        {
+            let mut buf = Vec::new();
+            let json_value: Value = {
+                // explicit nulls are off by default, so we don't need
+                // to set that when creating the writer:
+                let mut writer = ArrayWriter::new(&mut buf);
+                writer.write(&batch).unwrap();
+                writer.close().unwrap();
+                serde_json::from_slice(&buf).unwrap()
+            };
+
+            assert_eq!(
+                json!([
+                    {
+                        "bytes": "4e656420466c616e64657273"
+                    },
+                    {}, // empty because nulls are omitted
+                    {
+                        "bytes": "54726f79204d63436c757265"
+                    }
+                ]),
+                json_value
+            );
+        }
+    }
+
+    #[test]
+    fn test_writer_binary() {
+        // Binary:
+        binary_encoding_test::<i32>();
+        // LargeBinary:
+        binary_encoding_test::<i64>();
+    }
+
     #[test]
     fn test_writer_fixed_size_binary() {
         // set up schema:
diff --git a/arrow-json/src/writer/encoder.rs b/arrow-json/src/writer/encoder.rs
index 056ddf3dd96..9b6c9418fa0 100644
--- a/arrow-json/src/writer/encoder.rs
+++ b/arrow-json/src/writer/encoder.rs
@@ -105,7 +105,17 @@ fn make_encoder_impl<'a>(
 
         DataType::FixedSizeBinary(_) => {
             let array = array.as_fixed_size_binary();
-            (Box::new(FixedSizeBinaryEncoder::new(array)) as _, 
array.nulls().cloned())
+            (Box::new(BinaryEncoder::new(array)) as _, array.nulls().cloned())
+        }
+
+        DataType::Binary => {
+            let array: &BinaryArray = array.as_binary();
+            (Box::new(BinaryEncoder::new(array)) as _, array.nulls().cloned())
+        }
+
+        DataType::LargeBinary => {
+            let array: &LargeBinaryArray = array.as_binary();
+            (Box::new(BinaryEncoder::new(array)) as _, array.nulls().cloned())
         }
 
         DataType::Struct(fields) => {
@@ -509,15 +519,23 @@ impl<'a> Encoder for MapEncoder<'a> {
     }
 }
 
-struct FixedSizeBinaryEncoder<'a>(&'a FixedSizeBinaryArray);
+/// New-type wrapper for encoding the binary types in arrow: `Binary`, 
`LargeBinary`
+/// and `FixedSizeBinary` as hex strings in JSON.
+struct BinaryEncoder<B>(B);
 
-impl<'a> FixedSizeBinaryEncoder<'a> {
-    fn new(array: &'a FixedSizeBinaryArray) -> Self {
+impl<'a, B> BinaryEncoder<B>
+where
+    B: ArrayAccessor<Item = &'a [u8]>,
+{
+    fn new(array: B) -> Self {
         Self(array)
     }
 }
 
-impl<'a> Encoder for FixedSizeBinaryEncoder<'a> {
+impl<'a, B> Encoder for BinaryEncoder<B>
+where
+    B: ArrayAccessor<Item = &'a [u8]>,
+{
     fn encode(&mut self, idx: usize, out: &mut Vec<u8>) {
         out.push(b'"');
         for byte in self.0.value(idx) {

Reply via email to