Re: [PR] Implement hex decoding of JSON strings to binary arrays [arrow-rs]

via GitHub Sat, 01 Nov 2025 04:33:43 -0700


alamb commented on code in PR #8737:
URL: https://github.com/apache/arrow-rs/pull/8737#discussion_r2483658217



##########
arrow-json/src/lib.rs:
##########
@@ -261,4 +283,135 @@ mod tests {
             assert_eq!(list_input, &list_output);
         }
     }
+
+    #[test]
+    fn test_json_roundtrip_binary() {
+        let values: [Option<&[u8]>; 3] = [
+            Some(b"Ned Flanders" as &[u8]),

Review Comment:
   can you please also add a value that is not a valid utf8 value? 
   
   Perhaps something like:
   
https://github.com/apache/arrow-rs/blob/b8ae8e013d69816c1cdee5b2b6b8833a8b0c6a47/arrow-ipc/src/reader.rs#L3081-L3083



##########
arrow-json/src/lib.rs:
##########
@@ -261,4 +283,135 @@ mod tests {
             assert_eq!(list_input, &list_output);
         }
     }
+
+    #[test]
+    fn test_json_roundtrip_binary() {
+        let values: [Option<&[u8]>; 3] = [
+            Some(b"Ned Flanders" as &[u8]),
+            None,
+            Some(b"Troy McClure" as &[u8]),
+        ];
+        // Binary:
+        {
+            let batch = build_array_binary::<i32>(&values);
+            assert_binary_json(&batch);
+        }
+        // LargeBinary:
+        {
+            let batch = build_array_binary::<i64>(&values);
+            assert_binary_json(&batch);
+        }
+        // FixedSizeBinary:
+        {
+            let batch = build_array_fixed_size_binary(12, &values);
+            assert_binary_json(&batch);
+        }
+        // BinaryView:
+        {
+            let batch = build_array_binary_view(&values);
+            assert_binary_json(&batch);
+        }
+    }
+
+    fn build_array_binary<O: OffsetSizeTrait>(values: &[Option<&[u8]>]) -> 
RecordBatch {
+        let schema = SchemaRef::new(Schema::new(vec![Field::new(
+            "bytes",
+            GenericBinaryType::<O>::DATA_TYPE,
+            true,
+        )]));
+        let mut builder = GenericByteBuilder::<GenericBinaryType<O>>::new();
+        for value in values {
+            match value {
+                Some(v) => builder.append_value(v),
+                None => builder.append_null(),
+            }
+        }
+        let array = Arc::new(builder.finish()) as ArrayRef;
+        RecordBatch::try_new(schema, vec![array]).unwrap()
+    }

Review Comment:
   I think you can build arrays much more succinctly using this:
   ```suggestion
       fn build_array_binary<O: OffsetSizeTrait>(values: &[Option<&[u8]>]) -> 
RecordBatch {
           let array = GenericBinaryArray::<O>::from_iter(values);
           RecordBatch::try_from_iter(vec![(
               "bytes",
               Arc::new(array) as ArrayRef,
           )]).unwrap()
       }
   ```
   
   You could probably avoid the duplication of RecordBatch creation as well by 
changing `assert_binary_json` to take an `ArrayRef` and creating the batch 
within it
   
   The same comment / style applies to the functions for the other array types 
below too



##########
arrow-json/src/reader/binary_array.rs:
##########
@@ -0,0 +1,152 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow_array::builder::{
+    BinaryViewBuilder, FixedSizeBinaryBuilder, GenericBinaryBuilder, 
GenericStringBuilder,
+};
+use arrow_array::{Array, GenericStringArray, OffsetSizeTrait};
+use arrow_data::ArrayData;
+use arrow_schema::ArrowError;
+use std::marker::PhantomData;
+
+use crate::reader::ArrayDecoder;
+use crate::reader::tape::{Tape, TapeElement};
+
+/// Decode a hex-encoded string into bytes
+fn decode_hex_string(hex_string: &str) -> Result<Vec<u8>, ArrowError> {
+    let mut decoded = Vec::with_capacity(hex_string.len() / 2);
+    for substr in hex_string.as_bytes().chunks(2) {
+        let str = std::str::from_utf8(substr).map_err(|e| {

Review Comment:
   I am pretty sure this code could be made much faster with a custom lookup 
table rather than using `u8::from_str_radix` etc 
   
   That being said, that would be a nice thing to improve in a future PR



##########
arrow-json/src/lib.rs:
##########
@@ -261,4 +283,135 @@ mod tests {
             assert_eq!(list_input, &list_output);
         }
     }
+
+    #[test]
+    fn test_json_roundtrip_binary() {
+        let values: [Option<&[u8]>; 3] = [
+            Some(b"Ned Flanders" as &[u8]),
+            None,
+            Some(b"Troy McClure" as &[u8]),
+        ];
+        // Binary:
+        {
+            let batch = build_array_binary::<i32>(&values);
+            assert_binary_json(&batch);
+        }
+        // LargeBinary:
+        {
+            let batch = build_array_binary::<i64>(&values);
+            assert_binary_json(&batch);
+        }
+        // FixedSizeBinary:
+        {
+            let batch = build_array_fixed_size_binary(12, &values);
+            assert_binary_json(&batch);
+        }
+        // BinaryView:
+        {
+            let batch = build_array_binary_view(&values);
+            assert_binary_json(&batch);
+        }
+    }
+
+    fn build_array_binary<O: OffsetSizeTrait>(values: &[Option<&[u8]>]) -> 
RecordBatch {
+        let schema = SchemaRef::new(Schema::new(vec![Field::new(
+            "bytes",
+            GenericBinaryType::<O>::DATA_TYPE,
+            true,
+        )]));
+        let mut builder = GenericByteBuilder::<GenericBinaryType<O>>::new();
+        for value in values {
+            match value {
+                Some(v) => builder.append_value(v),
+                None => builder.append_null(),
+            }
+        }
+        let array = Arc::new(builder.finish()) as ArrayRef;
+        RecordBatch::try_new(schema, vec![array]).unwrap()
+    }
+
+    fn build_array_binary_view(values: &[Option<&[u8]>]) -> RecordBatch {
+        let schema = SchemaRef::new(Schema::new(vec![Field::new(
+            "bytes",
+            DataType::BinaryView,
+            true,
+        )]));
+        let mut builder = BinaryViewBuilder::new();
+        for value in values {
+            match value {
+                Some(v) => builder.append_value(v),
+                None => builder.append_null(),
+            }
+        }
+        let array = Arc::new(builder.finish()) as ArrayRef;
+        RecordBatch::try_new(schema, vec![array]).unwrap()
+    }
+
+    fn build_array_fixed_size_binary(byte_width: i32, values: 
&[Option<&[u8]>]) -> RecordBatch {
+        let schema = SchemaRef::new(Schema::new(vec![Field::new(
+            "bytes",
+            DataType::FixedSizeBinary(byte_width),
+            true,
+        )]));
+        let mut builder = FixedSizeBinaryBuilder::new(byte_width);
+        for value in values {
+            match value {
+                Some(v) => builder.append_value(v).unwrap(),
+                None => builder.append_null(),
+            }
+        }
+        let array = Arc::new(builder.finish()) as ArrayRef;
+        RecordBatch::try_new(schema, vec![array]).unwrap()
+    }
+
+    fn assert_binary_json(batch: &RecordBatch) {
+        // encode and check JSON with explicit nulls:
+        {
+            let mut buf = Vec::new();
+            let json_value: Value = {
+                let mut writer = WriterBuilder::new()
+                    .with_explicit_nulls(true)
+                    .build::<_, JsonArray>(&mut buf);
+                writer.write(batch).unwrap();
+                writer.close().unwrap();
+                serde_json::from_slice(&buf).unwrap()
+            };
+
+            let json_array = json_value.as_array().unwrap();
+
+            let decoded = {
+                let mut decoder = ReaderBuilder::new(batch.schema().clone())
+                    .build_decoder()
+                    .unwrap();
+                decoder.serialize(json_array).unwrap();
+                decoder.flush().unwrap().unwrap()
+            };
+
+            assert_eq!(batch, &decoded);
+        }
+
+        // encode and check JSON with no explicit nulls:
+        {
+            let mut buf = Vec::new();
+            let json_value: Value = {
+                // explicit nulls are off by default, so we don't need

Review Comment:
   I suggest setting it to false explicitly to make it clear from just the 
tests that both paths are covered



##########
arrow-json/src/lib.rs:
##########
@@ -261,4 +283,135 @@ mod tests {
             assert_eq!(list_input, &list_output);
         }
     }
+
+    #[test]
+    fn test_json_roundtrip_binary() {
+        let values: [Option<&[u8]>; 3] = [
+            Some(b"Ned Flanders" as &[u8]),
+            None,
+            Some(b"Troy McClure" as &[u8]),
+        ];
+        // Binary:
+        {
+            let batch = build_array_binary::<i32>(&values);
+            assert_binary_json(&batch);
+        }
+        // LargeBinary:
+        {
+            let batch = build_array_binary::<i64>(&values);
+            assert_binary_json(&batch);
+        }
+        // FixedSizeBinary:
+        {
+            let batch = build_array_fixed_size_binary(12, &values);
+            assert_binary_json(&batch);
+        }
+        // BinaryView:
+        {
+            let batch = build_array_binary_view(&values);
+            assert_binary_json(&batch);
+        }
+    }
+
+    fn build_array_binary<O: OffsetSizeTrait>(values: &[Option<&[u8]>]) -> 
RecordBatch {
+        let schema = SchemaRef::new(Schema::new(vec![Field::new(
+            "bytes",
+            GenericBinaryType::<O>::DATA_TYPE,
+            true,
+        )]));
+        let mut builder = GenericByteBuilder::<GenericBinaryType<O>>::new();
+        for value in values {
+            match value {
+                Some(v) => builder.append_value(v),
+                None => builder.append_null(),
+            }
+        }
+        let array = Arc::new(builder.finish()) as ArrayRef;
+        RecordBatch::try_new(schema, vec![array]).unwrap()
+    }
+
+    fn build_array_binary_view(values: &[Option<&[u8]>]) -> RecordBatch {
+        let schema = SchemaRef::new(Schema::new(vec![Field::new(
+            "bytes",
+            DataType::BinaryView,
+            true,
+        )]));
+        let mut builder = BinaryViewBuilder::new();
+        for value in values {
+            match value {
+                Some(v) => builder.append_value(v),
+                None => builder.append_null(),
+            }
+        }
+        let array = Arc::new(builder.finish()) as ArrayRef;
+        RecordBatch::try_new(schema, vec![array]).unwrap()
+    }
+
+    fn build_array_fixed_size_binary(byte_width: i32, values: 
&[Option<&[u8]>]) -> RecordBatch {
+        let schema = SchemaRef::new(Schema::new(vec![Field::new(
+            "bytes",
+            DataType::FixedSizeBinary(byte_width),
+            true,
+        )]));
+        let mut builder = FixedSizeBinaryBuilder::new(byte_width);
+        for value in values {
+            match value {
+                Some(v) => builder.append_value(v).unwrap(),
+                None => builder.append_null(),
+            }
+        }
+        let array = Arc::new(builder.finish()) as ArrayRef;
+        RecordBatch::try_new(schema, vec![array]).unwrap()
+    }
+
+    fn assert_binary_json(batch: &RecordBatch) {
+        // encode and check JSON with explicit nulls:
+        {
+            let mut buf = Vec::new();

Review Comment:
   nit: I think this would be eaiser to understand what was happening if you 
extracted out the round trip conversion test logic into a function that took a 
WriterBuilder and then setup the writer builders twice



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Implement hex decoding of JSON strings to binary arrays [arrow-rs]

Reply via email to