This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 220d0ea7e9 Implement hex decoding of JSON strings to binary arrays 
(#8737)
220d0ea7e9 is described below

commit 220d0ea7e9dadd02f59f8d990a2b9af99e586b06
Author: Philipp Oppermann <[email protected]>
AuthorDate: Thu Nov 6 20:49:59 2025 +0100

    Implement hex decoding of JSON strings to binary arrays (#8737)
    
    # Which issue does this PR close?
    
    - Closes https://github.com/apache/arrow-rs/issues/8736
    
    # Rationale for this change
    
    See linked issue.
    
    
    # What changes are included in this PR?
    
    Add JSON decoders for binary array variants that act as counterparts to
    https://github.com/apache/arrow-rs/pull/5622. This way, it becomes
    possible to do a full round-trip encoding/decoding of binary array.
    
    # Are these changes tested?
    
    I added a roundtrip test based on the `test_writer_binary`. It verifies
    that encoding and then decoding leads to the original input again. It
    covers `Binary`, `LargeBinary`, `FixedSizeBinary`, and `BinaryView`
    arrays, all with and without explicit nulls.
    
    # Are there any user-facing changes?
    
    Yes, encoding and decoding binary arrays to/from JSON is now fully
    supported, given the right schema.
    
    One limitation is that schema inference is not able to detect binary
    arrays as they look like normal JSON strings after encoding. However,
    this is already true when encoding other Arrow types, for example it's
    not possible to differentiate integer bit widths.
    
    I updated the docs accordingly.
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 arrow-json/src/lib.rs                 | 103 +++++++++++++++++++++--
 arrow-json/src/reader/binary_array.rs | 152 ++++++++++++++++++++++++++++++++++
 arrow-json/src/reader/mod.rs          |  11 ++-
 arrow-json/src/reader/schema.rs       |   9 ++
 4 files changed, 266 insertions(+), 9 deletions(-)

diff --git a/arrow-json/src/lib.rs b/arrow-json/src/lib.rs
index f5a38bb7fc..1b18e00947 100644
--- a/arrow-json/src/lib.rs
+++ b/arrow-json/src/lib.rs
@@ -20,18 +20,28 @@
 //! See the module level documentation for the
 //! [`reader`] and [`writer`] for usage examples.
 //!
-//! # Binary Data
+//! # Binary Data uses `Base16` Encoding
 //!
-//! As per [RFC7159] JSON cannot encode arbitrary binary data. A common 
approach to workaround
-//! this is to use a [binary-to-text encoding] scheme, such as base64, to 
encode the
-//! input data and then decode it on output.
+//! As per [RFC7159] JSON cannot encode arbitrary binary data. This crate 
works around that
+//! limitation by encoding/decoding binary data as a [hexadecimal] string (i.e.
+//! [`Base16` encoding]).
+//!
+//! Note that `Base16` only has 50% space efficiency (i.e., the encoded data 
is twice as large
+//! as the original). If that is an issue, we recommend to convert binary data 
to/from a different
+//! encoding format such as `Base64` instead. See the following example for 
details.
+//!
+//! ## `Base64` Encoding Example
+//!
+//! [`Base64`] is a common [binary-to-text encoding] scheme with a space 
efficiency of 75%. The
+//! following example shows how to use the [`arrow_cast`] crate to encode 
binary data to `Base64`
+//! before converting it to JSON and how to decode it back.
 //!
 //! ```
 //! # use std::io::Cursor;
 //! # use std::sync::Arc;
 //! # use arrow_array::{BinaryArray, RecordBatch, StringArray};
 //! # use arrow_array::cast::AsArray;
-//! # use arrow_cast::base64::{b64_decode, b64_encode, BASE64_STANDARD};
+//! use arrow_cast::base64::{b64_decode, b64_encode, BASE64_STANDARD};
 //! # use arrow_json::{LineDelimitedWriter, ReaderBuilder};
 //! #
 //! // The data we want to write
@@ -61,7 +71,9 @@
 //!
 //! [RFC7159]: https://datatracker.ietf.org/doc/html/rfc7159#section-8.1
 //! [binary-to-text encoding]: 
https://en.wikipedia.org/wiki/Binary-to-text_encoding
-//!
+//! [hexadecimal]: https://en.wikipedia.org/wiki/Hexadecimal
+//! [`Base16` encoding]: https://en.wikipedia.org/wiki/Base16#Base16
+//! [`Base64`]: https://en.wikipedia.org/wiki/Base64
 
 #![doc(
     html_logo_url = 
"https://arrow.apache.org/img/arrow-logo_chevrons_black-txt_white-bg.svg";,
@@ -167,8 +179,16 @@ impl JsonSerializable for f64 {
 
 #[cfg(test)]
 mod tests {
+    use std::sync::Arc;
+
+    use crate::writer::JsonArray;
+
     use super::*;
 
+    use arrow_array::{
+        ArrayRef, GenericBinaryArray, GenericByteViewArray, RecordBatch, 
RecordBatchWriter,
+        builder::FixedSizeBinaryBuilder, types::BinaryViewType,
+    };
     use serde_json::Value::{Bool, Number as VNumber, String as VString};
 
     #[test]
@@ -261,4 +281,75 @@ mod tests {
             assert_eq!(list_input, &list_output);
         }
     }
+
+    #[test]
+    #[allow(invalid_from_utf8)]
+    fn test_json_roundtrip_binary() {
+        let not_utf8: &[u8] = b"Not UTF8 \xa0\xa1!";
+        assert!(str::from_utf8(not_utf8).is_err());
+
+        let values: &[Option<&[u8]>] = &[
+            Some(b"Ned Flanders" as &[u8]),
+            None,
+            Some(b"Troy McClure" as &[u8]),
+            Some(not_utf8),
+        ];
+        // Binary:
+        
assert_binary_json(Arc::new(GenericBinaryArray::<i32>::from_iter(values)));
+
+        // LargeBinary:
+        
assert_binary_json(Arc::new(GenericBinaryArray::<i64>::from_iter(values)));
+
+        // FixedSizeBinary:
+        assert_binary_json(build_array_fixed_size_binary(12, values));
+
+        // BinaryView:
+        
assert_binary_json(Arc::new(GenericByteViewArray::<BinaryViewType>::from_iter(
+            values,
+        )));
+    }
+
+    fn build_array_fixed_size_binary(byte_width: i32, values: 
&[Option<&[u8]>]) -> ArrayRef {
+        let mut builder = FixedSizeBinaryBuilder::new(byte_width);
+        for value in values {
+            match value {
+                Some(v) => builder.append_value(v).unwrap(),
+                None => builder.append_null(),
+            }
+        }
+        Arc::new(builder.finish())
+    }
+
+    fn assert_binary_json(array: ArrayRef) {
+        // encode and check JSON with and without explicit nulls
+        assert_binary_json_with_writer(
+            array.clone(),
+            WriterBuilder::new().with_explicit_nulls(true),
+        );
+        assert_binary_json_with_writer(array, 
WriterBuilder::new().with_explicit_nulls(false));
+    }
+
+    fn assert_binary_json_with_writer(array: ArrayRef, builder: WriterBuilder) 
{
+        let batch = RecordBatch::try_from_iter([("bytes", array)]).unwrap();
+
+        let mut buf = Vec::new();
+        let json_value: Value = {
+            let mut writer = builder.build::<_, JsonArray>(&mut buf);
+            writer.write(&batch).unwrap();
+            writer.close().unwrap();
+            serde_json::from_slice(&buf).unwrap()
+        };
+
+        let json_array = json_value.as_array().unwrap();
+
+        let decoded = {
+            let mut decoder = ReaderBuilder::new(batch.schema().clone())
+                .build_decoder()
+                .unwrap();
+            decoder.serialize(json_array).unwrap();
+            decoder.flush().unwrap().unwrap()
+        };
+
+        assert_eq!(batch, decoded);
+    }
 }
diff --git a/arrow-json/src/reader/binary_array.rs 
b/arrow-json/src/reader/binary_array.rs
new file mode 100644
index 0000000000..a71569d57f
--- /dev/null
+++ b/arrow-json/src/reader/binary_array.rs
@@ -0,0 +1,152 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow_array::builder::{
+    BinaryViewBuilder, FixedSizeBinaryBuilder, GenericBinaryBuilder, 
GenericStringBuilder,
+};
+use arrow_array::{Array, GenericStringArray, OffsetSizeTrait};
+use arrow_data::ArrayData;
+use arrow_schema::ArrowError;
+use std::marker::PhantomData;
+
+use crate::reader::ArrayDecoder;
+use crate::reader::tape::{Tape, TapeElement};
+
+/// Decode a hex-encoded string into bytes
+fn decode_hex_string(hex_string: &str) -> Result<Vec<u8>, ArrowError> {
+    let mut decoded = Vec::with_capacity(hex_string.len() / 2);
+    for substr in hex_string.as_bytes().chunks(2) {
+        let str = std::str::from_utf8(substr).map_err(|e| {
+            ArrowError::JsonError(format!("invalid utf8 in hex encoded binary 
data: {e}"))
+        })?;
+        let byte = u8::from_str_radix(str, 16).map_err(|e| {
+            ArrowError::JsonError(format!("invalid hex encoding in binary 
data: {e}"))
+        })?;
+        decoded.push(byte);
+    }
+    Ok(decoded)
+}
+
+#[derive(Default)]
+pub struct BinaryArrayDecoder<O: OffsetSizeTrait> {
+    phantom: PhantomData<O>,
+}
+
+impl<O: OffsetSizeTrait> ArrayDecoder for BinaryArrayDecoder<O> {
+    fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, 
ArrowError> {
+        let data_capacity = estimate_data_capacity(tape, pos)?;
+
+        if O::from_usize(data_capacity).is_none() {
+            return Err(ArrowError::JsonError(format!(
+                "offset overflow decoding {}",
+                GenericStringArray::<O>::DATA_TYPE
+            )));
+        }
+
+        let mut builder = GenericBinaryBuilder::<O>::with_capacity(pos.len(), 
data_capacity);
+
+        GenericStringBuilder::<O>::with_capacity(pos.len(), data_capacity);
+
+        for p in pos {
+            match tape.get(*p) {
+                TapeElement::String(idx) => {
+                    let string = tape.get_string(idx);
+                    let decoded = decode_hex_string(string)?;
+                    builder.append_value(&decoded);
+                }
+                TapeElement::Null => builder.append_null(),
+                _ => unreachable!(),
+            }
+        }
+
+        Ok(builder.finish().into_data())
+    }
+}
+
+#[derive(Default)]
+pub struct FixedSizeBinaryArrayDecoder {
+    len: i32,
+}
+
+impl FixedSizeBinaryArrayDecoder {
+    pub fn new(len: i32) -> Self {
+        Self { len }
+    }
+}
+
+impl ArrayDecoder for FixedSizeBinaryArrayDecoder {
+    fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, 
ArrowError> {
+        let mut builder = FixedSizeBinaryBuilder::with_capacity(pos.len(), 
self.len);
+
+        for p in pos {
+            match tape.get(*p) {
+                TapeElement::String(idx) => {
+                    let string = tape.get_string(idx);
+                    let decoded = decode_hex_string(string)?;
+                    builder.append_value(&decoded)?;
+                }
+                TapeElement::Null => builder.append_null(),
+                _ => unreachable!(),
+            }
+        }
+
+        Ok(builder.finish().into_data())
+    }
+}
+
+#[derive(Default)]
+pub struct BinaryViewDecoder {}
+
+impl ArrayDecoder for BinaryViewDecoder {
+    fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, 
ArrowError> {
+        let data_capacity = estimate_data_capacity(tape, pos)?;
+        let mut builder = BinaryViewBuilder::with_capacity(data_capacity);
+
+        for p in pos {
+            match tape.get(*p) {
+                TapeElement::String(idx) => {
+                    let string = tape.get_string(idx);
+                    let decoded = decode_hex_string(string)?;
+                    builder.append_value(&decoded);
+                }
+                TapeElement::Null => builder.append_null(),
+                _ => unreachable!(),
+            }
+        }
+
+        Ok(builder.finish().into_data())
+    }
+}
+
+fn estimate_data_capacity(tape: &Tape<'_>, pos: &[u32]) -> Result<usize, 
ArrowError> {
+    let mut data_capacity = 0;
+    for p in pos {
+        match tape.get(*p) {
+            TapeElement::String(idx) => {
+                let string_len = tape.get_string(idx).len();
+                // two hex characters represent one byte
+                let decoded_len = string_len / 2;
+                data_capacity += decoded_len;
+            }
+            TapeElement::Null => {}
+            _ => {
+                return Err(tape.error(*p, "binary data encoded as string"));
+            }
+        }
+    }
+    Ok(data_capacity)
+}
diff --git a/arrow-json/src/reader/mod.rs b/arrow-json/src/reader/mod.rs
index c47aa65f81..f5fd1a8e7c 100644
--- a/arrow-json/src/reader/mod.rs
+++ b/arrow-json/src/reader/mod.rs
@@ -134,6 +134,9 @@
 //!
 
 use crate::StructMode;
+use crate::reader::binary_array::{
+    BinaryArrayDecoder, BinaryViewDecoder, FixedSizeBinaryArrayDecoder,
+};
 use std::io::BufRead;
 use std::sync::Arc;
 
@@ -159,6 +162,7 @@ use crate::reader::struct_array::StructArrayDecoder;
 use crate::reader::tape::{Tape, TapeDecoder};
 use crate::reader::timestamp_array::TimestampArrayDecoder;
 
+mod binary_array;
 mod boolean_array;
 mod decimal_array;
 mod list_array;
@@ -743,9 +747,10 @@ fn make_decoder(
         DataType::List(_) => 
Ok(Box::new(ListArrayDecoder::<i32>::new(data_type, coerce_primitive, 
strict_mode, is_nullable, struct_mode)?)),
         DataType::LargeList(_) => 
Ok(Box::new(ListArrayDecoder::<i64>::new(data_type, coerce_primitive, 
strict_mode, is_nullable, struct_mode)?)),
         DataType::Struct(_) => Ok(Box::new(StructArrayDecoder::new(data_type, 
coerce_primitive, strict_mode, is_nullable, struct_mode)?)),
-        DataType::Binary | DataType::LargeBinary | 
DataType::FixedSizeBinary(_) => {
-            Err(ArrowError::JsonError(format!("{data_type} is not supported by 
JSON")))
-        }
+        DataType::Binary => Ok(Box::new(BinaryArrayDecoder::<i32>::default())),
+        DataType::LargeBinary => 
Ok(Box::new(BinaryArrayDecoder::<i64>::default())),
+        DataType::FixedSizeBinary(len) => 
Ok(Box::new(FixedSizeBinaryArrayDecoder::new(len))),
+        DataType::BinaryView => Ok(Box::new(BinaryViewDecoder::default())),
         DataType::Map(_, _) => Ok(Box::new(MapArrayDecoder::new(data_type, 
coerce_primitive, strict_mode, is_nullable, struct_mode)?)),
         d => Err(ArrowError::NotYetImplemented(format!("Support for {d} in 
JSON reader")))
     }
diff --git a/arrow-json/src/reader/schema.rs b/arrow-json/src/reader/schema.rs
index c29a7bbe1a..fb7d93a85e 100644
--- a/arrow-json/src/reader/schema.rs
+++ b/arrow-json/src/reader/schema.rs
@@ -250,6 +250,15 @@ pub fn infer_json_schema_from_seekable<R: BufRead + Seek>(
 /// original file's cursor. This function is useful when the `reader`'s cursor 
is not available
 /// (does not implement [`Seek`]), such is the case for compressed streams 
decoders.
 ///
+///
+/// Note that JSON is not able to represent all Arrow data types exactly. So 
the inferred schema
+/// might be different from the schema of the original data that was encoded 
as JSON. For example,
+/// JSON does not have different integer types, so all integers are inferred 
as `Int64`. Another
+/// example is binary data, which is encoded as a [Base16] string in JSON and 
therefore inferred
+/// as String type by this function.
+///
+/// [Base16]: https://en.wikipedia.org/wiki/Base16#Base16
+///
 /// # Examples
 /// ```
 /// use std::fs::File;

Reply via email to