This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 220d0ea7e9 Implement hex decoding of JSON strings to binary arrays
(#8737)
220d0ea7e9 is described below
commit 220d0ea7e9dadd02f59f8d990a2b9af99e586b06
Author: Philipp Oppermann <[email protected]>
AuthorDate: Thu Nov 6 20:49:59 2025 +0100
Implement hex decoding of JSON strings to binary arrays (#8737)
# Which issue does this PR close?
- Closes https://github.com/apache/arrow-rs/issues/8736
# Rationale for this change
See linked issue.
# What changes are included in this PR?
Add JSON decoders for binary array variants that act as counterparts to
https://github.com/apache/arrow-rs/pull/5622. This way, it becomes
possible to do a full round-trip encoding/decoding of binary array.
# Are these changes tested?
I added a roundtrip test based on the `test_writer_binary`. It verifies
that encoding and then decoding leads to the original input again. It
covers `Binary`, `LargeBinary`, `FixedSizeBinary`, and `BinaryView`
arrays, all with and without explicit nulls.
# Are there any user-facing changes?
Yes, encoding and decoding binary arrays to/from JSON is now fully
supported, given the right schema.
One limitation is that schema inference is not able to detect binary
arrays as they look like normal JSON strings after encoding. However,
this is already true when encoding other Arrow types, for example it's
not possible to differentiate integer bit widths.
I updated the docs accordingly.
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
arrow-json/src/lib.rs | 103 +++++++++++++++++++++--
arrow-json/src/reader/binary_array.rs | 152 ++++++++++++++++++++++++++++++++++
arrow-json/src/reader/mod.rs | 11 ++-
arrow-json/src/reader/schema.rs | 9 ++
4 files changed, 266 insertions(+), 9 deletions(-)
diff --git a/arrow-json/src/lib.rs b/arrow-json/src/lib.rs
index f5a38bb7fc..1b18e00947 100644
--- a/arrow-json/src/lib.rs
+++ b/arrow-json/src/lib.rs
@@ -20,18 +20,28 @@
//! See the module level documentation for the
//! [`reader`] and [`writer`] for usage examples.
//!
-//! # Binary Data
+//! # Binary Data uses `Base16` Encoding
//!
-//! As per [RFC7159] JSON cannot encode arbitrary binary data. A common
approach to workaround
-//! this is to use a [binary-to-text encoding] scheme, such as base64, to
encode the
-//! input data and then decode it on output.
+//! As per [RFC7159] JSON cannot encode arbitrary binary data. This crate
works around that
+//! limitation by encoding/decoding binary data as a [hexadecimal] string (i.e.
+//! [`Base16` encoding]).
+//!
+//! Note that `Base16` only has 50% space efficiency (i.e., the encoded data
is twice as large
+//! as the original). If that is an issue, we recommend to convert binary data
to/from a different
+//! encoding format such as `Base64` instead. See the following example for
details.
+//!
+//! ## `Base64` Encoding Example
+//!
+//! [`Base64`] is a common [binary-to-text encoding] scheme with a space
efficiency of 75%. The
+//! following example shows how to use the [`arrow_cast`] crate to encode
binary data to `Base64`
+//! before converting it to JSON and how to decode it back.
//!
//! ```
//! # use std::io::Cursor;
//! # use std::sync::Arc;
//! # use arrow_array::{BinaryArray, RecordBatch, StringArray};
//! # use arrow_array::cast::AsArray;
-//! # use arrow_cast::base64::{b64_decode, b64_encode, BASE64_STANDARD};
+//! use arrow_cast::base64::{b64_decode, b64_encode, BASE64_STANDARD};
//! # use arrow_json::{LineDelimitedWriter, ReaderBuilder};
//! #
//! // The data we want to write
@@ -61,7 +71,9 @@
//!
//! [RFC7159]: https://datatracker.ietf.org/doc/html/rfc7159#section-8.1
//! [binary-to-text encoding]:
https://en.wikipedia.org/wiki/Binary-to-text_encoding
-//!
+//! [hexadecimal]: https://en.wikipedia.org/wiki/Hexadecimal
+//! [`Base16` encoding]: https://en.wikipedia.org/wiki/Base16#Base16
+//! [`Base64`]: https://en.wikipedia.org/wiki/Base64
#![doc(
html_logo_url =
"https://arrow.apache.org/img/arrow-logo_chevrons_black-txt_white-bg.svg",
@@ -167,8 +179,16 @@ impl JsonSerializable for f64 {
#[cfg(test)]
mod tests {
+ use std::sync::Arc;
+
+ use crate::writer::JsonArray;
+
use super::*;
+ use arrow_array::{
+ ArrayRef, GenericBinaryArray, GenericByteViewArray, RecordBatch,
RecordBatchWriter,
+ builder::FixedSizeBinaryBuilder, types::BinaryViewType,
+ };
use serde_json::Value::{Bool, Number as VNumber, String as VString};
#[test]
@@ -261,4 +281,75 @@ mod tests {
assert_eq!(list_input, &list_output);
}
}
+
+ #[test]
+ #[allow(invalid_from_utf8)]
+ fn test_json_roundtrip_binary() {
+ let not_utf8: &[u8] = b"Not UTF8 \xa0\xa1!";
+ assert!(str::from_utf8(not_utf8).is_err());
+
+ let values: &[Option<&[u8]>] = &[
+ Some(b"Ned Flanders" as &[u8]),
+ None,
+ Some(b"Troy McClure" as &[u8]),
+ Some(not_utf8),
+ ];
+ // Binary:
+
assert_binary_json(Arc::new(GenericBinaryArray::<i32>::from_iter(values)));
+
+ // LargeBinary:
+
assert_binary_json(Arc::new(GenericBinaryArray::<i64>::from_iter(values)));
+
+ // FixedSizeBinary:
+ assert_binary_json(build_array_fixed_size_binary(12, values));
+
+ // BinaryView:
+
assert_binary_json(Arc::new(GenericByteViewArray::<BinaryViewType>::from_iter(
+ values,
+ )));
+ }
+
+ fn build_array_fixed_size_binary(byte_width: i32, values:
&[Option<&[u8]>]) -> ArrayRef {
+ let mut builder = FixedSizeBinaryBuilder::new(byte_width);
+ for value in values {
+ match value {
+ Some(v) => builder.append_value(v).unwrap(),
+ None => builder.append_null(),
+ }
+ }
+ Arc::new(builder.finish())
+ }
+
+ fn assert_binary_json(array: ArrayRef) {
+ // encode and check JSON with and without explicit nulls
+ assert_binary_json_with_writer(
+ array.clone(),
+ WriterBuilder::new().with_explicit_nulls(true),
+ );
+ assert_binary_json_with_writer(array,
WriterBuilder::new().with_explicit_nulls(false));
+ }
+
+ fn assert_binary_json_with_writer(array: ArrayRef, builder: WriterBuilder)
{
+ let batch = RecordBatch::try_from_iter([("bytes", array)]).unwrap();
+
+ let mut buf = Vec::new();
+ let json_value: Value = {
+ let mut writer = builder.build::<_, JsonArray>(&mut buf);
+ writer.write(&batch).unwrap();
+ writer.close().unwrap();
+ serde_json::from_slice(&buf).unwrap()
+ };
+
+ let json_array = json_value.as_array().unwrap();
+
+ let decoded = {
+ let mut decoder = ReaderBuilder::new(batch.schema().clone())
+ .build_decoder()
+ .unwrap();
+ decoder.serialize(json_array).unwrap();
+ decoder.flush().unwrap().unwrap()
+ };
+
+ assert_eq!(batch, decoded);
+ }
}
diff --git a/arrow-json/src/reader/binary_array.rs
b/arrow-json/src/reader/binary_array.rs
new file mode 100644
index 0000000000..a71569d57f
--- /dev/null
+++ b/arrow-json/src/reader/binary_array.rs
@@ -0,0 +1,152 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow_array::builder::{
+ BinaryViewBuilder, FixedSizeBinaryBuilder, GenericBinaryBuilder,
GenericStringBuilder,
+};
+use arrow_array::{Array, GenericStringArray, OffsetSizeTrait};
+use arrow_data::ArrayData;
+use arrow_schema::ArrowError;
+use std::marker::PhantomData;
+
+use crate::reader::ArrayDecoder;
+use crate::reader::tape::{Tape, TapeElement};
+
+/// Decode a hex-encoded string into bytes
+fn decode_hex_string(hex_string: &str) -> Result<Vec<u8>, ArrowError> {
+ let mut decoded = Vec::with_capacity(hex_string.len() / 2);
+ for substr in hex_string.as_bytes().chunks(2) {
+ let str = std::str::from_utf8(substr).map_err(|e| {
+ ArrowError::JsonError(format!("invalid utf8 in hex encoded binary
data: {e}"))
+ })?;
+ let byte = u8::from_str_radix(str, 16).map_err(|e| {
+ ArrowError::JsonError(format!("invalid hex encoding in binary
data: {e}"))
+ })?;
+ decoded.push(byte);
+ }
+ Ok(decoded)
+}
+
+#[derive(Default)]
+pub struct BinaryArrayDecoder<O: OffsetSizeTrait> {
+ phantom: PhantomData<O>,
+}
+
+impl<O: OffsetSizeTrait> ArrayDecoder for BinaryArrayDecoder<O> {
+ fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData,
ArrowError> {
+ let data_capacity = estimate_data_capacity(tape, pos)?;
+
+ if O::from_usize(data_capacity).is_none() {
+ return Err(ArrowError::JsonError(format!(
+ "offset overflow decoding {}",
+ GenericStringArray::<O>::DATA_TYPE
+ )));
+ }
+
+ let mut builder = GenericBinaryBuilder::<O>::with_capacity(pos.len(),
data_capacity);
+
+ GenericStringBuilder::<O>::with_capacity(pos.len(), data_capacity);
+
+ for p in pos {
+ match tape.get(*p) {
+ TapeElement::String(idx) => {
+ let string = tape.get_string(idx);
+ let decoded = decode_hex_string(string)?;
+ builder.append_value(&decoded);
+ }
+ TapeElement::Null => builder.append_null(),
+ _ => unreachable!(),
+ }
+ }
+
+ Ok(builder.finish().into_data())
+ }
+}
+
+#[derive(Default)]
+pub struct FixedSizeBinaryArrayDecoder {
+ len: i32,
+}
+
+impl FixedSizeBinaryArrayDecoder {
+ pub fn new(len: i32) -> Self {
+ Self { len }
+ }
+}
+
+impl ArrayDecoder for FixedSizeBinaryArrayDecoder {
+ fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData,
ArrowError> {
+ let mut builder = FixedSizeBinaryBuilder::with_capacity(pos.len(),
self.len);
+
+ for p in pos {
+ match tape.get(*p) {
+ TapeElement::String(idx) => {
+ let string = tape.get_string(idx);
+ let decoded = decode_hex_string(string)?;
+ builder.append_value(&decoded)?;
+ }
+ TapeElement::Null => builder.append_null(),
+ _ => unreachable!(),
+ }
+ }
+
+ Ok(builder.finish().into_data())
+ }
+}
+
+#[derive(Default)]
+pub struct BinaryViewDecoder {}
+
+impl ArrayDecoder for BinaryViewDecoder {
+ fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData,
ArrowError> {
+ let data_capacity = estimate_data_capacity(tape, pos)?;
+ let mut builder = BinaryViewBuilder::with_capacity(data_capacity);
+
+ for p in pos {
+ match tape.get(*p) {
+ TapeElement::String(idx) => {
+ let string = tape.get_string(idx);
+ let decoded = decode_hex_string(string)?;
+ builder.append_value(&decoded);
+ }
+ TapeElement::Null => builder.append_null(),
+ _ => unreachable!(),
+ }
+ }
+
+ Ok(builder.finish().into_data())
+ }
+}
+
+fn estimate_data_capacity(tape: &Tape<'_>, pos: &[u32]) -> Result<usize,
ArrowError> {
+ let mut data_capacity = 0;
+ for p in pos {
+ match tape.get(*p) {
+ TapeElement::String(idx) => {
+ let string_len = tape.get_string(idx).len();
+ // two hex characters represent one byte
+ let decoded_len = string_len / 2;
+ data_capacity += decoded_len;
+ }
+ TapeElement::Null => {}
+ _ => {
+ return Err(tape.error(*p, "binary data encoded as string"));
+ }
+ }
+ }
+ Ok(data_capacity)
+}
diff --git a/arrow-json/src/reader/mod.rs b/arrow-json/src/reader/mod.rs
index c47aa65f81..f5fd1a8e7c 100644
--- a/arrow-json/src/reader/mod.rs
+++ b/arrow-json/src/reader/mod.rs
@@ -134,6 +134,9 @@
//!
use crate::StructMode;
+use crate::reader::binary_array::{
+ BinaryArrayDecoder, BinaryViewDecoder, FixedSizeBinaryArrayDecoder,
+};
use std::io::BufRead;
use std::sync::Arc;
@@ -159,6 +162,7 @@ use crate::reader::struct_array::StructArrayDecoder;
use crate::reader::tape::{Tape, TapeDecoder};
use crate::reader::timestamp_array::TimestampArrayDecoder;
+mod binary_array;
mod boolean_array;
mod decimal_array;
mod list_array;
@@ -743,9 +747,10 @@ fn make_decoder(
DataType::List(_) =>
Ok(Box::new(ListArrayDecoder::<i32>::new(data_type, coerce_primitive,
strict_mode, is_nullable, struct_mode)?)),
DataType::LargeList(_) =>
Ok(Box::new(ListArrayDecoder::<i64>::new(data_type, coerce_primitive,
strict_mode, is_nullable, struct_mode)?)),
DataType::Struct(_) => Ok(Box::new(StructArrayDecoder::new(data_type,
coerce_primitive, strict_mode, is_nullable, struct_mode)?)),
- DataType::Binary | DataType::LargeBinary |
DataType::FixedSizeBinary(_) => {
- Err(ArrowError::JsonError(format!("{data_type} is not supported by
JSON")))
- }
+ DataType::Binary => Ok(Box::new(BinaryArrayDecoder::<i32>::default())),
+ DataType::LargeBinary =>
Ok(Box::new(BinaryArrayDecoder::<i64>::default())),
+ DataType::FixedSizeBinary(len) =>
Ok(Box::new(FixedSizeBinaryArrayDecoder::new(len))),
+ DataType::BinaryView => Ok(Box::new(BinaryViewDecoder::default())),
DataType::Map(_, _) => Ok(Box::new(MapArrayDecoder::new(data_type,
coerce_primitive, strict_mode, is_nullable, struct_mode)?)),
d => Err(ArrowError::NotYetImplemented(format!("Support for {d} in
JSON reader")))
}
diff --git a/arrow-json/src/reader/schema.rs b/arrow-json/src/reader/schema.rs
index c29a7bbe1a..fb7d93a85e 100644
--- a/arrow-json/src/reader/schema.rs
+++ b/arrow-json/src/reader/schema.rs
@@ -250,6 +250,15 @@ pub fn infer_json_schema_from_seekable<R: BufRead + Seek>(
/// original file's cursor. This function is useful when the `reader`'s cursor
is not available
/// (does not implement [`Seek`]), such is the case for compressed streams
decoders.
///
+///
+/// Note that JSON is not able to represent all Arrow data types exactly. So
the inferred schema
+/// might be different from the schema of the original data that was encoded
as JSON. For example,
+/// JSON does not have different integer types, so all integers are inferred
as `Int64`. Another
+/// example is binary data, which is encoded as a [Base16] string in JSON and
therefore inferred
+/// as String type by this function.
+///
+/// [Base16]: https://en.wikipedia.org/wiki/Base16#Base16
+///
/// # Examples
/// ```
/// use std::fs::File;