This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new b07fd5dc4e4 feat(arrow-json): encode `Binary` and `LargeBinary` types
as hex when writing JSON (#5785)
b07fd5dc4e4 is described below
commit b07fd5dc4e4ecddb9bb3b6ed0304217f0251a6ac
Author: Trevor Hilton <[email protected]>
AuthorDate: Tue May 21 03:14:19 2024 -0400
feat(arrow-json): encode `Binary` and `LargeBinary` types as hex when
writing JSON (#5785)
* feat: encode Binary and LargeBinary types in JSON as hex
Added ability to the JSON writer to encode Binary and LargeBinary types
as hex. This follows the behaviour for FixedSizeBinary.
A test was added to check functionality for both Binary and LargeBinary.
* refactor: use ArrayAccessor instead of custom trait
* refactor: use generic in test instead of macro
* refactor: use const DATA_TYPE from GenericBinaryType
---
arrow-json/src/writer.rs | 83 ++++++++++++++++++++++++++++++++++++++++
arrow-json/src/writer/encoder.rs | 28 +++++++++++---
2 files changed, 106 insertions(+), 5 deletions(-)
diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs
index c9619f80b4f..ef4141d7ab2 100644
--- a/arrow-json/src/writer.rs
+++ b/arrow-json/src/writer.rs
@@ -1565,6 +1565,89 @@ mod tests {
Ok(())
}
+ fn binary_encoding_test<O: OffsetSizeTrait>() {
+ // set up schema
+ let schema = SchemaRef::new(Schema::new(vec![Field::new(
+ "bytes",
+ GenericBinaryType::<O>::DATA_TYPE,
+ true,
+ )]));
+
+ // build record batch:
+ let mut builder = GenericByteBuilder::<GenericBinaryType<O>>::new();
+ let values = [Some(b"Ned Flanders"), None, Some(b"Troy McClure")];
+ for value in values {
+ match value {
+ Some(v) => builder.append_value(v),
+ None => builder.append_null(),
+ }
+ }
+ let array = Arc::new(builder.finish()) as ArrayRef;
+ let batch = RecordBatch::try_new(schema, vec![array]).unwrap();
+
+ // encode and check JSON with explicit nulls:
+ {
+ let mut buf = Vec::new();
+ let json_value: Value = {
+ let mut writer = WriterBuilder::new()
+ .with_explicit_nulls(true)
+ .build::<_, JsonArray>(&mut buf);
+ writer.write(&batch).unwrap();
+ writer.close().unwrap();
+ serde_json::from_slice(&buf).unwrap()
+ };
+
+ assert_eq!(
+ json!([
+ {
+ "bytes": "4e656420466c616e64657273"
+ },
+ {
+ "bytes": null // the explicit null
+ },
+ {
+ "bytes": "54726f79204d63436c757265"
+ }
+ ]),
+ json_value,
+ );
+ }
+
+ // encode and check JSON with no explicit nulls:
+ {
+ let mut buf = Vec::new();
+ let json_value: Value = {
+ // explicit nulls are off by default, so we don't need
+ // to set that when creating the writer:
+ let mut writer = ArrayWriter::new(&mut buf);
+ writer.write(&batch).unwrap();
+ writer.close().unwrap();
+ serde_json::from_slice(&buf).unwrap()
+ };
+
+ assert_eq!(
+ json!([
+ {
+ "bytes": "4e656420466c616e64657273"
+ },
+ {}, // empty because nulls are omitted
+ {
+ "bytes": "54726f79204d63436c757265"
+ }
+ ]),
+ json_value
+ );
+ }
+ }
+
+ #[test]
+ fn test_writer_binary() {
+ // Binary:
+ binary_encoding_test::<i32>();
+ // LargeBinary:
+ binary_encoding_test::<i64>();
+ }
+
#[test]
fn test_writer_fixed_size_binary() {
// set up schema:
diff --git a/arrow-json/src/writer/encoder.rs b/arrow-json/src/writer/encoder.rs
index 056ddf3dd96..9b6c9418fa0 100644
--- a/arrow-json/src/writer/encoder.rs
+++ b/arrow-json/src/writer/encoder.rs
@@ -105,7 +105,17 @@ fn make_encoder_impl<'a>(
DataType::FixedSizeBinary(_) => {
let array = array.as_fixed_size_binary();
- (Box::new(FixedSizeBinaryEncoder::new(array)) as _,
array.nulls().cloned())
+ (Box::new(BinaryEncoder::new(array)) as _, array.nulls().cloned())
+ }
+
+ DataType::Binary => {
+ let array: &BinaryArray = array.as_binary();
+ (Box::new(BinaryEncoder::new(array)) as _, array.nulls().cloned())
+ }
+
+ DataType::LargeBinary => {
+ let array: &LargeBinaryArray = array.as_binary();
+ (Box::new(BinaryEncoder::new(array)) as _, array.nulls().cloned())
}
DataType::Struct(fields) => {
@@ -509,15 +519,23 @@ impl<'a> Encoder for MapEncoder<'a> {
}
}
-struct FixedSizeBinaryEncoder<'a>(&'a FixedSizeBinaryArray);
+/// New-type wrapper for encoding the binary types in arrow: `Binary`,
`LargeBinary`
+/// and `FixedSizeBinary` as hex strings in JSON.
+struct BinaryEncoder<B>(B);
-impl<'a> FixedSizeBinaryEncoder<'a> {
- fn new(array: &'a FixedSizeBinaryArray) -> Self {
+impl<'a, B> BinaryEncoder<B>
+where
+ B: ArrayAccessor<Item = &'a [u8]>,
+{
+ fn new(array: B) -> Self {
Self(array)
}
}
-impl<'a> Encoder for FixedSizeBinaryEncoder<'a> {
+impl<'a, B> Encoder for BinaryEncoder<B>
+where
+ B: ArrayAccessor<Item = &'a [u8]>,
+{
fn encode(&mut self, idx: usize, out: &mut Vec<u8>) {
out.push(b'"');
for byte in self.0.value(idx) {