alamb commented on code in PR #3979:
URL: https://github.com/apache/arrow-rs/pull/3979#discussion_r1156289716
##########
arrow-json/Cargo.toml:
##########
@@ -42,10 +42,12 @@ arrow-schema = { workspace = true }
half = { version = "2.1", default-features = false }
indexmap = { version = "1.9", default-features = false, features = ["std"] }
num = { version = "0.4", default-features = false, features = ["std"] }
+serde = { version = "1.0", default-features = false }
Review Comment:
I assume this is not a net-new dependency as `serde_json` depends on `serde`
##########
arrow-json/src/raw/tape.rs:
##########
@@ -452,6 +454,26 @@ impl TapeDecoder {
Ok(buf.len() - iter.len())
}
+ pub fn serialize<S: Serialize>(&mut self, rows: &[S]) -> Result<(),
ArrowError> {
Review Comment:
I think we should add a doc string here explaining what this does. Something
like
```suggestion
/// Writes any struct that implements [`Serialize`] into the TapeDecoder
/// so that it can be read as a RecordBatch
pub fn serialize<S: Serialize>(&mut self, rows: &[S]) -> Result<(),
ArrowError> {
```
##########
arrow-json/src/raw/serializer.rs:
##########
@@ -0,0 +1,417 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::raw::tape::TapeElement;
+use lexical_core::FormattedSize;
+use serde::ser::{
+ Impossible, SerializeMap, SerializeSeq, SerializeStruct, SerializeTuple,
+ SerializeTupleStruct,
+};
+use serde::{Serialize, Serializer};
+
+#[derive(Debug)]
+pub struct SerializerError(String);
+
+impl std::error::Error for SerializerError {}
+
+impl std::fmt::Display for SerializerError {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ write!(f, "{}", self.0)
+ }
+}
+
+impl serde::ser::Error for SerializerError {
+ fn custom<T>(msg: T) -> Self
+ where
+ T: std::fmt::Display,
+ {
+ Self(msg.to_string())
+ }
+}
+
+/// [`Serializer`] for [`TapeElement`]
+///
+/// Heavily based on <https://serde.rs/impl-serializer.html>
+pub struct TapeSerializer<'a> {
+ elements: &'a mut Vec<TapeElement>,
+
+ /// A buffer of parsed string data
+ bytes: &'a mut Vec<u8>,
+
+ /// Offsets into `data`
+ offsets: &'a mut Vec<usize>,
+}
+
+impl<'a> TapeSerializer<'a> {
+ pub fn new(
+ elements: &'a mut Vec<TapeElement>,
+ bytes: &'a mut Vec<u8>,
+ offsets: &'a mut Vec<usize>,
+ ) -> Self {
+ Self {
+ elements,
+ bytes,
+ offsets,
+ }
+ }
+}
+
+/// Need to use macro as const generic expressions are unstable
+/// <https://github.com/rust-lang/rust/issues/76560>
+macro_rules! serialize_lexical {
+ ($s:ident, $t:ty, $v:ident) => {{
+ let mut buffer = [0_u8; <$t>::FORMATTED_SIZE];
+ let s = lexical_core::write($v, &mut buffer);
+ $s.serialize_bytes(s)
+ }};
+}
+
+impl<'a, 'b> Serializer for &'a mut TapeSerializer<'b> {
+ type Ok = ();
+
+ type Error = SerializerError;
+
+ type SerializeSeq = ListSerializer<'a, 'b>;
+ type SerializeTuple = ListSerializer<'a, 'b>;
+ type SerializeTupleStruct = ListSerializer<'a, 'b>;
+ type SerializeTupleVariant = Impossible<(), SerializerError>;
+ type SerializeMap = ObjectSerializer<'a, 'b>;
+ type SerializeStruct = ObjectSerializer<'a, 'b>;
+ type SerializeStructVariant = Impossible<(), SerializerError>;
+
+ fn serialize_bool(self, v: bool) -> Result<(), SerializerError> {
+ self.elements.push(match v {
+ true => TapeElement::True,
+ false => TapeElement::False,
+ });
+ Ok(())
+ }
+
+ fn serialize_i8(self, v: i8) -> Result<(), SerializerError> {
+ serialize_lexical!(self, i8, v)
+ }
+
+ fn serialize_i16(self, v: i16) -> Result<(), SerializerError> {
+ serialize_lexical!(self, i16, v)
+ }
+
+ fn serialize_i32(self, v: i32) -> Result<(), SerializerError> {
+ serialize_lexical!(self, i32, v)
+ }
+
+ fn serialize_i64(self, v: i64) -> Result<(), SerializerError> {
+ serialize_lexical!(self, i64, v)
+ }
+
+ fn serialize_u8(self, v: u8) -> Result<(), SerializerError> {
+ serialize_lexical!(self, u8, v)
+ }
+
+ fn serialize_u16(self, v: u16) -> Result<(), SerializerError> {
+ serialize_lexical!(self, u16, v)
+ }
+
+ fn serialize_u32(self, v: u32) -> Result<(), SerializerError> {
+ serialize_lexical!(self, u32, v)
+ }
+
+ fn serialize_u64(self, v: u64) -> Result<(), SerializerError> {
+ serialize_lexical!(self, u64, v)
+ }
+
+ fn serialize_f32(self, v: f32) -> Result<(), SerializerError> {
+ serialize_lexical!(self, f32, v)
+ }
+
+ fn serialize_f64(self, v: f64) -> Result<(), SerializerError> {
+ serialize_lexical!(self, f64, v)
+ }
+
+ fn serialize_char(self, v: char) -> Result<(), SerializerError> {
+ self.serialize_str(&v.to_string())
+ }
+
+ fn serialize_str(self, v: &str) -> Result<(), SerializerError> {
+ self.serialize_bytes(v.as_bytes())
+ }
+
+ fn serialize_bytes(self, v: &[u8]) -> Result<(), SerializerError> {
+ self.bytes.extend_from_slice(v);
+ let idx = self.offsets.len() - 1;
+ self.elements.push(TapeElement::String(idx as _));
+ self.offsets.push(self.bytes.len());
+ Ok(())
+ }
+
+ fn serialize_none(self) -> Result<(), SerializerError> {
+ self.serialize_unit()
+ }
+
+ fn serialize_some<T>(self, value: &T) -> Result<(), SerializerError>
+ where
+ T: ?Sized + Serialize,
+ {
+ value.serialize(self)
+ }
+
+ fn serialize_unit(self) -> Result<(), SerializerError> {
+ self.elements.push(TapeElement::Null);
+ Ok(())
+ }
+
+ fn serialize_unit_struct(self, _name: &'static str) -> Result<(),
SerializerError> {
+ self.serialize_unit()
+ }
+
+ fn serialize_unit_variant(
+ self,
+ _name: &'static str,
+ _variant_index: u32,
+ variant: &'static str,
+ ) -> Result<(), SerializerError> {
+ self.serialize_str(variant)
+ }
+
+ fn serialize_newtype_struct<T>(
+ self,
+ _name: &'static str,
+ value: &T,
+ ) -> Result<(), SerializerError>
+ where
+ T: ?Sized + Serialize,
+ {
+ value.serialize(self)
+ }
+
+ fn serialize_newtype_variant<T>(
+ self,
+ _name: &'static str,
+ _variant_index: u32,
+ variant: &'static str,
+ value: &T,
+ ) -> Result<(), SerializerError>
+ where
+ T: ?Sized + Serialize,
+ {
+ let mut serializer = self.serialize_map(Some(1))?;
+ serializer.serialize_key(variant)?;
+ serializer.serialize_value(value)?;
+ serializer.finish();
+ Ok(())
+ }
+
+ fn serialize_seq(
+ self,
+ _len: Option<usize>,
+ ) -> Result<Self::SerializeSeq, SerializerError> {
+ Ok(ListSerializer::new(self))
+ }
+
+ fn serialize_tuple(
+ self,
+ len: usize,
+ ) -> Result<Self::SerializeTuple, SerializerError> {
+ self.serialize_seq(Some(len))
+ }
+
+ fn serialize_tuple_struct(
+ self,
+ _name: &'static str,
+ len: usize,
+ ) -> Result<Self::SerializeTupleStruct, SerializerError> {
+ self.serialize_seq(Some(len))
+ }
+
+ fn serialize_tuple_variant(
+ self,
+ _name: &'static str,
+ _variant_index: u32,
+ _variant: &'static str,
+ _len: usize,
+ ) -> Result<Self::SerializeTupleVariant, SerializerError> {
+ Err(SerializerError(
+ "serializing tuple variants is not currently
supported".to_string(),
Review Comment:
It would be a better ux I think if the name and variant were in the message
(same comment applies to `serialize_struct_variant`)
```suggestion
format!("serializing tuple variants is not currently supported:
{_name}::{_variant})")
```
##########
arrow-json/src/raw/mod.rs:
##########
@@ -233,6 +235,38 @@ impl RawDecoder {
self.tape_decoder.decode(buf)
}
+ /// Serialize `rows` to this [`RawDecoder`]
Review Comment:
This example is very cool. However, I think it is very hard to find (given
that it is on "RawDecoder" that is part of arrow_json).
I suggest we add a sentence / link to this example from the front page (and
maybe even bring a simpler example to the front page):
https://docs.rs/arrow/latest/arrow/index.html#io
##########
arrow-json/src/raw/serializer.rs:
##########
@@ -0,0 +1,395 @@
+use crate::raw::tape::TapeElement;
+use lexical_core::FormattedSize;
+use serde::ser::{
+ Impossible, SerializeMap, SerializeSeq, SerializeStruct, SerializeTuple,
+ SerializeTupleStruct,
+};
+use serde::{Serialize, Serializer};
+
+#[derive(Debug)]
+pub struct SerializerError(String);
+
+impl std::error::Error for SerializerError {}
+
+impl std::fmt::Display for SerializerError {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ write!(f, "{}", self.0)
+ }
+}
+
+impl serde::ser::Error for SerializerError {
+ fn custom<T>(msg: T) -> Self
+ where
+ T: std::fmt::Display,
+ {
+ Self(msg.to_string())
+ }
+}
+
+/// [`Serializer`] for [`TapeElement`]
+///
+/// Heavily based on <https://serde.rs/impl-serializer.html>
+pub struct TapeSerializer<'a> {
+ elements: &'a mut Vec<TapeElement>,
+
+ /// A buffer of parsed string data
+ bytes: &'a mut Vec<u8>,
+
+ /// Offsets into `data`
+ offsets: &'a mut Vec<usize>,
+}
+
+impl<'a> TapeSerializer<'a> {
+ pub fn new(
+ elements: &'a mut Vec<TapeElement>,
+ bytes: &'a mut Vec<u8>,
+ offsets: &'a mut Vec<usize>,
+ ) -> Self {
+ Self {
+ elements,
+ bytes,
+ offsets,
+ }
+ }
+}
+
+/// Need to use macro as const generic expressions are unstable
+/// <https://github.com/rust-lang/rust/issues/76560>
+macro_rules! serialize_lexical {
+ ($s:ident, $t:ty, $v:ident) => {{
+ let mut buffer = [0_u8; <$t>::FORMATTED_SIZE];
+ let s = lexical_core::write($v, &mut buffer);
+ $s.serialize_bytes(s)
+ }};
+}
+
+impl<'a, 'b> Serializer for &'a mut TapeSerializer<'b> {
+ type Ok = ();
+
+ type Error = SerializerError;
+
+ type SerializeSeq = ListSerializer<'a, 'b>;
+ type SerializeTuple = ListSerializer<'a, 'b>;
+ type SerializeTupleStruct = ListSerializer<'a, 'b>;
+ type SerializeTupleVariant = Impossible<(), SerializerError>;
+ type SerializeMap = ObjectSerializer<'a, 'b>;
+ type SerializeStruct = ObjectSerializer<'a, 'b>;
+ type SerializeStructVariant = Impossible<(), SerializerError>;
+
+ fn serialize_bool(self, v: bool) -> Result<(), SerializerError> {
+ self.elements.push(match v {
+ true => TapeElement::True,
+ false => TapeElement::False,
+ });
+ Ok(())
+ }
+
+ fn serialize_i8(self, v: i8) -> Result<(), SerializerError> {
+ serialize_lexical!(self, i8, v)
+ }
+
+ fn serialize_i16(self, v: i16) -> Result<(), SerializerError> {
+ serialize_lexical!(self, i16, v)
+ }
+
+ fn serialize_i32(self, v: i32) -> Result<(), SerializerError> {
+ serialize_lexical!(self, i32, v)
+ }
+
+ fn serialize_i64(self, v: i64) -> Result<(), SerializerError> {
+ serialize_lexical!(self, i64, v)
+ }
+
+ fn serialize_u8(self, v: u8) -> Result<(), SerializerError> {
+ serialize_lexical!(self, u8, v)
+ }
+
+ fn serialize_u16(self, v: u16) -> Result<(), SerializerError> {
+ serialize_lexical!(self, u16, v)
+ }
+
+ fn serialize_u32(self, v: u32) -> Result<(), SerializerError> {
+ serialize_lexical!(self, u32, v)
+ }
+
+ fn serialize_u64(self, v: u64) -> Result<(), SerializerError> {
+ serialize_lexical!(self, u64, v)
+ }
+
+ fn serialize_f32(self, v: f32) -> Result<(), SerializerError> {
+ serialize_lexical!(self, f32, v)
+ }
+
+ fn serialize_f64(self, v: f64) -> Result<(), SerializerError> {
+ serialize_lexical!(self, f64, v)
Review Comment:
```suggestion
// Formatting to a string only to parse it back again is rather
wasteful,
// could likely tweak the tape representation to allow storing raw
bytes
serialize_lexical!(self, f64, v)
```
##########
arrow-json/src/raw/mod.rs:
##########
@@ -233,6 +235,143 @@ impl RawDecoder {
self.tape_decoder.decode(buf)
}
+ /// Serialize `rows` to this [`RawDecoder`]
+ ///
+ /// Whilst this provides a simple way to convert [`serde`]-compatible
datastructures into arrow
+ /// [`RecordBatch`], performance-critical applications, especially where
the schema is known at
+ /// compile-time, may benefit from instead implementing custom conversion
logic as described
+ /// in [arrow_array::builder]
+ ///
+ /// This can be used with [`serde_json::Value`]
+ ///
+ /// ```
+ /// # use std::sync::Arc;
+ /// # use serde_json::{Value, json};
+ /// # use arrow_array::cast::AsArray;
+ /// # use arrow_array::types::Float32Type;
+ /// # use arrow_json::RawReaderBuilder;
+ /// # use arrow_schema::{DataType, Field, Schema};
+ /// let json = vec![json!({"float": 2.3}), json!({"float": 5.7})];
+ ///
+ /// let schema = Schema::new(vec![Field::new("float", DataType::Float32,
true)]);
+ /// let mut decoder =
RawReaderBuilder::new(Arc::new(schema)).build_decoder().unwrap();
+ ///
+ /// decoder.serialize(&json).unwrap();
+ /// let batch = decoder.flush().unwrap().unwrap();
+ /// assert_eq!(batch.num_rows(), 2);
+ /// assert_eq!(batch.num_columns(), 1);
+ /// let values = batch.column(0).as_primitive::<Float32Type>().values();
+ /// assert_eq!(values, &[2.3, 5.7])
+ /// ```
+ ///
+ /// It can also be used with arbitrary [`Serialize`] types
Review Comment:
I recommend we also make a simpler example with no nested structures (maybe
just an int32 and a string) in addition to the full featured example
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]