alamb commented on code in PR #3979:
URL: https://github.com/apache/arrow-rs/pull/3979#discussion_r1156289716


##########
arrow-json/Cargo.toml:
##########
@@ -42,10 +42,12 @@ arrow-schema = { workspace = true  }
 half = { version = "2.1", default-features = false }
 indexmap = { version = "1.9", default-features = false, features = ["std"] }
 num = { version = "0.4", default-features = false, features = ["std"] }
+serde = { version = "1.0", default-features = false }

Review Comment:
   I assume this is not a net-new dependency as `serde_json` depends on `serde`



##########
arrow-json/src/raw/tape.rs:
##########
@@ -452,6 +454,26 @@ impl TapeDecoder {
         Ok(buf.len() - iter.len())
     }
 
+    pub fn serialize<S: Serialize>(&mut self, rows: &[S]) -> Result<(), 
ArrowError> {

Review Comment:
   I think we should add a doc string here explaining what this does. Something 
like
   
   ```suggestion
       /// Writes any struct that implements [`Serialize`] into the TapeDecoder
       /// so that it can be read as a RecordBatch
       pub fn serialize<S: Serialize>(&mut self, rows: &[S]) -> Result<(), 
ArrowError> {
   ```



##########
arrow-json/src/raw/serializer.rs:
##########
@@ -0,0 +1,417 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::raw::tape::TapeElement;
+use lexical_core::FormattedSize;
+use serde::ser::{
+    Impossible, SerializeMap, SerializeSeq, SerializeStruct, SerializeTuple,
+    SerializeTupleStruct,
+};
+use serde::{Serialize, Serializer};
+
+#[derive(Debug)]
+pub struct SerializerError(String);
+
+impl std::error::Error for SerializerError {}
+
+impl std::fmt::Display for SerializerError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+impl serde::ser::Error for SerializerError {
+    fn custom<T>(msg: T) -> Self
+    where
+        T: std::fmt::Display,
+    {
+        Self(msg.to_string())
+    }
+}
+
+/// [`Serializer`] for [`TapeElement`]
+///
+/// Heavily based on <https://serde.rs/impl-serializer.html>
+pub struct TapeSerializer<'a> {
+    elements: &'a mut Vec<TapeElement>,
+
+    /// A buffer of parsed string data
+    bytes: &'a mut Vec<u8>,
+
+    /// Offsets into `data`
+    offsets: &'a mut Vec<usize>,
+}
+
+impl<'a> TapeSerializer<'a> {
+    pub fn new(
+        elements: &'a mut Vec<TapeElement>,
+        bytes: &'a mut Vec<u8>,
+        offsets: &'a mut Vec<usize>,
+    ) -> Self {
+        Self {
+            elements,
+            bytes,
+            offsets,
+        }
+    }
+}
+
+/// Need to use macro as const generic expressions are unstable
+/// <https://github.com/rust-lang/rust/issues/76560>
+macro_rules! serialize_lexical {
+    ($s:ident, $t:ty, $v:ident) => {{
+        let mut buffer = [0_u8; <$t>::FORMATTED_SIZE];
+        let s = lexical_core::write($v, &mut buffer);
+        $s.serialize_bytes(s)
+    }};
+}
+
+impl<'a, 'b> Serializer for &'a mut TapeSerializer<'b> {
+    type Ok = ();
+
+    type Error = SerializerError;
+
+    type SerializeSeq = ListSerializer<'a, 'b>;
+    type SerializeTuple = ListSerializer<'a, 'b>;
+    type SerializeTupleStruct = ListSerializer<'a, 'b>;
+    type SerializeTupleVariant = Impossible<(), SerializerError>;
+    type SerializeMap = ObjectSerializer<'a, 'b>;
+    type SerializeStruct = ObjectSerializer<'a, 'b>;
+    type SerializeStructVariant = Impossible<(), SerializerError>;
+
+    fn serialize_bool(self, v: bool) -> Result<(), SerializerError> {
+        self.elements.push(match v {
+            true => TapeElement::True,
+            false => TapeElement::False,
+        });
+        Ok(())
+    }
+
+    fn serialize_i8(self, v: i8) -> Result<(), SerializerError> {
+        serialize_lexical!(self, i8, v)
+    }
+
+    fn serialize_i16(self, v: i16) -> Result<(), SerializerError> {
+        serialize_lexical!(self, i16, v)
+    }
+
+    fn serialize_i32(self, v: i32) -> Result<(), SerializerError> {
+        serialize_lexical!(self, i32, v)
+    }
+
+    fn serialize_i64(self, v: i64) -> Result<(), SerializerError> {
+        serialize_lexical!(self, i64, v)
+    }
+
+    fn serialize_u8(self, v: u8) -> Result<(), SerializerError> {
+        serialize_lexical!(self, u8, v)
+    }
+
+    fn serialize_u16(self, v: u16) -> Result<(), SerializerError> {
+        serialize_lexical!(self, u16, v)
+    }
+
+    fn serialize_u32(self, v: u32) -> Result<(), SerializerError> {
+        serialize_lexical!(self, u32, v)
+    }
+
+    fn serialize_u64(self, v: u64) -> Result<(), SerializerError> {
+        serialize_lexical!(self, u64, v)
+    }
+
+    fn serialize_f32(self, v: f32) -> Result<(), SerializerError> {
+        serialize_lexical!(self, f32, v)
+    }
+
+    fn serialize_f64(self, v: f64) -> Result<(), SerializerError> {
+        serialize_lexical!(self, f64, v)
+    }
+
+    fn serialize_char(self, v: char) -> Result<(), SerializerError> {
+        self.serialize_str(&v.to_string())
+    }
+
+    fn serialize_str(self, v: &str) -> Result<(), SerializerError> {
+        self.serialize_bytes(v.as_bytes())
+    }
+
+    fn serialize_bytes(self, v: &[u8]) -> Result<(), SerializerError> {
+        self.bytes.extend_from_slice(v);
+        let idx = self.offsets.len() - 1;
+        self.elements.push(TapeElement::String(idx as _));
+        self.offsets.push(self.bytes.len());
+        Ok(())
+    }
+
+    fn serialize_none(self) -> Result<(), SerializerError> {
+        self.serialize_unit()
+    }
+
+    fn serialize_some<T>(self, value: &T) -> Result<(), SerializerError>
+    where
+        T: ?Sized + Serialize,
+    {
+        value.serialize(self)
+    }
+
+    fn serialize_unit(self) -> Result<(), SerializerError> {
+        self.elements.push(TapeElement::Null);
+        Ok(())
+    }
+
+    fn serialize_unit_struct(self, _name: &'static str) -> Result<(), 
SerializerError> {
+        self.serialize_unit()
+    }
+
+    fn serialize_unit_variant(
+        self,
+        _name: &'static str,
+        _variant_index: u32,
+        variant: &'static str,
+    ) -> Result<(), SerializerError> {
+        self.serialize_str(variant)
+    }
+
+    fn serialize_newtype_struct<T>(
+        self,
+        _name: &'static str,
+        value: &T,
+    ) -> Result<(), SerializerError>
+    where
+        T: ?Sized + Serialize,
+    {
+        value.serialize(self)
+    }
+
+    fn serialize_newtype_variant<T>(
+        self,
+        _name: &'static str,
+        _variant_index: u32,
+        variant: &'static str,
+        value: &T,
+    ) -> Result<(), SerializerError>
+    where
+        T: ?Sized + Serialize,
+    {
+        let mut serializer = self.serialize_map(Some(1))?;
+        serializer.serialize_key(variant)?;
+        serializer.serialize_value(value)?;
+        serializer.finish();
+        Ok(())
+    }
+
+    fn serialize_seq(
+        self,
+        _len: Option<usize>,
+    ) -> Result<Self::SerializeSeq, SerializerError> {
+        Ok(ListSerializer::new(self))
+    }
+
+    fn serialize_tuple(
+        self,
+        len: usize,
+    ) -> Result<Self::SerializeTuple, SerializerError> {
+        self.serialize_seq(Some(len))
+    }
+
+    fn serialize_tuple_struct(
+        self,
+        _name: &'static str,
+        len: usize,
+    ) -> Result<Self::SerializeTupleStruct, SerializerError> {
+        self.serialize_seq(Some(len))
+    }
+
+    fn serialize_tuple_variant(
+        self,
+        _name: &'static str,
+        _variant_index: u32,
+        _variant: &'static str,
+        _len: usize,
+    ) -> Result<Self::SerializeTupleVariant, SerializerError> {
+        Err(SerializerError(
+            "serializing tuple variants is not currently 
supported".to_string(),

Review Comment:
   It would be a better ux I think if the name and variant were in the message 
(same comment applies to `serialize_struct_variant`)
   
   ```suggestion
               format!("serializing tuple variants is not currently supported: 
{_name}::{_variant})")
   ```



##########
arrow-json/src/raw/mod.rs:
##########
@@ -233,6 +235,38 @@ impl RawDecoder {
         self.tape_decoder.decode(buf)
     }
 
+    /// Serialize `rows` to this [`RawDecoder`]

Review Comment:
   This example is very cool. However, I think it is very hard to find (given 
that it is on "RawDecoder" that is part of arrow_json). 
   
   I suggest we add a sentence / link to this example from the front page (and 
maybe even bring a simpler example to the front page):  
https://docs.rs/arrow/latest/arrow/index.html#io



##########
arrow-json/src/raw/serializer.rs:
##########
@@ -0,0 +1,395 @@
+use crate::raw::tape::TapeElement;
+use lexical_core::FormattedSize;
+use serde::ser::{
+    Impossible, SerializeMap, SerializeSeq, SerializeStruct, SerializeTuple,
+    SerializeTupleStruct,
+};
+use serde::{Serialize, Serializer};
+
+#[derive(Debug)]
+pub struct SerializerError(String);
+
+impl std::error::Error for SerializerError {}
+
+impl std::fmt::Display for SerializerError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+impl serde::ser::Error for SerializerError {
+    fn custom<T>(msg: T) -> Self
+    where
+        T: std::fmt::Display,
+    {
+        Self(msg.to_string())
+    }
+}
+
+/// [`Serializer`] for [`TapeElement`]
+///
+/// Heavily based on <https://serde.rs/impl-serializer.html>
+pub struct TapeSerializer<'a> {
+    elements: &'a mut Vec<TapeElement>,
+
+    /// A buffer of parsed string data
+    bytes: &'a mut Vec<u8>,
+
+    /// Offsets into `data`
+    offsets: &'a mut Vec<usize>,
+}
+
+impl<'a> TapeSerializer<'a> {
+    pub fn new(
+        elements: &'a mut Vec<TapeElement>,
+        bytes: &'a mut Vec<u8>,
+        offsets: &'a mut Vec<usize>,
+    ) -> Self {
+        Self {
+            elements,
+            bytes,
+            offsets,
+        }
+    }
+}
+
+/// Need to use macro as const generic expressions are unstable
+/// <https://github.com/rust-lang/rust/issues/76560>
+macro_rules! serialize_lexical {
+    ($s:ident, $t:ty, $v:ident) => {{
+        let mut buffer = [0_u8; <$t>::FORMATTED_SIZE];
+        let s = lexical_core::write($v, &mut buffer);
+        $s.serialize_bytes(s)
+    }};
+}
+
+impl<'a, 'b> Serializer for &'a mut TapeSerializer<'b> {
+    type Ok = ();
+
+    type Error = SerializerError;
+
+    type SerializeSeq = ListSerializer<'a, 'b>;
+    type SerializeTuple = ListSerializer<'a, 'b>;
+    type SerializeTupleStruct = ListSerializer<'a, 'b>;
+    type SerializeTupleVariant = Impossible<(), SerializerError>;
+    type SerializeMap = ObjectSerializer<'a, 'b>;
+    type SerializeStruct = ObjectSerializer<'a, 'b>;
+    type SerializeStructVariant = Impossible<(), SerializerError>;
+
+    fn serialize_bool(self, v: bool) -> Result<(), SerializerError> {
+        self.elements.push(match v {
+            true => TapeElement::True,
+            false => TapeElement::False,
+        });
+        Ok(())
+    }
+
+    fn serialize_i8(self, v: i8) -> Result<(), SerializerError> {
+        serialize_lexical!(self, i8, v)
+    }
+
+    fn serialize_i16(self, v: i16) -> Result<(), SerializerError> {
+        serialize_lexical!(self, i16, v)
+    }
+
+    fn serialize_i32(self, v: i32) -> Result<(), SerializerError> {
+        serialize_lexical!(self, i32, v)
+    }
+
+    fn serialize_i64(self, v: i64) -> Result<(), SerializerError> {
+        serialize_lexical!(self, i64, v)
+    }
+
+    fn serialize_u8(self, v: u8) -> Result<(), SerializerError> {
+        serialize_lexical!(self, u8, v)
+    }
+
+    fn serialize_u16(self, v: u16) -> Result<(), SerializerError> {
+        serialize_lexical!(self, u16, v)
+    }
+
+    fn serialize_u32(self, v: u32) -> Result<(), SerializerError> {
+        serialize_lexical!(self, u32, v)
+    }
+
+    fn serialize_u64(self, v: u64) -> Result<(), SerializerError> {
+        serialize_lexical!(self, u64, v)
+    }
+
+    fn serialize_f32(self, v: f32) -> Result<(), SerializerError> {
+        serialize_lexical!(self, f32, v)
+    }
+
+    fn serialize_f64(self, v: f64) -> Result<(), SerializerError> {
+        serialize_lexical!(self, f64, v)

Review Comment:
   ```suggestion
           // Formatting to a string only to parse it back again is rather 
wasteful, 
           // could likely tweak the tape representation to allow storing raw 
bytes
           serialize_lexical!(self, f64, v)
   ```



##########
arrow-json/src/raw/mod.rs:
##########
@@ -233,6 +235,143 @@ impl RawDecoder {
         self.tape_decoder.decode(buf)
     }
 
+    /// Serialize `rows` to this [`RawDecoder`]
+    ///
+    /// Whilst this provides a simple way to convert [`serde`]-compatible 
datastructures into arrow
+    /// [`RecordBatch`], performance-critical applications, especially where 
the schema is known at
+    /// compile-time, may benefit from instead implementing custom conversion 
logic as described
+    /// in [arrow_array::builder]
+    ///
+    /// This can be used with [`serde_json::Value`]
+    ///
+    /// ```
+    /// # use std::sync::Arc;
+    /// # use serde_json::{Value, json};
+    /// # use arrow_array::cast::AsArray;
+    /// # use arrow_array::types::Float32Type;
+    /// # use arrow_json::RawReaderBuilder;
+    /// # use arrow_schema::{DataType, Field, Schema};
+    /// let json = vec![json!({"float": 2.3}), json!({"float": 5.7})];
+    ///
+    /// let schema = Schema::new(vec![Field::new("float", DataType::Float32, 
true)]);
+    /// let mut decoder = 
RawReaderBuilder::new(Arc::new(schema)).build_decoder().unwrap();
+    ///
+    /// decoder.serialize(&json).unwrap();
+    /// let batch = decoder.flush().unwrap().unwrap();
+    /// assert_eq!(batch.num_rows(), 2);
+    /// assert_eq!(batch.num_columns(), 1);
+    /// let values = batch.column(0).as_primitive::<Float32Type>().values();
+    /// assert_eq!(values, &[2.3, 5.7])
+    /// ```
+    ///
+    /// It can also be used with arbitrary [`Serialize`] types

Review Comment:
   I recommend we also make a simpler example with no nested structures (maybe 
just an int32 and a string) in addition to the full featured example



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to