alamb commented on code in PR #7566:
URL: https://github.com/apache/arrow-rs/pull/7566#discussion_r2115928606


##########
arrow-json/src/writer/encoder.rs:
##########
@@ -805,3 +812,131 @@ where
         out.push(b'"');
     }
 }
+
+/// Configuration for how to encode union values.
+///
+/// Controls whether to encode union values with or without type information.
+/// The format choice impacts the JSON output structure and readability.
+#[derive(Debug, Clone, Copy)]
+pub(crate) enum UnionFormat {
+    #[allow(dead_code)]
+    /// Encode as a simple value without type information (e.g., `1` or `"a"`).
+    Simple,
+
+    /// Encode as an object with type_id and value fields (e.g., `{"type_id": 
0, "value": 1}`).
+    /// This is the default format and preserves complete type information.
+    WithTypeInfo,
+}
+
+/// Encoder for Union arrays that handles encoding union values according to 
the specified format.
+/// See [`UnionFormat`] for details on the available formats.
+pub(crate) struct UnionEncoder<'a> {
+    type_ids: &'a ScalarBuffer<i8>,
+    offsets: Option<&'a ScalarBuffer<i32>>,
+    field_encoders: Vec<NullableEncoder<'a>>,
+    format: UnionFormat,
+    type_id_to_index: std::collections::HashMap<i8, usize>,
+}
+
+impl<'a> UnionEncoder<'a> {
+    /// Create a new UnionEncoder for the given UnionArray.
+    ///
+    /// By default, this uses [`UnionFormat::WithTypeInfo`] to preserve 
complete type information.
+    /// For simpler output without type information, use 
[`UnionFormat::Simple`].
+    pub fn try_new(
+        field: &'a FieldRef,
+        array: &'a UnionArray,
+        options: &'a EncoderOptions,
+    ) -> Result<Self, ArrowError> {
+        Self::try_new_with_format(field, array, options, 
UnionFormat::WithTypeInfo)
+    }
+
+    /// Create a new UnionEncoder with a specific format.
+    ///
+    /// This allows explicitly choosing how to encode union values:
+    ///
+    /// - Use [`UnionFormat::Simple`] for cleaner JSON output without type 
information,
+    ///   producing values like: `1` or `"a"`
+    /// - Use [`UnionFormat::WithTypeInfo`] to preserve type information,
+    ///   producing values like: `{"type_id": 0, "value": 1}`
+    ///
+    /// See [`UnionFormat`] for more details about each format.
+    pub fn try_new_with_format(
+        _field: &'a FieldRef,
+        array: &'a UnionArray,
+        options: &'a EncoderOptions,
+        format: UnionFormat,
+    ) -> Result<Self, ArrowError> {
+        let type_ids = array.type_ids();
+
+        let union_fields = match array.data_type() {
+            DataType::Union(fields, _) => fields,
+            _ => unreachable!("Union array's data type is not a union!"),

Review Comment:
   Perhaps this could return a real error rather than panic'ing (given that 
this function already returns a `Result`)



##########
arrow-json/src/writer/encoder.rs:
##########
@@ -805,3 +812,131 @@ where
         out.push(b'"');
     }
 }
+
+/// Configuration for how to encode union values.
+///
+/// Controls whether to encode union values with or without type information.
+/// The format choice impacts the JSON output structure and readability.
+#[derive(Debug, Clone, Copy)]
+pub(crate) enum UnionFormat {
+    #[allow(dead_code)]

Review Comment:
   Why the `dead_code`?



##########
arrow-json/src/writer/mod.rs:
##########
@@ -2058,83 +2059,37 @@ mod tests {
         assert_json_eq(&buf, expected);
     }
 
-    fn make_fallback_encoder_test_data() -> (RecordBatch, Arc<dyn 
EncoderFactory>) {

Review Comment:
   are there any other usecases for a fallback encoder? I feel like that 
usecase is still quite valid
   
   What about `REEArray` or something 🤔 



##########
arrow-json/src/writer/encoder.rs:
##########
@@ -805,3 +812,131 @@ where
         out.push(b'"');
     }
 }
+
+/// Configuration for how to encode union values.
+///
+/// Controls whether to encode union values with or without type information.
+/// The format choice impacts the JSON output structure and readability.
+#[derive(Debug, Clone, Copy)]
+pub(crate) enum UnionFormat {
+    #[allow(dead_code)]
+    /// Encode as a simple value without type information (e.g., `1` or `"a"`).
+    Simple,
+
+    /// Encode as an object with type_id and value fields (e.g., `{"type_id": 
0, "value": 1}`).
+    /// This is the default format and preserves complete type information.

Review Comment:
   👍 



##########
arrow-json/src/writer/encoder.rs:
##########
@@ -805,3 +812,131 @@ where
         out.push(b'"');
     }
 }
+
+/// Configuration for how to encode union values.
+///
+/// Controls whether to encode union values with or without type information.
+/// The format choice impacts the JSON output structure and readability.
+#[derive(Debug, Clone, Copy)]
+pub(crate) enum UnionFormat {
+    #[allow(dead_code)]
+    /// Encode as a simple value without type information (e.g., `1` or `"a"`).
+    Simple,
+
+    /// Encode as an object with type_id and value fields (e.g., `{"type_id": 
0, "value": 1}`).
+    /// This is the default format and preserves complete type information.
+    WithTypeInfo,
+}
+
+/// Encoder for Union arrays that handles encoding union values according to 
the specified format.
+/// See [`UnionFormat`] for details on the available formats.
+pub(crate) struct UnionEncoder<'a> {
+    type_ids: &'a ScalarBuffer<i8>,
+    offsets: Option<&'a ScalarBuffer<i32>>,
+    field_encoders: Vec<NullableEncoder<'a>>,
+    format: UnionFormat,
+    type_id_to_index: std::collections::HashMap<i8, usize>,
+}
+
+impl<'a> UnionEncoder<'a> {
+    /// Create a new UnionEncoder for the given UnionArray.
+    ///
+    /// By default, this uses [`UnionFormat::WithTypeInfo`] to preserve 
complete type information.
+    /// For simpler output without type information, use 
[`UnionFormat::Simple`].
+    pub fn try_new(
+        field: &'a FieldRef,
+        array: &'a UnionArray,
+        options: &'a EncoderOptions,
+    ) -> Result<Self, ArrowError> {
+        Self::try_new_with_format(field, array, options, 
UnionFormat::WithTypeInfo)
+    }
+
+    /// Create a new UnionEncoder with a specific format.
+    ///
+    /// This allows explicitly choosing how to encode union values:
+    ///
+    /// - Use [`UnionFormat::Simple`] for cleaner JSON output without type 
information,
+    ///   producing values like: `1` or `"a"`
+    /// - Use [`UnionFormat::WithTypeInfo`] to preserve type information,
+    ///   producing values like: `{"type_id": 0, "value": 1}`
+    ///
+    /// See [`UnionFormat`] for more details about each format.
+    pub fn try_new_with_format(
+        _field: &'a FieldRef,
+        array: &'a UnionArray,
+        options: &'a EncoderOptions,
+        format: UnionFormat,
+    ) -> Result<Self, ArrowError> {
+        let type_ids = array.type_ids();
+
+        let union_fields = match array.data_type() {
+            DataType::Union(fields, _) => fields,
+            _ => unreachable!("Union array's data type is not a union!"),
+        };
+
+        let mut field_encoders = Vec::new();
+        let mut type_id_to_index = std::collections::HashMap::new();
+
+        for (idx, (type_id, field_ref)) in union_fields.iter().enumerate() {
+            let child = array.child(type_id);
+            let encoder = make_encoder(field_ref, child, options)?;
+            field_encoders.push(encoder);
+            type_id_to_index.insert(type_id, idx);
+        }
+
+        Ok(Self {
+            type_ids,
+            offsets: array.offsets(),
+            field_encoders,
+            format,
+            type_id_to_index,
+        })
+    }
+}
+
+impl Encoder for UnionEncoder<'_> {
+    fn encode(&mut self, idx: usize, out: &mut Vec<u8>) {
+        let type_id = self.type_ids[idx];
+        let value_idx = match self.offsets {
+            Some(offsets) => offsets[idx] as usize,
+            None => idx,
+        };
+
+        match self.format {
+            UnionFormat::Simple => {
+                if let Some(&encoder_idx) = 
self.type_id_to_index.get(&type_id) {

Review Comment:
   when would the type_id ever not be found? It seems like this means invalid 
input data -- and should be a panic in my opinion
   
   Likewise below



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to