alamb commented on code in PR #7566: URL: https://github.com/apache/arrow-rs/pull/7566#discussion_r2115928606
########## arrow-json/src/writer/encoder.rs: ########## @@ -805,3 +812,131 @@ where out.push(b'"'); } } + +/// Configuration for how to encode union values. +/// +/// Controls whether to encode union values with or without type information. +/// The format choice impacts the JSON output structure and readability. +#[derive(Debug, Clone, Copy)] +pub(crate) enum UnionFormat { + #[allow(dead_code)] + /// Encode as a simple value without type information (e.g., `1` or `"a"`). + Simple, + + /// Encode as an object with type_id and value fields (e.g., `{"type_id": 0, "value": 1}`). + /// This is the default format and preserves complete type information. + WithTypeInfo, +} + +/// Encoder for Union arrays that handles encoding union values according to the specified format. +/// See [`UnionFormat`] for details on the available formats. +pub(crate) struct UnionEncoder<'a> { + type_ids: &'a ScalarBuffer<i8>, + offsets: Option<&'a ScalarBuffer<i32>>, + field_encoders: Vec<NullableEncoder<'a>>, + format: UnionFormat, + type_id_to_index: std::collections::HashMap<i8, usize>, +} + +impl<'a> UnionEncoder<'a> { + /// Create a new UnionEncoder for the given UnionArray. + /// + /// By default, this uses [`UnionFormat::WithTypeInfo`] to preserve complete type information. + /// For simpler output without type information, use [`UnionFormat::Simple`]. + pub fn try_new( + field: &'a FieldRef, + array: &'a UnionArray, + options: &'a EncoderOptions, + ) -> Result<Self, ArrowError> { + Self::try_new_with_format(field, array, options, UnionFormat::WithTypeInfo) + } + + /// Create a new UnionEncoder with a specific format. + /// + /// This allows explicitly choosing how to encode union values: + /// + /// - Use [`UnionFormat::Simple`] for cleaner JSON output without type information, + /// producing values like: `1` or `"a"` + /// - Use [`UnionFormat::WithTypeInfo`] to preserve type information, + /// producing values like: `{"type_id": 0, "value": 1}` + /// + /// See [`UnionFormat`] for more details about each format. + pub fn try_new_with_format( + _field: &'a FieldRef, + array: &'a UnionArray, + options: &'a EncoderOptions, + format: UnionFormat, + ) -> Result<Self, ArrowError> { + let type_ids = array.type_ids(); + + let union_fields = match array.data_type() { + DataType::Union(fields, _) => fields, + _ => unreachable!("Union array's data type is not a union!"), Review Comment: Perhaps this could return a real error rather than panic'ing (given that this function already returns a `Result`) ########## arrow-json/src/writer/encoder.rs: ########## @@ -805,3 +812,131 @@ where out.push(b'"'); } } + +/// Configuration for how to encode union values. +/// +/// Controls whether to encode union values with or without type information. +/// The format choice impacts the JSON output structure and readability. +#[derive(Debug, Clone, Copy)] +pub(crate) enum UnionFormat { + #[allow(dead_code)] Review Comment: Why the `dead_code`? ########## arrow-json/src/writer/mod.rs: ########## @@ -2058,83 +2059,37 @@ mod tests { assert_json_eq(&buf, expected); } - fn make_fallback_encoder_test_data() -> (RecordBatch, Arc<dyn EncoderFactory>) { Review Comment: are there any other usecases for a fallback encoder? I feel like that usecase is still quite valid What about `REEArray` or something 🤔 ########## arrow-json/src/writer/encoder.rs: ########## @@ -805,3 +812,131 @@ where out.push(b'"'); } } + +/// Configuration for how to encode union values. +/// +/// Controls whether to encode union values with or without type information. +/// The format choice impacts the JSON output structure and readability. +#[derive(Debug, Clone, Copy)] +pub(crate) enum UnionFormat { + #[allow(dead_code)] + /// Encode as a simple value without type information (e.g., `1` or `"a"`). + Simple, + + /// Encode as an object with type_id and value fields (e.g., `{"type_id": 0, "value": 1}`). + /// This is the default format and preserves complete type information. Review Comment: 👍 ########## arrow-json/src/writer/encoder.rs: ########## @@ -805,3 +812,131 @@ where out.push(b'"'); } } + +/// Configuration for how to encode union values. +/// +/// Controls whether to encode union values with or without type information. +/// The format choice impacts the JSON output structure and readability. +#[derive(Debug, Clone, Copy)] +pub(crate) enum UnionFormat { + #[allow(dead_code)] + /// Encode as a simple value without type information (e.g., `1` or `"a"`). + Simple, + + /// Encode as an object with type_id and value fields (e.g., `{"type_id": 0, "value": 1}`). + /// This is the default format and preserves complete type information. + WithTypeInfo, +} + +/// Encoder for Union arrays that handles encoding union values according to the specified format. +/// See [`UnionFormat`] for details on the available formats. +pub(crate) struct UnionEncoder<'a> { + type_ids: &'a ScalarBuffer<i8>, + offsets: Option<&'a ScalarBuffer<i32>>, + field_encoders: Vec<NullableEncoder<'a>>, + format: UnionFormat, + type_id_to_index: std::collections::HashMap<i8, usize>, +} + +impl<'a> UnionEncoder<'a> { + /// Create a new UnionEncoder for the given UnionArray. + /// + /// By default, this uses [`UnionFormat::WithTypeInfo`] to preserve complete type information. + /// For simpler output without type information, use [`UnionFormat::Simple`]. + pub fn try_new( + field: &'a FieldRef, + array: &'a UnionArray, + options: &'a EncoderOptions, + ) -> Result<Self, ArrowError> { + Self::try_new_with_format(field, array, options, UnionFormat::WithTypeInfo) + } + + /// Create a new UnionEncoder with a specific format. + /// + /// This allows explicitly choosing how to encode union values: + /// + /// - Use [`UnionFormat::Simple`] for cleaner JSON output without type information, + /// producing values like: `1` or `"a"` + /// - Use [`UnionFormat::WithTypeInfo`] to preserve type information, + /// producing values like: `{"type_id": 0, "value": 1}` + /// + /// See [`UnionFormat`] for more details about each format. + pub fn try_new_with_format( + _field: &'a FieldRef, + array: &'a UnionArray, + options: &'a EncoderOptions, + format: UnionFormat, + ) -> Result<Self, ArrowError> { + let type_ids = array.type_ids(); + + let union_fields = match array.data_type() { + DataType::Union(fields, _) => fields, + _ => unreachable!("Union array's data type is not a union!"), + }; + + let mut field_encoders = Vec::new(); + let mut type_id_to_index = std::collections::HashMap::new(); + + for (idx, (type_id, field_ref)) in union_fields.iter().enumerate() { + let child = array.child(type_id); + let encoder = make_encoder(field_ref, child, options)?; + field_encoders.push(encoder); + type_id_to_index.insert(type_id, idx); + } + + Ok(Self { + type_ids, + offsets: array.offsets(), + field_encoders, + format, + type_id_to_index, + }) + } +} + +impl Encoder for UnionEncoder<'_> { + fn encode(&mut self, idx: usize, out: &mut Vec<u8>) { + let type_id = self.type_ids[idx]; + let value_idx = match self.offsets { + Some(offsets) => offsets[idx] as usize, + None => idx, + }; + + match self.format { + UnionFormat::Simple => { + if let Some(&encoder_idx) = self.type_id_to_index.get(&type_id) { Review Comment: when would the type_id ever not be found? It seems like this means invalid input data -- and should be a panic in my opinion Likewise below -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org