alamb commented on code in PR #5488:
URL: https://github.com/apache/arrow-rs/pull/5488#discussion_r1523663384
##########
arrow-flight/src/encode.rs:
##########
@@ -537,19 +599,54 @@ fn prepare_batch_for_flight(
/// Hydrates a dictionary to its underlying type if send_dictionaries is
false. If send_dictionaries
/// is true, dictionaries are sent with every batch which is not as optimal as
described in [DictionaryHandling::Hydrate] above,
/// but does enable sending DictionaryArray's via Flight.
-fn hydrate_dictionary(array: &ArrayRef, send_dictionaries: bool) ->
Result<ArrayRef> {
- let arr = match array.data_type() {
- DataType::Dictionary(_, value) if !send_dictionaries =>
arrow_cast::cast(array, value)?,
+fn hydrate_dictionary(
+ array: &ArrayRef,
+ data_type: &DataType,
+ send_dictionaries: bool,
+) -> Result<ArrayRef> {
+ let arr = match (array.data_type(), data_type) {
+ (DataType::Dictionary(_, value), _) if !send_dictionaries => {
Review Comment:
Is this code needed? It is strange to special case dictionary here, but just
cast to `data_type` below. Maybe it is covered already by the `(tpe,
data_type)` case
##########
arrow-flight/src/encode.rs:
##########
@@ -622,6 +729,311 @@ mod tests {
}
}
+ #[tokio::test]
+ async fn test_dictionary_list_hydration() {
+ let mut builder =
builder::ListBuilder::new(StringDictionaryBuilder::<UInt16Type>::new());
+
+ builder.append_value(vec![Some("a"), None, Some("b")]);
+
+ let arr1 = builder.finish();
+
+ builder.append_value(vec![Some("c"), None, Some("d")]);
+
+ let arr2 = builder.finish();
+
+ let schema = Arc::new(Schema::new(vec![Field::new_list(
+ "dict_list",
+ Field::new_dictionary("item", DataType::UInt16, DataType::Utf8,
true),
+ true,
+ )]));
+
+ let batch1 = RecordBatch::try_new(schema.clone(),
vec![Arc::new(arr1)]).unwrap();
+ let batch2 = RecordBatch::try_new(schema.clone(),
vec![Arc::new(arr2)]).unwrap();
+
+ let stream = futures::stream::iter(vec![Ok(batch1), Ok(batch2)]);
+
+ let encoder = FlightDataEncoderBuilder::default().build(stream);
+
+ let mut decoder = FlightDataDecoder::new(encoder);
+ let expected_schema = Schema::new(vec![Field::new_list(
+ "dict_list",
+ Field::new("item", DataType::Utf8, true),
+ true,
+ )]);
+
+ let expected_schema = Arc::new(expected_schema);
+
+ let mut expected_arrays = vec![
+ StringArray::from_iter(vec![Some("a"), None, Some("b")]),
+ StringArray::from_iter(vec![Some("c"), None, Some("d")]),
+ ]
+ .into_iter();
+
+ while let Some(decoded) = decoder.next().await {
+ let decoded = decoded.unwrap();
+ match decoded.payload {
+ DecodedPayload::None => {}
+ DecodedPayload::Schema(s) => assert_eq!(s, expected_schema),
+ DecodedPayload::RecordBatch(b) => {
+ assert_eq!(b.schema(), expected_schema);
+ let expected_array = expected_arrays.next().unwrap();
+ let list_array =
+
downcast_array::<ListArray>(b.column_by_name("dict_list").unwrap());
+ let elem_array =
downcast_array::<StringArray>(list_array.value(0).as_ref());
+
+ assert_eq!(elem_array, expected_array);
+ }
+ }
+ }
+ }
+
+ #[tokio::test]
+ async fn test_dictionary_struct_hydration() {
+ let struct_fields = vec![Field::new_list(
Review Comment:
The amount of code required to build these nested arrays is quite sad
(nothing wrong with this PR, I am just commenting in general)
##########
arrow-flight/src/encode.rs:
##########
@@ -388,29 +388,39 @@ impl Stream for FlightDataEncoder {
/// Defines how a [`FlightDataEncoder`] encodes [`DictionaryArray`]s
///
/// [`DictionaryArray`]: arrow_array::DictionaryArray
+///
Review Comment:
Should we update this comment maybe to say that the FlightDataEncoder
doesn't really handle sending the same dictionary multiple times?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]