alamb commented on code in PR #3547:
URL: https://github.com/apache/arrow-datafusion/pull/3547#discussion_r979406848
##########
datafusion/proto/proto/datafusion.proto:
##########
@@ -714,6 +714,9 @@ message Union{
}
message ScalarListValue{
+ // encode null explicitly to distinguish a list with a null value
+ // from a list with no values)
+ bool is_null = 3;
Review Comment:
I believe much of the complexity of the code to serialize List values was
related to trying to figure out NULL values. Encoding it explicitly makes the
code much simpler
##########
datafusion/proto/src/from_proto.rs:
##########
@@ -838,6 +754,23 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue {
}
}
+/// Ensures that all `values` are of type DataType::List and have the
+/// same type as field
+fn validate_list_values(field: &Field, values: &[ScalarValue]) -> Result<(),
Error> {
Review Comment:
I am not sure how valuable it is to do this validation if serialized
`ScalarValue` always came from DataFusion, but I wanted to keep the same
semantics
##########
datafusion/proto/src/to_proto.rs:
##########
@@ -980,115 +959,30 @@ impl TryFrom<&ScalarValue> for protobuf::ScalarValue {
Value::LargeUtf8Value(s.to_owned())
})
}
- scalar::ScalarValue::List(value, boxed_field) => match value {
- Some(values) => {
- if values.is_empty() {
- protobuf::ScalarValue {
- value:
Some(protobuf::scalar_value::Value::ListValue(
- protobuf::ScalarListValue {
- field:
Some(boxed_field.as_ref().try_into()?),
- values: Vec::new(),
- },
- )),
- }
- } else {
- let scalar_type = match boxed_field.data_type() {
- DataType::List(field) =>
field.as_ref().data_type(),
- unsupported => {
- return Err(Error::General(format!("Proto
serialization error: {:?} not supported to convert to DataType::List",
unsupported)));
- }
- };
+ scalar::ScalarValue::List(values, boxed_field) => {
+ let is_null = values.is_none();
- let type_checked_values: Vec<protobuf::ScalarValue> =
values
- .iter()
- .map(|scalar| match (scalar, scalar_type) {
- (
- scalar::ScalarValue::List(_, list_type),
- DataType::List(field),
- ) => {
- if let DataType::List(list_field) =
- list_type.data_type()
- {
- let scalar_datatype =
field.data_type();
- let list_datatype =
list_field.data_type();
- if
std::mem::discriminant(list_datatype)
- !=
std::mem::discriminant(scalar_datatype)
- {
- return
Err(Error::inconsistent_list_typing(
- list_datatype,
- scalar_datatype,
- ));
- }
- scalar.try_into()
- } else {
-
Err(Error::inconsistent_list_designated(
- scalar,
- boxed_field.data_type(),
- ))
- }
- }
- (scalar::ScalarValue::Boolean(_),
DataType::Boolean) => {
Review Comment:
it is not clear why there was other type specific code to serializing lists
- after this PR the existing code for serializing other types are used
##########
datafusion/proto/src/lib.rs:
##########
@@ -574,125 +579,12 @@ mod roundtrip_tests {
DataType::Utf8,
DataType::LargeUtf8,
// Recursive list tests
- DataType::List(new_box_field("Level1", DataType::Boolean, true)),
- DataType::List(new_box_field(
- "Level1",
- DataType::List(new_box_field("Level2", DataType::Date32,
true)),
- true,
- )),
- ];
-
- let should_fail: Vec<DataType> = vec![
Review Comment:
This test is for serializing into a list specific protobuf that was removed
in this PR
##########
datafusion/proto/src/lib.rs:
##########
@@ -480,14 +492,11 @@ mod roundtrip_tests {
ScalarValue::Float32(Some(2.0)),
ScalarValue::Float32(Some(1.0)),
]),
- DataType::List(new_box_field("level1", DataType::Float32,
true)),
+ DataType::Float32,
Review Comment:
These are not valid `ScalarValue::List` structures -- specifically, the type
of the field on a `List` is the type of the element (aka it is not
DataType::List(type of element), which is what this test was previously doing).
You can see correct usages in the rest of the codebase
https://github.com/apache/arrow-datafusion/search?q=new_list
for example
https://github.com/apache/arrow-datafusion/blob/e0a9fa347f7adb1e6d6157b537e44c574e88244a/datafusion/physical-expr/src/aggregate/array_agg.rs#L174-L183
##########
datafusion/proto/proto/datafusion.proto:
##########
@@ -825,17 +828,6 @@ enum PrimitiveScalarType{
TIME64 = 27;
}
-message ScalarType{
Review Comment:
I am not sure why but the previous code was encoding `ScalarValue::List`
using a different pattern than the struct. This PR changes the serialization
code to use the same structure as `ScalarValue::List` -- the Field and the a
list of scalar values
##########
datafusion/proto/src/lib.rs:
##########
@@ -367,15 +374,20 @@ mod roundtrip_tests {
];
for test_case in should_fail_on_seralize.into_iter() {
- let res: std::result::Result<
- super::protobuf::ScalarValue,
- super::to_proto::Error,
- > = (&test_case).try_into();
- assert!(
- res.is_err(),
- "The value {:?} should not have been able to serialize.
Serialized to :{:?}",
- test_case, res
- );
+ let proto: Result<super::protobuf::ScalarValue,
super::to_proto::Error> =
Review Comment:
Some of the validation is now done during deserialization rather than
serialization so I had to change this test slightly
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]