rangadi commented on code in PR #43767:
URL: https://github.com/apache/spark/pull/43767#discussion_r1391516380
##########
connector/protobuf/src/test/resources/protobuf/functions_suite.proto:
##########
@@ -324,3 +325,19 @@ message Proto3AllTypes {
}
map<string, string> map = 13;
}
+
+message WellKnownWrapperTypes {
+ google.protobuf.BoolValue bool_val = 1;
+ google.protobuf.Int32Value int32_val = 2;
Review Comment:
Could you add test where `int32_val` is not set. What should the Spark
struct contain:
* When `emit.default.values` is false (default)
* When `emit.default.values` is true
Please comment on the expected behavior.
Another similar test with `int32_val.value` set to `0` (default value).
##########
connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala:
##########
@@ -247,12 +247,86 @@ private[sql] class ProtobufDeserializer(
updater.setLong(ordinal, micros +
TimeUnit.NANOSECONDS.toMicros(nanoSeconds))
case (MESSAGE, StringType)
- if protoType.getMessageType.getFullName == "google.protobuf.Any" =>
+ if protoType.getMessageType.getFullName == "google.protobuf.Any" =>
(updater, ordinal, value) =>
// Convert 'Any' protobuf message to JSON string.
val jsonStr = jsonPrinter.print(value.asInstanceOf[DynamicMessage])
updater.set(ordinal, UTF8String.fromString(jsonStr))
+ // Handle well known wrapper types. We unpack the value field instead of
keeping
+ // them as nested structs
+ case (MESSAGE, BooleanType)
+ if protoType.getMessageType.getFullName ==
BoolValue.getDescriptor.getFullName =>
+ (updater, ordinal, value) =>
+ val dm = value.asInstanceOf[DynamicMessage]
+ updater.setBoolean(
+ ordinal,
+ dm.getField(
+ dm.getDescriptorForType.findFieldByName("value")
+ ).asInstanceOf[Boolean]
+ )
+ case (MESSAGE, IntegerType)
+ if (protoType.getMessageType.getFullName ==
Int32Value.getDescriptor.getFullName
+ || protoType.getMessageType.getFullName ==
UInt32Value.getDescriptor.getFullName) =>
+ (updater, ordinal, value) =>
+ val dm = value.asInstanceOf[DynamicMessage]
+ updater.setInt(
+ ordinal,
+ dm.getField(
+ dm.getDescriptorForType.findFieldByName("value")
+ ).asInstanceOf[Int]
+ )
+ case (MESSAGE, LongType)
+ if (protoType.getMessageType.getFullName ==
Int64Value.getDescriptor.getFullName
+ || protoType.getMessageType.getFullName ==
UInt64Value.getDescriptor.getFullName) =>
+ (updater, ordinal, value) =>
+ val dm = value.asInstanceOf[DynamicMessage]
+ updater.setLong(
+ ordinal,
+ dm.getField(
+ dm.getDescriptorForType.findFieldByName("value")
Review Comment:
Better to do `getFields().get(0)`, rather than find by value each time.
##########
connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufOptions.scala:
##########
@@ -168,6 +168,28 @@ private[sql] class ProtobufOptions(
// instead of string, so use caution if changing existing parsing logic.
val enumsAsInts: Boolean =
parameters.getOrElse("enums.as.ints", false.toString).toBoolean
+
+ // Whether to unwrap the struct representation for well known primitve
wrapper types when
+ // deserializing. By default, the wrapper types for primitives (i.e.
google.protobuf.Int32Value,
+ // google.protobuf.Int64Value, etc.) will get deserialized as structs. We
allow the option to
+ // deserialize them as their respective primitives.
+ // https://protobuf.dev/reference/protobuf/google.protobuf/
+ //
+ // For example, given a message like:
+ // ```
+ // syntax = "proto3";
+ // message = Example {
+ // google.protobuf.Int32Value val = 1;
+ // }
+ // ```
+ //
+ // The message Example(Int32Value(1)) would be deserialized by default as
+ // {val: {value: 1}}
+ //
+ // However, with this option set, it would be deserialized as
+ // {val: 1}
+ val unwrapWellKnownTypes: Boolean =
+ parameters.getOrElse("unwrap.protobuf.wkt", false.toString).toBoolean
Review Comment:
Lets find a better name for this.
`unwrap.primitive.value.types` or something on those lines? Better to avoid
short forms like `wkt`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]