[
https://issues.apache.org/jira/browse/PARQUET-2305?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17730337#comment-17730337
]
ASF GitHub Bot commented on PARQUET-2305:
-----------------------------------------
wgtmac commented on code in PR #1102:
URL: https://github.com/apache/parquet-mr/pull/1102#discussion_r1222352757
##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -31,10 +32,12 @@
import org.apache.parquet.io.api.Converter;
import org.apache.parquet.io.api.GroupConverter;
import org.apache.parquet.io.api.PrimitiveConverter;
+
import org.apache.parquet.schema.GroupType;
Review Comment:
```suggestion
import org.apache.parquet.io.api.PrimitiveConverter;
import org.apache.parquet.schema.GroupType;
```
##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoConstants.java:
##########
@@ -26,6 +26,7 @@ public final class ProtoConstants {
public static final String METADATA_ENUM_PREFIX = "parquet.proto.enum.";
public static final String METADATA_ENUM_KEY_VALUE_SEPARATOR = ":";
public static final String METADATA_ENUM_ITEM_SEPARATOR = ",";
+ public static final String CONFIG_IGNORE_UNKNOWN_FIELDS =
"parquet.proto.ignore.unknwon.fields";
Review Comment:
```suggestion
public static final String CONFIG_IGNORE_UNKNOWN_FIELDS =
"parquet.proto.ignore.unknown.fields";
```
##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -80,35 +91,77 @@ class ProtoMessageConverter extends GroupConverter {
// For usage in message arrays
ProtoMessageConverter(Configuration conf, ParentValueContainer pvc,
Message.Builder builder, GroupType parquetSchema, Map<String, String>
extraMetadata) {
+ if (pvc == null) {
+ throw new IllegalStateException("Missing parent value container");
+ }
int schemaSize = parquetSchema.getFieldCount();
converters = new Converter[schemaSize];
this.conf = conf;
this.parent = pvc;
this.extraMetadata = extraMetadata;
- int parquetFieldIndex = 1;
-
- if (pvc == null) {
- throw new IllegalStateException("Missing parent value container");
- }
+ boolean ignoreUnknownFields = conf.getBoolean(IGNORE_UNKNOWN_FIELDS,
false);
myBuilder = builder;
- Descriptors.Descriptor protoDescriptor = builder.getDescriptorForType();
+ if (builder == null && ignoreUnknownFields) {
+ IntStream.range(0, parquetSchema.getFieldCount())
+ .forEach(i-> converters[i] = dummyScalarConverter(DUMMY_PVC,
parquetSchema.getType(i), conf, extraMetadata));
+
+ } else {
- for (Type parquetField : parquetSchema.getFields()) {
- Descriptors.FieldDescriptor protoField =
protoDescriptor.findFieldByName(parquetField.getName());
+ int parquetFieldIndex = 0;
+ Descriptors.Descriptor protoDescriptor = builder.getDescriptorForType();
- if (protoField == null) {
- String description = "Scheme mismatch \n\"" + parquetField + "\"" +
- "\n proto descriptor:\n" + protoDescriptor.toProto();
- throw new IncompatibleSchemaModificationException("Cant find \"" +
parquetField.getName() + "\" " + description);
+ for (Type parquetField : parquetSchema.getFields()) {
+
+ Descriptors.FieldDescriptor protoField =
protoDescriptor.findFieldByName(parquetField.getName());
Review Comment:
```suggestion
for (Type parquetField : parquetSchema.getFields()) {
Descriptors.FieldDescriptor protoField =
protoDescriptor.findFieldByName(parquetField.getName());
```
##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoConstants.java:
##########
@@ -26,6 +26,7 @@ public final class ProtoConstants {
public static final String METADATA_ENUM_PREFIX = "parquet.proto.enum.";
public static final String METADATA_ENUM_KEY_VALUE_SEPARATOR = ":";
public static final String METADATA_ENUM_ITEM_SEPARATOR = ",";
+ public static final String CONFIG_IGNORE_UNKNOWN_FIELDS =
"parquet.proto.ignore.unknwon.fields";
Review Comment:
Could you please add a comment for `CONFIG_IGNORE_UNKNOWN_FIELDS`? Like what
we did for `CONFIG_ACCEPT_UNKNOWN_ENUM`.
##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -80,35 +91,77 @@ class ProtoMessageConverter extends GroupConverter {
// For usage in message arrays
ProtoMessageConverter(Configuration conf, ParentValueContainer pvc,
Message.Builder builder, GroupType parquetSchema, Map<String, String>
extraMetadata) {
+ if (pvc == null) {
+ throw new IllegalStateException("Missing parent value container");
+ }
int schemaSize = parquetSchema.getFieldCount();
converters = new Converter[schemaSize];
this.conf = conf;
this.parent = pvc;
this.extraMetadata = extraMetadata;
- int parquetFieldIndex = 1;
-
- if (pvc == null) {
- throw new IllegalStateException("Missing parent value container");
- }
+ boolean ignoreUnknownFields = conf.getBoolean(IGNORE_UNKNOWN_FIELDS,
false);
myBuilder = builder;
- Descriptors.Descriptor protoDescriptor = builder.getDescriptorForType();
+ if (builder == null && ignoreUnknownFields) {
+ IntStream.range(0, parquetSchema.getFieldCount())
+ .forEach(i-> converters[i] = dummyScalarConverter(DUMMY_PVC,
parquetSchema.getType(i), conf, extraMetadata));
+
+ } else {
- for (Type parquetField : parquetSchema.getFields()) {
- Descriptors.FieldDescriptor protoField =
protoDescriptor.findFieldByName(parquetField.getName());
+ int parquetFieldIndex = 0;
+ Descriptors.Descriptor protoDescriptor = builder.getDescriptorForType();
- if (protoField == null) {
- String description = "Scheme mismatch \n\"" + parquetField + "\"" +
- "\n proto descriptor:\n" + protoDescriptor.toProto();
- throw new IncompatibleSchemaModificationException("Cant find \"" +
parquetField.getName() + "\" " + description);
+ for (Type parquetField : parquetSchema.getFields()) {
+
+ Descriptors.FieldDescriptor protoField =
protoDescriptor.findFieldByName(parquetField.getName());
+
+ validateProtoField(ignoreUnknownFields, protoDescriptor.toProto(),
parquetField, protoField);
+
+ converters[parquetFieldIndex] = protoField != null ?
+ newMessageConverter(myBuilder, protoField, parquetField) :
+ dummyScalarConverter(DUMMY_PVC, parquetField, conf, extraMetadata);
+
+ parquetFieldIndex++;
}
+ }
+ }
+
+ private void validateProtoField(boolean ignoreUnknownFields,
+ DescriptorProtos.DescriptorProto
protoDescriptor,
+ Type parquetField,
+ Descriptors.FieldDescriptor protoField) {
+ if (protoField == null && !ignoreUnknownFields) {
+ String description = "Schema mismatch \n\"" + parquetField + "\"" +
+ "\n proto descriptor:\n" + protoDescriptor;
+ throw new IncompatibleSchemaModificationException("Cant find \"" +
parquetField.getName() + "\" " + description);
+ }
+ }
- converters[parquetFieldIndex - 1] = newMessageConverter(myBuilder,
protoField, parquetField);
- parquetFieldIndex++;
+ private Converter dummyScalarConverter(ParentValueContainer pvc,
+ Type parquetField, Configuration conf,
+ Map<String, String> extraMetadata) {
+
+ if(parquetField.isPrimitive()) {
Review Comment:
```suggestion
if (parquetField.isPrimitive()) {
```
> Allow Parquet to Proto conversion even though Target Schema has less fields
> ---------------------------------------------------------------------------
>
> Key: PARQUET-2305
> URL: https://issues.apache.org/jira/browse/PARQUET-2305
> Project: Parquet
> Issue Type: Improvement
> Components: parquet-protobuf
> Reporter: Sanjay Sharma
> Priority: Major
>
> If Parquet has any field which has been removed from the schema and Parquet
> to Proto conversion happens, it errors out due to Unknown fields. There could
> be some scenarios that we want to still convert PARQUET into the target proto
> schema object which has lesser fields.
> If specified "ignoreUnknownFields" as an argument, this should allow the
> conversion which ignore fields it can't convert and not error out.
> Similar functionality exist in
> [https://github.com/protocolbuffers/protobuf/blob/main/java/util/src/main/java/com/google/protobuf/util/JsonFormat.java]
> with field "ignoringUnknownFields"
--
This message was sent by Atlassian Jira
(v8.20.10#820010)