[ 
https://issues.apache.org/jira/browse/PARQUET-2305?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17730337#comment-17730337
 ] 

ASF GitHub Bot commented on PARQUET-2305:
-----------------------------------------

wgtmac commented on code in PR #1102:
URL: https://github.com/apache/parquet-mr/pull/1102#discussion_r1222352757


##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -31,10 +32,12 @@
 import org.apache.parquet.io.api.Converter;
 import org.apache.parquet.io.api.GroupConverter;
 import org.apache.parquet.io.api.PrimitiveConverter;
+
 import org.apache.parquet.schema.GroupType;

Review Comment:
   ```suggestion
   import org.apache.parquet.io.api.PrimitiveConverter;
   import org.apache.parquet.schema.GroupType;
   ```



##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoConstants.java:
##########
@@ -26,6 +26,7 @@ public final class ProtoConstants {
   public static final String METADATA_ENUM_PREFIX = "parquet.proto.enum.";
   public static final String METADATA_ENUM_KEY_VALUE_SEPARATOR = ":";
   public static final String METADATA_ENUM_ITEM_SEPARATOR = ",";
+  public static final String CONFIG_IGNORE_UNKNOWN_FIELDS = 
"parquet.proto.ignore.unknwon.fields";

Review Comment:
   ```suggestion
     public static final String CONFIG_IGNORE_UNKNOWN_FIELDS = 
"parquet.proto.ignore.unknown.fields";
   ```



##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -80,35 +91,77 @@ class ProtoMessageConverter extends GroupConverter {
 
   // For usage in message arrays
   ProtoMessageConverter(Configuration conf, ParentValueContainer pvc, 
Message.Builder builder, GroupType parquetSchema, Map<String, String> 
extraMetadata) {
+    if (pvc == null) {
+      throw new IllegalStateException("Missing parent value container");
+    }
 
     int schemaSize = parquetSchema.getFieldCount();
     converters = new Converter[schemaSize];
     this.conf = conf;
     this.parent = pvc;
     this.extraMetadata = extraMetadata;
-    int parquetFieldIndex = 1;
-
-    if (pvc == null) {
-      throw new IllegalStateException("Missing parent value container");
-    }
+    boolean ignoreUnknownFields = conf.getBoolean(IGNORE_UNKNOWN_FIELDS, 
false);
 
     myBuilder = builder;
 
-    Descriptors.Descriptor protoDescriptor = builder.getDescriptorForType();
+    if (builder == null && ignoreUnknownFields) {
+      IntStream.range(0, parquetSchema.getFieldCount())
+        .forEach(i-> converters[i] = dummyScalarConverter(DUMMY_PVC, 
parquetSchema.getType(i), conf, extraMetadata));
+
+    } else {
 
-    for (Type parquetField : parquetSchema.getFields()) {
-      Descriptors.FieldDescriptor protoField = 
protoDescriptor.findFieldByName(parquetField.getName());
+      int parquetFieldIndex = 0;
+      Descriptors.Descriptor protoDescriptor =  builder.getDescriptorForType();
 
-      if (protoField == null) {
-        String description = "Scheme mismatch \n\"" + parquetField + "\"" +
-                "\n proto descriptor:\n" + protoDescriptor.toProto();
-        throw new IncompatibleSchemaModificationException("Cant find \"" + 
parquetField.getName() + "\" " + description);
+      for (Type parquetField : parquetSchema.getFields()) {
+
+        Descriptors.FieldDescriptor protoField = 
protoDescriptor.findFieldByName(parquetField.getName());

Review Comment:
   ```suggestion
         for (Type parquetField : parquetSchema.getFields()) {
           Descriptors.FieldDescriptor protoField = 
protoDescriptor.findFieldByName(parquetField.getName());
   ```



##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoConstants.java:
##########
@@ -26,6 +26,7 @@ public final class ProtoConstants {
   public static final String METADATA_ENUM_PREFIX = "parquet.proto.enum.";
   public static final String METADATA_ENUM_KEY_VALUE_SEPARATOR = ":";
   public static final String METADATA_ENUM_ITEM_SEPARATOR = ",";
+  public static final String CONFIG_IGNORE_UNKNOWN_FIELDS = 
"parquet.proto.ignore.unknwon.fields";

Review Comment:
   Could you please add a comment for `CONFIG_IGNORE_UNKNOWN_FIELDS`? Like what 
we did for `CONFIG_ACCEPT_UNKNOWN_ENUM`.



##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -80,35 +91,77 @@ class ProtoMessageConverter extends GroupConverter {
 
   // For usage in message arrays
   ProtoMessageConverter(Configuration conf, ParentValueContainer pvc, 
Message.Builder builder, GroupType parquetSchema, Map<String, String> 
extraMetadata) {
+    if (pvc == null) {
+      throw new IllegalStateException("Missing parent value container");
+    }
 
     int schemaSize = parquetSchema.getFieldCount();
     converters = new Converter[schemaSize];
     this.conf = conf;
     this.parent = pvc;
     this.extraMetadata = extraMetadata;
-    int parquetFieldIndex = 1;
-
-    if (pvc == null) {
-      throw new IllegalStateException("Missing parent value container");
-    }
+    boolean ignoreUnknownFields = conf.getBoolean(IGNORE_UNKNOWN_FIELDS, 
false);
 
     myBuilder = builder;
 
-    Descriptors.Descriptor protoDescriptor = builder.getDescriptorForType();
+    if (builder == null && ignoreUnknownFields) {
+      IntStream.range(0, parquetSchema.getFieldCount())
+        .forEach(i-> converters[i] = dummyScalarConverter(DUMMY_PVC, 
parquetSchema.getType(i), conf, extraMetadata));
+
+    } else {
 
-    for (Type parquetField : parquetSchema.getFields()) {
-      Descriptors.FieldDescriptor protoField = 
protoDescriptor.findFieldByName(parquetField.getName());
+      int parquetFieldIndex = 0;
+      Descriptors.Descriptor protoDescriptor =  builder.getDescriptorForType();
 
-      if (protoField == null) {
-        String description = "Scheme mismatch \n\"" + parquetField + "\"" +
-                "\n proto descriptor:\n" + protoDescriptor.toProto();
-        throw new IncompatibleSchemaModificationException("Cant find \"" + 
parquetField.getName() + "\" " + description);
+      for (Type parquetField : parquetSchema.getFields()) {
+
+        Descriptors.FieldDescriptor protoField = 
protoDescriptor.findFieldByName(parquetField.getName());
+
+        validateProtoField(ignoreUnknownFields, protoDescriptor.toProto(), 
parquetField, protoField);
+
+        converters[parquetFieldIndex] = protoField != null ?
+            newMessageConverter(myBuilder, protoField, parquetField) :
+            dummyScalarConverter(DUMMY_PVC, parquetField, conf, extraMetadata);
+
+        parquetFieldIndex++;
       }
+    }
+  }
+
+  private void validateProtoField(boolean ignoreUnknownFields,
+                                  DescriptorProtos.DescriptorProto 
protoDescriptor,
+                                  Type parquetField,
+                                  Descriptors.FieldDescriptor protoField) {
+    if (protoField == null && !ignoreUnknownFields) {
+      String description = "Schema mismatch \n\"" + parquetField + "\"" +
+        "\n proto descriptor:\n" + protoDescriptor;
+      throw new IncompatibleSchemaModificationException("Cant find \"" + 
parquetField.getName() + "\" " + description);
+    }
+  }
 
-      converters[parquetFieldIndex - 1] = newMessageConverter(myBuilder, 
protoField, parquetField);
 
-      parquetFieldIndex++;
+  private Converter dummyScalarConverter(ParentValueContainer pvc,
+                                         Type parquetField, Configuration conf,
+                                         Map<String, String> extraMetadata) {
+
+    if(parquetField.isPrimitive()) {

Review Comment:
   ```suggestion
       if (parquetField.isPrimitive()) {
   ```





> Allow Parquet to Proto conversion even though Target Schema has less fields
> ---------------------------------------------------------------------------
>
>                 Key: PARQUET-2305
>                 URL: https://issues.apache.org/jira/browse/PARQUET-2305
>             Project: Parquet
>          Issue Type: Improvement
>          Components: parquet-protobuf
>            Reporter: Sanjay Sharma
>            Priority: Major
>
> If Parquet has any field which has been removed from the schema and Parquet 
> to Proto conversion happens, it errors out due to Unknown fields. There could 
> be some scenarios that we want to still convert PARQUET into the target proto 
> schema object which has lesser fields.
> If specified "ignoreUnknownFields" as an argument, this should allow the 
> conversion which ignore fields it can't convert and not error out.
> Similar functionality exist in 
> [https://github.com/protocolbuffers/protobuf/blob/main/java/util/src/main/java/com/google/protobuf/util/JsonFormat.java]
>  with field "ignoringUnknownFields"



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to