[ 
https://issues.apache.org/jira/browse/PARQUET-2305?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17729884#comment-17729884
 ] 

ASF GitHub Bot commented on PARQUET-2305:
-----------------------------------------

wgtmac commented on code in PR #1102:
URL: https://github.com/apache/parquet-mr/pull/1102#discussion_r1220671204


##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -31,9 +29,7 @@
 import org.apache.parquet.io.api.Converter;
 import org.apache.parquet.io.api.GroupConverter;
 import org.apache.parquet.io.api.PrimitiveConverter;
-import org.apache.parquet.schema.GroupType;
-import org.apache.parquet.schema.IncompatibleSchemaModificationException;
-import org.apache.parquet.schema.LogicalTypeAnnotation;
+import org.apache.parquet.schema.*;

Review Comment:
   ditto



##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -18,9 +18,7 @@
  */
 package org.apache.parquet.proto;
 
-import com.google.protobuf.ByteString;
-import com.google.protobuf.Descriptors;
-import com.google.protobuf.Message;
+import com.google.protobuf.*;

Review Comment:
   Please do not use import *



##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoParquetReader.java:
##########
@@ -71,6 +77,13 @@ protected Builder(InputFile file) {
       super(file);
     }
 
+    private Builder setIgnoreUnknownFields(boolean ignoreUnknownFields) {
+      if(ignoreUnknownFields) {
+        this.set("IGNORE_UNKNOWN_FIELDS", "TRUE");

Review Comment:
   We may define `IGNORE_UNKNOWN_FIELDS` constant in ProtoConstants.java like 
this: 
https://github.com/apache/parquet-mr/blob/master/parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoConstants.java#L35



##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoParquetReader.java:
##########
@@ -71,6 +77,13 @@ protected Builder(InputFile file) {
       super(file);
     }
 
+    private Builder setIgnoreUnknownFields(boolean ignoreUnknownFields) {
+      if(ignoreUnknownFields) {

Review Comment:
   ```suggestion
         if (ignoreUnknownFields) {
   ```



##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -86,32 +89,71 @@ class ProtoMessageConverter extends GroupConverter {
     this.conf = conf;
     this.parent = pvc;
     this.extraMetadata = extraMetadata;
-    int parquetFieldIndex = 1;
+    boolean ignoreUnknownFields = conf.getBoolean("IGNORE_UNKNOWN_FIELDS", 
false);
+
+    myBuilder = builder;
 
     if (pvc == null) {
       throw new IllegalStateException("Missing parent value container");
     }
 
-    myBuilder = builder;
+    if(builder == null && ignoreUnknownFields) {

Review Comment:
   ```suggestion
       if (builder == null && ignoreUnknownFields) {
   ```



##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -86,32 +89,71 @@ class ProtoMessageConverter extends GroupConverter {
     this.conf = conf;
     this.parent = pvc;
     this.extraMetadata = extraMetadata;
-    int parquetFieldIndex = 1;
+    boolean ignoreUnknownFields = conf.getBoolean("IGNORE_UNKNOWN_FIELDS", 
false);
+
+    myBuilder = builder;
 
     if (pvc == null) {
       throw new IllegalStateException("Missing parent value container");
     }
 
-    myBuilder = builder;
+    if(builder == null && ignoreUnknownFields) {
+      IntStream.range(0, parquetSchema.getFieldCount())
+        .forEach(i-> converters[i] = dummyScalarConverter(DUMMY_PVC, 
parquetSchema.getType(i), conf, extraMetadata));
 
-    Descriptors.Descriptor protoDescriptor = builder.getDescriptorForType();
+    } else {
 
-    for (Type parquetField : parquetSchema.getFields()) {
-      Descriptors.FieldDescriptor protoField = 
protoDescriptor.findFieldByName(parquetField.getName());
+      int parquetFieldIndex = 0;
+      Descriptors.Descriptor protoDescriptor =  builder.getDescriptorForType();
 
-      if (protoField == null) {
-        String description = "Scheme mismatch \n\"" + parquetField + "\"" +
-                "\n proto descriptor:\n" + protoDescriptor.toProto();
-        throw new IncompatibleSchemaModificationException("Cant find \"" + 
parquetField.getName() + "\" " + description);
-      }
+      for (Type parquetField : parquetSchema.getFields()) {

Review Comment:
   The indentation in the for loop looks weird to me. Could you please format 
it a little bit?



##########
parquet-protobuf/src/test/java/org/apache/parquet/proto/ProtoSchemaEvolutionTest.java:
##########
@@ -65,4 +66,68 @@ public void testEnumSchemaWriteV1ReadV2() throws IOException 
{
     assertEquals(messagesV2.size(), 1);
     assertSame(messagesV2.get(0).getOptionalLabelNumberPair(), 
TestProto3SchemaV2.MessageSchema.LabelNumberPair.SECOND);
   }
+
+  /**

Review Comment:
   Thanks for adding the test!



##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoParquetReader.java:
##########
@@ -37,11 +37,17 @@
   public static <T> ParquetReader.Builder<T> builder(Path file) {
     return new ProtoParquetReader.Builder<T>(file);
   }
-
+  public static <T> ParquetReader.Builder<T> builder(Path file, boolean 
ignoreUnknownFields) {

Review Comment:
   Please add one blank line before and after this new method.



##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoParquetReader.java:
##########
@@ -71,6 +77,13 @@ protected Builder(InputFile file) {
       super(file);
     }
 
+    private Builder setIgnoreUnknownFields(boolean ignoreUnknownFields) {

Review Comment:
   What about making it protected?



##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -86,32 +89,71 @@ class ProtoMessageConverter extends GroupConverter {
     this.conf = conf;
     this.parent = pvc;
     this.extraMetadata = extraMetadata;
-    int parquetFieldIndex = 1;
+    boolean ignoreUnknownFields = conf.getBoolean("IGNORE_UNKNOWN_FIELDS", 
false);
+
+    myBuilder = builder;
 
     if (pvc == null) {
       throw new IllegalStateException("Missing parent value container");
     }
 
-    myBuilder = builder;
+    if(builder == null && ignoreUnknownFields) {
+      IntStream.range(0, parquetSchema.getFieldCount())
+        .forEach(i-> converters[i] = dummyScalarConverter(DUMMY_PVC, 
parquetSchema.getType(i), conf, extraMetadata));
 
-    Descriptors.Descriptor protoDescriptor = builder.getDescriptorForType();
+    } else {
 
-    for (Type parquetField : parquetSchema.getFields()) {
-      Descriptors.FieldDescriptor protoField = 
protoDescriptor.findFieldByName(parquetField.getName());
+      int parquetFieldIndex = 0;
+      Descriptors.Descriptor protoDescriptor =  builder.getDescriptorForType();
 
-      if (protoField == null) {
-        String description = "Scheme mismatch \n\"" + parquetField + "\"" +
-                "\n proto descriptor:\n" + protoDescriptor.toProto();
-        throw new IncompatibleSchemaModificationException("Cant find \"" + 
parquetField.getName() + "\" " + description);
-      }
+      for (Type parquetField : parquetSchema.getFields()) {
+
+        Descriptors.FieldDescriptor protoField = 
protoDescriptor.findFieldByName(parquetField.getName());
+
+        validateProtoField(ignoreUnknownFields, protoDescriptor.toProto(), 
parquetField, protoField);
+
+        converters[parquetFieldIndex] = protoField != null ?
+            newMessageConverter(myBuilder, protoField, parquetField) :
+            dummyScalarConverter(DUMMY_PVC, parquetField, conf, extraMetadata);
 
-      converters[parquetFieldIndex - 1] = newMessageConverter(myBuilder, 
protoField, parquetField);
+          parquetFieldIndex++;
+        }
+
+    }
+  }
 
-      parquetFieldIndex++;
+  private void validateProtoField(boolean ignoreUnknownFields, 
DescriptorProtos.DescriptorProto protoDescriptor, Type parquetField, 
Descriptors.FieldDescriptor protoField) {

Review Comment:
   This line is too long and should be split.



##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -86,32 +89,71 @@ class ProtoMessageConverter extends GroupConverter {
     this.conf = conf;
     this.parent = pvc;
     this.extraMetadata = extraMetadata;
-    int parquetFieldIndex = 1;
+    boolean ignoreUnknownFields = conf.getBoolean("IGNORE_UNKNOWN_FIELDS", 
false);
+
+    myBuilder = builder;
 
     if (pvc == null) {
       throw new IllegalStateException("Missing parent value container");
     }
 
-    myBuilder = builder;
+    if(builder == null && ignoreUnknownFields) {
+      IntStream.range(0, parquetSchema.getFieldCount())
+        .forEach(i-> converters[i] = dummyScalarConverter(DUMMY_PVC, 
parquetSchema.getType(i), conf, extraMetadata));
 
-    Descriptors.Descriptor protoDescriptor = builder.getDescriptorForType();
+    } else {
 
-    for (Type parquetField : parquetSchema.getFields()) {
-      Descriptors.FieldDescriptor protoField = 
protoDescriptor.findFieldByName(parquetField.getName());
+      int parquetFieldIndex = 0;
+      Descriptors.Descriptor protoDescriptor =  builder.getDescriptorForType();
 
-      if (protoField == null) {
-        String description = "Scheme mismatch \n\"" + parquetField + "\"" +
-                "\n proto descriptor:\n" + protoDescriptor.toProto();
-        throw new IncompatibleSchemaModificationException("Cant find \"" + 
parquetField.getName() + "\" " + description);
-      }
+      for (Type parquetField : parquetSchema.getFields()) {
+
+        Descriptors.FieldDescriptor protoField = 
protoDescriptor.findFieldByName(parquetField.getName());
+
+        validateProtoField(ignoreUnknownFields, protoDescriptor.toProto(), 
parquetField, protoField);
+
+        converters[parquetFieldIndex] = protoField != null ?
+            newMessageConverter(myBuilder, protoField, parquetField) :
+            dummyScalarConverter(DUMMY_PVC, parquetField, conf, extraMetadata);
 
-      converters[parquetFieldIndex - 1] = newMessageConverter(myBuilder, 
protoField, parquetField);
+          parquetFieldIndex++;
+        }
+
+    }
+  }
 
-      parquetFieldIndex++;
+  private void validateProtoField(boolean ignoreUnknownFields, 
DescriptorProtos.DescriptorProto protoDescriptor, Type parquetField, 
Descriptors.FieldDescriptor protoField) {
+    if (protoField == null && !ignoreUnknownFields) {
+      String description = "Schema mismatch \n\"" + parquetField + "\"" +
+        "\n proto descriptor:\n" + protoDescriptor;
+      throw new IncompatibleSchemaModificationException("Cant find \"" + 
parquetField.getName() + "\" " + description);
     }
   }
 
 
+  private Converter dummyScalarConverter(ParentValueContainer pvc,
+                                         Type parquetField, Configuration conf,
+                                         Map<String, String> extraMetadata) {
+
+    if(parquetField.isPrimitive()) {
+      PrimitiveType primitiveType = parquetField.asPrimitiveType();
+      PrimitiveType.PrimitiveTypeName primitiveTypeName = 
primitiveType.getPrimitiveTypeName();
+      switch (primitiveTypeName) {
+        case BINARY: return new ProtoStringConverter(pvc);

Review Comment:
   CMIW, do we miss some cases? It looks shorter than here: 
https://github.com/apache/parquet-mr/blob/master/parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java#L176-L184



##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -86,32 +89,71 @@ class ProtoMessageConverter extends GroupConverter {
     this.conf = conf;
     this.parent = pvc;
     this.extraMetadata = extraMetadata;
-    int parquetFieldIndex = 1;
+    boolean ignoreUnknownFields = conf.getBoolean("IGNORE_UNKNOWN_FIELDS", 
false);

Review Comment:
   Seems line 92-94 can be moved under line 98?



##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -124,13 +166,15 @@ public void start() {
 
   @Override
   public void end() {
-    parent.add(myBuilder.build());
-    myBuilder.clear();
+    if(myBuilder != null) {
+      parent.add(myBuilder.build());
+      myBuilder.clear();
+    }
   }
 
   protected Converter newMessageConverter(final Message.Builder parentBuilder, 
final Descriptors.FieldDescriptor fieldDescriptor, Type parquetType) {
 
-    boolean isRepeated = fieldDescriptor.isRepeated();
+    boolean isRepeated = fieldDescriptor==null ? false : 
fieldDescriptor.isRepeated();

Review Comment:
   ```suggestion
       boolean isRepeated = fieldDescriptor != null && 
fieldDescriptor.isRepeated();
   ```



##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -124,13 +166,15 @@ public void start() {
 
   @Override
   public void end() {
-    parent.add(myBuilder.build());
-    myBuilder.clear();
+    if(myBuilder != null) {

Review Comment:
   ```suggestion
       if (myBuilder != null) {
   ```



##########
parquet-protobuf/src/test/java/org/apache/parquet/proto/TestUtils.java:
##########
@@ -195,6 +196,18 @@ public static <T extends MessageOrBuilder> List<T> 
readMessages(Path file, Class
     }
   }
 
+  /**
+   * Read messages from given file into the expected proto class.
+   * @param file
+   * @param messageClass
+   * @param <T>
+   * @return List of protobuf messages for the given type.
+   */
+  public static <T extends MessageOrBuilder> List<T> readMessages(Path file, 
Class<T> messageClass) throws IOException {
+    return readMessages(file, messageClass, false);
+

Review Comment:
   Please remove this blank line.





> Allow Parquet to Proto conversion even though Target Schema has less fields
> ---------------------------------------------------------------------------
>
>                 Key: PARQUET-2305
>                 URL: https://issues.apache.org/jira/browse/PARQUET-2305
>             Project: Parquet
>          Issue Type: Improvement
>          Components: parquet-protobuf
>            Reporter: Sanjay Sharma
>            Priority: Major
>
> If Parquet has any field which has been removed from the schema and Parquet 
> to Proto conversion happens, it errors out due to Unknown fields. There could 
> be some scenarios that we want to still convert PARQUET into the target proto 
> schema object which has lesser fields.
> If specified "ignoreUnknownFields" as an argument, this should allow the 
> conversion which ignore fields it can't convert and not error out.
> Similar functionality exist in 
> [https://github.com/protocolbuffers/protobuf/blob/main/java/util/src/main/java/com/google/protobuf/util/JsonFormat.java]
>  with field "ignoringUnknownFields"



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to