wgtmac commented on code in PR #1102:
URL: https://github.com/apache/parquet-mr/pull/1102#discussion_r1220671204


##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -31,9 +29,7 @@
 import org.apache.parquet.io.api.Converter;
 import org.apache.parquet.io.api.GroupConverter;
 import org.apache.parquet.io.api.PrimitiveConverter;
-import org.apache.parquet.schema.GroupType;
-import org.apache.parquet.schema.IncompatibleSchemaModificationException;
-import org.apache.parquet.schema.LogicalTypeAnnotation;
+import org.apache.parquet.schema.*;

Review Comment:
   ditto



##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -18,9 +18,7 @@
  */
 package org.apache.parquet.proto;
 
-import com.google.protobuf.ByteString;
-import com.google.protobuf.Descriptors;
-import com.google.protobuf.Message;
+import com.google.protobuf.*;

Review Comment:
   Please do not use import *



##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoParquetReader.java:
##########
@@ -71,6 +77,13 @@ protected Builder(InputFile file) {
       super(file);
     }
 
+    private Builder setIgnoreUnknownFields(boolean ignoreUnknownFields) {
+      if(ignoreUnknownFields) {
+        this.set("IGNORE_UNKNOWN_FIELDS", "TRUE");

Review Comment:
   We may define `IGNORE_UNKNOWN_FIELDS` constant in ProtoConstants.java like 
this: 
https://github.com/apache/parquet-mr/blob/master/parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoConstants.java#L35



##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoParquetReader.java:
##########
@@ -71,6 +77,13 @@ protected Builder(InputFile file) {
       super(file);
     }
 
+    private Builder setIgnoreUnknownFields(boolean ignoreUnknownFields) {
+      if(ignoreUnknownFields) {

Review Comment:
   ```suggestion
         if (ignoreUnknownFields) {
   ```



##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -86,32 +89,71 @@ class ProtoMessageConverter extends GroupConverter {
     this.conf = conf;
     this.parent = pvc;
     this.extraMetadata = extraMetadata;
-    int parquetFieldIndex = 1;
+    boolean ignoreUnknownFields = conf.getBoolean("IGNORE_UNKNOWN_FIELDS", 
false);
+
+    myBuilder = builder;
 
     if (pvc == null) {
       throw new IllegalStateException("Missing parent value container");
     }
 
-    myBuilder = builder;
+    if(builder == null && ignoreUnknownFields) {

Review Comment:
   ```suggestion
       if (builder == null && ignoreUnknownFields) {
   ```



##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -86,32 +89,71 @@ class ProtoMessageConverter extends GroupConverter {
     this.conf = conf;
     this.parent = pvc;
     this.extraMetadata = extraMetadata;
-    int parquetFieldIndex = 1;
+    boolean ignoreUnknownFields = conf.getBoolean("IGNORE_UNKNOWN_FIELDS", 
false);
+
+    myBuilder = builder;
 
     if (pvc == null) {
       throw new IllegalStateException("Missing parent value container");
     }
 
-    myBuilder = builder;
+    if(builder == null && ignoreUnknownFields) {
+      IntStream.range(0, parquetSchema.getFieldCount())
+        .forEach(i-> converters[i] = dummyScalarConverter(DUMMY_PVC, 
parquetSchema.getType(i), conf, extraMetadata));
 
-    Descriptors.Descriptor protoDescriptor = builder.getDescriptorForType();
+    } else {
 
-    for (Type parquetField : parquetSchema.getFields()) {
-      Descriptors.FieldDescriptor protoField = 
protoDescriptor.findFieldByName(parquetField.getName());
+      int parquetFieldIndex = 0;
+      Descriptors.Descriptor protoDescriptor =  builder.getDescriptorForType();
 
-      if (protoField == null) {
-        String description = "Scheme mismatch \n\"" + parquetField + "\"" +
-                "\n proto descriptor:\n" + protoDescriptor.toProto();
-        throw new IncompatibleSchemaModificationException("Cant find \"" + 
parquetField.getName() + "\" " + description);
-      }
+      for (Type parquetField : parquetSchema.getFields()) {

Review Comment:
   The indentation in the for loop looks weird to me. Could you please format 
it a little bit?



##########
parquet-protobuf/src/test/java/org/apache/parquet/proto/ProtoSchemaEvolutionTest.java:
##########
@@ -65,4 +66,68 @@ public void testEnumSchemaWriteV1ReadV2() throws IOException 
{
     assertEquals(messagesV2.size(), 1);
     assertSame(messagesV2.get(0).getOptionalLabelNumberPair(), 
TestProto3SchemaV2.MessageSchema.LabelNumberPair.SECOND);
   }
+
+  /**

Review Comment:
   Thanks for adding the test!



##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoParquetReader.java:
##########
@@ -37,11 +37,17 @@
   public static <T> ParquetReader.Builder<T> builder(Path file) {
     return new ProtoParquetReader.Builder<T>(file);
   }
-
+  public static <T> ParquetReader.Builder<T> builder(Path file, boolean 
ignoreUnknownFields) {

Review Comment:
   Please add one blank line before and after this new method.



##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoParquetReader.java:
##########
@@ -71,6 +77,13 @@ protected Builder(InputFile file) {
       super(file);
     }
 
+    private Builder setIgnoreUnknownFields(boolean ignoreUnknownFields) {

Review Comment:
   What about making it protected?



##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -86,32 +89,71 @@ class ProtoMessageConverter extends GroupConverter {
     this.conf = conf;
     this.parent = pvc;
     this.extraMetadata = extraMetadata;
-    int parquetFieldIndex = 1;
+    boolean ignoreUnknownFields = conf.getBoolean("IGNORE_UNKNOWN_FIELDS", 
false);
+
+    myBuilder = builder;
 
     if (pvc == null) {
       throw new IllegalStateException("Missing parent value container");
     }
 
-    myBuilder = builder;
+    if(builder == null && ignoreUnknownFields) {
+      IntStream.range(0, parquetSchema.getFieldCount())
+        .forEach(i-> converters[i] = dummyScalarConverter(DUMMY_PVC, 
parquetSchema.getType(i), conf, extraMetadata));
 
-    Descriptors.Descriptor protoDescriptor = builder.getDescriptorForType();
+    } else {
 
-    for (Type parquetField : parquetSchema.getFields()) {
-      Descriptors.FieldDescriptor protoField = 
protoDescriptor.findFieldByName(parquetField.getName());
+      int parquetFieldIndex = 0;
+      Descriptors.Descriptor protoDescriptor =  builder.getDescriptorForType();
 
-      if (protoField == null) {
-        String description = "Scheme mismatch \n\"" + parquetField + "\"" +
-                "\n proto descriptor:\n" + protoDescriptor.toProto();
-        throw new IncompatibleSchemaModificationException("Cant find \"" + 
parquetField.getName() + "\" " + description);
-      }
+      for (Type parquetField : parquetSchema.getFields()) {
+
+        Descriptors.FieldDescriptor protoField = 
protoDescriptor.findFieldByName(parquetField.getName());
+
+        validateProtoField(ignoreUnknownFields, protoDescriptor.toProto(), 
parquetField, protoField);
+
+        converters[parquetFieldIndex] = protoField != null ?
+            newMessageConverter(myBuilder, protoField, parquetField) :
+            dummyScalarConverter(DUMMY_PVC, parquetField, conf, extraMetadata);
 
-      converters[parquetFieldIndex - 1] = newMessageConverter(myBuilder, 
protoField, parquetField);
+          parquetFieldIndex++;
+        }
+
+    }
+  }
 
-      parquetFieldIndex++;
+  private void validateProtoField(boolean ignoreUnknownFields, 
DescriptorProtos.DescriptorProto protoDescriptor, Type parquetField, 
Descriptors.FieldDescriptor protoField) {

Review Comment:
   This line is too long and should be split.



##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -86,32 +89,71 @@ class ProtoMessageConverter extends GroupConverter {
     this.conf = conf;
     this.parent = pvc;
     this.extraMetadata = extraMetadata;
-    int parquetFieldIndex = 1;
+    boolean ignoreUnknownFields = conf.getBoolean("IGNORE_UNKNOWN_FIELDS", 
false);
+
+    myBuilder = builder;
 
     if (pvc == null) {
       throw new IllegalStateException("Missing parent value container");
     }
 
-    myBuilder = builder;
+    if(builder == null && ignoreUnknownFields) {
+      IntStream.range(0, parquetSchema.getFieldCount())
+        .forEach(i-> converters[i] = dummyScalarConverter(DUMMY_PVC, 
parquetSchema.getType(i), conf, extraMetadata));
 
-    Descriptors.Descriptor protoDescriptor = builder.getDescriptorForType();
+    } else {
 
-    for (Type parquetField : parquetSchema.getFields()) {
-      Descriptors.FieldDescriptor protoField = 
protoDescriptor.findFieldByName(parquetField.getName());
+      int parquetFieldIndex = 0;
+      Descriptors.Descriptor protoDescriptor =  builder.getDescriptorForType();
 
-      if (protoField == null) {
-        String description = "Scheme mismatch \n\"" + parquetField + "\"" +
-                "\n proto descriptor:\n" + protoDescriptor.toProto();
-        throw new IncompatibleSchemaModificationException("Cant find \"" + 
parquetField.getName() + "\" " + description);
-      }
+      for (Type parquetField : parquetSchema.getFields()) {
+
+        Descriptors.FieldDescriptor protoField = 
protoDescriptor.findFieldByName(parquetField.getName());
+
+        validateProtoField(ignoreUnknownFields, protoDescriptor.toProto(), 
parquetField, protoField);
+
+        converters[parquetFieldIndex] = protoField != null ?
+            newMessageConverter(myBuilder, protoField, parquetField) :
+            dummyScalarConverter(DUMMY_PVC, parquetField, conf, extraMetadata);
 
-      converters[parquetFieldIndex - 1] = newMessageConverter(myBuilder, 
protoField, parquetField);
+          parquetFieldIndex++;
+        }
+
+    }
+  }
 
-      parquetFieldIndex++;
+  private void validateProtoField(boolean ignoreUnknownFields, 
DescriptorProtos.DescriptorProto protoDescriptor, Type parquetField, 
Descriptors.FieldDescriptor protoField) {
+    if (protoField == null && !ignoreUnknownFields) {
+      String description = "Schema mismatch \n\"" + parquetField + "\"" +
+        "\n proto descriptor:\n" + protoDescriptor;
+      throw new IncompatibleSchemaModificationException("Cant find \"" + 
parquetField.getName() + "\" " + description);
     }
   }
 
 
+  private Converter dummyScalarConverter(ParentValueContainer pvc,
+                                         Type parquetField, Configuration conf,
+                                         Map<String, String> extraMetadata) {
+
+    if(parquetField.isPrimitive()) {
+      PrimitiveType primitiveType = parquetField.asPrimitiveType();
+      PrimitiveType.PrimitiveTypeName primitiveTypeName = 
primitiveType.getPrimitiveTypeName();
+      switch (primitiveTypeName) {
+        case BINARY: return new ProtoStringConverter(pvc);

Review Comment:
   CMIW, do we miss some cases? It looks shorter than here: 
https://github.com/apache/parquet-mr/blob/master/parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java#L176-L184



##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -86,32 +89,71 @@ class ProtoMessageConverter extends GroupConverter {
     this.conf = conf;
     this.parent = pvc;
     this.extraMetadata = extraMetadata;
-    int parquetFieldIndex = 1;
+    boolean ignoreUnknownFields = conf.getBoolean("IGNORE_UNKNOWN_FIELDS", 
false);

Review Comment:
   Seems line 92-94 can be moved under line 98?



##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -124,13 +166,15 @@ public void start() {
 
   @Override
   public void end() {
-    parent.add(myBuilder.build());
-    myBuilder.clear();
+    if(myBuilder != null) {
+      parent.add(myBuilder.build());
+      myBuilder.clear();
+    }
   }
 
   protected Converter newMessageConverter(final Message.Builder parentBuilder, 
final Descriptors.FieldDescriptor fieldDescriptor, Type parquetType) {
 
-    boolean isRepeated = fieldDescriptor.isRepeated();
+    boolean isRepeated = fieldDescriptor==null ? false : 
fieldDescriptor.isRepeated();

Review Comment:
   ```suggestion
       boolean isRepeated = fieldDescriptor != null && 
fieldDescriptor.isRepeated();
   ```



##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -124,13 +166,15 @@ public void start() {
 
   @Override
   public void end() {
-    parent.add(myBuilder.build());
-    myBuilder.clear();
+    if(myBuilder != null) {

Review Comment:
   ```suggestion
       if (myBuilder != null) {
   ```



##########
parquet-protobuf/src/test/java/org/apache/parquet/proto/TestUtils.java:
##########
@@ -195,6 +196,18 @@ public static <T extends MessageOrBuilder> List<T> 
readMessages(Path file, Class
     }
   }
 
+  /**
+   * Read messages from given file into the expected proto class.
+   * @param file
+   * @param messageClass
+   * @param <T>
+   * @return List of protobuf messages for the given type.
+   */
+  public static <T extends MessageOrBuilder> List<T> readMessages(Path file, 
Class<T> messageClass) throws IOException {
+    return readMessages(file, messageClass, false);
+

Review Comment:
   Please remove this blank line.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to