[
https://issues.apache.org/jira/browse/PARQUET-968?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16453965#comment-16453965
]
ASF GitHub Bot commented on PARQUET-968:
julienledem closed pull request #411: PARQUET-968 Add Hive/Presto support in
ProtoParquet
URL: https://github.com/apache/parquet-mr/pull/411
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git
a/parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java
b/parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java
index b5649a05b..d5f43e6b3 100644
---
a/parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java
+++
b/parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java
@@ -24,12 +24,14 @@
import com.twitter.elephantbird.util.Protobufs;
import org.apache.parquet.column.Dictionary;
import org.apache.parquet.io.InvalidRecordException;
+import org.apache.parquet.io.ParquetDecodingException;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.io.api.Converter;
import org.apache.parquet.io.api.GroupConverter;
import org.apache.parquet.io.api.PrimitiveConverter;
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.IncompatibleSchemaModificationException;
+import org.apache.parquet.schema.OriginalType;
import org.apache.parquet.schema.Type;
import java.util.HashMap;
@@ -129,10 +131,15 @@ public void add(Object value) {
};
}
+if (OriginalType.LIST == parquetType.getOriginalType()) {
+ return new ListConverter(parentBuilder, fieldDescriptor, parquetType);
+}
+if (OriginalType.MAP == parquetType.getOriginalType()) {
+ return new MapConverter(parentBuilder, fieldDescriptor, parquetType);
+}
return newScalarConverter(parent, parentBuilder, fieldDescriptor,
parquetType);
}
-
private Converter newScalarConverter(ParentValueContainer pvc,
Message.Builder parentBuilder, Descriptors.FieldDescriptor fieldDescriptor,
Type parquetType) {
JavaType javaType = fieldDescriptor.getJavaType();
@@ -345,4 +352,121 @@ public void addBinary(Binary binary) {
}
}
+
+ /**
+ * This class unwraps the additional LIST wrapper and makes it possible to
read the underlying data and then convert
+ * it to protobuf.
+ *
+ * Consider the following protobuf schema:
+ * message SimpleList {
+ * repeated int64 first_array = 1;
+ * }
+ *
+ * A LIST wrapper is created in parquet for the above mentioned protobuf
schema:
+ * message SimpleList {
+ * optional group first_array (LIST) = 1 {
+ * repeated group list {
+ * optional int32 element;
+ * }
+ * }
+ * }
+ *
+ * The LIST wrappers are used by 3rd party tools, such as Hive, to read
parquet arrays. The wrapper contains
+ * a repeated group named 'list', itself containing only one field called
'element' of the type of the repeated
+ * object (can be a primitive as in this example or a group in case of a
repeated message in protobuf).
+ */
+ final class ListConverter extends GroupConverter {
+private final Converter converter;
+
+public ListConverter(Message.Builder parentBuilder,
Descriptors.FieldDescriptor fieldDescriptor, Type parquetType) {
+ OriginalType originalType = parquetType.getOriginalType();
+ if (originalType != OriginalType.LIST || parquetType.isPrimitive()) {
+throw new ParquetDecodingException("Expected LIST wrapper. Found: " +
originalType + " instead.");
+ }
+
+ GroupType rootWrapperType = parquetType.asGroupType();
+ if (!rootWrapperType.containsField("list") ||
rootWrapperType.getType("list").isPrimitive()) {
+throw new ParquetDecodingException("Expected repeated 'list' group
inside LIST wrapperr but got: " + rootWrapperType);
+ }
+
+ GroupType listType = rootWrapperType.getType("list").asGroupType();
+ if (!listType.containsField("element")) {
+throw new ParquetDecodingException("Expected 'element' inside repeated
list group but got: " + listType);
+ }
+
+ Type elementType = listType.getType("element");
+ converter = newMessageConverter(parentBuilder, fieldDescriptor,
elementType);
+}
+
+@Override
+public Converter getConverter(int fieldIndex) {
+ if (fieldIndex > 0) {
+throw new ParquetDecodingException("Unexpected multiple fields in the
LIST wrapper");
+ }
+
+ return new GroupConverter() {
+@Override
+public Converter getConverter(int fieldIndex) {
+ return converter;
+}
+
+@Override
+public void start() {
+
+}
+
+