arina-ielchiieva commented on a change in pull request #1844: DRILL-7326:
Support repeated lists for CTAS parquet format
URL: https://github.com/apache/drill/pull/1844#discussion_r316138655
##########
File path:
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetRecordWriter.java
##########
@@ -287,40 +290,70 @@ private Type getType(MaterializedField field) {
DataMode dataMode = field.getType().getMode();
switch (minorType) {
case MAP:
- List<Type> types = Lists.newArrayList();
- for (MaterializedField childField : field.getChildren()) {
- types.add(getType(childField));
- }
+ List<Type> types = getChildrenTypes(field);
return new GroupType(dataMode == DataMode.REPEATED ?
Repetition.REPEATED : Repetition.OPTIONAL, field.getName(), types);
case LIST:
- MaterializedField elementField = field.getChildren().iterator().next();
+ MaterializedField elementField = getDataField(field);
ListBuilder<GroupType> listBuilder = org.apache.parquet.schema.Types
.list(dataMode == DataMode.OPTIONAL ? Repetition.OPTIONAL :
Repetition.REQUIRED);
addElementType(listBuilder, elementField);
GroupType listType = listBuilder.named(field.getName());
return listType;
case NULL:
MaterializedField newField = field.withType(
-
TypeProtos.MajorType.newBuilder().setMinorType(MinorType.INT).setMode(DataMode.OPTIONAL).build());
+
TypeProtos.MajorType.newBuilder().setMinorType(MinorType.INT).setMode(DataMode.OPTIONAL).build());
return getPrimitiveType(newField);
default:
return getPrimitiveType(field);
}
}
+ /**
+ * Helper method for conversion of map child
+ * fields.
+ *
+ * @param field map
+ * @return converted child fields
+ */
+ private List<Type> getChildrenTypes(MaterializedField field) {
+ return field.getChildren().stream()
+ .map(this::getType)
+ .collect(Collectors.toList());
+ }
+
+ /**
+ * Finds data child field of list or repeated type.
+ *
+ * @param field parent repeated field
+ * @return child data field
+ */
+ private MaterializedField getDataField(MaterializedField field) {
+ return field.getChildren().stream()
+ .filter(child ->
BaseRepeatedValueVector.DATA_VECTOR_NAME.equals(child.getName()))
+ .findFirst()
Review comment:
For parallel streams, its a big difference, for sequential stream it does
not matter. I understand that you are using sequential stream but using
`findAny()` will clearly indicate that order for you does not matter.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services