arina-ielchiieva commented on a change in pull request #1844: DRILL-7326:
Support repeated lists for CTAS parquet format
URL: https://github.com/apache/drill/pull/1844#discussion_r316131846
##########
File path:
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetRecordWriter.java
##########
@@ -287,40 +290,70 @@ private Type getType(MaterializedField field) {
DataMode dataMode = field.getType().getMode();
switch (minorType) {
case MAP:
- List<Type> types = Lists.newArrayList();
- for (MaterializedField childField : field.getChildren()) {
- types.add(getType(childField));
- }
+ List<Type> types = getChildrenTypes(field);
return new GroupType(dataMode == DataMode.REPEATED ?
Repetition.REPEATED : Repetition.OPTIONAL, field.getName(), types);
case LIST:
- MaterializedField elementField = field.getChildren().iterator().next();
+ MaterializedField elementField = getDataField(field);
ListBuilder<GroupType> listBuilder = org.apache.parquet.schema.Types
.list(dataMode == DataMode.OPTIONAL ? Repetition.OPTIONAL :
Repetition.REQUIRED);
addElementType(listBuilder, elementField);
GroupType listType = listBuilder.named(field.getName());
return listType;
case NULL:
MaterializedField newField = field.withType(
-
TypeProtos.MajorType.newBuilder().setMinorType(MinorType.INT).setMode(DataMode.OPTIONAL).build());
+
TypeProtos.MajorType.newBuilder().setMinorType(MinorType.INT).setMode(DataMode.OPTIONAL).build());
return getPrimitiveType(newField);
default:
return getPrimitiveType(field);
}
}
+ /**
+ * Helper method for conversion of map child
+ * fields.
+ *
+ * @param field map
+ * @return converted child fields
+ */
+ private List<Type> getChildrenTypes(MaterializedField field) {
+ return field.getChildren().stream()
+ .map(this::getType)
+ .collect(Collectors.toList());
+ }
+
+ /**
+ * Finds data child field of list or repeated type.
+ *
+ * @param field parent repeated field
+ * @return child data field
+ */
+ private MaterializedField getDataField(MaterializedField field) {
+ return field.getChildren().stream()
+ .filter(child ->
BaseRepeatedValueVector.DATA_VECTOR_NAME.equals(child.getName()))
+ .findFirst()
Review comment:
Do we need exactly first or any? If any, better to use `.findAny()`. Why it
does not matter if there are several? Please add better java-doc to answer
these questions?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services