KazydubB commented on a change in pull request #1954: DRILL-7509: Incorrect TupleSchema is created for DICT column when querying Parquet files URL: https://github.com/apache/drill/pull/1954#discussion_r369156536
########## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetTableMetadataUtils.java ########## @@ -507,29 +507,67 @@ private static long convertToDrillDateValue(int dateValue) { MetadataBase.ParquetTableMetadataBase parquetTableMetadata) { int precision = 0; int scale = 0; - int definitionLevel = 1; - int repetitionLevel = 0; MetadataVersion metadataVersion = new MetadataVersion(parquetTableMetadata.getMetadataVersion()); // only ColumnTypeMetadata_v3 and ColumnTypeMetadata_v4 store information about scale, precision, repetition level and definition level if (parquetTableMetadata.hasColumnMetadata() && (metadataVersion.compareTo(new MetadataVersion(3, 0)) >= 0)) { scale = parquetTableMetadata.getScale(name); precision = parquetTableMetadata.getPrecision(name); - repetitionLevel = parquetTableMetadata.getRepetitionLevel(name); - definitionLevel = parquetTableMetadata.getDefinitionLevel(name); - } - TypeProtos.DataMode mode; - if (repetitionLevel >= 1) { - mode = TypeProtos.DataMode.REPEATED; - } else if (repetitionLevel == 0 && definitionLevel == 0) { - mode = TypeProtos.DataMode.REQUIRED; - } else { - mode = TypeProtos.DataMode.OPTIONAL; } + + TypeProtos.DataMode mode = getDataMode(parquetTableMetadata, metadataVersion, name); return TypeProtos.MajorType.newBuilder(ParquetReaderUtility.getType(primitiveType, originalType, precision, scale)) .setMode(mode) .build(); } + /** + * Obtain data mode from table metadata for a column. Algorithm for retrieving data mode depends on metadata version: + * <ul> + * <li>starting from version {@code 4.2}, Parquet's {@link org.apache.parquet.schema.Type.Repetition} + * is stored in table metadata itself;</li> + * <li>starting from {@code 3.0} to {@code 4.2} (exclusively) the data mode is + * computed based on max {@code definition} and {@code repetition} levels + * ({@link MetadataBase.ParquetTableMetadataBase#getDefinitionLevel(String[])} and + * {@link MetadataBase.ParquetTableMetadataBase#getRepetitionLevel(String[])} respectively) + * obtained from Parquet's schema; + * + * <p><strong>Note:</strong> this computation may lead to erroneous results, + * when there are few nesting levels.</p> + * </li> + * <li>prior to {@code 3.0} {@code DataMode.OPTIONAL} is returned.</li> + * </ul> + * @param tableMetadata Parquet table metadata + * @param metadataVersion version of Parquet table metadata + * @param name (leaf) column to obtain data mode for + * @return data mode of the specified column + */ + private static TypeProtos.DataMode getDataMode(MetadataBase.ParquetTableMetadataBase tableMetadata, Review comment: Refactored the method. Added convenience methods to `MetadataVersion`: `atLeast(int major, int minor)`, `isEqualTo(int major, int minor)` and `after(int major, int minor)`. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services