[jira] [Commented] (DRILL-5797) Use more often the new parquet reader

ASF GitHub Bot (JIRA) Wed, 11 Jul 2018 09:43:48 -0700


    [ 
https://issues.apache.org/jira/browse/DRILL-5797?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16540356#comment-16540356
 ]


ASF GitHub Bot commented on DRILL-5797:
---------------------------------------

arina-ielchiieva closed pull request #1370: DRILL-5797: Use Parquet new reader 
in all non-complex column queries
URL: https://github.com/apache/drill/pull/1370
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git 
a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetScanBatchCreator.java
 
b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetScanBatchCreator.java
index dc09ce1b695..47f2e18a9a7 100644
--- 
a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetScanBatchCreator.java
+++ 
b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetScanBatchCreator.java
@@ -25,22 +25,21 @@
 import org.apache.drill.exec.ops.ExecutorFragmentContext;
 import org.apache.drill.exec.ops.OperatorContext;
 import org.apache.drill.exec.physical.impl.ScanBatch;
+import org.apache.drill.exec.proto.helper.QueryIdHelper;
 import org.apache.drill.exec.server.options.OptionManager;
 import org.apache.drill.exec.store.ColumnExplorer;
 import org.apache.drill.exec.store.RecordReader;
 import org.apache.drill.exec.store.dfs.DrillFileSystem;
+import org.apache.drill.exec.store.parquet.ParquetReaderUtility;
 import org.apache.drill.exec.store.parquet.columnreaders.ParquetRecordReader;
 import org.apache.drill.exec.store.parquet2.DrillParquetReader;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.parquet.ParquetReadOptions;
-import org.apache.parquet.column.ColumnDescriptor;
 import org.apache.parquet.hadoop.CodecFactory;
 import org.apache.parquet.hadoop.ParquetFileReader;
 import org.apache.parquet.hadoop.metadata.ParquetMetadata;
 import org.apache.parquet.hadoop.util.HadoopInputFile;
-import org.apache.parquet.schema.MessageType;
-import org.apache.parquet.schema.Type;
 
 import java.io.IOException;
 import java.util.ArrayList;
@@ -107,7 +106,10 @@ protected ScanBatch getBatch(ExecutorFragmentContext 
context, AbstractParquetRow
           ParquetReaderUtility.detectCorruptDates(footer, 
rowGroupScan.getColumns(), autoCorrectCorruptDates);
         logger.debug("Contains corrupt dates: {}", containsCorruptDates);
 
-        if 
(!context.getOptions().getBoolean(ExecConstants.PARQUET_NEW_RECORD_READER) && 
!isComplex(footer)) {
+        if 
(!context.getOptions().getBoolean(ExecConstants.PARQUET_NEW_RECORD_READER)
+            && !ParquetReaderUtility.containsComplexColumn(footer, 
rowGroupScan.getColumns())) {
+          logger.debug("Query {} qualifies for new Parquet reader",
+              
QueryIdHelper.getQueryId(oContext.getFragmentContext().getHandle().getQueryId()));
           readers.add(new ParquetRecordReader(context,
               rowGroup.getPath(),
               rowGroup.getRowGroupIndex(),
@@ -118,6 +120,8 @@ protected ScanBatch getBatch(ExecutorFragmentContext 
context, AbstractParquetRow
               rowGroupScan.getColumns(),
               containsCorruptDates));
         } else {
+          logger.debug("Query {} doesn't qualify for new reader, using old 
one",
+              
QueryIdHelper.getQueryId(oContext.getFragmentContext().getHandle().getQueryId()));
           readers.add(new DrillParquetReader(context,
               footer,
               rowGroup,
@@ -161,22 +165,6 @@ private ParquetMetadata readFooter(Configuration conf, 
String path) throws IOExc
     }
   }
 
-  private boolean isComplex(ParquetMetadata footer) {
-    MessageType schema = footer.getFileMetaData().getSchema();
-
-    for (Type type : schema.getFields()) {
-      if (!type.isPrimitive()) {
-        return true;
-      }
-    }
-    for (ColumnDescriptor col : schema.getColumns()) {
-      if (col.getMaxRepetitionLevel() > 0) {
-        return true;
-      }
-    }
-    return false;
-  }
-
   /**
    * Helper class responsible for creating and managing DrillFileSystem.
    */
@@ -190,5 +178,4 @@ protected AbstractDrillFileSystemManager(OperatorContext 
operatorContext) {
 
     protected abstract DrillFileSystem get(Configuration config, String path) 
throws ExecutionSetupException;
   }
-
 }
diff --git 
a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetReaderUtility.java
 
b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetReaderUtility.java
index a7f78fb40c6..6960b35a031 100644
--- 
a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetReaderUtility.java
+++ 
b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetReaderUtility.java
@@ -20,7 +20,6 @@
 import com.google.common.collect.Sets;
 import org.apache.commons.codec.binary.Base64;
 import org.apache.drill.common.exceptions.UserException;
-import org.apache.drill.common.expression.PathSegment;
 import org.apache.drill.common.expression.SchemaPath;
 import org.apache.drill.common.types.TypeProtos;
 import org.apache.drill.common.types.Types;
@@ -41,8 +40,10 @@
 import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
 import org.apache.parquet.hadoop.metadata.ColumnPath;
 import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+import org.apache.parquet.schema.MessageType;
 import org.apache.parquet.schema.OriginalType;
 import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
+import org.apache.parquet.schema.Type;
 import org.joda.time.Chronology;
 import org.joda.time.DateTimeConstants;
 import org.apache.parquet.example.data.simple.NanoTime;
@@ -51,6 +52,7 @@
 
 import java.util.Arrays;
 import java.util.HashMap;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@@ -140,15 +142,90 @@ public static int getIntFromLEBytes(byte[] input, int 
start) {
     return out;
   }
 
+  /**
+   * Map full schema paths in format `a`.`b`.`c` to respective SchemaElement 
objects.
+   *
+   * @param footer Parquet file metadata
+   * @return       schema full path to SchemaElement map
+   */
   public static Map<String, SchemaElement> 
getColNameToSchemaElementMapping(ParquetMetadata footer) {
-    HashMap<String, SchemaElement> schemaElements = new HashMap<>();
+    Map<String, SchemaElement> schemaElements = new HashMap<>();
     FileMetaData fileMetaData = new 
ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, 
footer);
-    for (SchemaElement se : fileMetaData.getSchema()) {
-      schemaElements.put(se.getName(), se);
+
+    Iterator<SchemaElement> iter = fileMetaData.getSchema().iterator();
+
+    // First element in collection is default `root` element. We skip it to 
maintain key in `a` format instead of `root`.`a`,
+    // and thus to avoid the need to cut it out again when comparing with 
SchemaPath string representation
+    if (iter.hasNext()) {
+      iter.next();
+    }
+    while (iter.hasNext()) {
+      addSchemaElementMapping(iter, new StringBuilder(), schemaElements);
     }
     return schemaElements;
   }
 
+  /**
+   * Populate full path to SchemaElement map by recursively traversing schema 
elements referenced by the given iterator
+   *
+   * @param iter file schema values iterator
+   * @param path parent schema element path
+   * @param schemaElements schema elements map to insert next iterator element 
into
+   */
+  private static void addSchemaElementMapping(Iterator<SchemaElement> iter, 
StringBuilder path,
+      Map<String, SchemaElement> schemaElements) {
+    SchemaElement schemaElement = iter.next();
+    path.append('`').append(schemaElement.getName().toLowerCase()).append('`');
+    schemaElements.put(path.toString(), schemaElement);
+
+    // for each element that has children we need to maintain remaining 
children count
+    // to exit current recursion level when no more children is left
+    int remainingChildren = schemaElement.getNum_children();
+
+    while (remainingChildren > 0 && iter.hasNext()) {
+      addSchemaElementMapping(iter, new StringBuilder(path).append('.'), 
schemaElements);
+      remainingChildren--;
+    }
+    return;
+  }
+
+  /**
+   * generate full path of the column in format `a`.`b`.`c`
+   *
+   * @param column ColumnDescriptor object
+   * @return       full path in format `a`.`b`.`c`
+   */
+  public static String getFullColumnPath(ColumnDescriptor column) {
+    StringBuilder sb = new StringBuilder();
+    String[] path = column.getPath();
+    for (int i = 0; i < path.length; i++) {
+      sb.append("`").append(path[i].toLowerCase()).append("`").append(".");
+    }
+
+    // remove trailing dot
+    if (sb.length() > 0) {
+      sb.deleteCharAt(sb.length() - 1);
+    }
+
+    return sb.toString();
+  }
+
+  /**
+   * Map full column paths to all ColumnDescriptors in file schema
+   *
+   * @param footer Parquet file metadata
+   * @return       column full path to ColumnDescriptor object map
+   */
+  public static Map<String, ColumnDescriptor> 
getColNameToColumnDescriptorMapping(ParquetMetadata footer) {
+    Map<String, ColumnDescriptor> colDescMap = new HashMap<>();
+    List<ColumnDescriptor> columns = 
footer.getFileMetaData().getSchema().getColumns();
+
+    for (ColumnDescriptor column : columns) {
+      colDescMap.put(getFullColumnPath(column), column);
+    }
+    return colDescMap;
+  }
+
   public static int autoCorrectCorruptedDate(int corruptedDate) {
     return (int) (corruptedDate - CORRECT_CORRUPT_DATE_SHIFT);
   }
@@ -361,7 +438,6 @@ else if (Boolean.valueOf(isDateCorrect)) {
     }
   }
 
-
   /**
    * Detect corrupt date values by looking at the min/max values in the 
metadata.
    *
@@ -401,9 +477,9 @@ public static DateCorruptionStatus 
checkForCorruptDateValuesInStatistics(Parquet
         // creating a NameSegment makes sure we are using the standard code 
for comparing names,
         // currently it is all case-insensitive
         if (Utilities.isStarQuery(columns)
-            || new 
PathSegment.NameSegment(column.getPath()[0]).equals(schemaPath.getRootSegment()))
 {
+            || 
getFullColumnPath(column).equalsIgnoreCase(schemaPath.getUnIndexed().toString()))
 {
           int colIndex = -1;
-          ConvertedType convertedType = 
schemaElements.get(column.getPath()[0]).getConverted_type();
+          ConvertedType convertedType = 
schemaElements.get(getFullColumnPath(column)).getConverted_type();
           if (convertedType != null && 
convertedType.equals(ConvertedType.DATE)) {
             List<ColumnChunkMetaData> colChunkList = 
footer.getBlocks().get(rowGroupIndex).getColumns();
             for (int j = 0; j < colChunkList.size(); j++) {
@@ -525,4 +601,57 @@ public static long getDateTimeValueFromBinary(Binary 
binaryTimeStampValue, boole
     }
   }
 
+  /**
+   * Check whether any of columns in the given list is either nested or 
repetitive.
+   *
+   * @param footer  Parquet file schema
+   * @param columns list of query SchemaPath objects
+   */
+  public static boolean containsComplexColumn(ParquetMetadata footer, 
List<SchemaPath> columns) {
+
+    MessageType schema = footer.getFileMetaData().getSchema();
+
+    if (Utilities.isStarQuery(columns)) {
+      for (Type type : schema.getFields()) {
+        if (!type.isPrimitive()) {
+          return true;
+        }
+      }
+      for (ColumnDescriptor col : schema.getColumns()) {
+        if (col.getMaxRepetitionLevel() > 0) {
+          return true;
+        }
+      }
+      return false;
+    } else {
+      Map<String, ColumnDescriptor> colDescMap = 
ParquetReaderUtility.getColNameToColumnDescriptorMapping(footer);
+      Map<String, SchemaElement> schemaElements = 
ParquetReaderUtility.getColNameToSchemaElementMapping(footer);
+
+      for (SchemaPath schemaPath : columns) {
+        // Schema path which is non-leaf is complex column
+        if (!schemaPath.isLeaf()) {
+          logger.trace("rowGroupScan contains complex column: {}", 
schemaPath.getUnIndexed().toString());
+          return true;
+        }
+
+        // following column descriptor lookup failure may mean two cases, 
depending on subsequent SchemaElement lookup:
+        // 1. success: queried column is complex, i.e. GroupType
+        // 2. failure: queried column is not in schema and thus is non-complex
+        ColumnDescriptor column = 
colDescMap.get(schemaPath.getUnIndexed().toString().toLowerCase());
+
+        if (column == null) {
+          SchemaElement schemaElement = 
schemaElements.get(schemaPath.getUnIndexed().toString().toLowerCase());
+          if (schemaElement != null) {
+            return true;
+          }
+        } else {
+          if (column.getMaxRepetitionLevel() > 0) {
+            logger.trace("rowGroupScan contains repetitive column: {}", 
schemaPath.getUnIndexed().toString());
+            return true;
+          }
+        }
+      }
+    }
+    return false;
+  }
 }
diff --git 
a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/ParquetColumnMetadata.java
 
b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/ParquetColumnMetadata.java
index 147938d4b93..9c2444c2837 100644
--- 
a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/ParquetColumnMetadata.java
+++ 
b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/ParquetColumnMetadata.java
@@ -29,6 +29,7 @@
 import org.apache.drill.exec.physical.impl.OutputMutator;
 import org.apache.drill.exec.record.MaterializedField;
 import org.apache.drill.exec.server.options.OptionManager;
+import org.apache.drill.exec.store.parquet.ParquetReaderUtility;
 import org.apache.drill.exec.vector.ValueVector;
 import org.apache.drill.exec.vector.complex.RepeatedValueVector;
 import org.apache.parquet.column.ColumnDescriptor;
@@ -56,8 +57,8 @@ public ParquetColumnMetadata(ColumnDescriptor column) {
   }
 
   public void resolveDrillType(Map<String, SchemaElement> schemaElements, 
OptionManager options) {
-    se = schemaElements.get(column.getPath()[0]);
-    type = ParquetToDrillTypeConverter.toMajorType(column.getType(), 
se.getType_length(),
+    se = schemaElements.get(ParquetReaderUtility.getFullColumnPath(column));
+    type = ParquetToDrillTypeConverter.toMajorType(column.getType(), 
column.getTypeLength(),
         getDataMode(column), se, options);
     field = 
MaterializedField.create(toFieldName(column.getPath()).getLastSegment().getNameSegment().getPath(),
 type);
     length = getDataTypeLength();
diff --git 
a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/ParquetSchema.java
 
b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/ParquetSchema.java
index 67174459e0a..c5f70f3c5b5 100644
--- 
a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/ParquetSchema.java
+++ 
b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/ParquetSchema.java
@@ -130,7 +130,7 @@ private void loadParquetSchema() {
     for (ColumnDescriptor column : 
footer.getFileMetaData().getSchema().getColumns()) {
       ParquetColumnMetadata columnMetadata = new ParquetColumnMetadata(column);
       columnMetadata.resolveDrillType(schemaElements, options);
-      if (! fieldSelected(columnMetadata.field)) {
+      if (! columnSelected(column)) {
         continue;
       }
       selectedColumnMetadata.add(columnMetadata);
@@ -174,23 +174,22 @@ public BlockMetaData getRowGroupMetadata() {
   }
 
   /**
-   * Determine if a Parquet field is selected for the query. It is selected
+   * Determine if a Parquet column is selected for the query. It is selected
    * either if this is a star query (we want all columns), or the column
    * appears in the select list.
    *
-   * @param field the Parquet column expressed as as Drill field.
+   * @param column the Parquet column expressed as column descriptor
    * @return true if the column is to be included in the scan, false
    * if not
    */
-
-  private boolean fieldSelected(MaterializedField field) {
+  private boolean columnSelected(ColumnDescriptor column) {
     if (isStarQuery()) {
       return true;
     }
 
     int i = 0;
     for (SchemaPath expr : selectedCols) {
-      if (field.getName().equalsIgnoreCase(expr.getRootSegmentPath())) {
+      if 
(ParquetReaderUtility.getFullColumnPath(column).equalsIgnoreCase(expr.getUnIndexed().toString()))
 {
         columnsFound[i] = true;
         return true;
       }
diff --git 
a/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/TestComplexColumnInSchema.java
 
b/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/TestComplexColumnInSchema.java
new file mode 100644
index 00000000000..d0977b8f38f
--- /dev/null
+++ 
b/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/TestComplexColumnInSchema.java
@@ -0,0 +1,168 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.exec.store.parquet.ParquetReaderUtility;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+import org.apache.parquet.hadoop.ParquetFileReader;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.util.List;
+import java.util.ArrayList;
+
+import java.io.IOException;
+
+/**
+ * This test checks correctness of complex column detection in the Parquet 
file schema.
+ */
+public class TestComplexColumnInSchema {
+
+  /*
+  Parquet schema:
+    message root {
+      optional int64 id;
+      optional binary a (UTF8);
+      repeated int64 repeated;
+      optional binary VariableCase (UTF8);
+      optional group nested {
+        optional int64 id;
+        repeated int64 repeated;
+        optional binary VaRiAbLeCaSe (UTF8);
+      }
+    }
+
+   Data set:
+   complex_special_cases.parquet
+   {
+     "id": 1,
+     "a": "some string",
+     "repeated": [1, 2],
+     "VariableCase": "top level variable case column",
+     "nested": {
+       "id": 2,
+       "repeated": [3, 4],
+       "VaRiAbLeCaSe": "nested variable case column"
+     }
+   }
+   */
+  private static final String path = 
"src/test/resources/store/parquet/complex/complex_special_cases.parquet";
+  private static ParquetMetadata footer;
+
+  @BeforeClass
+  public static void setup() throws IOException {
+    Configuration conf = new Configuration();
+
+    footer = ParquetFileReader.readFooter(conf, new Path(path));
+  }
+
+  @Test
+  public void testGroupTypeColumn() {
+    List<SchemaPath> columns = new ArrayList<>();
+    columns.add(SchemaPath.getCompoundPath("nested"));
+    assertTrue("GroupType column must be detected as complex",
+        ParquetReaderUtility.containsComplexColumn(footer, columns));
+  }
+
+  @Test
+  public void testNestedColumn() {
+    List<SchemaPath> columns = new ArrayList<>();
+    columns.add(SchemaPath.getCompoundPath("nested", "id"));
+    assertTrue("Nested column must be detected as complex",
+        ParquetReaderUtility.containsComplexColumn(footer, columns));
+  }
+
+  @Test
+  public void testCombinedColumns() {
+    List<SchemaPath> columns = new ArrayList<>();
+    columns.add(SchemaPath.getCompoundPath("id"));
+    columns.add(SchemaPath.getCompoundPath("nested", "id"));
+    assertTrue("Nested column in the list list must be detected as complex",
+        ParquetReaderUtility.containsComplexColumn(footer, columns));
+  }
+
+  @Test
+  public void testSimpleColumn() {
+    List<SchemaPath> columns = new ArrayList<>();
+    columns.add(SchemaPath.getCompoundPath("id"));
+    assertFalse("No complex column must be detected",
+        ParquetReaderUtility.containsComplexColumn(footer, columns));
+  }
+
+  @Test
+  public void testSimpleColumns() {
+    List<SchemaPath> columns = new ArrayList<>();
+    columns.add(SchemaPath.getCompoundPath("id"));
+    columns.add(SchemaPath.getCompoundPath("a"));
+    assertFalse("No complex columns must be detected",
+        ParquetReaderUtility.containsComplexColumn(footer, columns));
+  }
+
+  @Test
+  public void testNonexistentColumn() {
+    List<SchemaPath> columns = new ArrayList<>();
+    columns.add(SchemaPath.getCompoundPath("nonexistent"));
+    assertFalse("No complex column must be detected",
+        ParquetReaderUtility.containsComplexColumn(footer, columns));
+  }
+
+  @Test
+  public void testVariableCaseColumn() {
+    List<SchemaPath> columns = new ArrayList<>();
+    columns.add(SchemaPath.getCompoundPath("variablecase"));
+    assertFalse("No complex column must be detected",
+        ParquetReaderUtility.containsComplexColumn(footer, columns));
+  }
+
+  @Test
+  public void testVariableCaseSchemaPath() {
+    List<SchemaPath> columns = new ArrayList<>();
+    columns.add(SchemaPath.getCompoundPath("VaRiAbLeCaSe"));
+    assertFalse("No complex column must be detected",
+        ParquetReaderUtility.containsComplexColumn(footer, columns));
+  }
+
+  @Test
+  public void testNestedVariableCaseColumn() {
+    List<SchemaPath> columns = new ArrayList<>();
+    columns.add(SchemaPath.getCompoundPath("nested", "variablecase"));
+    assertTrue("Nested variable case column must be detected as complex",
+        ParquetReaderUtility.containsComplexColumn(footer, columns));
+  }
+
+  @Test
+  public void testRepeatedColumn() {
+    List<SchemaPath> columns = new ArrayList<>();
+    columns.add(SchemaPath.getCompoundPath("repeated"));
+    assertTrue("Repeated column must be detected as complex",
+        ParquetReaderUtility.containsComplexColumn(footer, columns));
+  }
+
+  @Test
+  public void testNestedRepeatedColumn() {
+    List<SchemaPath> columns = new ArrayList<>();
+    columns.add(SchemaPath.getCompoundPath("nested", "repeated"));
+    assertTrue("Nested repeated column must be detected as complex",
+        ParquetReaderUtility.containsComplexColumn(footer, columns));
+  }
+}
diff --git 
a/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/TestParquetReaderUtility.java
 
b/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/TestParquetReaderUtility.java
new file mode 100644
index 00000000000..4b24212c378
--- /dev/null
+++ 
b/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/TestParquetReaderUtility.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import org.apache.drill.exec.store.parquet.ParquetReaderUtility;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+
+import org.apache.parquet.column.ColumnDescriptor;
+import org.apache.parquet.format.SchemaElement;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+import org.apache.parquet.hadoop.ParquetFileReader;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.util.Map;
+
+public class TestParquetReaderUtility {
+
+  private static final String path = 
"src/test/resources/store/parquet/complex/complex.parquet";
+  private static ParquetMetadata footer;
+
+  @BeforeClass
+  public static void setup() throws IOException {
+    Configuration conf = new Configuration();
+
+    footer = ParquetFileReader.readFooter(conf, new Path(path));
+  }
+
+  @Test
+  public void testSchemaElementsMap() {
+    Map<String, SchemaElement> schemaElements = 
ParquetReaderUtility.getColNameToSchemaElementMapping(footer);
+    assertEquals("Schema elements map size must be 14", schemaElements.size(), 
14);
+
+    SchemaElement schemaElement = 
schemaElements.get("`marketing_info`.`camp_id`");
+    assertNotNull("Schema element must be not null", schemaElement);
+    assertEquals("Schema element must be named 'camp_id'", 
schemaElement.getName(), "camp_id");
+
+    schemaElement = schemaElements.get("`marketing_info`");
+    assertNotNull("Schema element must be not null", schemaElement);
+    assertEquals("Schema element name match lookup key", 
schemaElement.getName(), "marketing_info");
+  }
+
+  @Test
+  public void testColumnDescriptorMap() {
+    Map<String, ColumnDescriptor> colDescMap = 
ParquetReaderUtility.getColNameToColumnDescriptorMapping(footer);
+    assertEquals("Column descriptors map size must be 11", colDescMap.size(), 
11);
+
+    assertNotNull("column descriptor lookup must return not null", 
colDescMap.get("`marketing_info`.`camp_id`"));
+    assertNull("column descriptor lookup must return null on GroupType 
column", colDescMap.get("`marketing_info`"));
+  }
+}
diff --git 
a/exec/java-exec/src/test/resources/store/parquet/complex/complex_special_cases.parquet
 
b/exec/java-exec/src/test/resources/store/parquet/complex/complex_special_cases.parquet
new file mode 100644
index 00000000000..bbfb6d4096c
Binary files /dev/null and 
b/exec/java-exec/src/test/resources/store/parquet/complex/complex_special_cases.parquet
 differ


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> Use more often the new parquet reader
> -------------------------------------
>
>                 Key: DRILL-5797
>                 URL: https://issues.apache.org/jira/browse/DRILL-5797
>             Project: Apache Drill
>          Issue Type: Improvement
>          Components: Storage - Parquet
>            Reporter: Damien Profeta
>            Assignee: Oleksandr Kalinin
>            Priority: Major
>              Labels: ready-to-commit
>             Fix For: 1.14.0
>
>
> The choice of using the regular parquet reader of the optimized one is based 
> of what type of columns is in the file. But the columns that are read by the 
> query doesn't matter. We can increase a little bit the cases where the 
> optimized reader is used by checking is the projected column are simple or 
> not.
> This is an optimization waiting for the fast parquet reader to handle complex 
> structure.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

[jira] [Commented] (DRILL-5797) Use more often the new parquet reader

Reply via email to