[orc] branch main updated: ORC-1121: Fix column coversion check bug which causes column filters don't work (#1055)

dongjoon Tue, 08 Mar 2022 12:41:35 -0800

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git



The following commit(s) were added to refs/heads/main by this push:
     new e22f537  ORC-1121: Fix column coversion check bug which causes column 
filters don't work (#1055)
e22f537 is described below

commit e22f5370fa7e579573e1c627cbdfa6173b206b76
Author: shipenglei <shipengle...@163.com>
AuthorDate: Wed Mar 9 04:41:25 2022 +0800

    ORC-1121: Fix column coversion check bug which causes column filters don't 
work (#1055)
    
    ### What changes were proposed in this pull request?
    
    Add a map in `SchemaEvolution` which contains the mapping from the file 
column id to the reader column id, the mapping will be used in 
`SchemaEvolution.isPPDSafeConversion()`
    
    ### Why are the changes needed?
    
    `RecordReaderImpl.pickRowGroups()`  calls 
`SchemaEvolution.isPPDSafeConversion()` with file column id rather than reader 
column id which is required, this causes column filters can't work effectively 
and recordReader can't skip row groups which are not matched, so we need find 
the corresponding reader column id via file column id to ensure 
`SchemaEvolution.isPPDSafeConversion()` can work correctly.
    
    ### How was this patch tested?
    
    UT
---
 .../java/org/apache/orc/impl/SchemaEvolution.java    | 20 +++++++++++---------
 .../org/apache/orc/impl/TestSchemaEvolution.java     | 10 ++++++++++
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java 
b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java
index 168f391..cb35f2a 100644
--- a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java
+++ b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java
@@ -56,7 +56,7 @@ public class SchemaEvolution {
    */
   private final boolean includeAcidColumns;
 
-  // indexed by reader column id
+  // indexed by file column id
   private final boolean[] ppdSafeConversion;
 
   // columns are indexed, not named between Reader & File schema
@@ -296,13 +296,13 @@ public class SchemaEvolution {
 
   /**
    * Check if column is safe for ppd evaluation
-   * @param colId reader column id
+   * @param fileColId file column id
    * @return true if the specified column is safe for ppd evaluation else false
    */
-  public boolean isPPDSafeConversion(final int colId) {
+  public boolean isPPDSafeConversion(final int fileColId) {
     if (hasConversion()) {
-      return !(colId < 0 || colId >= ppdSafeConversion.length) &&
-          ppdSafeConversion[colId];
+      return !(fileColId < 0 || fileColId >= ppdSafeConversion.length) &&
+          ppdSafeConversion[fileColId];
     }
 
     // when there is no schema evolution PPD is safe
@@ -314,9 +314,9 @@ public class SchemaEvolution {
       return null;
     }
 
-    boolean[] result = new boolean[readerSchema.getMaximumId() + 1];
+    boolean[] result = new boolean[fileSchema.getMaximumId() + 1];
     boolean safePpd = validatePPDConversion(fileSchema, readerSchema);
-    result[readerSchema.getId()] = safePpd;
+    result[fileSchema.getId()] = safePpd;
     return populatePpdSafeConversionForChildren(result,
         readerSchema.getChildren());
   }
@@ -337,12 +337,14 @@ public class SchemaEvolution {
       for (TypeDescription child : children) {
         TypeDescription fileType = getFileType(child.getId());
         safePpd = validatePPDConversion(fileType, child);
-        ppdSafeConversion[child.getId()] = safePpd;
+        if (fileType != null) {
+          ppdSafeConversion[fileType.getId()] = safePpd;
+        }
         populatePpdSafeConversionForChildren(ppdSafeConversion,
             child.getChildren());
       }
     }
-    return  ppdSafeConversion;
+    return ppdSafeConversion;
   }
 
   private boolean validatePPDConversion(final TypeDescription fileType,
diff --git a/java/core/src/test/org/apache/orc/impl/TestSchemaEvolution.java 
b/java/core/src/test/org/apache/orc/impl/TestSchemaEvolution.java
index 5c2d7d2..3a82fb5 100644
--- a/java/core/src/test/org/apache/orc/impl/TestSchemaEvolution.java
+++ b/java/core/src/test/org/apache/orc/impl/TestSchemaEvolution.java
@@ -853,6 +853,16 @@ public class TestSchemaEvolution {
     assertTrue(both1.isPPDSafeConversion(2));
     assertTrue(both1.isPPDSafeConversion(3));
     assertFalse(both1.isPPDSafeConversion(4));
+
+    // column pruning
+    readerStruct1 = TypeDescription.createStruct()
+        .addField("f2", TypeDescription.createString());
+    both1 = new SchemaEvolution(fileStruct1, readerStruct1, options);
+    assertTrue(both1.hasConversion());
+    assertFalse(both1.isPPDSafeConversion(0));
+    assertFalse(both1.isPPDSafeConversion(1));
+    assertTrue(both1.isPPDSafeConversion(2));
+    assertFalse(both1.isPPDSafeConversion(3));
   }
 
   @Test

[orc] branch main updated: ORC-1121: Fix column coversion check bug which causes column filters don't work (#1055)

Reply via email to