[orc] branch master updated: ORC-696: Consistent TypeDescription handling for quoted field names

dongjoon Mon, 14 Dec 2020 15:11:15 -0800

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/orc.git



The following commit(s) were added to refs/heads/master by this push:
     new 4cad055  ORC-696: Consistent TypeDescription handling for quoted field 
names
4cad055 is described below

commit 4cad05524077fc5ab93cf3387b5582fc491515a3
Author: Panagiotis Garefalakis <[email protected]>
AuthorDate: Mon Dec 14 23:11:03 2020 +0000

    ORC-696: Consistent TypeDescription handling for quoted field names
    
    ### What changes were proposed in this pull request?
    
    Consistent TypeDescription handling for quoted field names
    
    ### Why are the changes needed?
    
    SARGs failing due to incorrect handling of quoted fieldNames
    
    ### How was this patch tested?
    
    TestVectorOrcFile.testQuotedPredicatePushdown
---
 java/core/src/java/org/apache/orc/OrcUtils.java    |   5 +-
 .../src/java/org/apache/orc/impl/ParserUtils.java  |   2 +-
 .../src/test/org/apache/orc/TestVectorOrcFile.java | 103 +++++++++++++++++++++
 3 files changed, 107 insertions(+), 3 deletions(-)

diff --git a/java/core/src/java/org/apache/orc/OrcUtils.java 
b/java/core/src/java/org/apache/orc/OrcUtils.java
index e6f10a0..a158e5c 100644
--- a/java/core/src/java/org/apache/orc/OrcUtils.java
+++ b/java/core/src/java/org/apache/orc/OrcUtils.java
@@ -17,6 +17,7 @@
  */
 package org.apache.orc;
 
+import org.apache.orc.impl.ParserUtils;
 import org.apache.orc.impl.ReaderImpl;
 import org.apache.orc.impl.SchemaEvolution;
 
@@ -344,8 +345,8 @@ public class OrcUtils {
       case STRUCT: {
           result = TypeDescription.createStruct();
           for(int f=0; f < type.getSubtypesCount(); ++f) {
-            result.addField(type.getFieldNames(f),
-                convertTypeFromProtobuf(types, type.getSubtypes(f)));
+            String fieldName = ParserUtils.parseName(new 
ParserUtils.StringPosition(type.getFieldNames(f)));
+            result.addField(fieldName, convertTypeFromProtobuf(types, 
type.getSubtypes(f)));
           }
         }
         break;
diff --git a/java/core/src/java/org/apache/orc/impl/ParserUtils.java 
b/java/core/src/java/org/apache/orc/impl/ParserUtils.java
index 1231818..559e3e1 100644
--- a/java/core/src/java/org/apache/orc/impl/ParserUtils.java
+++ b/java/core/src/java/org/apache/orc/impl/ParserUtils.java
@@ -76,7 +76,7 @@ public class ParserUtils {
     return result;
   }
 
-  static String parseName(ParserUtils.StringPosition source) {
+  public static String parseName(ParserUtils.StringPosition source) {
     if (source.position == source.length) {
       throw new IllegalArgumentException("Missing name at " + source);
     }
diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java 
b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
index 3b89db9..61786bd 100644
--- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
+++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
@@ -974,6 +974,16 @@ public class TestVectorOrcFile {
         + "complex:struct<int2:int,String1:string>>");
   }
 
+  private static TypeDescription createQuotedSchema() {
+    return TypeDescription.createStruct()
+        .addField("`int1`", TypeDescription.createInt())
+        .addField("`string1`", TypeDescription.createString());
+  }
+
+  private static TypeDescription createQuotedSchemaFromString() {
+    return 
TypeDescription.fromString("struct<```int1```:int,```string1```:string>");
+  }
+
   private static TypeDescription createBigRowSchema() {
     return TypeDescription.createStruct()
         .addField("boolean1", TypeDescription.createBoolean())
@@ -2594,6 +2604,99 @@ public class TestVectorOrcFile {
     Assert.assertEquals(3500, rows.getRowNumber());
   }
 
+  @Test
+  public void testQuotedPredicatePushdown() throws Exception {
+    TypeDescription schema = createQuotedSchema();
+    Writer writer = OrcFile.createWriter(testFilePath,
+        OrcFile.writerOptions(conf)
+            .setSchema(schema)
+            .stripeSize(400000L)
+            .compress(CompressionKind.NONE)
+            .bufferSize(500)
+            .rowIndexStride(1000)
+            .version(fileFormat));
+    VectorizedRowBatch batch = schema.createRowBatch();
+    batch.ensureSize(3500);
+    batch.size = 3500;
+    for(int i=0; i < 3500; ++i) {
+      ((LongColumnVector) batch.cols[0]).vector[i] = i * 300;
+      ((BytesColumnVector) batch.cols[1]).setVal(i,
+          Integer.toHexString(10*i).getBytes(StandardCharsets.UTF_8));
+    }
+    writer.addRowBatch(batch);
+    writer.close();
+    Reader reader = OrcFile.createReader(testFilePath,
+        OrcFile.readerOptions(conf).filesystem(fs));
+    assertEquals(3500, reader.getNumberOfRows());
+
+    SearchArgument sarg = SearchArgumentFactory.newBuilder()
+        .startAnd()
+        .startNot()
+        .lessThan("`int1`", PredicateLeaf.Type.LONG, 300000L)
+        .end()
+        .lessThan("`int1`", PredicateLeaf.Type.LONG, 600000L)
+        .end()
+        .build();
+    RecordReader rows = reader.rows(reader.options()
+        .range(0L, Long.MAX_VALUE)
+        .include(new boolean[]{true, true, true})
+        .searchArgument(sarg, new String[]{null, "`int1`", "string1"}));
+    batch = reader.getSchema().createRowBatch(2000);
+
+    Assert.assertEquals(1000L, rows.getRowNumber());
+    Assert.assertEquals(true, rows.nextBatch(batch));
+    assertEquals(1000, batch.size);
+
+    // Validate the same behaviour with schemaFromString
+    fs.delete(testFilePath, false);
+    TypeDescription qSchema = createQuotedSchemaFromString();
+    // [`int1`, `string1`]
+    assertEquals(schema.getFieldNames(), qSchema.getFieldNames());
+
+    Writer writerSchemaFromStr = OrcFile.createWriter(testFilePath,
+        OrcFile.writerOptions(conf)
+            .setSchema(qSchema)
+            .stripeSize(400000L)
+            .compress(CompressionKind.NONE)
+            .bufferSize(500)
+            .rowIndexStride(1000)
+            .version(fileFormat));
+    batch = qSchema.createRowBatch();
+    batch.ensureSize(3500);
+    batch.size = 3500;
+    for(int i=0; i < 3500; ++i) {
+      ((LongColumnVector) batch.cols[0]).vector[i] = i * 300;
+      ((BytesColumnVector) batch.cols[1]).setVal(i,
+          Integer.toHexString(10*i).getBytes(StandardCharsets.UTF_8));
+    }
+    writerSchemaFromStr.addRowBatch(batch);
+    writerSchemaFromStr.close();
+    Reader readerSchemaFromStr = OrcFile.createReader(testFilePath,
+        OrcFile.readerOptions(conf).filesystem(fs));
+    assertEquals(3500, readerSchemaFromStr.getNumberOfRows());
+
+    sarg = SearchArgumentFactory.newBuilder()
+        .startAnd()
+        .startNot()
+        .lessThan("`int1`", PredicateLeaf.Type.LONG, 300000L)
+        .end()
+        .lessThan("`int1`", PredicateLeaf.Type.LONG, 600000L)
+        .end()
+        .build();
+    rows = readerSchemaFromStr.rows(readerSchemaFromStr.options()
+        .range(0L, Long.MAX_VALUE)
+        .include(new boolean[]{true, true, true})
+        .searchArgument(sarg, new String[]{null, "`int1`", "string1"}));
+    batch = readerSchemaFromStr.getSchema().createRowBatch(2000);
+
+    Assert.assertEquals(1000L, rows.getRowNumber());
+    Assert.assertEquals(true, rows.nextBatch(batch));
+    assertEquals(1000, batch.size);
+
+    assertEquals(reader.getSchema(), readerSchemaFromStr.getSchema());
+    assertEquals(writer.getSchema(), writerSchemaFromStr.getSchema());
+  }
+
   /**
    * Test all of the types that have distinct ORC writers using the vectorized
    * writer with different combinations of repeating and null values.

[orc] branch master updated: ORC-696: Consistent TypeDescription handling for quoted field names

Reply via email to