This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/master by this push:
new 4cad055 ORC-696: Consistent TypeDescription handling for quoted field
names
4cad055 is described below
commit 4cad05524077fc5ab93cf3387b5582fc491515a3
Author: Panagiotis Garefalakis <[email protected]>
AuthorDate: Mon Dec 14 23:11:03 2020 +0000
ORC-696: Consistent TypeDescription handling for quoted field names
### What changes were proposed in this pull request?
Consistent TypeDescription handling for quoted field names
### Why are the changes needed?
SARGs failing due to incorrect handling of quoted fieldNames
### How was this patch tested?
TestVectorOrcFile.testQuotedPredicatePushdown
---
java/core/src/java/org/apache/orc/OrcUtils.java | 5 +-
.../src/java/org/apache/orc/impl/ParserUtils.java | 2 +-
.../src/test/org/apache/orc/TestVectorOrcFile.java | 103 +++++++++++++++++++++
3 files changed, 107 insertions(+), 3 deletions(-)
diff --git a/java/core/src/java/org/apache/orc/OrcUtils.java
b/java/core/src/java/org/apache/orc/OrcUtils.java
index e6f10a0..a158e5c 100644
--- a/java/core/src/java/org/apache/orc/OrcUtils.java
+++ b/java/core/src/java/org/apache/orc/OrcUtils.java
@@ -17,6 +17,7 @@
*/
package org.apache.orc;
+import org.apache.orc.impl.ParserUtils;
import org.apache.orc.impl.ReaderImpl;
import org.apache.orc.impl.SchemaEvolution;
@@ -344,8 +345,8 @@ public class OrcUtils {
case STRUCT: {
result = TypeDescription.createStruct();
for(int f=0; f < type.getSubtypesCount(); ++f) {
- result.addField(type.getFieldNames(f),
- convertTypeFromProtobuf(types, type.getSubtypes(f)));
+ String fieldName = ParserUtils.parseName(new
ParserUtils.StringPosition(type.getFieldNames(f)));
+ result.addField(fieldName, convertTypeFromProtobuf(types,
type.getSubtypes(f)));
}
}
break;
diff --git a/java/core/src/java/org/apache/orc/impl/ParserUtils.java
b/java/core/src/java/org/apache/orc/impl/ParserUtils.java
index 1231818..559e3e1 100644
--- a/java/core/src/java/org/apache/orc/impl/ParserUtils.java
+++ b/java/core/src/java/org/apache/orc/impl/ParserUtils.java
@@ -76,7 +76,7 @@ public class ParserUtils {
return result;
}
- static String parseName(ParserUtils.StringPosition source) {
+ public static String parseName(ParserUtils.StringPosition source) {
if (source.position == source.length) {
throw new IllegalArgumentException("Missing name at " + source);
}
diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
index 3b89db9..61786bd 100644
--- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
+++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
@@ -974,6 +974,16 @@ public class TestVectorOrcFile {
+ "complex:struct<int2:int,String1:string>>");
}
+ private static TypeDescription createQuotedSchema() {
+ return TypeDescription.createStruct()
+ .addField("`int1`", TypeDescription.createInt())
+ .addField("`string1`", TypeDescription.createString());
+ }
+
+ private static TypeDescription createQuotedSchemaFromString() {
+ return
TypeDescription.fromString("struct<```int1```:int,```string1```:string>");
+ }
+
private static TypeDescription createBigRowSchema() {
return TypeDescription.createStruct()
.addField("boolean1", TypeDescription.createBoolean())
@@ -2594,6 +2604,99 @@ public class TestVectorOrcFile {
Assert.assertEquals(3500, rows.getRowNumber());
}
+ @Test
+ public void testQuotedPredicatePushdown() throws Exception {
+ TypeDescription schema = createQuotedSchema();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(400000L)
+ .compress(CompressionKind.NONE)
+ .bufferSize(500)
+ .rowIndexStride(1000)
+ .version(fileFormat));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ batch.ensureSize(3500);
+ batch.size = 3500;
+ for(int i=0; i < 3500; ++i) {
+ ((LongColumnVector) batch.cols[0]).vector[i] = i * 300;
+ ((BytesColumnVector) batch.cols[1]).setVal(i,
+ Integer.toHexString(10*i).getBytes(StandardCharsets.UTF_8));
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ assertEquals(3500, reader.getNumberOfRows());
+
+ SearchArgument sarg = SearchArgumentFactory.newBuilder()
+ .startAnd()
+ .startNot()
+ .lessThan("`int1`", PredicateLeaf.Type.LONG, 300000L)
+ .end()
+ .lessThan("`int1`", PredicateLeaf.Type.LONG, 600000L)
+ .end()
+ .build();
+ RecordReader rows = reader.rows(reader.options()
+ .range(0L, Long.MAX_VALUE)
+ .include(new boolean[]{true, true, true})
+ .searchArgument(sarg, new String[]{null, "`int1`", "string1"}));
+ batch = reader.getSchema().createRowBatch(2000);
+
+ Assert.assertEquals(1000L, rows.getRowNumber());
+ Assert.assertEquals(true, rows.nextBatch(batch));
+ assertEquals(1000, batch.size);
+
+ // Validate the same behaviour with schemaFromString
+ fs.delete(testFilePath, false);
+ TypeDescription qSchema = createQuotedSchemaFromString();
+ // [`int1`, `string1`]
+ assertEquals(schema.getFieldNames(), qSchema.getFieldNames());
+
+ Writer writerSchemaFromStr = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(qSchema)
+ .stripeSize(400000L)
+ .compress(CompressionKind.NONE)
+ .bufferSize(500)
+ .rowIndexStride(1000)
+ .version(fileFormat));
+ batch = qSchema.createRowBatch();
+ batch.ensureSize(3500);
+ batch.size = 3500;
+ for(int i=0; i < 3500; ++i) {
+ ((LongColumnVector) batch.cols[0]).vector[i] = i * 300;
+ ((BytesColumnVector) batch.cols[1]).setVal(i,
+ Integer.toHexString(10*i).getBytes(StandardCharsets.UTF_8));
+ }
+ writerSchemaFromStr.addRowBatch(batch);
+ writerSchemaFromStr.close();
+ Reader readerSchemaFromStr = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ assertEquals(3500, readerSchemaFromStr.getNumberOfRows());
+
+ sarg = SearchArgumentFactory.newBuilder()
+ .startAnd()
+ .startNot()
+ .lessThan("`int1`", PredicateLeaf.Type.LONG, 300000L)
+ .end()
+ .lessThan("`int1`", PredicateLeaf.Type.LONG, 600000L)
+ .end()
+ .build();
+ rows = readerSchemaFromStr.rows(readerSchemaFromStr.options()
+ .range(0L, Long.MAX_VALUE)
+ .include(new boolean[]{true, true, true})
+ .searchArgument(sarg, new String[]{null, "`int1`", "string1"}));
+ batch = readerSchemaFromStr.getSchema().createRowBatch(2000);
+
+ Assert.assertEquals(1000L, rows.getRowNumber());
+ Assert.assertEquals(true, rows.nextBatch(batch));
+ assertEquals(1000, batch.size);
+
+ assertEquals(reader.getSchema(), readerSchemaFromStr.getSchema());
+ assertEquals(writer.getSchema(), writerSchemaFromStr.getSchema());
+ }
+
/**
* Test all of the types that have distinct ORC writers using the vectorized
* writer with different combinations of repeating and null values.