pavibhai commented on code in PR #1482:
URL: https://github.com/apache/orc/pull/1482#discussion_r1192762867
##########
java/core/src/test/org/apache/orc/TestOrcFilterContext.java:
##########
@@ -61,9 +69,24 @@ public class TestOrcFilterContext {
TypeDescription.createList(TypeDescription.createChar()))
)
);
+ private static Configuration configuration;
+ private static FileSystem fileSystem;
+ private static final Path workDir = new
Path(System.getProperty("test.tmp.dir",
+ "target" + File.separator + "test"
+ + File.separator + "tmp"));
+ private static final Path filePath = new Path(workDir,
"orc_filter_file.orc");
+
+ private static final int RowCount = 400;
+
+ private static final int scale = 3;
Review Comment:
Given that the schema you are using is just int and string, we can remove
this.
##########
java/core/src/test/org/apache/orc/TestOrcFilterContext.java:
##########
@@ -225,4 +248,114 @@ public void testRepeatingVector() {
assertTrue(OrcFilterContext.isNull(vectorBranch, 1));
assertTrue(OrcFilterContext.isNull(vectorBranch, 2));
}
+
+ @Test
+ public void testACIDTable() {
+ ColumnVector[] columnVector =
filterContextACID.findColumnVector("string1");
+ assertEquals(1, columnVector.length);
+ assertTrue(columnVector[0] instanceof BytesColumnVector, "Expected a
BytesColumnVector, but found "+ columnVector[0].getClass());
+ columnVector = filterContextACID.findColumnVector("int1");
+ assertEquals(1, columnVector.length);
+ assertTrue(columnVector[0] instanceof LongColumnVector, "Expected a
LongColumnVector, but found "+ columnVector[0].getClass());
+ }
+
+
+ @Test
+ public void testRowFilterWithACIDTable() throws IOException {
+ createAcidORCFile();
+ readSingleRowWithFilter(new Random().nextInt(RowCount));
+ fileSystem.delete(filePath, false);
+
+ }
+ private void createAcidORCFile() throws IOException {
Review Comment:
.nit need a newline
##########
java/core/src/test/org/apache/orc/TestOrcFilterContext.java:
##########
@@ -225,4 +248,114 @@ public void testRepeatingVector() {
assertTrue(OrcFilterContext.isNull(vectorBranch, 1));
assertTrue(OrcFilterContext.isNull(vectorBranch, 2));
}
+
+ @Test
+ public void testACIDTable() {
+ ColumnVector[] columnVector =
filterContextACID.findColumnVector("string1");
+ assertEquals(1, columnVector.length);
+ assertTrue(columnVector[0] instanceof BytesColumnVector, "Expected a
BytesColumnVector, but found "+ columnVector[0].getClass());
+ columnVector = filterContextACID.findColumnVector("int1");
+ assertEquals(1, columnVector.length);
+ assertTrue(columnVector[0] instanceof LongColumnVector, "Expected a
LongColumnVector, but found "+ columnVector[0].getClass());
+ }
+
+
+ @Test
+ public void testRowFilterWithACIDTable() throws IOException {
+ createAcidORCFile();
+ readSingleRowWithFilter(new Random().nextInt(RowCount));
+ fileSystem.delete(filePath, false);
+
+ }
+ private void createAcidORCFile() throws IOException {
+ configuration = new Configuration();
+ fileSystem = FileSystem.get(configuration);
+
+ try (Writer writer = OrcFile.createWriter(filePath,
+ OrcFile.writerOptions(configuration)
+ .fileSystem(fileSystem)
+ .overwrite(true)
+ .rowIndexStride(8192)
+ .setSchema(acidSchema))) {
+
+ Random random = new Random(1024);
+ VectorizedRowBatch vectorizedRowBatch = acidSchema.createRowBatch();
+ for (int rowId = 0; rowId < RowCount; rowId++) {
+ long v = random.nextLong();
+ populateColumnValues(acidSchema,
vectorizedRowBatch.cols,vectorizedRowBatch.size, v);
+ // Populate the rowId
+ ((LongColumnVector)
vectorizedRowBatch.cols[3]).vector[vectorizedRowBatch.size] = rowId;
+ StructColumnVector row = (StructColumnVector)
vectorizedRowBatch.cols[5];
+ ((LongColumnVector) row.fields[0]).vector[vectorizedRowBatch.size] =
rowId;
+ vectorizedRowBatch.size += 1;
+ if (vectorizedRowBatch.size == vectorizedRowBatch.getMaxSize()) {
+ writer.addRowBatch(vectorizedRowBatch);
+ vectorizedRowBatch.reset();
+ }
+ }
+ if (vectorizedRowBatch.size > 0) {
+ writer.addRowBatch(vectorizedRowBatch);
+ vectorizedRowBatch.reset();
+ }
+ }
+ }
+
+ private void populateColumnValues(TypeDescription typeDescription,
ColumnVector[] columnVectors, int index, long value) {
+ for (int columnId = 0; columnId < typeDescription.getChildren().size() ;
columnId++) {
+ switch (typeDescription.getChildren().get(columnId).getCategory()) {
+ case INT:
+ ((LongColumnVector)columnVectors[columnId]).vector[index] = value;
+ break;
+ case LONG:
+ ((LongColumnVector)columnVectors[columnId]).vector[index] = value;
+ break;
+ case DECIMAL:
+ HiveDecimalWritable hiveDecimalWritable = new HiveDecimalWritable();
+ hiveDecimalWritable.setFromLongAndScale(value, scale);
+ ((DecimalColumnVector) columnVectors[columnId]).vector[index] =
hiveDecimalWritable;
+ break;
+ case STRING:
+ ((BytesColumnVector) columnVectors[columnId]).setVal(index,
+ ("String-"+ index).getBytes(StandardCharsets.UTF_8));
+ break;
+ case STRUCT:
+ populateColumnValues(typeDescription.getChildren().get(columnId),
((StructColumnVector)columnVectors[columnId]).fields, index, value);
+ break;
+ default:
+ throw new IllegalArgumentException();
+ }
+ }
+ }
+ private void readSingleRowWithFilter(long id) throws IOException {
+ Reader reader = OrcFile.createReader(filePath,
OrcFile.readerOptions(configuration).filesystem(fileSystem));
+ SearchArgument searchArgument = SearchArgumentFactory.newBuilder()
+ .in("int1", PredicateLeaf.Type.LONG, id)
+ .build();
+ Reader.Options readerOptions = reader.options()
+ .searchArgument(searchArgument, new String[] {"int1"})
+ .useSelected(true)
+ .allowSARGToFilter(true);
+ VectorizedRowBatch vectorizedRowBatch = acidSchema.createRowBatch();
+ long rowCount = 0;
+ try (RecordReader recordReader = reader.rows(readerOptions)) {
+ assertTrue(recordReader.nextBatch(vectorizedRowBatch));
+ rowCount += vectorizedRowBatch.size;
+ assertEquals(6, vectorizedRowBatch.cols.length);
+ assertTrue(vectorizedRowBatch.cols[5] instanceof StructColumnVector);
+ assertTrue(((StructColumnVector) vectorizedRowBatch.cols[5]).fields[0]
instanceof LongColumnVector);
+ assertTrue(((StructColumnVector) vectorizedRowBatch.cols[5]).fields[1]
instanceof BytesColumnVector);
+ assertEquals(id, ((LongColumnVector) ((StructColumnVector)
vectorizedRowBatch.cols[5]).fields[0]).vector[vectorizedRowBatch.selected[0]]);
+ checkStringColumn(id, vectorizedRowBatch);
+ assertFalse(recordReader.nextBatch(vectorizedRowBatch));
+ }
+ assertEquals(1, rowCount);
+ }
+
+ private static void checkStringColumn(long id, VectorizedRowBatch
vectorizedRowBatch) {
Review Comment:
Can we not just use `BytesColumnVector.toString(rowid)` and compare with the
expected value?
##########
java/core/src/test/org/apache/orc/TestOrcFilterContext.java:
##########
@@ -225,4 +248,114 @@ public void testRepeatingVector() {
assertTrue(OrcFilterContext.isNull(vectorBranch, 1));
assertTrue(OrcFilterContext.isNull(vectorBranch, 2));
}
+
+ @Test
+ public void testACIDTable() {
+ ColumnVector[] columnVector =
filterContextACID.findColumnVector("string1");
+ assertEquals(1, columnVector.length);
+ assertTrue(columnVector[0] instanceof BytesColumnVector, "Expected a
BytesColumnVector, but found "+ columnVector[0].getClass());
+ columnVector = filterContextACID.findColumnVector("int1");
+ assertEquals(1, columnVector.length);
+ assertTrue(columnVector[0] instanceof LongColumnVector, "Expected a
LongColumnVector, but found "+ columnVector[0].getClass());
+ }
+
+
+ @Test
+ public void testRowFilterWithACIDTable() throws IOException {
+ createAcidORCFile();
+ readSingleRowWithFilter(new Random().nextInt(RowCount));
+ fileSystem.delete(filePath, false);
+
+ }
+ private void createAcidORCFile() throws IOException {
+ configuration = new Configuration();
+ fileSystem = FileSystem.get(configuration);
+
+ try (Writer writer = OrcFile.createWriter(filePath,
+ OrcFile.writerOptions(configuration)
+ .fileSystem(fileSystem)
+ .overwrite(true)
+ .rowIndexStride(8192)
+ .setSchema(acidSchema))) {
+
+ Random random = new Random(1024);
+ VectorizedRowBatch vectorizedRowBatch = acidSchema.createRowBatch();
+ for (int rowId = 0; rowId < RowCount; rowId++) {
+ long v = random.nextLong();
+ populateColumnValues(acidSchema,
vectorizedRowBatch.cols,vectorizedRowBatch.size, v);
+ // Populate the rowId
+ ((LongColumnVector)
vectorizedRowBatch.cols[3]).vector[vectorizedRowBatch.size] = rowId;
+ StructColumnVector row = (StructColumnVector)
vectorizedRowBatch.cols[5];
+ ((LongColumnVector) row.fields[0]).vector[vectorizedRowBatch.size] =
rowId;
+ vectorizedRowBatch.size += 1;
+ if (vectorizedRowBatch.size == vectorizedRowBatch.getMaxSize()) {
+ writer.addRowBatch(vectorizedRowBatch);
+ vectorizedRowBatch.reset();
+ }
+ }
+ if (vectorizedRowBatch.size > 0) {
+ writer.addRowBatch(vectorizedRowBatch);
+ vectorizedRowBatch.reset();
+ }
+ }
+ }
+
+ private void populateColumnValues(TypeDescription typeDescription,
ColumnVector[] columnVectors, int index, long value) {
+ for (int columnId = 0; columnId < typeDescription.getChildren().size() ;
columnId++) {
+ switch (typeDescription.getChildren().get(columnId).getCategory()) {
+ case INT:
+ ((LongColumnVector)columnVectors[columnId]).vector[index] = value;
+ break;
+ case LONG:
Review Comment:
I think it might be cleaner to remove code that is not being used. `LONG`
and `DECIMAL`.
##########
java/core/src/test/org/apache/orc/TestOrcFilterContext.java:
##########
@@ -225,4 +248,114 @@ public void testRepeatingVector() {
assertTrue(OrcFilterContext.isNull(vectorBranch, 1));
assertTrue(OrcFilterContext.isNull(vectorBranch, 2));
}
+
+ @Test
+ public void testACIDTable() {
+ ColumnVector[] columnVector =
filterContextACID.findColumnVector("string1");
+ assertEquals(1, columnVector.length);
+ assertTrue(columnVector[0] instanceof BytesColumnVector, "Expected a
BytesColumnVector, but found "+ columnVector[0].getClass());
+ columnVector = filterContextACID.findColumnVector("int1");
+ assertEquals(1, columnVector.length);
+ assertTrue(columnVector[0] instanceof LongColumnVector, "Expected a
LongColumnVector, but found "+ columnVector[0].getClass());
+ }
+
Review Comment:
.nit please remove the extra new line
##########
java/core/src/test/org/apache/orc/TestOrcFilterContext.java:
##########
@@ -225,4 +248,114 @@ public void testRepeatingVector() {
assertTrue(OrcFilterContext.isNull(vectorBranch, 1));
assertTrue(OrcFilterContext.isNull(vectorBranch, 2));
}
+
+ @Test
+ public void testACIDTable() {
+ ColumnVector[] columnVector =
filterContextACID.findColumnVector("string1");
+ assertEquals(1, columnVector.length);
+ assertTrue(columnVector[0] instanceof BytesColumnVector, "Expected a
BytesColumnVector, but found "+ columnVector[0].getClass());
+ columnVector = filterContextACID.findColumnVector("int1");
+ assertEquals(1, columnVector.length);
+ assertTrue(columnVector[0] instanceof LongColumnVector, "Expected a
LongColumnVector, but found "+ columnVector[0].getClass());
+ }
+
+
+ @Test
+ public void testRowFilterWithACIDTable() throws IOException {
+ createAcidORCFile();
+ readSingleRowWithFilter(new Random().nextInt(RowCount));
+ fileSystem.delete(filePath, false);
+
+ }
+ private void createAcidORCFile() throws IOException {
+ configuration = new Configuration();
+ fileSystem = FileSystem.get(configuration);
+
+ try (Writer writer = OrcFile.createWriter(filePath,
+ OrcFile.writerOptions(configuration)
+ .fileSystem(fileSystem)
+ .overwrite(true)
+ .rowIndexStride(8192)
+ .setSchema(acidSchema))) {
+
+ Random random = new Random(1024);
+ VectorizedRowBatch vectorizedRowBatch = acidSchema.createRowBatch();
+ for (int rowId = 0; rowId < RowCount; rowId++) {
+ long v = random.nextLong();
+ populateColumnValues(acidSchema,
vectorizedRowBatch.cols,vectorizedRowBatch.size, v);
+ // Populate the rowId
+ ((LongColumnVector)
vectorizedRowBatch.cols[3]).vector[vectorizedRowBatch.size] = rowId;
+ StructColumnVector row = (StructColumnVector)
vectorizedRowBatch.cols[5];
+ ((LongColumnVector) row.fields[0]).vector[vectorizedRowBatch.size] =
rowId;
+ vectorizedRowBatch.size += 1;
+ if (vectorizedRowBatch.size == vectorizedRowBatch.getMaxSize()) {
+ writer.addRowBatch(vectorizedRowBatch);
+ vectorizedRowBatch.reset();
+ }
+ }
+ if (vectorizedRowBatch.size > 0) {
+ writer.addRowBatch(vectorizedRowBatch);
+ vectorizedRowBatch.reset();
+ }
+ }
+ }
+
+ private void populateColumnValues(TypeDescription typeDescription,
ColumnVector[] columnVectors, int index, long value) {
+ for (int columnId = 0; columnId < typeDescription.getChildren().size() ;
columnId++) {
+ switch (typeDescription.getChildren().get(columnId).getCategory()) {
+ case INT:
+ ((LongColumnVector)columnVectors[columnId]).vector[index] = value;
+ break;
+ case LONG:
+ ((LongColumnVector)columnVectors[columnId]).vector[index] = value;
+ break;
+ case DECIMAL:
+ HiveDecimalWritable hiveDecimalWritable = new HiveDecimalWritable();
+ hiveDecimalWritable.setFromLongAndScale(value, scale);
+ ((DecimalColumnVector) columnVectors[columnId]).vector[index] =
hiveDecimalWritable;
+ break;
+ case STRING:
+ ((BytesColumnVector) columnVectors[columnId]).setVal(index,
+ ("String-"+ index).getBytes(StandardCharsets.UTF_8));
+ break;
+ case STRUCT:
+ populateColumnValues(typeDescription.getChildren().get(columnId),
((StructColumnVector)columnVectors[columnId]).fields, index, value);
+ break;
+ default:
+ throw new IllegalArgumentException();
+ }
+ }
+ }
+ private void readSingleRowWithFilter(long id) throws IOException {
Review Comment:
.nit need newline
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]