[GitHub] [orc] zratkai commented on a diff in pull request #1482: ORC-1413 fix for ORC row level filter issue with ACID table

via GitHub Mon, 15 May 2023 01:50:23 -0700


zratkai commented on code in PR #1482:
URL: https://github.com/apache/orc/pull/1482#discussion_r1193531356



##########
java/core/src/test/org/apache/orc/TestOrcFilterContext.java:
##########
@@ -225,4 +248,114 @@ public void testRepeatingVector() {
     assertTrue(OrcFilterContext.isNull(vectorBranch, 1));
     assertTrue(OrcFilterContext.isNull(vectorBranch, 2));
   }
+  
+  @Test
+  public void testACIDTable() {
+    ColumnVector[] columnVector = 
filterContextACID.findColumnVector("string1");
+    assertEquals(1, columnVector.length);
+    assertTrue(columnVector[0] instanceof BytesColumnVector, "Expected a  
BytesColumnVector, but found "+ columnVector[0].getClass());
+    columnVector = filterContextACID.findColumnVector("int1");
+    assertEquals(1, columnVector.length);
+    assertTrue(columnVector[0] instanceof LongColumnVector, "Expected a  
LongColumnVector, but found "+ columnVector[0].getClass());
+  }
+
+
+  @Test
+  public void testRowFilterWithACIDTable() throws IOException {
+    createAcidORCFile();
+    readSingleRowWithFilter(new Random().nextInt(RowCount));
+    fileSystem.delete(filePath, false);
+    
+  }
+  private void createAcidORCFile() throws IOException {
+    configuration = new Configuration();
+    fileSystem = FileSystem.get(configuration);
+
+    try (Writer writer = OrcFile.createWriter(filePath,
+            OrcFile.writerOptions(configuration)
+                    .fileSystem(fileSystem)
+                    .overwrite(true)
+                    .rowIndexStride(8192)
+                    .setSchema(acidSchema))) {
+
+      Random random = new Random(1024);
+      VectorizedRowBatch vectorizedRowBatch = acidSchema.createRowBatch();
+      for (int rowId = 0; rowId < RowCount; rowId++) {
+        long v = random.nextLong();
+        populateColumnValues(acidSchema, 
vectorizedRowBatch.cols,vectorizedRowBatch.size, v);
+        // Populate the rowId
+        ((LongColumnVector) 
vectorizedRowBatch.cols[3]).vector[vectorizedRowBatch.size] = rowId;
+        StructColumnVector row = (StructColumnVector) 
vectorizedRowBatch.cols[5];
+        ((LongColumnVector) row.fields[0]).vector[vectorizedRowBatch.size] = 
rowId;
+        vectorizedRowBatch.size += 1;
+        if (vectorizedRowBatch.size == vectorizedRowBatch.getMaxSize()) {
+          writer.addRowBatch(vectorizedRowBatch);
+          vectorizedRowBatch.reset();
+        }
+      }
+      if (vectorizedRowBatch.size > 0) {
+        writer.addRowBatch(vectorizedRowBatch);
+        vectorizedRowBatch.reset();
+      }
+    }
+  }
+  
+  private void populateColumnValues(TypeDescription typeDescription, 
ColumnVector[] columnVectors, int index, long value) {
+    for (int columnId = 0; columnId < typeDescription.getChildren().size() ; 
columnId++) {
+      switch (typeDescription.getChildren().get(columnId).getCategory()) {
+        case INT:
+          ((LongColumnVector)columnVectors[columnId]).vector[index] = value;
+          break;
+        case LONG:
+          ((LongColumnVector)columnVectors[columnId]).vector[index] = value;
+          break;
+        case DECIMAL:
+          HiveDecimalWritable hiveDecimalWritable = new HiveDecimalWritable();
+          hiveDecimalWritable.setFromLongAndScale(value, scale);
+          ((DecimalColumnVector) columnVectors[columnId]).vector[index] = 
hiveDecimalWritable;
+          break;
+        case STRING:
+          ((BytesColumnVector) columnVectors[columnId]).setVal(index,
+                  ("String-"+ index).getBytes(StandardCharsets.UTF_8));
+          break;
+        case STRUCT:
+          populateColumnValues(typeDescription.getChildren().get(columnId), 
((StructColumnVector)columnVectors[columnId]).fields, index, value);
+          break;           
+        default:
+          throw new IllegalArgumentException();
+      }
+    }
+  }
+  private void readSingleRowWithFilter(long id) throws IOException {
+    Reader reader = OrcFile.createReader(filePath, 
OrcFile.readerOptions(configuration).filesystem(fileSystem));
+    SearchArgument searchArgument = SearchArgumentFactory.newBuilder()
+            .in("int1", PredicateLeaf.Type.LONG, id)
+            .build();
+    Reader.Options readerOptions = reader.options()
+            .searchArgument(searchArgument, new String[] {"int1"})
+            .useSelected(true)
+            .allowSARGToFilter(true);
+    VectorizedRowBatch vectorizedRowBatch = acidSchema.createRowBatch();
+    long rowCount = 0;
+    try (RecordReader recordReader = reader.rows(readerOptions)) {
+      assertTrue(recordReader.nextBatch(vectorizedRowBatch));
+      rowCount += vectorizedRowBatch.size;
+      assertEquals(6, vectorizedRowBatch.cols.length);
+      assertTrue(vectorizedRowBatch.cols[5] instanceof StructColumnVector);
+      assertTrue(((StructColumnVector) vectorizedRowBatch.cols[5]).fields[0] 
instanceof LongColumnVector);
+      assertTrue(((StructColumnVector) vectorizedRowBatch.cols[5]).fields[1] 
instanceof BytesColumnVector);
+      assertEquals(id, ((LongColumnVector) ((StructColumnVector) 
vectorizedRowBatch.cols[5]).fields[0]).vector[vectorizedRowBatch.selected[0]]);
+      checkStringColumn(id, vectorizedRowBatch);
+      assertFalse(recordReader.nextBatch(vectorizedRowBatch));
+    }
+    assertEquals(1, rowCount);
+  }
+
+  private static void checkStringColumn(long id, VectorizedRowBatch 
vectorizedRowBatch) {

Review Comment:
   Yes, it can be done. Thank for showing this!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [orc] zratkai commented on a diff in pull request #1482: ORC-1413 fix for ORC row level filter issue with ACID table

Reply via email to