[GitHub] [orc] pavibhai commented on a change in pull request #635: ORC-742: LazyIO for non-filter columns

GitBox Tue, 16 Feb 2021 11:44:38 -0800


pavibhai commented on a change in pull request #635:
URL: https://github.com/apache/orc/pull/635#discussion_r577090098




##########
File path: 
java/core/src/java/org/apache/orc/impl/reader/tree/StructBatchReader.java
##########
@@ -15,62 +15,80 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 package org.apache.orc.impl.reader.tree;
 
 import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.filter.OrcFilterContext;
 import org.apache.orc.impl.TreeReaderFactory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
-import java.util.Set;
 
 public class StructBatchReader extends BatchReader {
+  private static final Logger LOG = 
LoggerFactory.getLogger(StructBatchReader.class);
   // The reader context including row-filtering details
   private final TreeReaderFactory.Context context;
+  private final TreeReaderFactory.StructTreeReader structReader;
+  private final OrcFilterContext fc;
 
-  public StructBatchReader(TreeReaderFactory.StructTreeReader rowReader, 
TreeReaderFactory.Context context) {
+  public StructBatchReader(TypeReader rowReader, TreeReaderFactory.Context 
context) {
     super(rowReader);
     this.context = context;
+    this.fc = new 
OrcFilterContext(context.getSchemaEvolution().getReaderSchema());
+    if (rowReader instanceof TreeReaderFactory.StructTreeReader) {
+      structReader = (TreeReaderFactory.StructTreeReader) rowReader;
+    } else {
+      structReader = (TreeReaderFactory.StructTreeReader) 
((LevelTypeReader)rowReader).getReader();
+    }
   }
 
-  private void readBatchColumn(VectorizedRowBatch batch, TypeReader[] 
children, int batchSize, int index)
-      throws IOException {
+  private void readBatchColumn(VectorizedRowBatch batch,
+                               TypeReader[] children,
+                               int batchSize,
+                               int index,
+                               ReadLevel readLevel)
+    throws IOException {
     ColumnVector colVector = batch.cols[index];
     if (colVector != null) {
       colVector.reset();
       colVector.ensureSize(batchSize, false);
-      children[index].nextVector(colVector, null, batchSize, batch);
+      children[index].nextVector(colVector, null, batchSize, batch, readLevel);
     }
   }
 
   @Override
-  public void nextBatch(VectorizedRowBatch batch, int batchSize) throws 
IOException {
-    TypeReader[] children = ((TreeReaderFactory.StructTreeReader) 
rootType).fields;
-    // Early expand fields --> apply filter --> expand remaining fields
-    Set<Integer> earlyExpandCols = context.getColumnFilterIds();
+  public void nextBatch(VectorizedRowBatch batch, int batchSize, ReadLevel 
readLevel)
+    throws IOException {
+    nextBatchLevel(batch, batchSize, readLevel);
 
-    // Clear selected and early expand columns used in Filter
-    batch.selectedInUse = false;
-    for (int i = 0; i < children.length && !earlyExpandCols.isEmpty() &&
-        (vectorColumnCount == -1 || i < vectorColumnCount); ++i) {
-      if (earlyExpandCols.contains(children[i].getColumnId())) {
-        readBatchColumn(batch, children, batchSize, i);
+    if (readLevel == ReadLevel.LEAD) {
+      // Apply filter callback to reduce number of # rows selected for 
decoding in the next
+      // TreeReaders
+      if (this.context.getColumnFilterCallback() != null) {
+        this.context.getColumnFilterCallback().accept(fc.setBatch(batch));
       }
     }
-    // Since we are going to filter rows based on some column values set 
batch.size earlier here
-    batch.size = batchSize;
+  }
+
+  private void nextBatchLevel(VectorizedRowBatch batch, int batchSize, 
ReadLevel readLevel) throws IOException {
+    TypeReader[] children = structReader.fields;
 
-    // Apply filter callback to reduce number of # rows selected for decoding 
in the next TreeReaders
-    if (!earlyExpandCols.isEmpty() && this.context.getColumnFilterCallback() 
!= null) {
-      this.context.getColumnFilterCallback().accept(batch);
+    if (readLevel != ReadLevel.FOLLOW) {
+      // In case of FOLLOW we leave the selectedInUse untouched.
+      batch.selectedInUse = false;

Review comment:
       This is coming from the VectorizedRowBatch, it indicates a partial 
selection in the batch which requires the use of the selected vector to 
determine which rows are valid.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [orc] pavibhai commented on a change in pull request #635: ORC-742: LazyIO for non-filter columns

Reply via email to