flyrain commented on issue #3141:
URL: https://github.com/apache/iceberg/issues/3141#issuecomment-932565038
By digging a bit deeper, extending class ColumnarBatch is a dead-end. The
way we thought may work is to overload method `ColumnarBatch::rowIterator`, so
that we can filter out the deleted rows in a customized iterator. It works for
code path without Whole-Stage code gen. But codegen takes a different path,
which read rows directly from column vectors of a `ColumnarBatch` object by
invoking the public method `ColumnarBatch::column`, check the line 35 in the
code generated following.
We can try to make codegen to use `ColumnarBatch::rowIterator`, but it's not
a good practice since Spark will lose the ability to handle them as a batch. It
will fall back to row-by-row model.
Attached the code generated.
```
Found 1 WholeStageCodegen subtrees.
== Subtree 1 / 1 (maxMethodCodeSize:213; maxConstantPoolSize:129(0.20%
used); numInnerClasses:0) ==
*(1) Filter (data#1 = a)
+- *(1) ColumnarToRow
+- BatchScan[id#0, data#1] default_iceberg.default.test [filters=data =
‘a’]
Generated code:
/* 001 */ public Object generate(Object[] references) {
/* 002 */ return new GeneratedIteratorForCodegenStage1(references);
/* 003 */ }
/* 004 */
/* 005 */ // codegenStageId=1
/* 006 */ final class GeneratedIteratorForCodegenStage1 extends
org.apache.spark.sql.execution.BufferedRowIterator {
/* 007 */ private Object[] references;
/* 008 */ private scala.collection.Iterator[] inputs;
/* 009 */ private int columnartorow_batchIdx_0;
/* 010 */ private
org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[]
columnartorow_mutableStateArray_3 = new
org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[2];
/* 011 */ private org.apache.spark.sql.vectorized.ColumnarBatch[]
columnartorow_mutableStateArray_1 = new
org.apache.spark.sql.vectorized.ColumnarBatch[1];
/* 012 */ private scala.collection.Iterator[]
columnartorow_mutableStateArray_0 = new scala.collection.Iterator[1];
/* 013 */ private org.apache.spark.sql.vectorized.ColumnVector[]
columnartorow_mutableStateArray_2 = new
org.apache.spark.sql.vectorized.ColumnVector[2];
/* 014 */
/* 015 */ public GeneratedIteratorForCodegenStage1(Object[] references) {
/* 016 */ this.references = references;
/* 017 */ }
/* 018 */
/* 019 */ public void init(int index, scala.collection.Iterator[] inputs) {
/* 020 */ partitionIndex = index;
/* 021 */ this.inputs = inputs;
/* 022 */ columnartorow_mutableStateArray_0[0] = inputs[0];
/* 023 */
/* 024 */ columnartorow_mutableStateArray_3[0] = new
org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(2, 32);
/* 025 */ columnartorow_mutableStateArray_3[1] = new
org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(2, 32);
/* 026 */
/* 027 */ }
/* 028 */
/* 029 */ private void columnartorow_nextBatch_0() throws
java.io.IOException {
/* 030 */ if (columnartorow_mutableStateArray_0[0].hasNext()) {
/* 031 */ columnartorow_mutableStateArray_1[0] =
(org.apache.spark.sql.vectorized.ColumnarBatch)columnartorow_mutableStateArray_0[0].next();
/* 032 */ ((org.apache.spark.sql.execution.metric.SQLMetric)
references[1] /* numInputBatches */).add(1);
/* 033 */ ((org.apache.spark.sql.execution.metric.SQLMetric)
references[0] /* numOutputRows
*/).add(columnartorow_mutableStateArray_1[0].numRows());
/* 034 */ columnartorow_batchIdx_0 = 0;
/* 035 */ columnartorow_mutableStateArray_2[0] =
(org.apache.spark.sql.vectorized.ColumnVector)
columnartorow_mutableStateArray_1[0].column(0);
/* 036 */ columnartorow_mutableStateArray_2[1] =
(org.apache.spark.sql.vectorized.ColumnVector)
columnartorow_mutableStateArray_1[0].column(1);
/* 037 */
/* 038 */ }
/* 039 */ }
/* 040 */
/* 041 */ protected void processNext() throws java.io.IOException {
/* 042 */ if (columnartorow_mutableStateArray_1[0] == null) {
/* 043 */ columnartorow_nextBatch_0();
/* 044 */ }
/* 045 */ while ( columnartorow_mutableStateArray_1[0] != null) {
/* 046 */ int columnartorow_numRows_0 =
columnartorow_mutableStateArray_1[0].numRows();
/* 047 */ int columnartorow_localEnd_0 = columnartorow_numRows_0 -
columnartorow_batchIdx_0;
/* 048 */ for (int columnartorow_localIdx_0 = 0;
columnartorow_localIdx_0 < columnartorow_localEnd_0;
columnartorow_localIdx_0++) {
/* 049 */ int columnartorow_rowIdx_0 = columnartorow_batchIdx_0 +
columnartorow_localIdx_0;
/* 050 */ do {
/* 051 */ UTF8String columnartorow_value_1 =
columnartorow_mutableStateArray_2[1].getUTF8String(columnartorow_rowIdx_0);
/* 052 */
/* 053 */ boolean filter_value_0 = false;
/* 054 */ filter_value_0 =
columnartorow_value_1.equals(((UTF8String) references[3] /* literal */));
/* 055 */ if (!filter_value_0) continue;
/* 056 */
/* 057 */ ((org.apache.spark.sql.execution.metric.SQLMetric)
references[2] /* numOutputRows */).add(1);
/* 058 */
/* 059 */ int columnartorow_value_0 =
columnartorow_mutableStateArray_2[0].getInt(columnartorow_rowIdx_0);
/* 060 */ columnartorow_mutableStateArray_3[1].reset();
/* 061 */
/* 062 */ columnartorow_mutableStateArray_3[1].write(0,
columnartorow_value_0);
/* 063 */
/* 064 */ columnartorow_mutableStateArray_3[1].write(1,
columnartorow_value_1);
/* 065 */ append((columnartorow_mutableStateArray_3[1].getRow()));
/* 066 */
/* 067 */ } while(false);
/* 068 */ if (shouldStop()) { columnartorow_batchIdx_0 =
columnartorow_rowIdx_0 + 1; return; }
/* 069 */ }
/* 070 */ columnartorow_batchIdx_0 = columnartorow_numRows_0;
/* 071 */ columnartorow_mutableStateArray_1[0] = null;
/* 072 */ columnartorow_nextBatch_0();
/* 073 */ }
/* 074 */ }
/* 075 */
/* 076 */ }
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]