sunchao commented on a change in pull request #34659:
URL: https://github.com/apache/spark/pull/34659#discussion_r838971025
##########
File path:
sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedRleValuesReader.java
##########
@@ -210,48 +213,358 @@ private void readBatchInternal(
} else if (rowId > rangeEnd) {
state.nextRange();
} else {
- // the range [rowId, rowId + n) overlaps with the current row range in
state
+ // The range [rowId, rowId + n) overlaps with the current row range in
state
long start = Math.max(rangeStart, rowId);
long end = Math.min(rangeEnd, rowId + n - 1);
- // skip the part [rowId, start)
+ // Skip the part [rowId, start)
int toSkip = (int) (start - rowId);
if (toSkip > 0) {
skipValues(toSkip, state, valueReader, updater);
rowId += toSkip;
leftInPage -= toSkip;
}
- // read the part [start, end]
+ // Read the part [start, end]
n = (int) (end - start + 1);
switch (mode) {
case RLE:
if (currentValue == state.maxDefinitionLevel) {
- updater.readValues(n, offset, values, valueReader);
- } else {
- nulls.putNulls(offset, n);
+ updater.readValues(n, state.valueOffset, values, valueReader);
+ state.valueOffset += n;
+ } else if (!state.isRequired && currentValue ==
state.maxDefinitionLevel - 1) {
+ // Only add null if this represents a null element, but not for
the case where a
+ // struct itself is null
+ nulls.putNulls(state.valueOffset, n);
+ state.valueOffset += n;
}
+ defLevels.putInts(state.levelOffset, n, currentValue);
break;
case PACKED:
for (int i = 0; i < n; ++i) {
- if (currentBuffer[currentBufferIdx++] ==
state.maxDefinitionLevel) {
- updater.readValue(offset + i, values, valueReader);
+ int value = currentBuffer[currentBufferIdx++];
+ if (value == state.maxDefinitionLevel) {
+ updater.readValue(state.valueOffset++, values, valueReader);
} else {
- nulls.putNull(offset + i);
+ // Only add null if this represents a null element, but not
for the case where a
+ // struct itself is null
+ nulls.putNull(state.valueOffset++);
}
+ defLevels.putInt(state.levelOffset + i, value);
}
break;
}
- offset += n;
+ state.levelOffset += n;
Review comment:
This is because `valueOffset` is only updated when the value is not
null, but `levelOffset` is updated in all the cases. I could move this to the
end of each `case` clause above, but I feel it's not too helpful.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]