nastra commented on a change in pull request #2746:
URL: https://github.com/apache/iceberg/pull/2746#discussion_r666278668
##########
File path:
arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDictionaryEncodedParquetValuesReader.java
##########
@@ -40,413 +40,183 @@ public VectorizedDictionaryEncodedParquetValuesReader(int
maxDefLevel, boolean s
super(maxDefLevel, setValidityVector);
}
- void readBatchOfDictionaryIds(IntVector intVector, int startOffset, int
numValuesToRead,
- NullabilityHolder nullabilityHolder) {
- int left = numValuesToRead;
- int idx = startOffset;
- while (left > 0) {
- if (this.currentCount == 0) {
- this.readNextGroup();
+ abstract class BaseDictEncodedReader {
+ public void nextBatch(
+ FieldVector vector, int startOffset, int numValuesToRead, Dictionary
dict,
+ NullabilityHolder nullabilityHolder, int typeWidth) {
+ int left = numValuesToRead;
+ int idx = startOffset;
+ while (left > 0) {
+ if (currentCount == 0) {
+ readNextGroup();
+ }
+ int numValues = Math.min(left, currentCount);
+ for (int i = 0; i < numValues; i++) {
+ int index = idx * typeWidth;
+ if (typeWidth == -1) {
+ index = idx;
+ }
+ if (Mode.RLE.equals(mode)) {
+ nextVal(vector, dict, index, currentValue, typeWidth);
+ } else if (Mode.PACKED.equals(mode)) {
+ nextVal(vector, dict, index,
packedValuesBuffer[packedValuesBufferIdx++], typeWidth);
+ }
+ nullabilityHolder.setNotNull(idx);
+ if (setArrowValidityVector) {
+ BitVectorHelper.setBit(vector.getValidityBuffer(), idx);
+ }
+ idx++;
+ }
+ left -= numValues;
+ currentCount -= numValues;
}
- int numValues = Math.min(left, this.currentCount);
- switch (mode) {
- case RLE:
- for (int i = 0; i < numValues; i++) {
- intVector.set(idx, currentValue);
- setNotNull(intVector, nullabilityHolder, idx);
- idx++;
- }
- break;
- case PACKED:
- for (int i = 0; i < numValues; i++) {
- intVector.set(idx, packedValuesBuffer[packedValuesBufferIdx++]);
- setNotNull(intVector, nullabilityHolder, idx);
- idx++;
- }
- break;
- }
- left -= numValues;
- currentCount -= numValues;
}
+
+ protected abstract void nextVal(FieldVector vector, Dictionary dict, int
idx, int currentVal, int typeWidth);
}
- void readBatchOfDictionaryEncodedLongs(FieldVector vector, int startOffset,
int numValuesToRead, Dictionary dict,
- NullabilityHolder nullabilityHolder,
int typeWidth) {
- int left = numValuesToRead;
- int idx = startOffset;
- while (left > 0) {
- if (this.currentCount == 0) {
- this.readNextGroup();
- }
- int numValues = Math.min(left, this.currentCount);
- switch (mode) {
- case RLE:
- for (int i = 0; i < numValues; i++) {
- vector.getDataBuffer().setLong(idx * typeWidth,
dict.decodeToLong(currentValue));
- setNotNull(vector, nullabilityHolder, idx);
- idx++;
- }
- break;
- case PACKED:
- for (int i = 0; i < numValues; i++) {
- vector.getDataBuffer()
- .setLong(idx * typeWidth,
dict.decodeToLong(packedValuesBuffer[packedValuesBufferIdx++]));
- setNotNull(vector, nullabilityHolder, idx);
- idx++;
- }
- break;
- }
- left -= numValues;
- currentCount -= numValues;
+ class DictionaryIdReader extends BaseDictEncodedReader {
+ @Override
+ protected void nextVal(FieldVector vector, Dictionary dict, int idx, int
currentVal, int typeWidth) {
+ ((IntVector) vector).set(idx, currentVal);
Review comment:
Dictionary encoded vectors are always represented as `IntVector`, but
since `BaseDictEncodedReader` uses the more generic `FieldVector` we need to do
a cast to `IntVector` here. Fwiw, here's how it was done in the original code:
https://github.com/apache/iceberg/blob/master/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDictionaryEncodedParquetValuesReader.java#L55
and
https://github.com/apache/iceberg/blob/6bcca16c48cd92dc98640130a28f73431e99e336/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java#L136
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]