[GitHub] spark pull request #21070: [SPARK-23972][BUILD][SQL] Update Parquet to 1.10....

rdblue Tue, 08 May 2018 09:32:36 -0700

Github user rdblue commented on a diff in the pull request:

    https://github.com/apache/spark/pull/21070#discussion_r186789552
  
    --- Diff: 
sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedRleValuesReader.java
 ---
    @@ -619,32 +608,37 @@ private int ceil8(int value) {
       /**
        * Reads the next group.
        */
    -  private void readNextGroup()  {
    -    int header = readUnsignedVarInt();
    -    this.mode = (header & 1) == 0 ? MODE.RLE : MODE.PACKED;
    -    switch (mode) {
    -      case RLE:
    -        this.currentCount = header >>> 1;
    -        this.currentValue = readIntLittleEndianPaddedOnBitWidth();
    -        return;
    -      case PACKED:
    -        int numGroups = header >>> 1;
    -        this.currentCount = numGroups * 8;
    -        int bytesToRead = ceil8(this.currentCount * this.bitWidth);
    -
    -        if (this.currentBuffer.length < this.currentCount) {
    -          this.currentBuffer = new int[this.currentCount];
    -        }
    -        currentBufferIdx = 0;
    -        int valueIndex = 0;
    -        for (int byteIndex = offset; valueIndex < this.currentCount; 
byteIndex += this.bitWidth) {
    -          this.packer.unpack8Values(in, byteIndex, this.currentBuffer, 
valueIndex);
    -          valueIndex += 8;
    -        }
    -        offset += bytesToRead;
    -        return;
    -      default:
    -        throw new ParquetDecodingException("not a valid mode " + 
this.mode);
    +  private void readNextGroup() {
    +    try {
    +      int header = readUnsignedVarInt();
    +      this.mode = (header & 1) == 0 ? MODE.RLE : MODE.PACKED;
    +      switch (mode) {
    +        case RLE:
    +          this.currentCount = header >>> 1;
    +          this.currentValue = readIntLittleEndianPaddedOnBitWidth();
    +          return;
    +        case PACKED:
    +          int numGroups = header >>> 1;
    +          this.currentCount = numGroups * 8;
    +
    +          if (this.currentBuffer.length < this.currentCount) {
    +            this.currentBuffer = new int[this.currentCount];
    +          }
    +          currentBufferIdx = 0;
    +          int valueIndex = 0;
    +          while (valueIndex < this.currentCount) {
    +            // values are bit packed 8 at a time, so reading bitWidth will 
always work
    +            ByteBuffer buffer = in.slice(bitWidth);
    +            this.packer.unpack8Values(
    +                buffer, buffer.arrayOffset() + buffer.position(), 
this.currentBuffer, valueIndex);
    --- End diff --
    
    Good catch. Fixed to remove the call to `arrayOffset`. It should work with 
both on- and off-heap buffers now.



---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request #21070: [SPARK-23972][BUILD][SQL] Update Parquet to 1.10....

Reply via email to