This is an automated email from the ASF dual-hosted git repository.
kunalkapoor pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/carbondata.git
The following commit(s) were added to refs/heads/master by this push:
new a4bd3df [CARBONDATA-3995] Support presto querying older complex type
stores
a4bd3df is described below
commit a4bd3df8a29bcadd985bdd2b88bc186c9d74f25e
Author: ajantha-bhat <[email protected]>
AuthorDate: Fri Sep 18 12:41:45 2020 +0530
[CARBONDATA-3995] Support presto querying older complex type stores
Why is this PR needed?
Before carbon 2.0, complex child length is stored as SHORT for string,
varchar, binary, date, decimal types.
So, In 2.0 as it is stored as INT, presto complex query code always
assumes it as INT and goes to out of bound exception when old store is
queried.
What changes were proposed in this PR?
If INT_LENGTH_COMPLEX_CHILD_BYTE_ARRAY encoding is present, parse as
INT, else parse as SHORT. so, that both stores can be queried.
This closes #3937
---
.../dimension/v3/DimensionChunkReaderV3.java | 2 +
.../encoding/compress/DirectCompressCodec.java | 44 +++++++++++++------
.../metadata/datatype/DecimalConverterFactory.java | 50 ++++++++++++++--------
.../core/scan/result/vector/ColumnVectorInfo.java | 3 ++
4 files changed, 70 insertions(+), 29 deletions(-)
diff --git
a/core/src/main/java/org/apache/carbondata/core/datastore/chunk/reader/dimension/v3/DimensionChunkReaderV3.java
b/core/src/main/java/org/apache/carbondata/core/datastore/chunk/reader/dimension/v3/DimensionChunkReaderV3.java
index 2538687..53744db 100644
---
a/core/src/main/java/org/apache/carbondata/core/datastore/chunk/reader/dimension/v3/DimensionChunkReaderV3.java
+++
b/core/src/main/java/org/apache/carbondata/core/datastore/chunk/reader/dimension/v3/DimensionChunkReaderV3.java
@@ -253,6 +253,8 @@ public class DimensionChunkReaderV3 extends
AbstractDimensionChunkReader {
ColumnPageDecoder decoder = encodingFactory.createDecoder(encodings,
encoderMetas,
compressorName, vectorInfo != null);
if (vectorInfo != null) {
+ // set encodings of current page in the vectorInfo, used for decoding
the complex child page
+ vectorInfo.encodings = encodings;
decoder
.decodeAndFillVector(pageData.array(), offset,
pageMetadata.data_page_length, vectorInfo,
nullBitSet, isLocalDictEncodedPage,
pageMetadata.numberOfRowsInpage,
diff --git
a/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/compress/DirectCompressCodec.java
b/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/compress/DirectCompressCodec.java
index 27520c9..fbf1d73 100644
---
a/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/compress/DirectCompressCodec.java
+++
b/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/compress/DirectCompressCodec.java
@@ -50,6 +50,7 @@ import
org.apache.carbondata.core.scan.result.vector.impl.directread.ColumnarVec
import
org.apache.carbondata.core.scan.result.vector.impl.directread.ConvertibleVector;
import
org.apache.carbondata.core.scan.result.vector.impl.directread.SequentialFill;
import org.apache.carbondata.core.util.ByteUtil;
+import org.apache.carbondata.core.util.CarbonUtil;
import org.apache.carbondata.format.Encoding;
/**
@@ -316,6 +317,17 @@ public class DirectCompressCodec implements
ColumnPageCodec {
private void fillPrimitiveType(byte[] pageData, CarbonColumnVector vector,
DataType vectorDataType, DataType pageDataType, int pageSize,
ColumnVectorInfo vectorInfo,
BitSet nullBits) {
+ int intSizeInBytes = DataTypes.INT.getSizeInBytes();
+ int shortSizeInBytes = DataTypes.SHORT.getSizeInBytes();
+ int lengthStoredInBytes;
+ if (vectorInfo.encodings != null && vectorInfo.encodings.size() > 0 &&
CarbonUtil
+ .hasEncoding(vectorInfo.encodings,
Encoding.INT_LENGTH_COMPLEX_CHILD_BYTE_ARRAY)) {
+ lengthStoredInBytes = intSizeInBytes;
+ } else {
+ // Before to carbon 2.0, complex child length is stored as SHORT
+ // for string, varchar, binary, date, decimal types
+ lengthStoredInBytes = shortSizeInBytes;
+ }
int rowId = 0;
if (pageDataType == DataTypes.BOOLEAN || pageDataType == DataTypes.BYTE)
{
if (vectorDataType == DataTypes.SHORT) {
@@ -345,7 +357,6 @@ public class DirectCompressCodec implements ColumnPageCodec
{
}
}
} else if (pageDataType == DataTypes.SHORT) {
- int shortSizeInBytes = DataTypes.SHORT.getSizeInBytes();
int size = pageSize * shortSizeInBytes;
if (vectorDataType == DataTypes.SHORT) {
for (int i = 0; i < size; i += shortSizeInBytes) {
@@ -397,7 +408,6 @@ public class DirectCompressCodec implements ColumnPageCodec
{
}
}
} else {
- int intSizeInBytes = DataTypes.INT.getSizeInBytes();
if (pageDataType == DataTypes.INT) {
int size = pageSize * intSizeInBytes;
if (vectorDataType == DataTypes.INT) {
@@ -441,36 +451,46 @@ public class DirectCompressCodec implements
ColumnPageCodec {
|| vectorDataType == DataTypes.VARCHAR) {
// for complex primitive string, binary, varchar type
int offset = 0;
+ int length;
for (int i = 0; i < pageSize; i++) {
- int len = ByteBuffer.wrap(pageData, offset,
intSizeInBytes).getInt();
- offset += intSizeInBytes;
- if (vectorDataType == DataTypes.BINARY && len == 0) {
+ if (lengthStoredInBytes == intSizeInBytes) {
+ length = ByteBuffer.wrap(pageData, offset,
lengthStoredInBytes).getInt();
+ } else {
+ length = ByteBuffer.wrap(pageData, offset,
lengthStoredInBytes).getShort();
+ }
+ offset += lengthStoredInBytes;
+ if (vectorDataType == DataTypes.BINARY && length == 0) {
vector.putNull(i);
continue;
}
- byte[] row = new byte[len];
- System.arraycopy(pageData, offset, row, 0, len);
+ byte[] row = new byte[length];
+ System.arraycopy(pageData, offset, row, 0, length);
if (Arrays.equals(row,
CarbonCommonConstants.MEMBER_DEFAULT_VAL_ARRAY)) {
vector.putNull(i);
} else {
vector.putObject(i, row);
}
- offset += len;
+ offset += length;
}
} else if (vectorDataType == DataTypes.DATE) {
// for complex primitive date type
int offset = 0;
+ int length;
for (int i = 0; i < pageSize; i++) {
- int len = ByteBuffer.wrap(pageData, offset,
intSizeInBytes).getInt();
- offset += intSizeInBytes;
+ if (lengthStoredInBytes == intSizeInBytes) {
+ length = ByteBuffer.wrap(pageData, offset,
lengthStoredInBytes).getInt();
+ } else {
+ length = ByteBuffer.wrap(pageData, offset,
lengthStoredInBytes).getShort();
+ }
+ offset += lengthStoredInBytes;
int surrogateInternal =
ByteUtil.toXorInt(pageData, offset, intSizeInBytes);
- if (len == 0) {
+ if (length == 0) {
vector.putObject(0, null);
} else {
vector.putObject(0, surrogateInternal -
DateDirectDictionaryGenerator.cutOffDate);
}
- offset += len;
+ offset += length;
}
} else if (DataTypes.isDecimal(vectorDataType)) {
// for complex primitive decimal type
diff --git
a/core/src/main/java/org/apache/carbondata/core/metadata/datatype/DecimalConverterFactory.java
b/core/src/main/java/org/apache/carbondata/core/metadata/datatype/DecimalConverterFactory.java
index 7659cba..b848898 100644
---
a/core/src/main/java/org/apache/carbondata/core/metadata/datatype/DecimalConverterFactory.java
+++
b/core/src/main/java/org/apache/carbondata/core/metadata/datatype/DecimalConverterFactory.java
@@ -27,7 +27,9 @@ import
org.apache.carbondata.core.scan.result.vector.CarbonColumnVector;
import org.apache.carbondata.core.scan.result.vector.ColumnVectorInfo;
import
org.apache.carbondata.core.scan.result.vector.impl.directread.ColumnarVectorWrapperDirectFactory;
import org.apache.carbondata.core.util.ByteUtil;
+import org.apache.carbondata.core.util.CarbonUtil;
import org.apache.carbondata.core.util.DataTypeUtil;
+import org.apache.carbondata.format.Encoding;
/**
* Decimal converter to keep the data compact.
@@ -110,6 +112,10 @@ public final class DecimalConverterFactory {
@Override
public void fillVector(Object valuesToBeConverted, int size,
ColumnVectorInfo vectorInfo, BitSet nullBitSet, DataType pageType) {
+ if (!(valuesToBeConverted instanceof byte[])) {
+ throw new UnsupportedOperationException("This object type " +
valuesToBeConverted.getClass()
+ + " is not supported in this method");
+ }
// TODO we need to find way to directly set to vector with out
conversion. This way is very
// inefficient.
CarbonColumnVector vector = getCarbonColumnVector(vectorInfo,
nullBitSet);
@@ -124,9 +130,16 @@ public final class DecimalConverterFactory {
precision = vectorInfo.measure.getMeasure().getPrecision();
newMeasureScale = vectorInfo.measure.getMeasure().getScale();
}
- if (!(valuesToBeConverted instanceof byte[])) {
- throw new UnsupportedOperationException("This object type " +
valuesToBeConverted.getClass()
- + " is not supported in this method");
+ int shortSizeInBytes = DataTypes.SHORT.getSizeInBytes();
+ int intSizeInBytes = DataTypes.INT.getSizeInBytes();
+ int lengthStoredInBytes;
+ if (vectorInfo.encodings != null && vectorInfo.encodings.size() > 0 &&
CarbonUtil
+ .hasEncoding(vectorInfo.encodings,
Encoding.INT_LENGTH_COMPLEX_CHILD_BYTE_ARRAY)) {
+ lengthStoredInBytes = intSizeInBytes;
+ } else {
+ // before to carbon 2.0, complex child length is stored as SHORT
+ // for string, varchar, binary, date, decimal types
+ lengthStoredInBytes = shortSizeInBytes;
}
byte[] data = (byte[]) valuesToBeConverted;
if (pageType == DataTypes.BYTE) {
@@ -142,7 +155,6 @@ public final class DecimalConverterFactory {
}
}
} else if (pageType == DataTypes.SHORT) {
- int shortSizeInBytes = DataTypes.SHORT.getSizeInBytes();
for (int i = 0; i < size; i++) {
if (nullBitSet.get(i)) {
vector.putNull(i);
@@ -172,7 +184,6 @@ public final class DecimalConverterFactory {
}
}
} else {
- int intSizeInBytes = DataTypes.INT.getSizeInBytes();
if (pageType == DataTypes.INT) {
for (int i = 0; i < size; i++) {
if (nullBitSet.get(i)) {
@@ -205,39 +216,44 @@ public final class DecimalConverterFactory {
} else if (pageType == DataTypes.BYTE_ARRAY) {
// complex primitive decimal dimension
int offset = 0;
+ int length;
for (int j = 0; j < size; j++) {
// here decimal data will be Length[4 byte], scale[1 byte],
value[Length byte]
- int len = ByteBuffer.wrap(data, offset, intSizeInBytes).getInt();
- offset += intSizeInBytes;
- if (len == 0) {
+ if (lengthStoredInBytes == intSizeInBytes) {
+ length = ByteBuffer.wrap(data, offset,
lengthStoredInBytes).getInt();
+ } else {
+ length = ByteBuffer.wrap(data, offset,
lengthStoredInBytes).getShort();
+ }
+ offset += lengthStoredInBytes;
+ if (length == 0) {
vector.putNull(j);
continue;
}
// jump the scale offset
offset += 1;
// remove scale from the length
- len -= 1;
- byte[] row = new byte[len];
- System.arraycopy(data, offset, row, 0, len);
+ length -= 1;
+ byte[] row = new byte[length];
+ System.arraycopy(data, offset, row, 0, length);
long val;
- if (len == 1) {
+ if (length == 1) {
val = row[0];
- } else if (len == 2) {
+ } else if (length == 2) {
val = ByteUtil.toShort(row, 0);
- } else if (len == 4) {
+ } else if (length == 4) {
val = ByteUtil.toInt(row, 0);
- } else if (len == 3) {
+ } else if (length == 3) {
val = ByteUtil.valueOf3Bytes(row, 0);
} else {
// TODO: check if other value can come
- val = ByteUtil.toLong(row, 0, len);
+ val = ByteUtil.toLong(row, 0, length);
}
BigDecimal value = BigDecimal.valueOf(val, scale);
if (value.scale() < newMeasureScale) {
value = value.setScale(newMeasureScale);
}
vector.putDecimal(j, value, precision);
- offset += len;
+ offset += length;
}
}
}
diff --git
a/core/src/main/java/org/apache/carbondata/core/scan/result/vector/ColumnVectorInfo.java
b/core/src/main/java/org/apache/carbondata/core/scan/result/vector/ColumnVectorInfo.java
index ed8be52..afccd3c 100644
---
a/core/src/main/java/org/apache/carbondata/core/scan/result/vector/ColumnVectorInfo.java
+++
b/core/src/main/java/org/apache/carbondata/core/scan/result/vector/ColumnVectorInfo.java
@@ -28,6 +28,7 @@ import
org.apache.carbondata.core.scan.filter.GenericQueryType;
import org.apache.carbondata.core.scan.model.ProjectionDimension;
import org.apache.carbondata.core.scan.model.ProjectionMeasure;
import
org.apache.carbondata.core.scan.result.vector.impl.CarbonColumnVectorImpl;
+import org.apache.carbondata.format.Encoding;
public class ColumnVectorInfo implements Comparable<ColumnVectorInfo> {
public int offset;
@@ -45,6 +46,8 @@ public class ColumnVectorInfo implements
Comparable<ColumnVectorInfo> {
public DecimalConverterFactory.DecimalConverter decimalConverter;
// Vector stack is used in complex column vectorInfo to store all the
children vectors.
public Stack<CarbonColumnVector> vectorStack = new Stack<>();
+ // store the encoding of the column, used while decoding the page for
filling the vector
+ public List<Encoding> encodings;
@Override
public int compareTo(ColumnVectorInfo o) {