This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/main by this push:
new affae443d ORC-1266: DecimalColumnVector resets the isRepeating flag in
the nextVector method
affae443d is described below
commit affae443d337963c59fa5c4b7bbadf2b0477ef7a
Author: sychen <[email protected]>
AuthorDate: Sun Sep 18 20:20:05 2022 -0700
ORC-1266: DecimalColumnVector resets the isRepeating flag in the nextVector
method
### What changes were proposed in this pull request?
`DecimalColumnVector` resets the `isRepeating` flag in the `nextVector`
method.
### Why are the changes needed?
`DecimalColumnVector` does not set `isRepeating` when reading decimal data,
which leads to the use of type promotion to read decimal, and the obtained data
may be wrong.
### How was this patch tested?
add UT
Closes #1244 from cxzl25/ORC-1266.
Authored-by: sychen <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
.../org/apache/orc/impl/TreeReaderFactory.java | 75 +++++++++++---
.../orc/impl/TestConvertTreeReaderFactory.java | 115 +++++++++++++++++++++
2 files changed, 178 insertions(+), 12 deletions(-)
diff --git a/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java
b/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java
index a639e3181..bbbfdfef2 100644
--- a/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java
+++ b/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java
@@ -1551,7 +1551,8 @@ public class TreeReaderFactory {
HiveDecimalWritable[] vector = result.vector;
HiveDecimalWritable decWritable;
if (result.noNulls) {
- for (int r=0; r < batchSize; ++r) {
+ result.isRepeating = true;
+ for (int r = 0; r < batchSize; ++r) {
decWritable = vector[r];
if (!decWritable.serializationUtilsRead(
valueStream, scratchScaleVector[r],
@@ -1559,9 +1560,11 @@ public class TreeReaderFactory {
result.isNull[r] = true;
result.noNulls = false;
}
+ setIsRepeatingIfNeeded(result, r);
}
} else if (!result.isRepeating || !result.isNull[0]) {
- for (int r=0; r < batchSize; ++r) {
+ result.isRepeating = true;
+ for (int r = 0; r < batchSize; ++r) {
if (!result.isNull[r]) {
decWritable = vector[r];
if (!decWritable.serializationUtilsRead(
@@ -1571,6 +1574,7 @@ public class TreeReaderFactory {
result.noNulls = false;
}
}
+ setIsRepeatingIfNeeded(result, r);
}
}
}
@@ -1591,8 +1595,9 @@ public class TreeReaderFactory {
HiveDecimalWritable[] vector = result.vector;
HiveDecimalWritable decWritable;
if (result.noNulls) {
+ result.isRepeating = true;
int previousIdx = 0;
- for (int r=0; r != filterContext.getSelectedSize(); ++r) {
+ for (int r = 0; r != filterContext.getSelectedSize(); ++r) {
int idx = filterContext.getSelected()[r];
if (idx - previousIdx > 0) {
skipStreamRows(idx - previousIdx);
@@ -1604,12 +1609,14 @@ public class TreeReaderFactory {
result.isNull[idx] = true;
result.noNulls = false;
}
+ setIsRepeatingIfNeeded(result, idx);
previousIdx = idx + 1;
}
skipStreamRows(batchSize - previousIdx);
} else if (!result.isRepeating || !result.isNull[0]) {
+ result.isRepeating = true;
int previousIdx = 0;
- for (int r=0; r != filterContext.getSelectedSize(); ++r) {
+ for (int r = 0; r != filterContext.getSelectedSize(); ++r) {
int idx = filterContext.getSelected()[r];
if (idx - previousIdx > 0) {
skipStreamRows(countNonNullRowsInRange(result.isNull, previousIdx,
idx));
@@ -1623,6 +1630,7 @@ public class TreeReaderFactory {
result.noNulls = false;
}
}
+ setIsRepeatingIfNeeded(result, idx);
previousIdx = idx + 1;
}
skipStreamRows(countNonNullRowsInRange(result.isNull, previousIdx,
batchSize));
@@ -1643,16 +1651,20 @@ public class TreeReaderFactory {
// read the scales
scaleReader.nextVector(result, scratchScaleVector, batchSize);
if (result.noNulls) {
- for (int r=0; r < batchSize; ++r) {
+ result.isRepeating = true;
+ for (int r = 0; r < batchSize; ++r) {
final long scaleFactor = powerOfTenTable[scale -
scratchScaleVector[r]];
result.vector[r] = SerializationUtils.readVslong(valueStream) *
scaleFactor;
+ setIsRepeatingIfNeeded(result, r);
}
} else if (!result.isRepeating || !result.isNull[0]) {
- for (int r=0; r < batchSize; ++r) {
+ result.isRepeating = true;
+ for (int r = 0; r < batchSize; ++r) {
if (!result.isNull[r]) {
final long scaleFactor = powerOfTenTable[scale -
scratchScaleVector[r]];
result.vector[r] = SerializationUtils.readVslong(valueStream) *
scaleFactor;
}
+ setIsRepeatingIfNeeded(result, r);
}
}
result.precision = (short) precision;
@@ -1674,8 +1686,9 @@ public class TreeReaderFactory {
// Read all the scales
scaleReader.nextVector(result, scratchScaleVector, batchSize);
if (result.noNulls) {
+ result.isRepeating = true;
int previousIdx = 0;
- for (int r=0; r != filterContext.getSelectedSize(); r++) {
+ for (int r = 0; r != filterContext.getSelectedSize(); r++) {
int idx = filterContext.getSelected()[r];
if (idx - previousIdx > 0) {
skipStreamRows(idx - previousIdx);
@@ -1684,12 +1697,14 @@ public class TreeReaderFactory {
for (int s=scratchScaleVector[idx]; s < scale; ++s) {
result.vector[idx] *= 10;
}
+ setIsRepeatingIfNeeded(result, idx);
previousIdx = idx + 1;
}
skipStreamRows(batchSize - previousIdx);
} else if (!result.isRepeating || !result.isNull[0]) {
+ result.isRepeating = true;
int previousIdx = 0;
- for (int r=0; r != filterContext.getSelectedSize(); r++) {
+ for (int r = 0; r != filterContext.getSelectedSize(); r++) {
int idx = filterContext.getSelected()[r];
if (idx - previousIdx > 0) {
skipStreamRows(countNonNullRowsInRange(result.isNull, previousIdx,
idx));
@@ -1700,6 +1715,7 @@ public class TreeReaderFactory {
result.vector[idx] *= 10;
}
}
+ setIsRepeatingIfNeeded(result, idx);
previousIdx = idx + 1;
}
skipStreamRows(countNonNullRowsInRange(result.isNull, previousIdx,
batchSize));
@@ -1708,6 +1724,24 @@ public class TreeReaderFactory {
result.scale = (short) scale;
}
+ private void setIsRepeatingIfNeeded(Decimal64ColumnVector result, int
index) {
+ if (result.isRepeating
+ && index > 0
+ && (result.vector[0] != result.vector[index]
+ || result.isNull[0] != result.isNull[index])) {
+ result.isRepeating = false;
+ }
+ }
+
+ private void setIsRepeatingIfNeeded(DecimalColumnVector result, int index)
{
+ if (result.isRepeating
+ && index > 0
+ && (!result.vector[0].equals(result.vector[index])
+ || result.isNull[0] != result.isNull[index])) {
+ result.isRepeating = false;
+ }
+ }
+
@Override
public void nextVector(ColumnVector result,
boolean[] isNull,
@@ -1815,6 +1849,7 @@ public class TreeReaderFactory {
final int batchSize) throws IOException {
if (result.noNulls) {
if (filterContext.isSelectedInUse()) {
+ result.isRepeating = true;
int previousIdx = 0;
for (int r = 0; r != filterContext.getSelectedSize(); ++r) {
int idx = filterContext.getSelected()[r];
@@ -1822,16 +1857,20 @@ public class TreeReaderFactory {
valueReader.skip(idx - previousIdx);
}
result.vector[idx].setFromLongAndScale(valueReader.next(), scale);
+ setIsRepeatingIfNeeded(result, idx);
previousIdx = idx + 1;
}
valueReader.skip(batchSize - previousIdx);
} else {
+ result.isRepeating = true;
for (int r = 0; r < batchSize; ++r) {
result.vector[r].setFromLongAndScale(valueReader.next(), scale);
+ setIsRepeatingIfNeeded(result, r);
}
}
} else if (!result.isRepeating || !result.isNull[0]) {
if (filterContext.isSelectedInUse()) {
+ result.isRepeating = true;
int previousIdx = 0;
for (int r = 0; r != filterContext.getSelectedSize(); ++r) {
int idx = filterContext.getSelected()[r];
@@ -1841,16 +1880,19 @@ public class TreeReaderFactory {
if (!result.isNull[r]) {
result.vector[idx].setFromLongAndScale(valueReader.next(),
scale);
}
+ setIsRepeatingIfNeeded(result, idx);
previousIdx = idx + 1;
}
valueReader.skip(countNonNullRowsInRange(result.isNull, previousIdx,
batchSize));
} else {
- for (int r = 0; r < batchSize; ++r) {
- if (!result.isNull[r]) {
- result.vector[r].setFromLongAndScale(valueReader.next(),
scale);
- }
+ result.isRepeating = true;
+ for (int r = 0; r < batchSize; ++r) {
+ if (!result.isNull[r]) {
+ result.vector[r].setFromLongAndScale(valueReader.next(), scale);
}
+ setIsRepeatingIfNeeded(result, r);
}
+ }
}
result.precision = (short) precision;
result.scale = (short) scale;
@@ -1864,6 +1906,15 @@ public class TreeReaderFactory {
result.scale = (short) scale;
}
+ private void setIsRepeatingIfNeeded(DecimalColumnVector result, int index)
{
+ if (result.isRepeating
+ && index > 0
+ && (!result.vector[0].equals(result.vector[index])
+ || result.isNull[0] != result.isNull[index])) {
+ result.isRepeating = false;
+ }
+ }
+
@Override
public void nextVector(ColumnVector result,
boolean[] isNull,
diff --git
a/java/core/src/test/org/apache/orc/impl/TestConvertTreeReaderFactory.java
b/java/core/src/test/org/apache/orc/impl/TestConvertTreeReaderFactory.java
index 49c3bf243..a90a285a6 100644
--- a/java/core/src/test/org/apache/orc/impl/TestConvertTreeReaderFactory.java
+++ b/java/core/src/test/org/apache/orc/impl/TestConvertTreeReaderFactory.java
@@ -31,6 +31,7 @@ import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.OrcConf;
import org.apache.orc.OrcFile;
import org.apache.orc.OrcFile.WriterOptions;
import org.apache.orc.Reader;
@@ -52,6 +53,7 @@ import java.util.GregorianCalendar;
import java.util.concurrent.TimeUnit;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.mockito.Mockito.mock;
@@ -639,4 +641,117 @@ public class TestConvertTreeReaderFactory {
private void testConvertToBinaryIncreasingSize() throws Exception {
readORCFileIncreasingBatchSize("binary", BytesColumnVector.class);
}
+
+ @Test
+ public void testDecimalConvertInNullStripe() throws Exception {
+ try {
+ Configuration decimalConf = new Configuration(conf);
+ decimalConf.set(OrcConf.STRIPE_ROW_COUNT.getAttribute(), "1024");
+ decimalConf.set(OrcConf.ROWS_BETWEEN_CHECKS.getAttribute(), "1");
+
+ String typeStr = "decimal(5,1)";
+ TypeDescription schema = TypeDescription.fromString("struct<col1:" +
typeStr + ">");
+ Writer w = OrcFile.createWriter(testFilePath,
OrcFile.writerOptions(decimalConf).setSchema(schema));
+
+ VectorizedRowBatch b = schema.createRowBatch();
+ DecimalColumnVector f1 = (DecimalColumnVector) b.cols[0];
+ f1.isRepeating = true;
+ f1.set(0, (HiveDecimal) null);
+ b.size = 1024;
+ w.addRowBatch(b);
+
+ b.reset();
+ for (int i = 0; i < 1024; i++) {
+ f1.set(i, HiveDecimal.create(i + 1));
+ }
+ b.size = 1024;
+ w.addRowBatch(b);
+
+ b.reset();
+ f1.isRepeating = true;
+ f1.set(0, HiveDecimal.create(1));
+ b.size = 1024;
+ w.addRowBatch(b);
+
+ b.reset();
+ w.close();
+
+ testDecimalConvertToLongInNullStripe();
+ testDecimalConvertToDoubleInNullStripe();
+ testDecimalConvertToStringInNullStripe();
+ testDecimalConvertToTimestampInNullStripe();
+ testDecimalConvertToDecimalInNullStripe();
+ } finally {
+ fs.delete(testFilePath, false);
+ }
+ }
+
+ private void readDecimalInNullStripe(String typeString, Class<?>
expectedColumnType,
+ String[] expectedResult) throws Exception {
+ Reader.Options options = new Reader.Options();
+ TypeDescription schema = TypeDescription.fromString("struct<col1:" +
typeString + ">");
+ options.schema(schema);
+ String expected = options.toString();
+
+ Configuration conf = new Configuration();
+
+ Reader reader = OrcFile.createReader(testFilePath,
OrcFile.readerOptions(conf));
+ RecordReader rows = reader.rows(options);
+ VectorizedRowBatch batch = schema.createRowBatch();
+
+ rows.nextBatch(batch);
+ assertEquals(1024, batch.size);
+ assertEquals(expected, options.toString());
+ assertEquals(batch.cols.length, 1);
+ assertEquals(batch.cols[0].getClass(), expectedColumnType);
+ assertTrue(batch.cols[0].isRepeating);
+ StringBuilder sb = new StringBuilder();
+ batch.cols[0].stringifyValue(sb, 1023);
+ assertEquals(sb.toString(), expectedResult[0]);
+
+ rows.nextBatch(batch);
+ assertEquals(1024, batch.size);
+ assertEquals(expected, options.toString());
+ assertEquals(batch.cols.length, 1);
+ assertEquals(batch.cols[0].getClass(), expectedColumnType);
+ assertFalse(batch.cols[0].isRepeating);
+ StringBuilder sb2 = new StringBuilder();
+ batch.cols[0].stringifyValue(sb2, 1023);
+ assertEquals(sb2.toString(), expectedResult[1]);
+
+ rows.nextBatch(batch);
+ assertEquals(1024, batch.size);
+ assertEquals(expected, options.toString());
+ assertEquals(batch.cols.length, 1);
+ assertEquals(batch.cols[0].getClass(), expectedColumnType);
+ assertTrue(batch.cols[0].isRepeating);
+ StringBuilder sb3 = new StringBuilder();
+ batch.cols[0].stringifyValue(sb3, 1023);
+ assertEquals(sb3.toString(), expectedResult[2]);
+ }
+
+ private void testDecimalConvertToLongInNullStripe() throws Exception {
+ readDecimalInNullStripe("bigint", LongColumnVector.class,
+ new String[]{"null", "1024", "1"});
+ }
+
+ private void testDecimalConvertToDoubleInNullStripe() throws Exception {
+ readDecimalInNullStripe("double", DoubleColumnVector.class,
+ new String[]{"null", "1024.0", "1.0"});
+ }
+
+ private void testDecimalConvertToStringInNullStripe() throws Exception {
+ readDecimalInNullStripe("string", BytesColumnVector.class,
+ new String[]{"null", "\"1024\"", "\"1\""});
+ }
+
+ private void testDecimalConvertToTimestampInNullStripe() throws Exception {
+ readDecimalInNullStripe("timestamp", TimestampColumnVector.class,
+ new String[]{"null", "1970-01-01 00:17:04.0", "1970-01-01
00:00:01.0"});
+ }
+
+ private void testDecimalConvertToDecimalInNullStripe() throws Exception {
+ readDecimalInNullStripe("decimal(18,2)", DecimalColumnVector.class,
+ new String[]{"null", "1024", "1"});
+ }
}