This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/main by this push:
     new affae443d ORC-1266: DecimalColumnVector resets the isRepeating flag in 
the nextVector method
affae443d is described below

commit affae443d337963c59fa5c4b7bbadf2b0477ef7a
Author: sychen <[email protected]>
AuthorDate: Sun Sep 18 20:20:05 2022 -0700

    ORC-1266: DecimalColumnVector resets the isRepeating flag in the nextVector 
method
    
    ### What changes were proposed in this pull request?
    `DecimalColumnVector` resets the `isRepeating` flag in the `nextVector` 
method.
    
    ### Why are the changes needed?
    `DecimalColumnVector` does not set `isRepeating` when reading decimal data, 
which leads to the use of type promotion to read decimal, and the obtained data 
may be wrong.
    
    ### How was this patch tested?
    add UT
    
    Closes #1244 from cxzl25/ORC-1266.
    
    Authored-by: sychen <[email protected]>
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 .../org/apache/orc/impl/TreeReaderFactory.java     |  75 +++++++++++---
 .../orc/impl/TestConvertTreeReaderFactory.java     | 115 +++++++++++++++++++++
 2 files changed, 178 insertions(+), 12 deletions(-)

diff --git a/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java 
b/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java
index a639e3181..bbbfdfef2 100644
--- a/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java
+++ b/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java
@@ -1551,7 +1551,8 @@ public class TreeReaderFactory {
       HiveDecimalWritable[] vector = result.vector;
       HiveDecimalWritable decWritable;
       if (result.noNulls) {
-        for (int r=0; r < batchSize; ++r) {
+        result.isRepeating = true;
+        for (int r = 0; r < batchSize; ++r) {
           decWritable = vector[r];
           if (!decWritable.serializationUtilsRead(
               valueStream, scratchScaleVector[r],
@@ -1559,9 +1560,11 @@ public class TreeReaderFactory {
             result.isNull[r] = true;
             result.noNulls = false;
           }
+          setIsRepeatingIfNeeded(result, r);
         }
       } else if (!result.isRepeating || !result.isNull[0]) {
-        for (int r=0; r < batchSize; ++r) {
+        result.isRepeating = true;
+        for (int r = 0; r < batchSize; ++r) {
           if (!result.isNull[r]) {
             decWritable = vector[r];
             if (!decWritable.serializationUtilsRead(
@@ -1571,6 +1574,7 @@ public class TreeReaderFactory {
               result.noNulls = false;
             }
           }
+          setIsRepeatingIfNeeded(result, r);
         }
       }
     }
@@ -1591,8 +1595,9 @@ public class TreeReaderFactory {
       HiveDecimalWritable[] vector = result.vector;
       HiveDecimalWritable decWritable;
       if (result.noNulls) {
+        result.isRepeating = true;
         int previousIdx = 0;
-        for (int r=0; r != filterContext.getSelectedSize(); ++r) {
+        for (int r = 0; r != filterContext.getSelectedSize(); ++r) {
           int idx = filterContext.getSelected()[r];
           if (idx - previousIdx > 0) {
             skipStreamRows(idx - previousIdx);
@@ -1604,12 +1609,14 @@ public class TreeReaderFactory {
             result.isNull[idx] = true;
             result.noNulls = false;
           }
+          setIsRepeatingIfNeeded(result, idx);
           previousIdx = idx + 1;
         }
         skipStreamRows(batchSize - previousIdx);
       } else if (!result.isRepeating || !result.isNull[0]) {
+        result.isRepeating = true;
         int previousIdx = 0;
-        for (int r=0; r != filterContext.getSelectedSize(); ++r) {
+        for (int r = 0; r != filterContext.getSelectedSize(); ++r) {
           int idx = filterContext.getSelected()[r];
           if (idx - previousIdx > 0) {
             skipStreamRows(countNonNullRowsInRange(result.isNull, previousIdx, 
idx));
@@ -1623,6 +1630,7 @@ public class TreeReaderFactory {
               result.noNulls = false;
             }
           }
+          setIsRepeatingIfNeeded(result, idx);
           previousIdx = idx + 1;
         }
         skipStreamRows(countNonNullRowsInRange(result.isNull, previousIdx, 
batchSize));
@@ -1643,16 +1651,20 @@ public class TreeReaderFactory {
       // read the scales
       scaleReader.nextVector(result, scratchScaleVector, batchSize);
       if (result.noNulls) {
-        for (int r=0; r < batchSize; ++r) {
+        result.isRepeating = true;
+        for (int r = 0; r < batchSize; ++r) {
           final long scaleFactor = powerOfTenTable[scale - 
scratchScaleVector[r]];
           result.vector[r] = SerializationUtils.readVslong(valueStream) * 
scaleFactor;
+          setIsRepeatingIfNeeded(result, r);
         }
       } else if (!result.isRepeating || !result.isNull[0]) {
-        for (int r=0; r < batchSize; ++r) {
+        result.isRepeating = true;
+        for (int r = 0; r < batchSize; ++r) {
           if (!result.isNull[r]) {
             final long scaleFactor = powerOfTenTable[scale - 
scratchScaleVector[r]];
             result.vector[r] = SerializationUtils.readVslong(valueStream) * 
scaleFactor;
           }
+          setIsRepeatingIfNeeded(result, r);
         }
       }
       result.precision = (short) precision;
@@ -1674,8 +1686,9 @@ public class TreeReaderFactory {
       // Read all the scales
       scaleReader.nextVector(result, scratchScaleVector, batchSize);
       if (result.noNulls) {
+        result.isRepeating = true;
         int previousIdx = 0;
-        for (int r=0; r != filterContext.getSelectedSize(); r++) {
+        for (int r = 0; r != filterContext.getSelectedSize(); r++) {
           int idx = filterContext.getSelected()[r];
           if (idx - previousIdx > 0) {
             skipStreamRows(idx - previousIdx);
@@ -1684,12 +1697,14 @@ public class TreeReaderFactory {
           for (int s=scratchScaleVector[idx]; s < scale; ++s) {
             result.vector[idx] *= 10;
           }
+          setIsRepeatingIfNeeded(result, idx);
           previousIdx = idx + 1;
         }
         skipStreamRows(batchSize - previousIdx);
       } else if (!result.isRepeating || !result.isNull[0]) {
+        result.isRepeating = true;
         int previousIdx = 0;
-        for (int r=0; r != filterContext.getSelectedSize(); r++) {
+        for (int r = 0; r != filterContext.getSelectedSize(); r++) {
           int idx = filterContext.getSelected()[r];
           if (idx - previousIdx > 0) {
             skipStreamRows(countNonNullRowsInRange(result.isNull, previousIdx, 
idx));
@@ -1700,6 +1715,7 @@ public class TreeReaderFactory {
               result.vector[idx] *= 10;
             }
           }
+          setIsRepeatingIfNeeded(result, idx);
           previousIdx = idx + 1;
         }
         skipStreamRows(countNonNullRowsInRange(result.isNull, previousIdx, 
batchSize));
@@ -1708,6 +1724,24 @@ public class TreeReaderFactory {
       result.scale = (short) scale;
     }
 
+    private void setIsRepeatingIfNeeded(Decimal64ColumnVector result, int 
index) {
+      if (result.isRepeating
+          && index > 0
+          && (result.vector[0] != result.vector[index]
+              || result.isNull[0] != result.isNull[index])) {
+        result.isRepeating = false;
+      }
+    }
+
+    private void setIsRepeatingIfNeeded(DecimalColumnVector result, int index) 
{
+      if (result.isRepeating
+          && index > 0
+          && (!result.vector[0].equals(result.vector[index])
+              || result.isNull[0] != result.isNull[index])) {
+        result.isRepeating = false;
+      }
+    }
+
     @Override
     public void nextVector(ColumnVector result,
                            boolean[] isNull,
@@ -1815,6 +1849,7 @@ public class TreeReaderFactory {
                             final int batchSize) throws IOException {
       if (result.noNulls) {
         if (filterContext.isSelectedInUse()) {
+          result.isRepeating = true;
           int previousIdx = 0;
           for (int r = 0; r != filterContext.getSelectedSize(); ++r) {
             int idx = filterContext.getSelected()[r];
@@ -1822,16 +1857,20 @@ public class TreeReaderFactory {
               valueReader.skip(idx - previousIdx);
             }
             result.vector[idx].setFromLongAndScale(valueReader.next(), scale);
+            setIsRepeatingIfNeeded(result, idx);
             previousIdx = idx + 1;
           }
           valueReader.skip(batchSize - previousIdx);
         } else {
+          result.isRepeating = true;
           for (int r = 0; r < batchSize; ++r) {
             result.vector[r].setFromLongAndScale(valueReader.next(), scale);
+            setIsRepeatingIfNeeded(result, r);
           }
         }
       } else if (!result.isRepeating || !result.isNull[0]) {
         if (filterContext.isSelectedInUse()) {
+          result.isRepeating = true;
           int previousIdx = 0;
           for (int r = 0; r != filterContext.getSelectedSize(); ++r) {
             int idx = filterContext.getSelected()[r];
@@ -1841,16 +1880,19 @@ public class TreeReaderFactory {
             if (!result.isNull[r]) {
               result.vector[idx].setFromLongAndScale(valueReader.next(), 
scale);
             }
+            setIsRepeatingIfNeeded(result, idx);
             previousIdx = idx + 1;
           }
           valueReader.skip(countNonNullRowsInRange(result.isNull, previousIdx, 
batchSize));
         } else {
-            for (int r = 0; r < batchSize; ++r) {
-              if (!result.isNull[r]) {
-                result.vector[r].setFromLongAndScale(valueReader.next(), 
scale);
-              }
+          result.isRepeating = true;
+          for (int r = 0; r < batchSize; ++r) {
+            if (!result.isNull[r]) {
+              result.vector[r].setFromLongAndScale(valueReader.next(), scale);
             }
+            setIsRepeatingIfNeeded(result, r);
           }
+        }
       }
       result.precision = (short) precision;
       result.scale = (short) scale;
@@ -1864,6 +1906,15 @@ public class TreeReaderFactory {
       result.scale = (short) scale;
     }
 
+    private void setIsRepeatingIfNeeded(DecimalColumnVector result, int index) 
{
+      if (result.isRepeating
+          && index > 0
+          && (!result.vector[0].equals(result.vector[index])
+              || result.isNull[0] != result.isNull[index])) {
+        result.isRepeating = false;
+      }
+    }
+
     @Override
     public void nextVector(ColumnVector result,
                            boolean[] isNull,
diff --git 
a/java/core/src/test/org/apache/orc/impl/TestConvertTreeReaderFactory.java 
b/java/core/src/test/org/apache/orc/impl/TestConvertTreeReaderFactory.java
index 49c3bf243..a90a285a6 100644
--- a/java/core/src/test/org/apache/orc/impl/TestConvertTreeReaderFactory.java
+++ b/java/core/src/test/org/apache/orc/impl/TestConvertTreeReaderFactory.java
@@ -31,6 +31,7 @@ import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.OrcConf;
 import org.apache.orc.OrcFile;
 import org.apache.orc.OrcFile.WriterOptions;
 import org.apache.orc.Reader;
@@ -52,6 +53,7 @@ import java.util.GregorianCalendar;
 import java.util.concurrent.TimeUnit;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertThrows;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 import static org.mockito.Mockito.mock;
@@ -639,4 +641,117 @@ public class TestConvertTreeReaderFactory {
   private void testConvertToBinaryIncreasingSize() throws Exception {
     readORCFileIncreasingBatchSize("binary", BytesColumnVector.class);
   }
+
+  @Test
+  public void testDecimalConvertInNullStripe() throws Exception {
+    try {
+      Configuration decimalConf = new Configuration(conf);
+      decimalConf.set(OrcConf.STRIPE_ROW_COUNT.getAttribute(), "1024");
+      decimalConf.set(OrcConf.ROWS_BETWEEN_CHECKS.getAttribute(), "1");
+
+      String typeStr = "decimal(5,1)";
+      TypeDescription schema = TypeDescription.fromString("struct<col1:" + 
typeStr + ">");
+      Writer w = OrcFile.createWriter(testFilePath, 
OrcFile.writerOptions(decimalConf).setSchema(schema));
+
+      VectorizedRowBatch b = schema.createRowBatch();
+      DecimalColumnVector f1 = (DecimalColumnVector) b.cols[0];
+      f1.isRepeating = true;
+      f1.set(0, (HiveDecimal) null);
+      b.size = 1024;
+      w.addRowBatch(b);
+
+      b.reset();
+      for (int i = 0; i < 1024; i++) {
+        f1.set(i, HiveDecimal.create(i + 1));
+      }
+      b.size = 1024;
+      w.addRowBatch(b);
+
+      b.reset();
+      f1.isRepeating = true;
+      f1.set(0, HiveDecimal.create(1));
+      b.size = 1024;
+      w.addRowBatch(b);
+
+      b.reset();
+      w.close();
+
+      testDecimalConvertToLongInNullStripe();
+      testDecimalConvertToDoubleInNullStripe();
+      testDecimalConvertToStringInNullStripe();
+      testDecimalConvertToTimestampInNullStripe();
+      testDecimalConvertToDecimalInNullStripe();
+    } finally {
+      fs.delete(testFilePath, false);
+    }
+  }
+
+  private void readDecimalInNullStripe(String typeString, Class<?> 
expectedColumnType,
+      String[] expectedResult) throws Exception {
+    Reader.Options options = new Reader.Options();
+    TypeDescription schema = TypeDescription.fromString("struct<col1:" + 
typeString + ">");
+    options.schema(schema);
+    String expected = options.toString();
+
+    Configuration conf = new Configuration();
+
+    Reader reader = OrcFile.createReader(testFilePath, 
OrcFile.readerOptions(conf));
+    RecordReader rows = reader.rows(options);
+    VectorizedRowBatch batch = schema.createRowBatch();
+
+    rows.nextBatch(batch);
+    assertEquals(1024, batch.size);
+    assertEquals(expected, options.toString());
+    assertEquals(batch.cols.length, 1);
+    assertEquals(batch.cols[0].getClass(), expectedColumnType);
+    assertTrue(batch.cols[0].isRepeating);
+    StringBuilder sb = new StringBuilder();
+    batch.cols[0].stringifyValue(sb, 1023);
+    assertEquals(sb.toString(), expectedResult[0]);
+
+    rows.nextBatch(batch);
+    assertEquals(1024, batch.size);
+    assertEquals(expected, options.toString());
+    assertEquals(batch.cols.length, 1);
+    assertEquals(batch.cols[0].getClass(), expectedColumnType);
+    assertFalse(batch.cols[0].isRepeating);
+    StringBuilder sb2 = new StringBuilder();
+    batch.cols[0].stringifyValue(sb2, 1023);
+    assertEquals(sb2.toString(), expectedResult[1]);
+
+    rows.nextBatch(batch);
+    assertEquals(1024, batch.size);
+    assertEquals(expected, options.toString());
+    assertEquals(batch.cols.length, 1);
+    assertEquals(batch.cols[0].getClass(), expectedColumnType);
+    assertTrue(batch.cols[0].isRepeating);
+    StringBuilder sb3 = new StringBuilder();
+    batch.cols[0].stringifyValue(sb3, 1023);
+    assertEquals(sb3.toString(), expectedResult[2]);
+  }
+
+  private void testDecimalConvertToLongInNullStripe() throws Exception {
+    readDecimalInNullStripe("bigint", LongColumnVector.class,
+            new String[]{"null", "1024", "1"});
+  }
+
+  private void testDecimalConvertToDoubleInNullStripe() throws Exception {
+    readDecimalInNullStripe("double", DoubleColumnVector.class,
+            new String[]{"null", "1024.0", "1.0"});
+  }
+
+  private void testDecimalConvertToStringInNullStripe() throws Exception {
+    readDecimalInNullStripe("string", BytesColumnVector.class,
+            new String[]{"null", "\"1024\"", "\"1\""});
+  }
+
+  private void testDecimalConvertToTimestampInNullStripe() throws Exception {
+    readDecimalInNullStripe("timestamp", TimestampColumnVector.class,
+            new String[]{"null", "1970-01-01 00:17:04.0", "1970-01-01 
00:00:01.0"});
+  }
+
+  private void testDecimalConvertToDecimalInNullStripe() throws Exception {
+    readDecimalInNullStripe("decimal(18,2)", DecimalColumnVector.class,
+            new String[]{"null", "1024", "1"});
+  }
 }

Reply via email to