This is an automated email from the ASF dual-hosted git repository.

dkuzmenko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new 146c9506038 HIVE-28202: Incorrect projected column size after ORC 
upgrade to v1.6.7 (Denys Kuzmenko, reviewed by Laszlo Bodor)
146c9506038 is described below

commit 146c9506038fa9591c5091948ed0407070a0c1b2
Author: Denys Kuzmenko <[email protected]>
AuthorDate: Tue May 28 12:55:47 2024 +0300

    HIVE-28202: Incorrect projected column size after ORC upgrade to v1.6.7 
(Denys Kuzmenko, reviewed by Laszlo Bodor)
    
    Closes #5195
---
 .../ql/txn/compactor/TestCrudCompactorOnTez.java   |  44 ++++++++++-----------
 .../hadoop/hive/ql/io/orc/OrcInputFormat.java      |  30 ++++----------
 .../hive/ql/io/orc/TestInputOutputFormat.java      |  43 ++++++++++++++------
 ql/src/test/resources/bucket_00952_0               | Bin 0 -> 32743456 bytes
 4 files changed, 62 insertions(+), 55 deletions(-)

diff --git 
a/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/txn/compactor/TestCrudCompactorOnTez.java
 
b/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/txn/compactor/TestCrudCompactorOnTez.java
index c2b3a64ee8d..9abac8636de 100644
--- 
a/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/txn/compactor/TestCrudCompactorOnTez.java
+++ 
b/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/txn/compactor/TestCrudCompactorOnTez.java
@@ -177,22 +177,22 @@ public class TestCrudCompactorOnTez extends 
CompactorOnTezTest {
             "{\"writeid\":7,\"bucketid\":536870912,\"rowid\":4}\t13\t13",
         },
         {
-            "{\"writeid\":7,\"bucketid\":536936448,\"rowid\":6}\t4\t4",
+            "{\"writeid\":7,\"bucketid\":536936448,\"rowid\":6}\t6\t4",
             "{\"writeid\":7,\"bucketid\":536936448,\"rowid\":7}\t3\t4",
-            "{\"writeid\":7,\"bucketid\":536936448,\"rowid\":8}\t2\t4",
-            "{\"writeid\":7,\"bucketid\":536936448,\"rowid\":9}\t5\t4",
+            "{\"writeid\":7,\"bucketid\":536936448,\"rowid\":8}\t4\t4",
+            "{\"writeid\":7,\"bucketid\":536936448,\"rowid\":9}\t2\t4",
         },
         {
-            "{\"writeid\":7,\"bucketid\":537001984,\"rowid\":10}\t6\t4",
-            "{\"writeid\":7,\"bucketid\":537001984,\"rowid\":11}\t5\t3",
+            "{\"writeid\":7,\"bucketid\":537001984,\"rowid\":10}\t5\t4",
+            "{\"writeid\":7,\"bucketid\":537001984,\"rowid\":11}\t2\t3",
             "{\"writeid\":7,\"bucketid\":537001984,\"rowid\":12}\t3\t3",
-            "{\"writeid\":7,\"bucketid\":537001984,\"rowid\":13}\t2\t3",
+            "{\"writeid\":7,\"bucketid\":537001984,\"rowid\":13}\t6\t3",
             "{\"writeid\":7,\"bucketid\":537001984,\"rowid\":14}\t4\t3",
         },
         {
-            "{\"writeid\":7,\"bucketid\":537067520,\"rowid\":15}\t6\t3",
-            "{\"writeid\":7,\"bucketid\":537067520,\"rowid\":16}\t5\t2",
-            "{\"writeid\":7,\"bucketid\":537067520,\"rowid\":17}\t6\t2",
+            "{\"writeid\":7,\"bucketid\":537067520,\"rowid\":15}\t5\t3",
+            "{\"writeid\":7,\"bucketid\":537067520,\"rowid\":16}\t6\t2",
+            "{\"writeid\":7,\"bucketid\":537067520,\"rowid\":17}\t5\t2",
         },
     };
     verifyRebalance(testDataProvider, tableName, null, expectedBuckets,
@@ -233,22 +233,22 @@ public class TestCrudCompactorOnTez extends 
CompactorOnTezTest {
         },
         {
             "{\"writeid\":7,\"bucketid\":536936448,\"rowid\":5}\t12\t12",
-            "{\"writeid\":7,\"bucketid\":536936448,\"rowid\":6}\t4\t4",
+            "{\"writeid\":7,\"bucketid\":536936448,\"rowid\":6}\t6\t4",
             "{\"writeid\":7,\"bucketid\":536936448,\"rowid\":7}\t3\t4",
-            "{\"writeid\":7,\"bucketid\":536936448,\"rowid\":8}\t2\t4",
-            "{\"writeid\":7,\"bucketid\":536936448,\"rowid\":9}\t5\t4",
+            "{\"writeid\":7,\"bucketid\":536936448,\"rowid\":8}\t4\t4",
+            "{\"writeid\":7,\"bucketid\":536936448,\"rowid\":9}\t2\t4",
         },
         {
-            "{\"writeid\":7,\"bucketid\":537001984,\"rowid\":10}\t6\t4",
-            "{\"writeid\":7,\"bucketid\":537001984,\"rowid\":11}\t5\t3",
+            "{\"writeid\":7,\"bucketid\":537001984,\"rowid\":10}\t5\t4",
+            "{\"writeid\":7,\"bucketid\":537001984,\"rowid\":11}\t2\t3",
             "{\"writeid\":7,\"bucketid\":537001984,\"rowid\":12}\t3\t3",
-            "{\"writeid\":7,\"bucketid\":537001984,\"rowid\":13}\t2\t3",
+            "{\"writeid\":7,\"bucketid\":537001984,\"rowid\":13}\t6\t3",
             "{\"writeid\":7,\"bucketid\":537001984,\"rowid\":14}\t4\t3",
         },
         {
-            "{\"writeid\":7,\"bucketid\":537067520,\"rowid\":15}\t6\t3",
-            "{\"writeid\":7,\"bucketid\":537067520,\"rowid\":16}\t5\t2",
-            "{\"writeid\":7,\"bucketid\":537067520,\"rowid\":17}\t6\t2",
+            "{\"writeid\":7,\"bucketid\":537067520,\"rowid\":15}\t5\t3",
+            "{\"writeid\":7,\"bucketid\":537067520,\"rowid\":16}\t6\t2",
+            "{\"writeid\":7,\"bucketid\":537067520,\"rowid\":17}\t5\t2",
         },
     };
     verifyRebalance(testDataProvider, tableName, null, expectedBuckets,
@@ -524,8 +524,6 @@ public class TestCrudCompactorOnTez extends 
CompactorOnTezTest {
             "{\"writeid\":1,\"bucketid\":536870912,\"rowid\":1}\t6\t2",
             "{\"writeid\":1,\"bucketid\":536870912,\"rowid\":2}\t6\t3",
             "{\"writeid\":1,\"bucketid\":536870912,\"rowid\":3}\t6\t4",
-            "{\"writeid\":1,\"bucketid\":536870912,\"rowid\":4}\t5\t2",
-            "{\"writeid\":1,\"bucketid\":536870912,\"rowid\":5}\t5\t3",
             "{\"writeid\":2,\"bucketid\":536870912,\"rowid\":0}\t12\t12",
             "{\"writeid\":3,\"bucketid\":536870912,\"rowid\":0}\t13\t13",
             "{\"writeid\":4,\"bucketid\":536870912,\"rowid\":0}\t14\t14",
@@ -534,7 +532,9 @@ public class TestCrudCompactorOnTez extends 
CompactorOnTezTest {
             "{\"writeid\":7,\"bucketid\":536870912,\"rowid\":0}\t17\t17",
         },
         {
-            "{\"writeid\":1,\"bucketid\":536936448,\"rowid\":0}\t2\t4",
+            "{\"writeid\":1,\"bucketid\":536936448,\"rowid\":0}\t5\t2",
+            "{\"writeid\":1,\"bucketid\":536936448,\"rowid\":1}\t5\t3",
+            "{\"writeid\":1,\"bucketid\":536936448,\"rowid\":2}\t2\t4",
         },
         {
             "{\"writeid\":1,\"bucketid\":537001984,\"rowid\":0}\t3\t3",
@@ -562,7 +562,7 @@ public class TestCrudCompactorOnTez extends 
CompactorOnTezTest {
     Assert.assertEquals("Buckets does not match after compaction", 
Arrays.asList(bucketNames),
         CompactorTestUtil.getBucketFileNames(fs, table, partitionName, 
folderName));
     AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf);
-    for(int i = 0; i < expectedBucketContent.length; i++) {
+    for (int i = 0; i < expectedBucketContent.length; i++) {
       Assert.assertEquals("rebalanced bucket " + i, 
Arrays.asList(expectedBucketContent[i]),
           testDataProvider.getBucketData(tableName, 
BucketCodec.V1.encode(options.bucket(i)) + ""));
     }
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java 
b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
index 564836144f0..54d693c2fd4 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
@@ -1725,35 +1725,21 @@ public class OrcInputFormat implements 
InputFormat<NullWritable, OrcStruct>,
     }
 
     private long computeProjectionSize(List<OrcProto.Type> fileTypes,
-        List<OrcProto.ColumnStatistics> stats, boolean[] fileIncluded) throws 
FileFormatException {
-      List<Integer> internalColIds = Lists.newArrayList();
+          List<OrcProto.ColumnStatistics> stats, boolean[] fileIncluded) 
throws FileFormatException {
+      // Exclude ORC <root> and ACID <row> struct elements to avoid full 
schema size estimation  
+      final List<Integer> internalColIds;
       if (fileIncluded == null) {
-        // Add all.
-        for (int i = 0; i < fileTypes.size(); i++) {
-          internalColIds.add(i);
-        }
+        internalColIds = IntStream.range(1, fileTypes.size())
+            .boxed().collect(Collectors.toList());
       } else {
-        for (int i = 0; i < fileIncluded.length; i++) {
-          if (fileIncluded[i]) {
-            internalColIds.add(i);
-          }
-        }
+        internalColIds = IntStream.range(1, fileTypes.size()).filter(i -> 
fileIncluded[i])
+            .filter(i -> i != OrcRecordUpdater.ROW + 1 || isOriginal) 
+            .boxed().collect(Collectors.toList());
       }
       return ReaderImpl.getRawDataSizeFromColIndices(internalColIds, 
fileTypes, stats);
     }
   }
 
-  public static boolean[] shiftReaderIncludedForAcid(boolean[] included) {
-    // We always need the base row
-    included[0] = true;
-    boolean[] newIncluded = new boolean[included.length + 
OrcRecordUpdater.FIELDS];
-    Arrays.fill(newIncluded, 0, OrcRecordUpdater.FIELDS, true);
-    for (int i = 0; i < included.length; ++i) {
-      newIncluded[i + OrcRecordUpdater.FIELDS] = included[i];
-    }
-    return newIncluded;
-  }
-
   /** Class intended to update two values from methods... Java-related cruft. 
*/
   @VisibleForTesting
   static final class CombinedCtx {
diff --git 
a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java 
b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
index c986d10ab4c..456d180b884 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
@@ -122,6 +122,7 @@ import org.apache.hadoop.mapred.RecordWriter;
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.hadoop.security.UserGroupInformation;
 import org.apache.hadoop.util.Progressable;
+import org.apache.hive.common.util.HiveTestUtils;
 import org.apache.orc.FileFormatException;
 import org.apache.orc.OrcConf;
 import org.apache.orc.OrcProto;
@@ -1785,21 +1786,21 @@ public class TestInputOutputFormat {
     OrcInputFormat.SplitGenerator splitter =
         new OrcInputFormat.SplitGenerator(new 
OrcInputFormat.SplitInfo(context, fs,
             fs.getFileStatus(new Path("/a/file")), null, null, true,
-            new ArrayList<AcidInputFormat.DeltaMetaData>(), true, null, null), 
null, true, true);
+            new ArrayList<>(), true, null, null), null, true, true);
     List<OrcSplit> results = splitter.call();
     OrcSplit result = results.get(0);
     assertEquals(3, results.size());
     assertEquals(3, result.getStart());
     assertEquals(400, result.getLength());
-    assertEquals(175168, result.getProjectedColumnsUncompressedSize());
+    assertEquals(167468, result.getProjectedColumnsUncompressedSize());
     result = results.get(1);
     assertEquals(403, result.getStart());
     assertEquals(400, result.getLength());
-    assertEquals(175168, result.getProjectedColumnsUncompressedSize());
+    assertEquals(167468, result.getProjectedColumnsUncompressedSize());
     result = results.get(2);
     assertEquals(803, result.getStart());
     assertEquals(100, result.getLength());
-    assertEquals(43792, result.getProjectedColumnsUncompressedSize());
+    assertEquals(41867, result.getProjectedColumnsUncompressedSize());
 
     // test min = 0, max = 0 generates each stripe
     HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPRED_MAX_SPLIT_SIZE, 0);
@@ -1807,17 +1808,16 @@ public class TestInputOutputFormat {
     context = new OrcInputFormat.Context(conf);
     splitter = new OrcInputFormat.SplitGenerator(new 
OrcInputFormat.SplitInfo(context, fs,
         fs.getFileStatus(new Path("/a/file")), null, null, true,
-        new ArrayList<AcidInputFormat.DeltaMetaData>(),
-        true, null, null), null, true, true);
+        new ArrayList<>(), true, null, null), null, true, true);
     results = splitter.call();
     assertEquals(5, results.size());
     for (int i = 0; i < stripeSizes.length; ++i) {
       assertEquals("checking stripe " + i + " size",
           stripeSizes[i], results.get(i).getLength());
       if (i == stripeSizes.length - 1) {
-        assertEquals(43792, 
results.get(i).getProjectedColumnsUncompressedSize());
+        assertEquals(41867, 
results.get(i).getProjectedColumnsUncompressedSize());
       } else {
-        assertEquals(87584, 
results.get(i).getProjectedColumnsUncompressedSize());
+        assertEquals(83734, 
results.get(i).getProjectedColumnsUncompressedSize());
       }
     }
 
@@ -1827,16 +1827,37 @@ public class TestInputOutputFormat {
     context = new OrcInputFormat.Context(conf);
     splitter = new OrcInputFormat.SplitGenerator(new 
OrcInputFormat.SplitInfo(context, fs,
         fs.getFileStatus(new Path("/a/file")), null, null, true,
-        new ArrayList<AcidInputFormat.DeltaMetaData>(),
-        true, null, null), null, true, true);
+        new ArrayList<>(), true, null, null), null, true, true);
     results = splitter.call();
     assertEquals(1, results.size());
     result = results.get(0);
     assertEquals(3, result.getStart());
     assertEquals(900, result.getLength());
-    assertEquals(394128, result.getProjectedColumnsUncompressedSize());
+    assertEquals(376804, result.getProjectedColumnsUncompressedSize());
   }
 
+  @Test
+  public void testAcidProjectedColumnSize() throws Exception {
+    Path file = new Path(HiveTestUtils.getFileFromClasspath("bucket_00952_0"));
+    FileSystem fs = FileSystem.getLocal(conf).getRaw();
+
+    conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, true);
+    OrcInputFormat.Context context = new OrcInputFormat.Context(conf);
+    
+    OrcInputFormat.SplitInfo splitInfo = new OrcInputFormat.SplitInfo(
+        context, fs, fs.getFileStatus(file), null, null, false, new 
ArrayList<>(), true, null, null);
+    OrcInputFormat.SplitGenerator splitter = new 
OrcInputFormat.SplitGenerator(splitInfo, null, true, true);
+    
+    List<OrcSplit> results = splitter.call();
+    assertEquals(1246255309, 
results.get(0).getProjectedColumnsUncompressedSize());
+
+    conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
+    conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "2,4,6,8");
+    
+    results = splitter.call();
+    assertEquals(10509573, 
results.get(0).getProjectedColumnsUncompressedSize());
+  }
+  
   @Test
   public void testInOutFormat() throws Exception {
     Properties properties = new Properties();
diff --git a/ql/src/test/resources/bucket_00952_0 
b/ql/src/test/resources/bucket_00952_0
new file mode 100644
index 00000000000..3b7b6316229
Binary files /dev/null and b/ql/src/test/resources/bucket_00952_0 differ

Reply via email to