This is an automated email from the ASF dual-hosted git repository.
dkuzmenko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 146c9506038 HIVE-28202: Incorrect projected column size after ORC
upgrade to v1.6.7 (Denys Kuzmenko, reviewed by Laszlo Bodor)
146c9506038 is described below
commit 146c9506038fa9591c5091948ed0407070a0c1b2
Author: Denys Kuzmenko <[email protected]>
AuthorDate: Tue May 28 12:55:47 2024 +0300
HIVE-28202: Incorrect projected column size after ORC upgrade to v1.6.7
(Denys Kuzmenko, reviewed by Laszlo Bodor)
Closes #5195
---
.../ql/txn/compactor/TestCrudCompactorOnTez.java | 44 ++++++++++-----------
.../hadoop/hive/ql/io/orc/OrcInputFormat.java | 30 ++++----------
.../hive/ql/io/orc/TestInputOutputFormat.java | 43 ++++++++++++++------
ql/src/test/resources/bucket_00952_0 | Bin 0 -> 32743456 bytes
4 files changed, 62 insertions(+), 55 deletions(-)
diff --git
a/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/txn/compactor/TestCrudCompactorOnTez.java
b/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/txn/compactor/TestCrudCompactorOnTez.java
index c2b3a64ee8d..9abac8636de 100644
---
a/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/txn/compactor/TestCrudCompactorOnTez.java
+++
b/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/txn/compactor/TestCrudCompactorOnTez.java
@@ -177,22 +177,22 @@ public class TestCrudCompactorOnTez extends
CompactorOnTezTest {
"{\"writeid\":7,\"bucketid\":536870912,\"rowid\":4}\t13\t13",
},
{
- "{\"writeid\":7,\"bucketid\":536936448,\"rowid\":6}\t4\t4",
+ "{\"writeid\":7,\"bucketid\":536936448,\"rowid\":6}\t6\t4",
"{\"writeid\":7,\"bucketid\":536936448,\"rowid\":7}\t3\t4",
- "{\"writeid\":7,\"bucketid\":536936448,\"rowid\":8}\t2\t4",
- "{\"writeid\":7,\"bucketid\":536936448,\"rowid\":9}\t5\t4",
+ "{\"writeid\":7,\"bucketid\":536936448,\"rowid\":8}\t4\t4",
+ "{\"writeid\":7,\"bucketid\":536936448,\"rowid\":9}\t2\t4",
},
{
- "{\"writeid\":7,\"bucketid\":537001984,\"rowid\":10}\t6\t4",
- "{\"writeid\":7,\"bucketid\":537001984,\"rowid\":11}\t5\t3",
+ "{\"writeid\":7,\"bucketid\":537001984,\"rowid\":10}\t5\t4",
+ "{\"writeid\":7,\"bucketid\":537001984,\"rowid\":11}\t2\t3",
"{\"writeid\":7,\"bucketid\":537001984,\"rowid\":12}\t3\t3",
- "{\"writeid\":7,\"bucketid\":537001984,\"rowid\":13}\t2\t3",
+ "{\"writeid\":7,\"bucketid\":537001984,\"rowid\":13}\t6\t3",
"{\"writeid\":7,\"bucketid\":537001984,\"rowid\":14}\t4\t3",
},
{
- "{\"writeid\":7,\"bucketid\":537067520,\"rowid\":15}\t6\t3",
- "{\"writeid\":7,\"bucketid\":537067520,\"rowid\":16}\t5\t2",
- "{\"writeid\":7,\"bucketid\":537067520,\"rowid\":17}\t6\t2",
+ "{\"writeid\":7,\"bucketid\":537067520,\"rowid\":15}\t5\t3",
+ "{\"writeid\":7,\"bucketid\":537067520,\"rowid\":16}\t6\t2",
+ "{\"writeid\":7,\"bucketid\":537067520,\"rowid\":17}\t5\t2",
},
};
verifyRebalance(testDataProvider, tableName, null, expectedBuckets,
@@ -233,22 +233,22 @@ public class TestCrudCompactorOnTez extends
CompactorOnTezTest {
},
{
"{\"writeid\":7,\"bucketid\":536936448,\"rowid\":5}\t12\t12",
- "{\"writeid\":7,\"bucketid\":536936448,\"rowid\":6}\t4\t4",
+ "{\"writeid\":7,\"bucketid\":536936448,\"rowid\":6}\t6\t4",
"{\"writeid\":7,\"bucketid\":536936448,\"rowid\":7}\t3\t4",
- "{\"writeid\":7,\"bucketid\":536936448,\"rowid\":8}\t2\t4",
- "{\"writeid\":7,\"bucketid\":536936448,\"rowid\":9}\t5\t4",
+ "{\"writeid\":7,\"bucketid\":536936448,\"rowid\":8}\t4\t4",
+ "{\"writeid\":7,\"bucketid\":536936448,\"rowid\":9}\t2\t4",
},
{
- "{\"writeid\":7,\"bucketid\":537001984,\"rowid\":10}\t6\t4",
- "{\"writeid\":7,\"bucketid\":537001984,\"rowid\":11}\t5\t3",
+ "{\"writeid\":7,\"bucketid\":537001984,\"rowid\":10}\t5\t4",
+ "{\"writeid\":7,\"bucketid\":537001984,\"rowid\":11}\t2\t3",
"{\"writeid\":7,\"bucketid\":537001984,\"rowid\":12}\t3\t3",
- "{\"writeid\":7,\"bucketid\":537001984,\"rowid\":13}\t2\t3",
+ "{\"writeid\":7,\"bucketid\":537001984,\"rowid\":13}\t6\t3",
"{\"writeid\":7,\"bucketid\":537001984,\"rowid\":14}\t4\t3",
},
{
- "{\"writeid\":7,\"bucketid\":537067520,\"rowid\":15}\t6\t3",
- "{\"writeid\":7,\"bucketid\":537067520,\"rowid\":16}\t5\t2",
- "{\"writeid\":7,\"bucketid\":537067520,\"rowid\":17}\t6\t2",
+ "{\"writeid\":7,\"bucketid\":537067520,\"rowid\":15}\t5\t3",
+ "{\"writeid\":7,\"bucketid\":537067520,\"rowid\":16}\t6\t2",
+ "{\"writeid\":7,\"bucketid\":537067520,\"rowid\":17}\t5\t2",
},
};
verifyRebalance(testDataProvider, tableName, null, expectedBuckets,
@@ -524,8 +524,6 @@ public class TestCrudCompactorOnTez extends
CompactorOnTezTest {
"{\"writeid\":1,\"bucketid\":536870912,\"rowid\":1}\t6\t2",
"{\"writeid\":1,\"bucketid\":536870912,\"rowid\":2}\t6\t3",
"{\"writeid\":1,\"bucketid\":536870912,\"rowid\":3}\t6\t4",
- "{\"writeid\":1,\"bucketid\":536870912,\"rowid\":4}\t5\t2",
- "{\"writeid\":1,\"bucketid\":536870912,\"rowid\":5}\t5\t3",
"{\"writeid\":2,\"bucketid\":536870912,\"rowid\":0}\t12\t12",
"{\"writeid\":3,\"bucketid\":536870912,\"rowid\":0}\t13\t13",
"{\"writeid\":4,\"bucketid\":536870912,\"rowid\":0}\t14\t14",
@@ -534,7 +532,9 @@ public class TestCrudCompactorOnTez extends
CompactorOnTezTest {
"{\"writeid\":7,\"bucketid\":536870912,\"rowid\":0}\t17\t17",
},
{
- "{\"writeid\":1,\"bucketid\":536936448,\"rowid\":0}\t2\t4",
+ "{\"writeid\":1,\"bucketid\":536936448,\"rowid\":0}\t5\t2",
+ "{\"writeid\":1,\"bucketid\":536936448,\"rowid\":1}\t5\t3",
+ "{\"writeid\":1,\"bucketid\":536936448,\"rowid\":2}\t2\t4",
},
{
"{\"writeid\":1,\"bucketid\":537001984,\"rowid\":0}\t3\t3",
@@ -562,7 +562,7 @@ public class TestCrudCompactorOnTez extends
CompactorOnTezTest {
Assert.assertEquals("Buckets does not match after compaction",
Arrays.asList(bucketNames),
CompactorTestUtil.getBucketFileNames(fs, table, partitionName,
folderName));
AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf);
- for(int i = 0; i < expectedBucketContent.length; i++) {
+ for (int i = 0; i < expectedBucketContent.length; i++) {
Assert.assertEquals("rebalanced bucket " + i,
Arrays.asList(expectedBucketContent[i]),
testDataProvider.getBucketData(tableName,
BucketCodec.V1.encode(options.bucket(i)) + ""));
}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
index 564836144f0..54d693c2fd4 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
@@ -1725,35 +1725,21 @@ public class OrcInputFormat implements
InputFormat<NullWritable, OrcStruct>,
}
private long computeProjectionSize(List<OrcProto.Type> fileTypes,
- List<OrcProto.ColumnStatistics> stats, boolean[] fileIncluded) throws
FileFormatException {
- List<Integer> internalColIds = Lists.newArrayList();
+ List<OrcProto.ColumnStatistics> stats, boolean[] fileIncluded)
throws FileFormatException {
+ // Exclude ORC <root> and ACID <row> struct elements to avoid full
schema size estimation
+ final List<Integer> internalColIds;
if (fileIncluded == null) {
- // Add all.
- for (int i = 0; i < fileTypes.size(); i++) {
- internalColIds.add(i);
- }
+ internalColIds = IntStream.range(1, fileTypes.size())
+ .boxed().collect(Collectors.toList());
} else {
- for (int i = 0; i < fileIncluded.length; i++) {
- if (fileIncluded[i]) {
- internalColIds.add(i);
- }
- }
+ internalColIds = IntStream.range(1, fileTypes.size()).filter(i ->
fileIncluded[i])
+ .filter(i -> i != OrcRecordUpdater.ROW + 1 || isOriginal)
+ .boxed().collect(Collectors.toList());
}
return ReaderImpl.getRawDataSizeFromColIndices(internalColIds,
fileTypes, stats);
}
}
- public static boolean[] shiftReaderIncludedForAcid(boolean[] included) {
- // We always need the base row
- included[0] = true;
- boolean[] newIncluded = new boolean[included.length +
OrcRecordUpdater.FIELDS];
- Arrays.fill(newIncluded, 0, OrcRecordUpdater.FIELDS, true);
- for (int i = 0; i < included.length; ++i) {
- newIncluded[i + OrcRecordUpdater.FIELDS] = included[i];
- }
- return newIncluded;
- }
-
/** Class intended to update two values from methods... Java-related cruft.
*/
@VisibleForTesting
static final class CombinedCtx {
diff --git
a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
index c986d10ab4c..456d180b884 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
@@ -122,6 +122,7 @@ import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.util.Progressable;
+import org.apache.hive.common.util.HiveTestUtils;
import org.apache.orc.FileFormatException;
import org.apache.orc.OrcConf;
import org.apache.orc.OrcProto;
@@ -1785,21 +1786,21 @@ public class TestInputOutputFormat {
OrcInputFormat.SplitGenerator splitter =
new OrcInputFormat.SplitGenerator(new
OrcInputFormat.SplitInfo(context, fs,
fs.getFileStatus(new Path("/a/file")), null, null, true,
- new ArrayList<AcidInputFormat.DeltaMetaData>(), true, null, null),
null, true, true);
+ new ArrayList<>(), true, null, null), null, true, true);
List<OrcSplit> results = splitter.call();
OrcSplit result = results.get(0);
assertEquals(3, results.size());
assertEquals(3, result.getStart());
assertEquals(400, result.getLength());
- assertEquals(175168, result.getProjectedColumnsUncompressedSize());
+ assertEquals(167468, result.getProjectedColumnsUncompressedSize());
result = results.get(1);
assertEquals(403, result.getStart());
assertEquals(400, result.getLength());
- assertEquals(175168, result.getProjectedColumnsUncompressedSize());
+ assertEquals(167468, result.getProjectedColumnsUncompressedSize());
result = results.get(2);
assertEquals(803, result.getStart());
assertEquals(100, result.getLength());
- assertEquals(43792, result.getProjectedColumnsUncompressedSize());
+ assertEquals(41867, result.getProjectedColumnsUncompressedSize());
// test min = 0, max = 0 generates each stripe
HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPRED_MAX_SPLIT_SIZE, 0);
@@ -1807,17 +1808,16 @@ public class TestInputOutputFormat {
context = new OrcInputFormat.Context(conf);
splitter = new OrcInputFormat.SplitGenerator(new
OrcInputFormat.SplitInfo(context, fs,
fs.getFileStatus(new Path("/a/file")), null, null, true,
- new ArrayList<AcidInputFormat.DeltaMetaData>(),
- true, null, null), null, true, true);
+ new ArrayList<>(), true, null, null), null, true, true);
results = splitter.call();
assertEquals(5, results.size());
for (int i = 0; i < stripeSizes.length; ++i) {
assertEquals("checking stripe " + i + " size",
stripeSizes[i], results.get(i).getLength());
if (i == stripeSizes.length - 1) {
- assertEquals(43792,
results.get(i).getProjectedColumnsUncompressedSize());
+ assertEquals(41867,
results.get(i).getProjectedColumnsUncompressedSize());
} else {
- assertEquals(87584,
results.get(i).getProjectedColumnsUncompressedSize());
+ assertEquals(83734,
results.get(i).getProjectedColumnsUncompressedSize());
}
}
@@ -1827,16 +1827,37 @@ public class TestInputOutputFormat {
context = new OrcInputFormat.Context(conf);
splitter = new OrcInputFormat.SplitGenerator(new
OrcInputFormat.SplitInfo(context, fs,
fs.getFileStatus(new Path("/a/file")), null, null, true,
- new ArrayList<AcidInputFormat.DeltaMetaData>(),
- true, null, null), null, true, true);
+ new ArrayList<>(), true, null, null), null, true, true);
results = splitter.call();
assertEquals(1, results.size());
result = results.get(0);
assertEquals(3, result.getStart());
assertEquals(900, result.getLength());
- assertEquals(394128, result.getProjectedColumnsUncompressedSize());
+ assertEquals(376804, result.getProjectedColumnsUncompressedSize());
}
+ @Test
+ public void testAcidProjectedColumnSize() throws Exception {
+ Path file = new Path(HiveTestUtils.getFileFromClasspath("bucket_00952_0"));
+ FileSystem fs = FileSystem.getLocal(conf).getRaw();
+
+ conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, true);
+ OrcInputFormat.Context context = new OrcInputFormat.Context(conf);
+
+ OrcInputFormat.SplitInfo splitInfo = new OrcInputFormat.SplitInfo(
+ context, fs, fs.getFileStatus(file), null, null, false, new
ArrayList<>(), true, null, null);
+ OrcInputFormat.SplitGenerator splitter = new
OrcInputFormat.SplitGenerator(splitInfo, null, true, true);
+
+ List<OrcSplit> results = splitter.call();
+ assertEquals(1246255309,
results.get(0).getProjectedColumnsUncompressedSize());
+
+ conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
+ conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "2,4,6,8");
+
+ results = splitter.call();
+ assertEquals(10509573,
results.get(0).getProjectedColumnsUncompressedSize());
+ }
+
@Test
public void testInOutFormat() throws Exception {
Properties properties = new Properties();
diff --git a/ql/src/test/resources/bucket_00952_0
b/ql/src/test/resources/bucket_00952_0
new file mode 100644
index 00000000000..3b7b6316229
Binary files /dev/null and b/ql/src/test/resources/bucket_00952_0 differ