This is an automated email from the ASF dual-hosted git repository.
hashutosh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 13ec7c3 HIVE-21390 : BI split strategy does not work for blob stores
(Prasanth Jayachandran reviewed by Gopal V)
13ec7c3 is described below
commit 13ec7c3ab47001df25e7be87739731192075b0a7
Author: Prasanth Jayachandran <[email protected]>
AuthorDate: Sat Mar 9 09:13:59 2019 -0800
HIVE-21390 : BI split strategy does not work for blob stores (Prasanth
Jayachandran reviewed by Gopal V)
Signed-off-by: Ashutosh Chauhan <[email protected]>
---
.../java/org/apache/hadoop/hive/conf/HiveConf.java | 3 +++
.../hive/hcatalog/streaming/TestStreaming.java | 4 ++++
.../streaming/mutate/StreamingTestUtils.java | 5 ++++
.../hadoop/hive/ql/io/orc/OrcInputFormat.java | 27 ++++++++++++++++------
.../hive/ql/io/orc/TestInputOutputFormat.java | 5 ++++
5 files changed, 37 insertions(+), 7 deletions(-)
diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index 076035b..c33d03e 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -1984,6 +1984,9 @@ public class HiveConf extends Configuration {
" ETL strategy is used when spending little more time in split
generation is acceptable" +
" (split generation reads and caches file footers). HYBRID chooses
between the above strategies" +
" based on heuristics."),
+ HIVE_ORC_BLOB_STORAGE_SPLIT_SIZE("hive.exec.orc.blob.storage.split.size",
128L * 1024 * 1024,
+ "When blob storage is used, BI split strategy does not have block
locations for splitting orc files.\n" +
+ "In such cases, split generation will use this config to split orc
file"),
HIVE_ORC_WRITER_LLAP_MEMORY_MANAGER_ENABLED("hive.exec.orc.writer.llap.memory.manager.enabled",
true,
"Whether orc writers should use llap-aware memory manager. LLAP aware
memory manager will use memory\n" +
"per executor instead of entire heap memory when concurrent orc
writers are involved. This will let\n" +
diff --git
a/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/TestStreaming.java
b/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/TestStreaming.java
index d0d9759..4dc04f4 100644
---
a/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/TestStreaming.java
+++
b/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/TestStreaming.java
@@ -133,6 +133,10 @@ public class TestStreaming {
return NAME;
}
+ @Override
+ public String getScheme() {
+ return "raw";
+ }
@Override
public FileStatus getFileStatus(Path path) throws IOException {
diff --git
a/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/mutate/StreamingTestUtils.java
b/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/mutate/StreamingTestUtils.java
index 63690f9..afda7d5 100644
---
a/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/mutate/StreamingTestUtils.java
+++
b/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/mutate/StreamingTestUtils.java
@@ -90,6 +90,11 @@ public class StreamingTestUtils {
}
@Override
+ public String getScheme() {
+ return "raw";
+ }
+
+ @Override
public FileStatus getFileStatus(Path path) throws IOException {
File file = pathToFile(path);
if (!file.exists()) {
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
index ca25449..9dac185 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
@@ -18,6 +18,7 @@
package org.apache.hadoop.hive.ql.io.orc;
+import org.apache.hadoop.hive.common.BlobStorageUtils;
import org.apache.hadoop.hive.common.NoDynamicValuesException;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.hdfs.DistributedFileSystem;
@@ -1074,16 +1075,28 @@ public class OrcInputFormat implements
InputFormat<NullWritable, OrcStruct>,
if (fileKey == null && allowSyntheticFileIds) {
fileKey = new SyntheticFileId(fileStatus);
}
- TreeMap<Long, BlockLocation> blockOffsets =
SHIMS.getLocationsWithOffset(fs, fileStatus);
- for (Map.Entry<Long, BlockLocation> entry : blockOffsets.entrySet())
{
- if(entry.getKey() + entry.getValue().getLength() > logicalLen) {
- //don't create splits for anything past logical EOF
- continue;
+ if (BlobStorageUtils.isBlobStorageFileSystem(conf, fs)) {
+ final long splitSize = HiveConf.getLongVar(conf,
HiveConf.ConfVars.HIVE_ORC_BLOB_STORAGE_SPLIT_SIZE);
+ LOG.info("Blob storage detected for BI split strategy. Splitting
files at boundary {}..", splitSize);
+ long start;
+ for (start = 0; start < logicalLen; start = start + splitSize) {
+ OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), fileKey,
start,
+ Math.min(splitSize, logicalLen - start), null, null,
isOriginal, true,
+ deltas, -1, logicalLen, dir, offsetAndBucket);
+ splits.add(orcSplit);
}
- OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), fileKey,
entry.getKey(),
+ } else {
+ TreeMap<Long, BlockLocation> blockOffsets =
SHIMS.getLocationsWithOffset(fs, fileStatus);
+ for (Map.Entry<Long, BlockLocation> entry :
blockOffsets.entrySet()) {
+ if (entry.getKey() + entry.getValue().getLength() > logicalLen) {
+ //don't create splits for anything past logical EOF
+ continue;
+ }
+ OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), fileKey,
entry.getKey(),
entry.getValue().getLength(), entry.getValue().getHosts(),
null, isOriginal, true,
deltas, -1, logicalLen, dir, offsetAndBucket);
- splits.add(orcSplit);
+ splits.add(orcSplit);
+ }
}
}
}
diff --git
a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
index 25cd657..9a8ae3b 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
@@ -1237,6 +1237,11 @@ public class TestInputOutputFormat {
}
}
+ @Override
+ public String getScheme() {
+ return "mock";
+ }
+
// increments file modification time
public void touch(MockFile file) {
if (fileStatusMap.containsKey(file)) {