abstractdog commented on code in PR #5174:
URL: https://github.com/apache/hive/pull/5174#discussion_r1551593916
##########
ql/src/java/org/apache/hadoop/hive/ql/exec/tez/HiveSplitGenerator.java:
##########
@@ -154,6 +165,92 @@ private void prepare(InputInitializerContext
initializerContext) throws IOExcept
LOG.info("SplitLocationProvider: " + splitLocationProvider);
}
+ /**
+ * SplitSerializer is a helper class for taking care of serializing splits
to the tez scratch dir
+ * when a size criteria defined by
"hive.tez.split.fs.serialization.threshold" is met.
+ * It utilizes an ExecutorService for parallel writes to prevent a single
split write operation
+ * becoming the bottleneck (as write() is called from a loop currently).
+ */
+ class SplitSerializer {
+ // fields needed for filepath
+ private String queryId;
+ private String inputName;
+ private int vertexId;
+ private Path appStagingPath;
+ // metrics
+ private AtomicInteger timeSpentWithSplitWriteMs;
+ private AtomicInteger splitsWritten;
+ // lazy initialized filesystem and executor
+ private FileSystem fs;
+ private ExecutorService executor;
+
+ /**
+ * Lazy init filesystem and executor service: don't initialize if there is
no split serialized at all.
+ * No need to synchronize, this is called from a loop.
+ */
+ private void lazyInit() throws IOException {
+ if (fs != null) {
+ return;
+ }
+ queryId = jobConf.get(HiveConf.ConfVars.HIVEQUERYID.varname);
+ inputName = getContext().getInputName();
+ vertexId = getContext().getVertexId();
+ appStagingPath = TezCommonUtils.getTezSystemStagingPath(conf,
getContext().getApplicationId().toString());
+
+ timeSpentWithSplitWriteMs = new AtomicInteger(0);
+ splitsWritten = new AtomicInteger(0);
+
+ fs = appStagingPath.getFileSystem(jobConf);
+ executor = Executors.newFixedThreadPool(8,
+ new ThreadFactoryBuilder().setDaemon(true)
Review Comment:
ack
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]