This is an automated email from the ASF dual-hosted git repository.
baunsgaard pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 6a759ce18f [SYSTEMDS-3482] Parallel Hadoop IO Startup
6a759ce18f is described below
commit 6a759ce18f08d184d98d652f2e68c952f6a610a9
Author: baunsgaard <[email protected]>
AuthorDate: Tue Jan 3 12:37:29 2023 +0100
[SYSTEMDS-3482] Parallel Hadoop IO Startup
I observed that the compile time if we include IO operations increase to
~0.6 sec. While if we do not have IO operations it is ~0.2 sec. This
is due to the hadoop IO we are using taking up to 70% of the compile time
in cases where we have simple scripts with only read and a single operation.
This is a constant overhead on the fist IO operation that does not effect
subsequent IO operations, to improve this I have moved this to a parallel
operation when we construct the JobConfiguration. This improve the
compile time of systemds in general from ~0.6 sec when using IO to ~0.2 sec.
Closes #1757
---
.../apache/sysds/conf/ConfigurationManager.java | 23 ++++++++++++++++++++--
1 file changed, 21 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/apache/sysds/conf/ConfigurationManager.java
b/src/main/java/org/apache/sysds/conf/ConfigurationManager.java
index 12764eacf4..18bd83e959 100644
--- a/src/main/java/org/apache/sysds/conf/ConfigurationManager.java
+++ b/src/main/java/org/apache/sysds/conf/ConfigurationManager.java
@@ -19,11 +19,18 @@
package org.apache.sysds.conf;
+import java.util.concurrent.ExecutorService;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.mapred.JobConf;
import org.apache.sysds.conf.CompilerConfig.ConfigType;
import org.apache.sysds.hops.OptimizerUtils;
import org.apache.sysds.lops.Compression.CompressConfig;
import org.apache.sysds.lops.compile.linearization.ILinearize;
+import
org.apache.sysds.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
+import org.apache.sysds.runtime.io.IOUtilFunctions;
+import org.apache.sysds.runtime.util.CommonThreadPool;
/**
* Singleton for accessing the parsed and merged system configuration.
@@ -31,8 +38,9 @@ import org.apache.sysds.lops.compile.linearization.ILinearize;
* NOTE: parallel execution of multiple DML scripts (in the same JVM) with
different configurations
* would require changes/extensions of this class.
*/
-public class ConfigurationManager
-{
+public class ConfigurationManager{
+ private static final Log LOG =
LogFactory.getLog(ConfigurationManager.class.getName());
+
/** Global cached job conf for read-only operations */
private static JobConf _rJob = null;
@@ -56,6 +64,17 @@ public class ConfigurationManager
//ConfigManager -> OptimizerUtils -> InfrastructureAnalyer ->
ConfigManager
_dmlconf = new DMLConfig();
_cconf = new CompilerConfig();
+
+ final ExecutorService pool =
CommonThreadPool.get(InfrastructureAnalyzer.getLocalParallelism());
+ pool.submit(() ->{
+ try{
+ IOUtilFunctions.getFileSystem(_rJob);
+ }
+ catch(Exception e){
+ LOG.warn(e.getMessage());
+ }
+ });
+ pool.shutdown();
}