Github user cloud-fan commented on a diff in the pull request:
https://github.com/apache/spark/pull/19130#discussion_r139576095
--- Diff: core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala ---
@@ -367,6 +368,54 @@ object SparkSubmit extends CommandLineUtils with
Logging {
}.orNull
}
+ // When running in YARN, for some remote resources with scheme:
+ // 1. Hadoop FileSystem doesn't support them.
+ // 2. We explicitly bypass Hadoop FileSystem with
"spark.yarn.dist.forceDownloadSchemes".
+ // We will download them to local disk prior to add to YARN's
distributed cache.
+ // For yarn client mode, since we already download them with above
code, so we only need to
+ // figure out the local path and replace the remote one.
+ if (clusterManager == YARN) {
+ sparkConf.setIfMissing(SecurityManager.SPARK_AUTH_SECRET_CONF,
"unused")
+ val secMgr = new SecurityManager(sparkConf)
+ val forceDownloadSchemes = sparkConf.get(FORCE_DOWNLOAD_SCHEMES)
+
+ def shouldDownload(scheme: String): Boolean = {
+ val isFsAvailable = () => {
+ Try { FileSystem.getFileSystemClass(scheme, hadoopConf)
}.isSuccess
+ }
+ forceDownloadSchemes.contains(scheme) || !isFsAvailable()
+ }
+
+ def downloadResource(resource: String): String = {
+ val uri = Utils.resolveURI(resource)
+ uri.getScheme match {
+ case "local" | "file" => resource
+ case e if shouldDownload(e) =>
--- End diff --
shall we explicitly list "http" | "https" | "ftp"?
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]