Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/19130#discussion_r139576095 --- Diff: core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala --- @@ -367,6 +368,54 @@ object SparkSubmit extends CommandLineUtils with Logging { }.orNull } + // When running in YARN, for some remote resources with scheme: + // 1. Hadoop FileSystem doesn't support them. + // 2. We explicitly bypass Hadoop FileSystem with "spark.yarn.dist.forceDownloadSchemes". + // We will download them to local disk prior to add to YARN's distributed cache. + // For yarn client mode, since we already download them with above code, so we only need to + // figure out the local path and replace the remote one. + if (clusterManager == YARN) { + sparkConf.setIfMissing(SecurityManager.SPARK_AUTH_SECRET_CONF, "unused") + val secMgr = new SecurityManager(sparkConf) + val forceDownloadSchemes = sparkConf.get(FORCE_DOWNLOAD_SCHEMES) + + def shouldDownload(scheme: String): Boolean = { + val isFsAvailable = () => { + Try { FileSystem.getFileSystemClass(scheme, hadoopConf) }.isSuccess + } + forceDownloadSchemes.contains(scheme) || !isFsAvailable() + } + + def downloadResource(resource: String): String = { + val uri = Utils.resolveURI(resource) + uri.getScheme match { + case "local" | "file" => resource + case e if shouldDownload(e) => --- End diff -- shall we explicitly list "http" | "https" | "ftp"?
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org