[GitHub] spark pull request: [SPARK-5341] Use maven coordinates as dependen...

pwendell Tue, 03 Feb 2015 15:10:12 -0800

Github user pwendell commented on a diff in the pull request:

    https://github.com/apache/spark/pull/4215#discussion_r24049531
  
    --- Diff: core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala ---
    @@ -475,6 +500,199 @@ object SparkSubmit {
       }
     }
     
    +/** Provides utility functions to be used inside SparkSubmit. */
    +private[spark] object SparkSubmitUtils {
    +
    +  private val printStream = SparkSubmit.printStream
    +
    +  /**
    +   * Represents a Maven Coordinate
    +   * @param groupId the groupId of the coordinate
    +   * @param artifactId the artifactId of the coordinate
    +   * @param version the version of the coordinate
    +   */
    +  private[spark] case class MavenCoordinate(groupId: String, artifactId: 
String, version: String)
    +
    +/**
    + * Extracts maven coordinates from a comma-delimited string
    + * @param coordinates Comma-delimited string of maven coordinates
    + * @return Sequence of Maven coordinates
    + */
    +  private[spark] def extractMavenCoordinates(coordinates: String): 
Seq[MavenCoordinate] = {
    +    coordinates.split(",").map { p =>
    +      val splits = p.split(":")
    +      require(splits.length == 3, s"Provided Maven Coordinates must be in 
the form " +
    +        s"'groupId:artifactId:version'. The coordinate provided is: $p")
    +      require(splits(0) != null && splits(0).trim.nonEmpty, s"The groupId 
cannot be null or " +
    +        s"be whitespace. The groupId provided is: ${splits(0)}")
    +      require(splits(1) != null && splits(1).trim.nonEmpty, s"The 
artifactId cannot be null or " +
    +        s"be whitespace. The artifactId provided is: ${splits(1)}")
    +      require(splits(2) != null && splits(2).trim.nonEmpty, s"The version 
cannot be null or " +
    +        s"be whitespace. The version provided is: ${splits(2)}")
    +      new MavenCoordinate(splits(0), splits(1), splits(2))
    +    }
    +  }
    +
    +  /**
    +   * Extracts maven coordinates from a comma-delimited string
    +   * @param remoteRepos Comma-delimited string of remote repositories
    +   * @return A ChainResolver used by Ivy to search for and resolve 
dependencies.
    +   */
    +  private[spark] def createRepoResolvers(remoteRepos: Option[String]): 
ChainResolver = {
    +    // We need a chain resolver if we want to check multiple repositories
    +    val cr = new ChainResolver
    +    cr.setName("list")
    +
    +    // the biblio resolver resolves POM declared dependencies
    +    val br: IBiblioResolver = new IBiblioResolver
    +    br.setM2compatible(true)
    +    br.setUsepoms(true)
    +    br.setName("central")
    +    cr.add(br)
    +
    +    val repositoryList = remoteRepos.getOrElse("")
    +    // add any other remote repositories other than maven central
    +    if (repositoryList.trim.nonEmpty) {
    +      repositoryList.split(",").zipWithIndex.foreach { case (repo, i) =>
    +        val brr: IBiblioResolver = new IBiblioResolver
    +        brr.setM2compatible(true)
    +        brr.setUsepoms(true)
    +        brr.setRoot(repo)
    +        brr.setName(s"repo-${i + 1}")
    +        cr.add(brr)
    +        printStream.println(s"$repo added as a remote repository with the 
name: ${brr.getName}")
    +      }
    +    }
    +    cr
    +  }
    +
    +  /**
    +   * Output a comma-delimited list of paths for the downloaded jars to be 
added to the classpath
    +   * (will append to jars in SparkSubmit). The name of the jar is given
    +   * after a '!' by Ivy. It also sometimes contains '(bundle)' after 
'.jar'. Remove that as well.
    +   * @param artifacts Sequence of dependencies that were resolved and 
retrieved
    +   * @param cacheDirectory directory where jars are cached
    +   * @return a comma-delimited list of paths for the dependencies
    +   */
    +  private[spark] def resolveDependencyPaths(
    +      artifacts: Array[AnyRef],
    +      cacheDirectory: File): String = {
    +    artifacts.map { artifactInfo =>
    +      val artifactString = artifactInfo.toString
    +      val jarName = artifactString.drop(artifactString.lastIndexOf("!") + 
1)
    +      cacheDirectory.getAbsolutePath + File.separator +
    +        jarName.substring(0, jarName.lastIndexOf(".jar") + 4)
    +    }.mkString(",")
    +  }
    +
    +  /** Adds the given maven coordinates to Ivy's module descriptor. */
    +  private[spark] def addDependenciesToIvy(
    +      md: DefaultModuleDescriptor,
    +      artifacts: Seq[MavenCoordinate],
    +      ivyConfName: String): Unit = {
    +    artifacts.foreach { mvn =>
    +      val ri = ModuleRevisionId.newInstance(mvn.groupId, mvn.artifactId, 
mvn.version)
    +      val dd = new DefaultDependencyDescriptor(ri, false, false)
    +      dd.addDependencyConfiguration(ivyConfName, ivyConfName)
    +      printStream.println(s"${dd.getDependencyId} added as a dependency")
    +      md.addDependency(dd)
    +    }
    +  }
    +
    +  /** A nice function to use in tests as well. Values are dummy strings. */
    +  private[spark] def getModuleDescriptor = 
DefaultModuleDescriptor.newDefaultInstance(
    +    ModuleRevisionId.newInstance("org.apache.spark", 
"spark-submit-envelope", "1.0"))
    +
    +  /**
    +   * Resolves any dependencies that were supplied through maven coordinates
    +   * @param coordinates Comma-delimited string of maven coordinates
    +   * @param remoteRepos Comma-delimited string of remote repositories 
other than maven central
    +   * @param ivyPath The path to the local ivy repository
    +   * @return The comma-delimited path to the jars of the given maven 
artifacts including their
    +   *         transitive dependencies
    +   */
    +  private[spark] def resolveMavenCoordinates(
    +      coordinates: String,
    +      remoteRepos: Option[String],
    +      ivyPath: Option[String],
    +      isTest: Boolean = false): String = {
    +    if (coordinates == null || coordinates.trim.isEmpty) {
    +      ""
    +    } else {
    +      val artifacts = extractMavenCoordinates(coordinates)
    +      // Default configuration name for ivy
    +      val ivyConfName = "default"
    +      // set ivy settings for location of cache
    +      val ivySettings: IvySettings = new IvySettings
    +      // Directories for caching downloads through ivy and storing the 
jars when maven coordinates
    +      // are supplied to spark-submit
    +      val alternateIvyCache = ivyPath.getOrElse("")
    +      val packagesDirectory: File =
    +        if (alternateIvyCache.trim.isEmpty) {
    +          new File(ivySettings.getDefaultIvyUserDir, "jars")
    +        } else {
    +          ivySettings.setDefaultCache(new File(alternateIvyCache, "cache"))
    +          new File(alternateIvyCache, "jars")
    +        }
    +      printStream.println(
    +        s"Ivy Default Cache set to: 
${ivySettings.getDefaultCache.getAbsolutePath}")
    +      printStream.println(s"The jars for the packages stored in: 
$packagesDirectory")
    +      // create a pattern matcher
    +      ivySettings.addMatcher(new GlobPatternMatcher)
    +      // create the dependency resolvers
    +      val repoResolver = createRepoResolvers(remoteRepos)
    +      ivySettings.addResolver(repoResolver)
    +      ivySettings.setDefaultResolver(repoResolver.getName)
    +
    +      val ivy = Ivy.newInstance(ivySettings)
    +      // Set resolve options to download transitive dependencies as well
    +      val resolveOptions = new ResolveOptions
    +      resolveOptions.setTransitive(true)
    +      val retrieveOptions = new RetrieveOptions
    +      // Turn downloading and logging off for testing
    +      if (isTest) {
    +        resolveOptions.setDownload(false)
    +        resolveOptions.setLog(LogOptions.LOG_QUIET)
    +        retrieveOptions.setLog(LogOptions.LOG_QUIET)
    +      } else {
    +        resolveOptions.setDownload(true)
    +      }
    +
    +      // A Module descriptor must be specified. Entries are dummy strings
    +      val md = getModuleDescriptor
    +      md.setDefaultConf(ivyConfName)
    +
    +      // Add an exclusion rule for Spark
    +      val sparkArtifacts = new ArtifactId(new ModuleId("org.apache.spark", 
"*"), "*", "*", "*")
    +      val sparkDependencyExcludeRule =
    +        new DefaultExcludeRule(sparkArtifacts, 
ivySettings.getMatcher("glob"), null)
    +      sparkDependencyExcludeRule.addConfiguration(ivyConfName)
    +
    +      // Exclude any Spark dependencies, and add all supplied maven 
artifacts as dependencies
    +      md.addExcludeRule(sparkDependencyExcludeRule)
    +      addDependenciesToIvy(md, artifacts, ivyConfName)
    +
    +      // resolve dependencies
    +      val rr: ResolveReport = ivy.resolve(md, resolveOptions)
    +      if (rr.hasError) {
    +        throw new RuntimeException(rr.getAllProblemMessages.toString)
    +      }
    +      // Log the callers for each dependency
    +      rr.getDependencies.toArray.foreach { case dependency: IvyNode =>
    +        var logMsg = s"$dependency will be retrieved as a dependency for:"
    --- End diff --
    
    After running this myself, I think your original instinct is right. Let's 
not bother printing this since there is already fairly thorough printing in ivy.



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request: [SPARK-5341] Use maven coordinates as dependen...

Reply via email to