[GitHub] spark pull request: [SPARK-7017][Build][Project Infra]: Refactor d...

brennonyork Wed, 17 Jun 2015 19:09:07 -0700

Github user brennonyork commented on a diff in the pull request:

    https://github.com/apache/spark/pull/5694#discussion_r32694170
  
    --- Diff: dev/run-tests.py ---
    @@ -0,0 +1,536 @@
    +#!/usr/bin/env python2
    +
    +#
    +# Licensed to the Apache Software Foundation (ASF) under one or more
    +# contributor license agreements.  See the NOTICE file distributed with
    +# this work for additional information regarding copyright ownership.
    +# The ASF licenses this file to You under the Apache License, Version 2.0
    +# (the "License"); you may not use this file except in compliance with
    +# the License.  You may obtain a copy of the License at
    +#
    +#    http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +#
    +
    +import os
    +import re
    +import sys
    +import shutil
    +import subprocess
    +from collections import namedtuple
    +
    +SPARK_HOME = os.path.join(os.path.dirname(os.path.realpath(__file__)), 
"..")
    +USER_HOME = os.environ.get("HOME")
    +
    +
    +def get_error_codes(err_code_file):
    +    """Function to retrieve all block numbers from the `run-tests-codes.sh`
    +    file to maintain backwards compatibility with the `run-tests-jenkins`
    +    script"""
    +
    +    with open(err_code_file, 'r') as f:
    +        err_codes = [e.split()[1].strip().split('=')
    +                     for e in f if e.startswith("readonly")]
    +        return dict(err_codes)
    +
    +
    +ERROR_CODES = get_error_codes(os.path.join(SPARK_HOME, 
"dev/run-tests-codes.sh"))
    +
    +
    +def exit_from_command_with_retcode(cmd, retcode):
    +    print "[error] running", cmd, "; received return code", retcode
    +    sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
    +
    +
    +def rm_r(path):
    +    """Given an arbitrary path properly remove it with the correct python
    +    construct if it exists
    +    - from: http://stackoverflow.com/a/9559881""";
    +
    +    if os.path.isdir(path):
    +        shutil.rmtree(path)
    +    elif os.path.exists(path):
    +        os.remove(path)
    +
    +
    +def run_cmd(cmd):
    +    """Given a command as a list of arguments will attempt to execute the
    +    command from the determined SPARK_HOME directory and, on failure, print
    +    an error message"""
    +
    +    if not isinstance(cmd, list):
    +        cmd = cmd.split()
    +    try:
    +        subprocess.check_call(cmd)
    +    except subprocess.CalledProcessError as e:
    +        exit_from_command_with_retcode(e.cmd, e.returncode)
    +
    +
    +def is_exe(path):
    +    """Check if a given path is an executable file
    +    - from: http://stackoverflow.com/a/377028""";
    +
    +    return os.path.isfile(path) and os.access(path, os.X_OK)
    +
    +
    +def which(program):
    +    """Find and return the given program by its absolute path or 'None'
    +    - from: http://stackoverflow.com/a/377028""";
    +
    +    fpath, fname = os.path.split(program)
    +
    +    if fpath:
    +        if is_exe(program):
    +            return program
    +    else:
    +        for path in os.environ.get("PATH").split(os.pathsep):
    +            path = path.strip('"')
    +            exe_file = os.path.join(path, program)
    +            if is_exe(exe_file):
    +                return exe_file
    +    return None
    +
    +
    +def determine_java_executable():
    +    """Will return the path of the java executable that will be used by 
Spark's
    +    tests or `None`"""
    +
    +    # Any changes in the way that Spark's build detects java must be 
reflected
    +    # here. Currently the build looks for $JAVA_HOME/bin/java then falls 
back to
    +    # the `java` executable on the path
    +
    +    java_home = os.environ.get("JAVA_HOME")
    +
    +    # check if there is an executable at $JAVA_HOME/bin/java
    +    java_exe = which(os.path.join(java_home, "bin", "java")) if java_home 
else None
    +    # if the java_exe wasn't set, check for a `java` version on the $PATH
    +    return java_exe if java_exe else which("java")
    +
    +
    +JavaVersion = namedtuple('JavaVersion', ['major', 'minor', 'patch', 
'update'])
    +
    +
    +def determine_java_version(java_exe):
    +    """Given a valid java executable will return its version in named 
tuple format
    +    with accessors '.major', '.minor', '.patch', '.update'"""
    +
    +    raw_output = subprocess.check_output([java_exe, "-version"],
    +                                         stderr=subprocess.STDOUT)
    +    raw_version_str = raw_output.split('\n')[0]  # eg 'java version 
"1.8.0_25"'
    +    version_str = raw_version_str.split()[-1].strip('"')  # eg '1.8.0_25'
    +    version, update = version_str.split('_')  # eg ['1.8.0', '25']
    +
    +    # map over the values and convert them to integers
    +    version_info = [int(x) for x in version.split('.') + [update]]
    +
    +    return JavaVersion(major=version_info[0],
    +                       minor=version_info[1],
    +                       patch=version_info[2],
    +                       update=version_info[3])
    +
    +
    +def set_title_and_block(title, err_block):
    +    os.environ["CURRENT_BLOCK"] = ERROR_CODES[err_block]
    +    line_str = '=' * 72
    +
    +    print
    +    print line_str
    +    print title
    +    print line_str
    +
    +
    +def run_apache_rat_checks():
    +    set_title_and_block("Running Apache RAT checks", "BLOCK_RAT")
    +    run_cmd([os.path.join(SPARK_HOME, "dev", "check-license")])
    +
    +
    +def run_scala_style_checks():
    +    set_title_and_block("Running Scala style checks", "BLOCK_SCALA_STYLE")
    +    run_cmd([os.path.join(SPARK_HOME, "dev", "lint-scala")])
    +
    +
    +def run_python_style_checks():
    +    set_title_and_block("Running Python style checks", 
"BLOCK_PYTHON_STYLE")
    +    run_cmd([os.path.join(SPARK_HOME, "dev", "lint-python")])
    +
    +
    +def build_spark_documentation():
    +    set_title_and_block("Building Spark Documentation", 
"BLOCK_DOCUMENTATION")
    +    os.environ["PRODUCTION"] = "1 jekyll build"
    +
    +    os.chdir(os.path.join(SPARK_HOME, "docs"))
    +
    +    jekyll_bin = which("jekyll")
    +
    +    if not jekyll_bin:
    +        print "[error] Cannot find a version of `jekyll` on the system; 
please",
    +        print "install one and retry to build documentation."
    +        sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
    +    else:
    +        run_cmd([jekyll_bin, "build"])
    +
    +    os.chdir(SPARK_HOME)
    +
    +
    +def exec_maven(mvn_args=[]):
    +    """Will call Maven in the current directory with the list of mvn_args 
passed
    +    in and returns the subprocess for any further processing"""
    +
    +    run_cmd([os.path.join(SPARK_HOME, "build", "mvn")] + mvn_args)
    +
    +
    +def exec_sbt(sbt_args=[]):
    +    """Will call SBT in the current directory with the list of mvn_args 
passed
    +    in and returns the subprocess for any further processing"""
    +
    +    sbt_cmd = [os.path.join(SPARK_HOME, "build", "sbt")] + sbt_args
    +
    +    sbt_output_filter = re.compile("^.*[info].*Resolving" + "|" +
    +                                   "^.*[warn].*Merging" + "|" +
    +                                   "^.*[info].*Including")
    +
    +    # NOTE: echo "q" is needed because sbt on encountering a build file
    +    # with failure (either resolution or compilation) prompts the user for
    +    # input either q, r, etc to quit or retry. This echo is there to make 
it
    +    # not block.
    +    echo_proc = subprocess.Popen(["echo", "\"q\n\""], 
stdout=subprocess.PIPE)
    +    sbt_proc = subprocess.Popen(sbt_cmd,
    +                                stdin=echo_proc.stdout,
    +                                stdout=subprocess.PIPE)
    +    echo_proc.wait()
    +    for line in iter(sbt_proc.stdout.readline, ''):
    +        if not sbt_output_filter.match(line):
    +            print line,
    +    retcode = sbt_proc.wait()
    +
    +    if retcode > 0:
    +        exit_from_command_with_retcode(sbt_cmd, retcode)
    +
    +
    +def get_hadoop_profiles(hadoop_version):
    +    """Return a list of profiles indicating which Hadoop version to use 
from
    +    a Hadoop version tag."""
    +
    +    sbt_maven_hadoop_profiles = {
    +        "hadoop1.0": ["-Phadoop-1", "-Dhadoop.version=1.0.4"],
    +        "hadoop2.0": ["-Phadoop-1", "-Dhadoop.version=2.0.0-mr1-cdh4.1.1"],
    +        "hadoop2.2": ["-Pyarn", "-Phadoop-2.2"],
    +        "hadoop2.3": ["-Pyarn", "-Phadoop-2.3", "-Dhadoop.version=2.3.0"],
    +    }
    +
    +    if hadoop_version in sbt_maven_hadoop_profiles:
    +        return sbt_maven_hadoop_profiles[hadoop_version]
    +    else:
    +        print "[error] Could not find", hadoop_version, "in the list. 
Valid options",
    +        print "are", sbt_maven_hadoop_profiles.keys()
    +        sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
    +
    +
    +def get_build_profiles(hadoop_version="hadoop2.3",
    +                       enable_base_profiles=True,
    +                       enable_hive_profiles=False,
    +                       enable_doc_profiles=False):
    +    """Returns a list of hadoop profiles to be used as looked up from the 
passed in hadoop profile
    +    key with the option of adding on the base and hive profiles."""
    +
    +    base_profiles = ["-Pkinesis-asl"]
    +    hive_profiles = ["-Phive", "-Phive-thriftserver"]
    +    doc_profiles = []
    +    hadoop_profiles = get_hadoop_profiles(hadoop_version)
    +
    +    build_profiles = hadoop_profiles
    +
    +    if enable_base_profiles:
    +        build_profiles += base_profiles
    +
    +    if enable_hive_profiles:
    +        build_profiles += hive_profiles
    +
    +    if enable_doc_profiles:
    +        build_profiles += doc_profiles
    +
    +    return build_profiles
    +
    +
    +def build_spark_maven(hadoop_version):
    +    # we always build with Hive support even if we skip Hive tests in most 
builds
    +    build_profiles = get_build_profiles(hadoop_version, 
enable_hive_profiles=True)
    +    mvn_goals = ["clean", "package", "-DskipTests"]
    +    profiles_and_goals = build_profiles + mvn_goals
    +
    +    print "[info] Building Spark (w/Hive 0.13.1) using Maven with these 
arguments:",
    +    print " ".join(profiles_and_goals)
    +
    +    exec_maven(profiles_and_goals)
    +
    +
    +def build_spark_sbt(hadoop_version):
    +    build_profiles = get_build_profiles(hadoop_version, 
enable_hive_profiles=True)
    +    sbt_goals = ["package",
    +                 "assembly/assembly",
    +                 "streaming-kafka-assembly/assembly"]
    +    profiles_and_goals = build_profiles + sbt_goals
    +
    +    print "[info] Building Spark (w/Hive 0.13.1) using SBT with these 
arguments:",
    +    print " ".join(profiles_and_goals)
    +
    +    exec_sbt(profiles_and_goals)
    +
    +
    +def build_apache_spark(build_tool, hadoop_version):
    +    """Will build Spark against Hive v0.13.1 given the passed in build 
tool (either `sbt` or
    +    `maven`). Defaults to using `sbt`."""
    +
    +    set_title_and_block("Building Spark", "BLOCK_BUILD")
    +
    +    rm_r("lib_managed")
    +
    +    if build_tool == "maven":
    +        build_spark_maven(hadoop_version)
    +    else:
    +        build_spark_sbt(hadoop_version)
    +
    +
    +def detect_binary_inop_with_mima():
    +    set_title_and_block("Detecting binary incompatibilities with MiMa", 
"BLOCK_MIMA")
    +    run_cmd([os.path.join(SPARK_HOME, "dev", "mima")])
    +
    +
    +def identify_changed_modules(test_env):
    +    """Given the passed in environment will determine the changed modules 
and
    +    return them as a set. If the environment is local, will simply run all 
tests.
    +    If run under the `amplab_jenkins` environment will determine the 
changed files
    +    as compared to the `ghprbTargetBranch` and execute the necessary set 
of tests
    +    to provide coverage for the changed code."""
    +    changed_modules = set()
    +
    +    if test_env == "amplab_jenkins":
    +        target_branch = os.environ["ghprbTargetBranch"]
    +
    +        run_cmd(['git', 'fetch', 'origin', 
str(target_branch+':'+target_branch)])
    +
    +        raw_output = subprocess.check_output(['git', 'diff', 
'--name-only', target_branch])
    +        # remove any empty strings
    +        changed_files = [f for f in raw_output.split('\n') if f]
    +
    +        sql_files = [f for f in changed_files
    +                     if any(f.startswith(p) for p in
    +                            ["sql/",
    +                             "bin/spark-sql",
    +                             "sbin/start-thriftserver.sh",
    +                             
"examples/src/main/java/org/apache/spark/examples/sql/",
    +                             
"examples/src/main/scala/org/apache/spark/examples/sql/"])]
    +        mllib_files = [f for f in changed_files
    +                       if any(f.startswith(p) for p in
    +                              
["examples/src/main/java/org/apache/spark/examples/mllib/",
    +                               
"examples/src/main/scala/org/apache/spark/examples/mllib",
    +                               "data/mllib/",
    +                               "mllib/"])]
    +        streaming_files = [f for f in changed_files
    +                           if any(f.startswith(p) for p in
    +                                  ["examples/scala-2.10/",
    +                                   
"examples/src/main/java/org/apache/spark/examples/streaming/",
    +                                   
"examples/src/main/scala/org/apache/spark/examples/streaming/",
    +                                   "external/",
    +                                   "extras/java8-tests/",
    +                                   "extras/kinesis-asl/",
    +                                   "streaming/"])]
    +        graphx_files = [f for f in changed_files
    +                        if any(f.startswith(p) for p in
    +                               
["examples/src/main/scala/org/apache/spark/examples/graphx/",
    +                                "graphx/"])]
    +        doc_files = [f for f in changed_files if f.startswith("docs/")]
    +
    +        # union together all changed top level project files
    +        top_level_project_files = set().union(*[set(f) for f in [sql_files,
    +                                                                 
mllib_files,
    +                                                                 
streaming_files,
    +                                                                 
graphx_files,
    +                                                                 
doc_files]])
    +        changed_core_files = 
set(changed_files).difference(top_level_project_files)
    +
    +        if changed_core_files:
    +            changed_modules.add("CORE")
    +        if sql_files:
    +            print "[info] Detected changes in SQL. Will run Hive test 
suite."
    +            changed_modules.add("SQL")
    +        if mllib_files:
    +            print "[info] Detected changes in MLlib. Will run MLlib test 
suite."
    +            changed_modules.add("MLLIB")
    +        if streaming_files:
    +            print "[info] Detected changes in Streaming. Will run 
Streaming test suite."
    +            changed_modules.add("STREAMING")
    +        if graphx_files:
    +            print "[info] Detected changes in GraphX. Will run GraphX test 
suite."
    +            changed_modules.add("GRAPHX")
    +        if doc_files:
    +            print "[info] Detected changes in documentation. Will build 
spark with documentation."
    +            changed_modules.add("DOCS")
    +
    +        return changed_modules
    +    else:
    +        # we aren't in the Amplab environment so simply run all tests
    +        changed_modules.add("ALL")
    +        return changed_modules
    +
    +
    +def run_scala_tests_maven(test_profiles):
    +    mvn_test_goals = ["test", "--fail-at-end"]
    +    profiles_and_goals = test_profiles + mvn_test_goals
    +
    +    print "[info] Running Spark tests using Maven with these arguments:",
    +    print " ".join(profiles_and_goals)
    +
    +    exec_maven(profiles_and_goals)
    +
    +
    +def run_scala_tests_sbt(test_modules, test_profiles):
    +    # declare the variable for reference
    +    sbt_test_goals = None
    --- End diff --
    
    Hey Josh, I'm out of pocket until tomorrow morning. Can you get a hot fix 
in for this? Immediately I'm not seeing what the issue is / code to fix it :/ I 
can figure it out tomorrow morning though if it can wait.
    
    
    
    -----Original Message-----
    From: Josh Rosen [[email protected]<mailto:[email protected]>]
    Sent: Wednesday, June 17, 2015 09:58 PM Eastern Standard Time
    To: apache/spark
    Cc: York, Brennon
    Subject: Re: [spark] [SPARK-7017][Build][Project Infra]: Refactor 
dev/run-tests into Python (#5694)
    
    
    In 
dev/run-tests.py<https://github.com/apache/spark/pull/5694#discussion_r32693784>:
    
    
    > +        return changed_modules
    > +
    > +
    > +def run_scala_tests_maven(test_profiles):
    > +    mvn_test_goals = ["test", "--fail-at-end"]
    > +    profiles_and_goals = test_profiles + mvn_test_goals
    > +
    > +    print "[info] Running Spark tests using Maven with these arguments:",
    > +    print " ".join(profiles_and_goals)
    > +
    > +    exec_maven(profiles_and_goals)
    > +
    > +
    > +def run_scala_tests_sbt(test_modules, test_profiles):
    > +    # declare the variable for reference
    > +    sbt_test_goals = None
    
    
    We'll end up concatenating a list to this if we only run the MLLib tests in 
the code below. This breaks PRB tests for MLLib, Streaming, and GraphX only PRs.
    
    I'm considering reverting this patch or pushing a hotfix to address this.
    
    â
    Reply to this email directly or view it on 
GitHub<https://github.com/apache/spark/pull/5694/files#r32693784>.
    ________________________________________________________
    
    The information contained in this e-mail is confidential and/or proprietary 
to Capital One and/or its affiliates and may only be used solely in performance 
of work or services for Capital One. The information transmitted herewith is 
intended only for use by the individual or entity to which it is addressed. If 
the reader of this message is not the intended recipient, you are hereby 
notified that any review, retransmission, dissemination, distribution, copying 
or other use of, or taking of any action in reliance upon this information is 
strictly prohibited. If you have received this communication in error, please 
contact the sender and delete the material from your computer.




---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request: [SPARK-7017][Build][Project Infra]: Refactor d...

Reply via email to