Github user davies commented on a diff in the pull request:

    https://github.com/apache/spark/pull/6866#discussion_r32781762
  
    --- Diff: dev/run-tests.py ---
    @@ -28,6 +29,241 @@
     USER_HOME = os.environ.get("HOME")
     
     
    +# 
-------------------------------------------------------------------------------------------------
    +# Test module definitions and functions for traversing module dependency 
graph
    +# 
-------------------------------------------------------------------------------------------------
    +
    +
    +all_modules = []
    +
    +
    +class Module(object):
    +
    +    def __init__(self, name, dependencies, source_file_regexes, 
sbt_test_goals=(),
    +                 should_run_python_tests=False, should_run_r_tests=False):
    +        self.name = name
    +        self.dependencies = dependencies
    +        self.source_file_prefixes = source_file_regexes
    +        self.sbt_test_goals = sbt_test_goals
    +        self.should_run_python_tests = should_run_python_tests
    +        self.should_run_r_tests = should_run_r_tests
    +
    +        self.dependent_modules = set()
    +        for dep in dependencies:
    +            dep.dependent_modules.add(self)
    +        all_modules.append(self)
    +
    +    def contains_file(self, filename):
    +        return any(re.match(p, filename) for p in 
self.source_file_prefixes)
    +
    +
    +root = Module(
    +    name="root",
    +    dependencies=[],
    +    source_file_regexes=[],
    +    sbt_test_goals=[
    +        "test",
    +    ],
    +    should_run_python_tests=True,
    +    should_run_r_tests=True
    +)
    +
    +
    +sql = Module(
    +    name="sql",
    +    dependencies=[],
    +    source_file_regexes=[
    +        "sql/(?!hive-thriftserver)",
    +        "bin/spark-sql",
    +        "examples/src/main/java/org/apache/spark/examples/sql/",
    +        "examples/src/main/scala/org/apache/spark/examples/sql/",
    +    ],
    +    sbt_test_goals=[
    +        "catalyst/test",
    +        "sql/test",
    +        "hive/test",
    +    ])
    +
    +
    +hive_thriftserver = Module(
    +    name="hive-thriftserver",
    +    dependencies=[sql],
    +    source_file_regexes=[
    +        "sql/hive-thriftserver",
    +        "sbin/start-thriftserver.sh",
    +    ],
    +    sbt_test_goals=[
    +        "hive-thriftserver/test",
    +    ]
    +)
    +
    +
    +mllib = Module(
    +    name="mllib",
    +    dependencies=[sql],
    +    source_file_regexes=[
    +        "examples/src/main/java/org/apache/spark/examples/mllib/",
    +        "examples/src/main/scala/org/apache/spark/examples/mllib",
    +        "data/mllib/",
    +        "mllib/",
    +    ],
    +    sbt_test_goals=[
    +        "mllib/test",
    +        "examples/test",
    +    ]
    +)
    +
    +
    +graphx = Module(
    +    name="graphx",
    +    dependencies=[],
    +    source_file_regexes=[
    +        "graphx/",
    +    ],
    +    sbt_test_goals=[
    +        "graphx/test"
    +    ]
    +)
    +
    +
    +streaming = Module(
    +    name="streaming",
    +    dependencies=[],
    +    source_file_regexes=[
    +        "external/",
    +        "extras/java8-tests/",
    +        "extras/kinesis-asl/",
    +        "streaming",
    +    ],
    +    sbt_test_goals=[
    +        "streaming/test",
    +        "streaming-flume/test",
    +        "streaming-flume-sink/test",
    +        "streaming-kafka/test",
    +        "streaming-mqtt/test",
    +        "streaming-twitter/test",
    +        "streaming-zeromq/test",
    +    ]
    +)
    +
    +
    +examples = Module(
    +    name="examples",
    +    dependencies=[graphx, mllib, streaming, sql],
    +    source_file_regexes=[
    +        "examples/",
    +    ],
    +    sbt_test_goals=[
    +        "examples/test",
    +    ]
    +)
    +
    +
    +pyspark = Module(
    +    name="pyspark",
    +    dependencies=[mllib, streaming, sql],
    +    source_file_regexes=[
    +        "python/"
    +    ],
    +    should_run_python_tests=True
    +)
    +
    +
    +sparkr = Module(
    +    name="sparkr",
    +    dependencies=[sql, mllib],
    +    source_file_regexes=[
    +        "R/",
    +    ],
    +    should_run_r_tests=True
    +)
    +
    +
    +docs = Module(
    +    name="docs",
    +    dependencies=[],
    +    source_file_regexes=[
    +        "docs/",
    +    ]
    +)
    +
    +
    +def determine_modules_for_files(filenames):
    +    """
    +    Given a list of filenames, return the set of modules that contain 
those files.
    +    If a file is not associated with a more specific submodule, then this 
method will consider that
    +    file to belong to the 'root' module.
    +
    +    >>> sorted(x.name for x in 
determine_modules_for_files(["python/pyspark/a.py", "sql/test/foo"]))
    +    ['pyspark', 'sql']
    +    >>> [x.name for x in 
determine_modules_for_files(["file_not_matched_by_any_subproject"])]
    +    ['root']
    +    """
    +    changed_modules = set()
    +    for filename in filenames:
    +        matched_at_least_one_module = False
    +        for module in all_modules:
    +            if module.contains_file(filename):
    +                changed_modules.add(module)
    +                matched_at_least_one_module = True
    +        if not matched_at_least_one_module:
    +            changed_modules.add(root)
    +    return changed_modules
    +
    +
    +def identify_changed_modules_from_git_commits(patch_sha, 
target_branch=None, target_ref=None):
    +    """
    +    Given a git commit and target ref, use the set of files changed in the 
diff in order to
    +    determine which modules' tests should be run.
    +
    +    >>> [x.name for x in \
    +         identify_changed_modules_from_git_commits("fc0a1475ef", 
target_ref="5da21f07")]
    +    ['graphx']
    +    >>> 'root' in [x.name for x in \
    +         identify_changed_modules_from_git_commits("50a0496a43", 
target_ref="6765ef9")]
    +    True
    +    """
    +    if target_branch is None and target_ref is None:
    +        raise AttributeError("must specify either target_branch or 
target_ref")
    +    elif target_branch is not None and target_ref is not None:
    +        raise AttributeError("must specify either target_branch or 
target_ref, not both")
    +    if target_branch is not None:
    +        diff_target = target_branch
    +        run_cmd(['git', 'fetch', 'origin', 
str(target_branch+':'+target_branch)])
    +    else:
    +        diff_target = target_ref
    +    raw_output = subprocess.check_output(['git', 'diff', '--name-only', 
patch_sha, diff_target])
    +    # Remove any empty strings
    +    changed_files = [f for f in raw_output.split('\n') if f]
    +    return determine_modules_for_files(changed_files)
    +
    +
    +def determine_modules_to_test(changed_modules):
    +    """
    +    Given a set of modules that have changed, compute the transitive 
closure of those modules'
    +    dependent modules in order to determine the set of modules that should 
be tested.
    +
    +    >>> sorted(x.name for x in determine_modules_to_test([root]))
    +    ['root']
    +    >>> sorted(x.name for x in determine_modules_to_test([graphx]))
    +    ['examples', 'graphx']
    +    >>> sorted(x.name for x in determine_modules_to_test([sql]))
    +    ['examples', 'hive-thriftserver', 'mllib', 'pyspark', 'sparkr', 'sql']
    +    """
    +    modules_to_test = set()
    +    for module in changed_modules:
    +        modules_to_test = 
modules_to_test.union(determine_modules_to_test(module.dependent_modules))
    +    modules_to_test = modules_to_test.union(set(changed_modules))
    +    if root in modules_to_test:
    --- End diff --
    
    No module will depends on `root`, this line could be moved to the begging.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to