HyukjinKwon commented on a change in pull request #35112:
URL: https://github.com/apache/spark/pull/35112#discussion_r779961031



##########
File path: dev/is-changed.py
##########
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import sys
+from argparse import ArgumentParser
+from sparktestsupport.utils import determine_modules_for_files, 
determine_modules_to_test, identify_changed_files_from_git_commits

Review comment:
       Hm, it actually should complain about the line length .. I actually 
fixed the linter script to run on dev scripts too before but seems like there's 
a regression about this. I will take a separate look at this one - there should 
be more places to fix per the linter not only this script.

##########
File path: dev/sparktestsupport/utils.py
##########
@@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import sys
+import subprocess
+from sparktestsupport import modules
+from sparktestsupport.shellutils import run_cmd
+from sparktestsupport.toposort import toposort_flatten
+
+# 
-------------------------------------------------------------------------------------------------
+# Functions for traversing module dependency graph
+# 
-------------------------------------------------------------------------------------------------
+
+
+def determine_modules_for_files(filenames):
+    """
+    Given a list of filenames, return the set of modules that contain those 
files.
+    If a file is not associated with a more specific submodule, then this 
method will consider that
+    file to belong to the 'root' module. `.github` directory is counted only 
in GitHub Actions,
+    and `appveyor.yml` is always ignored because this file is dedicated only 
to AppVeyor builds.
+
+    >>> sorted(x.name for x in 
determine_modules_for_files(["python/pyspark/a.py", "sql/core/foo"]))
+    ['pyspark-core', 'sql']
+    >>> [x.name for x in 
determine_modules_for_files(["file_not_matched_by_any_subproject"])]
+    ['root']
+    >>> [x.name for x in determine_modules_for_files(["appveyor.yml"])]
+    []
+    """
+    changed_modules = set()
+    for filename in filenames:
+        if filename in ("appveyor.yml",):
+            continue
+        if ("GITHUB_ACTIONS" not in os.environ) and 
filename.startswith(".github"):
+            continue
+        matched_at_least_one_module = False
+        for module in modules.all_modules:
+            if module.contains_file(filename):
+                changed_modules.add(module)
+                matched_at_least_one_module = True
+        if not matched_at_least_one_module:
+            changed_modules.add(modules.root)
+    return changed_modules
+
+
+def identify_changed_files_from_git_commits(patch_sha, target_branch=None, 
target_ref=None):
+    """
+    Given a git commit and target ref, use the set of files changed in the 
diff in order to
+    determine which modules' tests should be run.
+
+    >>> [x.name for x in determine_modules_for_files( \
+            identify_changed_files_from_git_commits("fc0a1475ef", 
target_ref="5da21f07"))]
+    ['graphx']
+    >>> 'root' in [x.name for x in determine_modules_for_files( \
+         identify_changed_files_from_git_commits("50a0496a43", 
target_ref="6765ef9"))]
+    True
+    """
+    if target_branch is None and target_ref is None:
+        raise AttributeError("must specify either target_branch or target_ref")
+    elif target_branch is not None and target_ref is not None:
+        raise AttributeError("must specify either target_branch or target_ref, 
not both")
+    if target_branch is not None:
+        diff_target = target_branch
+        run_cmd(['git', 'fetch', 'origin', 
str(target_branch+':'+target_branch)])
+    else:
+        diff_target = target_ref
+    raw_output = subprocess.check_output(['git', 'diff', '--name-only', 
patch_sha, diff_target],
+                                         universal_newlines=True)
+    # Remove any empty strings
+    return [f for f in raw_output.split('\n') if f]
+
+
+def determine_modules_to_test(changed_modules, deduplicated=True):
+    """
+    Given a set of modules that have changed, compute the transitive closure 
of those modules'
+    dependent modules in order to determine the set of modules that should be 
tested.
+
+    Returns a topologically-sorted list of modules (ties are broken by sorting 
on module names).
+    If ``deduplicated`` is disabled, the modules are returned without tacking 
the deduplication
+    by dependencies into account.
+
+    >>> [x.name for x in determine_modules_to_test([modules.root])]
+    ['root']
+    >>> [x.name for x in determine_modules_to_test([modules.build])]
+    ['root']
+    >>> [x.name for x in determine_modules_to_test([modules.core])]
+    ['root']
+    >>> [x.name for x in determine_modules_to_test([modules.launcher])]
+    ['root']
+    >>> [x.name for x in determine_modules_to_test([modules.graphx])]
+    ['graphx', 'examples']
+    >>> [x.name for x in determine_modules_to_test([modules.sql])]
+    ... # doctest: +NORMALIZE_WHITESPACE
+    ['sql', 'avro', 'docker-integration-tests', 'hive', 'mllib', 
'sql-kafka-0-10', 'examples',
+     'hive-thriftserver', 'pyspark-sql', 'repl', 'sparkr',
+     'pyspark-mllib', 'pyspark-pandas', 'pyspark-pandas-slow', 'pyspark-ml']
+    >>> sorted([x.name for x in determine_modules_to_test(
+    ...     [modules.sparkr, modules.sql], deduplicated=False)])
+    ... # doctest: +NORMALIZE_WHITESPACE
+    ['avro', 'docker-integration-tests', 'examples', 'hive', 
'hive-thriftserver', 'mllib',
+     'pyspark-ml', 'pyspark-mllib', 'pyspark-pandas', 'pyspark-pandas-slow', 
'pyspark-sql',
+     'repl', 'sparkr', 'sql', 'sql-kafka-0-10']
+    >>> sorted([x.name for x in determine_modules_to_test(
+    ...     [modules.sql, modules.core], deduplicated=False)])
+    ... # doctest: +NORMALIZE_WHITESPACE
+    ['avro', 'catalyst', 'core', 'docker-integration-tests', 'examples', 
'graphx', 'hive',
+     'hive-thriftserver', 'mllib', 'mllib-local', 'pyspark-core', 
'pyspark-ml', 'pyspark-mllib',
+     'pyspark-pandas', 'pyspark-pandas-slow', 'pyspark-resource', 
'pyspark-sql',
+     'pyspark-streaming', 'repl', 'root', 'sparkr', 'sql', 'sql-kafka-0-10', 
'streaming',
+     'streaming-kafka-0-10', 'streaming-kinesis-asl']
+    """
+    modules_to_test = set()
+    for module in changed_modules:
+        modules_to_test = modules_to_test.union(
+            determine_modules_to_test(module.dependent_modules, deduplicated))
+    modules_to_test = modules_to_test.union(set(changed_modules))
+
+    if not deduplicated:
+        return modules_to_test
+
+    # If we need to run all of the tests, then we should short-circuit and 
return 'root'
+    if modules.root in modules_to_test:
+        return [modules.root]
+    return toposort_flatten(
+        {m: set(m.dependencies).intersection(modules_to_test) for m in 
modules_to_test}, sort=True)
+
+
+def determine_tags_to_exclude(changed_modules):
+    tags = []
+    for m in modules.all_modules:
+        if m not in changed_modules:
+            tags += m.test_tags
+    return tags
+
+
+def _test():
+    import doctest
+    failure_count = doctest.testmod()[0]
+    if failure_count:
+        sys.exit(-1)
+
+if __name__ == "__main__":
+    _test()

Review comment:
       ```suggestion
   ```
   
   Maybe we can remove these because these are not supposed to be executed as a 
script.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to