This is an automated email from the ASF dual-hosted git repository.

potiuk pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git


The following commit(s) were added to refs/heads/main by this push:
     new 8494fc7036 Fix regression in ignoring symlinks (#23535)
8494fc7036 is described below

commit 8494fc7036c33683af06a0e57474b8a6157fda05
Author: Ian Buss <[email protected]>
AuthorDate: Fri May 20 07:35:41 2022 +0100

    Fix regression in ignoring symlinks (#23535)
---
 airflow/utils/file.py    | 32 +++++++++++++++++----------
 tests/utils/test_file.py | 56 +++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 76 insertions(+), 12 deletions(-)

diff --git a/airflow/utils/file.py b/airflow/utils/file.py
index 89d0b7d9fc..5a3db7fd48 100644
--- a/airflow/utils/file.py
+++ b/airflow/utils/file.py
@@ -40,11 +40,14 @@ class _IgnoreRule(Protocol):
 
     @staticmethod
     def compile(pattern: str, base_dir: Path, definition_file: Path) -> 
Optional['_IgnoreRule']:
-        pass
+        """
+        Build an ignore rule from the supplied pattern where base_dir
+        and definition_file should be absolute paths.
+        """
 
     @staticmethod
     def match(path: Path, rules: List['_IgnoreRule']) -> bool:
-        pass
+        """Match a candidate absolute path against a list of rules"""
 
 
 class _RegexpIgnoreRule(NamedTuple):
@@ -57,7 +60,7 @@ class _RegexpIgnoreRule(NamedTuple):
     def compile(pattern: str, base_dir: Path, definition_file: Path) -> 
Optional[_IgnoreRule]:
         """Build an ignore rule from the supplied regexp pattern and log a 
useful warning if it is invalid"""
         try:
-            return _RegexpIgnoreRule(re.compile(pattern), base_dir.resolve())
+            return _RegexpIgnoreRule(re.compile(pattern), base_dir)
         except re.error as e:
             log.warning("Ignoring invalid regex '%s' from %s: %s", pattern, 
definition_file, e)
             return None
@@ -65,11 +68,10 @@ class _RegexpIgnoreRule(NamedTuple):
     @staticmethod
     def match(path: Path, rules: List[_IgnoreRule]) -> bool:
         """Match a list of ignore rules against the supplied path"""
-        test_path: Path = path.resolve()
         for rule in rules:
             if not isinstance(rule, _RegexpIgnoreRule):
                 raise ValueError(f"_RegexpIgnoreRule cannot match rules of 
type: {type(rule)}")
-            if rule.pattern.search(str(test_path.relative_to(rule.base_dir))) 
is not None:
+            if rule.pattern.search(str(path.relative_to(rule.base_dir))) is 
not None:
                 return True
         return False
 
@@ -95,21 +97,20 @@ class _GlobIgnoreRule(NamedTuple):
             # > If there is a separator at the beginning or middle (or both) 
of the pattern, then the
             # > pattern is relative to the directory level of the particular 
.gitignore file itself.
             # > Otherwise the pattern may also match at any level below the 
.gitignore level.
-            relative_to = definition_file.resolve().parent
+            relative_to = definition_file.parent
         ignore_pattern = GitWildMatchPattern(pattern)
         return _GlobIgnoreRule(ignore_pattern.regex, pattern, 
ignore_pattern.include, relative_to)
 
     @staticmethod
     def match(path: Path, rules: List[_IgnoreRule]) -> bool:
         """Match a list of ignore rules against the supplied path"""
-        test_path: Path = path.resolve()
         matched = False
         for r in rules:
             if not isinstance(r, _GlobIgnoreRule):
                 raise ValueError(f"_GlobIgnoreRule cannot match rules of type: 
{type(r)}")
             rule: _GlobIgnoreRule = r  # explicit typing to make mypy play 
nicely
-            rel_path = str(test_path.relative_to(rule.relative_to) if 
rule.relative_to else test_path.name)
-            if rule.raw_pattern.endswith("/") and test_path.is_dir():
+            rel_path = str(path.relative_to(rule.relative_to) if 
rule.relative_to else path.name)
+            if rule.raw_pattern.endswith("/") and path.is_dir():
                 # ensure the test path will potentially match a directory 
pattern if it is a directory
                 rel_path += "/"
             if rule.include is not None and rule.pattern.match(rel_path) is 
not None:
@@ -208,10 +209,11 @@ def _find_path_from_directory(
 
     :return: a generator of file paths which should not be ignored.
     """
+    # A Dict of patterns, keyed using resolved, absolute paths
     patterns_by_dir: Dict[Path, List[_IgnoreRule]] = {}
 
     for root, dirs, files in os.walk(base_dir_path, followlinks=True):
-        patterns: List[_IgnoreRule] = patterns_by_dir.get(Path(root), [])
+        patterns: List[_IgnoreRule] = 
patterns_by_dir.get(Path(root).resolve(), [])
 
         ignore_file_path = Path(root) / ignore_file_name
         if ignore_file_path.is_file():
@@ -233,7 +235,15 @@ def _find_path_from_directory(
 
         dirs[:] = [subdir for subdir in dirs if not 
ignore_rule_type.match(Path(root) / subdir, patterns)]
 
-        patterns_by_dir.update({Path(root) / sd: patterns.copy() for sd in 
dirs})
+        # explicit loop for infinite recursion detection since we are 
following symlinks in this walk
+        for sd in dirs:
+            dirpath = (Path(root) / sd).resolve()
+            if dirpath in patterns_by_dir:
+                raise RuntimeError(
+                    "Detected recursive loop when walking DAG directory "
+                    + f"{base_dir_path}: {dirpath} has appeared more than 
once."
+                )
+            patterns_by_dir.update({dirpath: patterns.copy()})
 
         for file in files:
             if file == ignore_file_name:
diff --git a/tests/utils/test_file.py b/tests/utils/test_file.py
index c79836b58a..99f7e90a7d 100644
--- a/tests/utils/test_file.py
+++ b/tests/utils/test_file.py
@@ -16,10 +16,14 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import os
 import os.path
 import unittest
+from pathlib import Path
 from unittest import mock
 
+import pytest
+
 from airflow.utils.file import correct_maybe_zipped, find_path_from_directory, 
open_maybe_zipped
 from tests.models import TEST_DAGS_FOLDER
 
@@ -77,7 +81,24 @@ class TestOpenMaybeZipped(unittest.TestCase):
         assert isinstance(content, str)
 
 
-class TestListPyFilesPath(unittest.TestCase):
+class TestListPyFilesPath:
+    @pytest.fixture()
+    def test_dir(self, tmp_path):
+        # create test tree with symlinks
+        source = os.path.join(tmp_path, "folder")
+        target = os.path.join(tmp_path, "symlink")
+        py_file = os.path.join(source, "hello_world.py")
+        ignore_file = os.path.join(tmp_path, ".airflowignore")
+        os.mkdir(source)
+        os.symlink(source, target)
+        # write ignore files
+        with open(ignore_file, 'w') as f:
+            f.write("folder")
+        # write sample pyfile
+        with open(py_file, 'w') as f:
+            f.write("print('hello world')")
+        return tmp_path
+
     def test_find_path_from_directory_regex_ignore(self):
         should_ignore = [
             "test_invalid_cron.py",
@@ -110,3 +131,36 @@ class TestListPyFilesPath(unittest.TestCase):
         assert len(list(filter(lambda file: os.path.basename(file) in 
should_not_ignore, files))) == len(
             should_not_ignore
         )
+
+    def test_find_path_from_directory_respects_symlinks_regexp_ignore(self, 
test_dir):
+        ignore_list_file = ".airflowignore"
+        found = list(find_path_from_directory(test_dir, ignore_list_file))
+
+        assert os.path.join(test_dir, "symlink", "hello_world.py") in found
+        assert os.path.join(test_dir, "folder", "hello_world.py") not in found
+
+    def test_find_path_from_directory_respects_symlinks_glob_ignore(self, 
test_dir):
+        ignore_list_file = ".airflowignore"
+        found = list(find_path_from_directory(test_dir, ignore_list_file, 
ignore_file_syntax="glob"))
+
+        assert os.path.join(test_dir, "symlink", "hello_world.py") in found
+        assert os.path.join(test_dir, "folder", "hello_world.py") not in found
+
+    def test_find_path_from_directory_fails_on_recursive_link(self, test_dir):
+        # add a recursive link
+        recursing_src = os.path.join(test_dir, "folder2", "recursor")
+        recursing_tgt = os.path.join(test_dir, "folder2")
+        os.mkdir(recursing_tgt)
+        os.symlink(recursing_tgt, recursing_src)
+
+        ignore_list_file = ".airflowignore"
+
+        try:
+            list(find_path_from_directory(test_dir, ignore_list_file, 
ignore_file_syntax="glob"))
+            assert False, "Walking a self-recursive tree should fail"
+        except RuntimeError as err:
+            assert (
+                str(err)
+                == f"Detected recursive loop when walking DAG directory 
{test_dir}: "
+                + f"{Path(recursing_tgt).resolve()} has appeared more than 
once."
+            )

Reply via email to