This is an automated email from the ASF dual-hosted git repository.
potiuk pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git
The following commit(s) were added to refs/heads/main by this push:
new 8494fc7036 Fix regression in ignoring symlinks (#23535)
8494fc7036 is described below
commit 8494fc7036c33683af06a0e57474b8a6157fda05
Author: Ian Buss <[email protected]>
AuthorDate: Fri May 20 07:35:41 2022 +0100
Fix regression in ignoring symlinks (#23535)
---
airflow/utils/file.py | 32 +++++++++++++++++----------
tests/utils/test_file.py | 56 +++++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 76 insertions(+), 12 deletions(-)
diff --git a/airflow/utils/file.py b/airflow/utils/file.py
index 89d0b7d9fc..5a3db7fd48 100644
--- a/airflow/utils/file.py
+++ b/airflow/utils/file.py
@@ -40,11 +40,14 @@ class _IgnoreRule(Protocol):
@staticmethod
def compile(pattern: str, base_dir: Path, definition_file: Path) ->
Optional['_IgnoreRule']:
- pass
+ """
+ Build an ignore rule from the supplied pattern where base_dir
+ and definition_file should be absolute paths.
+ """
@staticmethod
def match(path: Path, rules: List['_IgnoreRule']) -> bool:
- pass
+ """Match a candidate absolute path against a list of rules"""
class _RegexpIgnoreRule(NamedTuple):
@@ -57,7 +60,7 @@ class _RegexpIgnoreRule(NamedTuple):
def compile(pattern: str, base_dir: Path, definition_file: Path) ->
Optional[_IgnoreRule]:
"""Build an ignore rule from the supplied regexp pattern and log a
useful warning if it is invalid"""
try:
- return _RegexpIgnoreRule(re.compile(pattern), base_dir.resolve())
+ return _RegexpIgnoreRule(re.compile(pattern), base_dir)
except re.error as e:
log.warning("Ignoring invalid regex '%s' from %s: %s", pattern,
definition_file, e)
return None
@@ -65,11 +68,10 @@ class _RegexpIgnoreRule(NamedTuple):
@staticmethod
def match(path: Path, rules: List[_IgnoreRule]) -> bool:
"""Match a list of ignore rules against the supplied path"""
- test_path: Path = path.resolve()
for rule in rules:
if not isinstance(rule, _RegexpIgnoreRule):
raise ValueError(f"_RegexpIgnoreRule cannot match rules of
type: {type(rule)}")
- if rule.pattern.search(str(test_path.relative_to(rule.base_dir)))
is not None:
+ if rule.pattern.search(str(path.relative_to(rule.base_dir))) is
not None:
return True
return False
@@ -95,21 +97,20 @@ class _GlobIgnoreRule(NamedTuple):
# > If there is a separator at the beginning or middle (or both)
of the pattern, then the
# > pattern is relative to the directory level of the particular
.gitignore file itself.
# > Otherwise the pattern may also match at any level below the
.gitignore level.
- relative_to = definition_file.resolve().parent
+ relative_to = definition_file.parent
ignore_pattern = GitWildMatchPattern(pattern)
return _GlobIgnoreRule(ignore_pattern.regex, pattern,
ignore_pattern.include, relative_to)
@staticmethod
def match(path: Path, rules: List[_IgnoreRule]) -> bool:
"""Match a list of ignore rules against the supplied path"""
- test_path: Path = path.resolve()
matched = False
for r in rules:
if not isinstance(r, _GlobIgnoreRule):
raise ValueError(f"_GlobIgnoreRule cannot match rules of type:
{type(r)}")
rule: _GlobIgnoreRule = r # explicit typing to make mypy play
nicely
- rel_path = str(test_path.relative_to(rule.relative_to) if
rule.relative_to else test_path.name)
- if rule.raw_pattern.endswith("/") and test_path.is_dir():
+ rel_path = str(path.relative_to(rule.relative_to) if
rule.relative_to else path.name)
+ if rule.raw_pattern.endswith("/") and path.is_dir():
# ensure the test path will potentially match a directory
pattern if it is a directory
rel_path += "/"
if rule.include is not None and rule.pattern.match(rel_path) is
not None:
@@ -208,10 +209,11 @@ def _find_path_from_directory(
:return: a generator of file paths which should not be ignored.
"""
+ # A Dict of patterns, keyed using resolved, absolute paths
patterns_by_dir: Dict[Path, List[_IgnoreRule]] = {}
for root, dirs, files in os.walk(base_dir_path, followlinks=True):
- patterns: List[_IgnoreRule] = patterns_by_dir.get(Path(root), [])
+ patterns: List[_IgnoreRule] =
patterns_by_dir.get(Path(root).resolve(), [])
ignore_file_path = Path(root) / ignore_file_name
if ignore_file_path.is_file():
@@ -233,7 +235,15 @@ def _find_path_from_directory(
dirs[:] = [subdir for subdir in dirs if not
ignore_rule_type.match(Path(root) / subdir, patterns)]
- patterns_by_dir.update({Path(root) / sd: patterns.copy() for sd in
dirs})
+ # explicit loop for infinite recursion detection since we are
following symlinks in this walk
+ for sd in dirs:
+ dirpath = (Path(root) / sd).resolve()
+ if dirpath in patterns_by_dir:
+ raise RuntimeError(
+ "Detected recursive loop when walking DAG directory "
+ + f"{base_dir_path}: {dirpath} has appeared more than
once."
+ )
+ patterns_by_dir.update({dirpath: patterns.copy()})
for file in files:
if file == ignore_file_name:
diff --git a/tests/utils/test_file.py b/tests/utils/test_file.py
index c79836b58a..99f7e90a7d 100644
--- a/tests/utils/test_file.py
+++ b/tests/utils/test_file.py
@@ -16,10 +16,14 @@
# specific language governing permissions and limitations
# under the License.
+import os
import os.path
import unittest
+from pathlib import Path
from unittest import mock
+import pytest
+
from airflow.utils.file import correct_maybe_zipped, find_path_from_directory,
open_maybe_zipped
from tests.models import TEST_DAGS_FOLDER
@@ -77,7 +81,24 @@ class TestOpenMaybeZipped(unittest.TestCase):
assert isinstance(content, str)
-class TestListPyFilesPath(unittest.TestCase):
+class TestListPyFilesPath:
+ @pytest.fixture()
+ def test_dir(self, tmp_path):
+ # create test tree with symlinks
+ source = os.path.join(tmp_path, "folder")
+ target = os.path.join(tmp_path, "symlink")
+ py_file = os.path.join(source, "hello_world.py")
+ ignore_file = os.path.join(tmp_path, ".airflowignore")
+ os.mkdir(source)
+ os.symlink(source, target)
+ # write ignore files
+ with open(ignore_file, 'w') as f:
+ f.write("folder")
+ # write sample pyfile
+ with open(py_file, 'w') as f:
+ f.write("print('hello world')")
+ return tmp_path
+
def test_find_path_from_directory_regex_ignore(self):
should_ignore = [
"test_invalid_cron.py",
@@ -110,3 +131,36 @@ class TestListPyFilesPath(unittest.TestCase):
assert len(list(filter(lambda file: os.path.basename(file) in
should_not_ignore, files))) == len(
should_not_ignore
)
+
+ def test_find_path_from_directory_respects_symlinks_regexp_ignore(self,
test_dir):
+ ignore_list_file = ".airflowignore"
+ found = list(find_path_from_directory(test_dir, ignore_list_file))
+
+ assert os.path.join(test_dir, "symlink", "hello_world.py") in found
+ assert os.path.join(test_dir, "folder", "hello_world.py") not in found
+
+ def test_find_path_from_directory_respects_symlinks_glob_ignore(self,
test_dir):
+ ignore_list_file = ".airflowignore"
+ found = list(find_path_from_directory(test_dir, ignore_list_file,
ignore_file_syntax="glob"))
+
+ assert os.path.join(test_dir, "symlink", "hello_world.py") in found
+ assert os.path.join(test_dir, "folder", "hello_world.py") not in found
+
+ def test_find_path_from_directory_fails_on_recursive_link(self, test_dir):
+ # add a recursive link
+ recursing_src = os.path.join(test_dir, "folder2", "recursor")
+ recursing_tgt = os.path.join(test_dir, "folder2")
+ os.mkdir(recursing_tgt)
+ os.symlink(recursing_tgt, recursing_src)
+
+ ignore_list_file = ".airflowignore"
+
+ try:
+ list(find_path_from_directory(test_dir, ignore_list_file,
ignore_file_syntax="glob"))
+ assert False, "Walking a self-recursive tree should fail"
+ except RuntimeError as err:
+ assert (
+ str(err)
+ == f"Detected recursive loop when walking DAG directory
{test_dir}: "
+ + f"{Path(recursing_tgt).resolve()} has appeared more than
once."
+ )