mik-laj commented on a change in pull request #22051:
URL: https://github.com/apache/airflow/pull/22051#discussion_r820823369
##########
File path: airflow/utils/file.py
##########
@@ -108,46 +111,167 @@ def open_maybe_zipped(fileloc, mode='r'):
return open(fileloc, mode=mode)
-def find_path_from_directory(base_dir_path: str, ignore_file_name: str) ->
Generator[str, None, None]:
+def _find_path_from_directory(
+ base_dir_path: str,
+ ignore_file_name: str,
+ compile_pattern_fn: Callable[[str, str, str], Optional[Union[Pattern,
IgnoreRule]]],
+ match_pattern_fn: Callable[[str, str, List[Union[Pattern, IgnoreRule]]],
bool],
+) -> Generator[str, None, None]:
"""
- Search the file and return the path of the file that should not be ignored.
- :param base_dir_path: the base path to be searched for.
- :param ignore_file_name: the file name in which specifies a regular
expression pattern is written.
-
- :return : file path not to be ignored.
+ Recursively search the base path and return the list of file paths that
should not be ignored by
+ regular expressions in any ignore files at each directory level.
+ :param base_dir_path: the base path to be searched
+ :param ignore_file_name: the file name containing regular expressions for
files that should be ignored.
+ :param compile_pattern_fn: a callable function which accepts a raw pattern
string, the base dir path
+ and path to the ignore file it was found in and returns a regular
expression or ignore rule.
+ :param match_pattern_fn: a callable function which accepts a root
directory a candidate subpath (file
+ or dir) and a list of patterns and returns a boolean to indicate whether
the candidate subpath
+ should be ignored.
+
+ :return : a generator of file paths which should not be ignored.
"""
- patterns_by_dir: Dict[str, List[Pattern[str]]] = {}
+ patterns_by_dir: Dict[str, List[Union[Pattern, IgnoreRule]]] = {}
for root, dirs, files in os.walk(str(base_dir_path), followlinks=True):
- patterns: List[Pattern[str]] = patterns_by_dir.get(root, [])
+ patterns: List[Union[Pattern, IgnoreRule]] = patterns_by_dir.get(root,
[])
ignore_file_path = os.path.join(root, ignore_file_name)
if os.path.isfile(ignore_file_path):
- with open(ignore_file_path) as file:
- lines_no_comments = [re.sub(r"\s*#.*", "", line) for line in
file.read().split("\n")]
- patterns += [re.compile(line) for line in lines_no_comments if
line]
- patterns = list(set(patterns))
-
- dirs[:] = [
- subdir
- for subdir in dirs
- if not any(
- p.search(os.path.join(os.path.relpath(root,
str(base_dir_path)), subdir)) for p in patterns
- )
- ]
+ with open(ignore_file_path) as ifile:
+ lines_no_comments = [re.sub(r"\s*#.*", "", line) for line in
ifile.read().split("\n")]
+ # append new patterns and filter out "None" objects, which are
invalid patterns
+ patterns += [
+ p
+ for p in [
+ compile_pattern_fn(line, root, ignore_file_path) for
line in lines_no_comments if line
+ ]
+ if p is not None
+ ]
+ # evaluation order is important with negation so that later
patterns can
+ # override earlier patterns
+ patterns = list(OrderedDict.fromkeys(patterns).keys())
+
+ dirs[:] = [subdir for subdir in dirs if not match_pattern_fn(root,
subdir, patterns)]
patterns_by_dir.update({os.path.join(root, sd): patterns.copy() for sd
in dirs})
- for file in files: # type: ignore
+ for file in files:
if file == ignore_file_name:
continue
abs_file_path = os.path.join(root, str(file))
- rel_file_path = os.path.join(os.path.relpath(root,
str(base_dir_path)), str(file))
- if any(p.search(rel_file_path) for p in patterns):
+ if match_pattern_fn(root, file, patterns):
continue
yield str(abs_file_path)
+def _regex_syntax_compile_fn(
+ pattern: str, base_path: str, ignore_file_path: str
+) -> Optional[Union[Pattern, IgnoreRule]]:
+ """
+ Compile a regular expression from the supplied pattern and log a useful
warning if it is invalid
+ :param pattern: the string pattern
+ :param base_path: the base directory where the pattern is defined, unused
+ :param ignore_file_path: the ignore file where the pattern was defined,
used for logging warnings
+
+ :return: a regular expression Pattern
+ """
+ p: Optional[Union[Pattern, IgnoreRule]] = None
+ try:
+ p = re.compile(pattern)
+ except re.error as e:
+ log.warning(f"Ignoring invalid regex '{pattern}' from
{ignore_file_path}: {e}")
+ return p
+
+
+def _get_regex_syntax_match_fn(
+ base_dir_path: str,
+) -> Callable[[str, str, List[Pattern]], bool]:
+ """
+ Return a function which can take a root, a candidate subpath and a list of
regular expression
+ patterns for evaluation.
+ :param base_dir_path: the base path for DAG searching, supplied to the
underlying function via closure
+
+ :return: a callable function which can be used to evaluate supplied ignore
expressions
+ """
+
+ def match_function(root: str, subpath: str, patterns: List[Pattern]) ->
bool:
+ test_path: str = os.path.join(os.path.relpath(root,
str(base_dir_path)), subpath)
+ return any(p.search(test_path) for p in patterns)
+
+ return match_function
+
+
+def _glob_syntax_compile_fn(
+ pattern: str, base_path: str, ignore_file_path: str
+) -> Optional[Union[Pattern, IgnoreRule]]:
+ """Build an ignore rule from the supplied pattern and log a useful warning
if it is invalid"""
+ rule = rule_from_pattern(pattern, base_path=Path(base_path).resolve())
+ if not rule:
+ # There are four reasons why a None rule might be returned by
"rule_from_pattern":
+ # 1. The pattern string is empty or is a comment - we can ignore
+ # 2. The pattern contains three or more consecutive '*' chars - should
warn
+ # 3. The pattern contains '**' not at beginning or end and it is not
between '/' chars - should warn
+ # 4. The pattern is '/' which matches nothing - should warn
+ reason = ""
+ if '/' == pattern.strip():
+ reason = "will not match any files or directories"
+ elif '***' in pattern:
+ reason = "more than two consecutive '*'"
+ elif '**' in pattern and '/**/' not in pattern:
+ reason = "'**' must be between '/' chars when not at beginning or
end of pattern"
+ if reason:
+ log.warning(f"Ignoring glob '{pattern}' from {ignore_file_path}:
{reason}")
Review comment:
```suggestion
log.warning(f"Ignoring glob '%s' from %s: %s:', pattern,
ignore_file_path, reason)
```
Please avoid stirring formatting before passing to logger.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]