[GitHub] [airflow] mik-laj commented on a change in pull request #22051: Support glob syntax in .airflowignore files (#21392)

GitBox Mon, 07 Mar 2022 07:34:20 -0800


mik-laj commented on a change in pull request #22051:
URL: https://github.com/apache/airflow/pull/22051#discussion_r820823369




##########
File path: airflow/utils/file.py
##########
@@ -108,46 +111,167 @@ def open_maybe_zipped(fileloc, mode='r'):
         return open(fileloc, mode=mode)
 
 
-def find_path_from_directory(base_dir_path: str, ignore_file_name: str) -> 
Generator[str, None, None]:
+def _find_path_from_directory(
+    base_dir_path: str,
+    ignore_file_name: str,
+    compile_pattern_fn: Callable[[str, str, str], Optional[Union[Pattern, 
IgnoreRule]]],
+    match_pattern_fn: Callable[[str, str, List[Union[Pattern, IgnoreRule]]], 
bool],
+) -> Generator[str, None, None]:
     """
-    Search the file and return the path of the file that should not be ignored.
-    :param base_dir_path: the base path to be searched for.
-    :param ignore_file_name: the file name in which specifies a regular 
expression pattern is written.
-
-    :return : file path not to be ignored.
+    Recursively search the base path and return the list of file paths that 
should not be ignored by
+    regular expressions in any ignore files at each directory level.
+    :param base_dir_path: the base path to be searched
+    :param ignore_file_name: the file name containing regular expressions for 
files that should be ignored.
+    :param compile_pattern_fn: a callable function which accepts a raw pattern 
string, the base dir path
+      and path to the ignore file it was found in and returns a regular 
expression or ignore rule.
+    :param match_pattern_fn: a callable function which accepts a root 
directory a candidate subpath (file
+      or dir) and a list of patterns and returns a boolean to indicate whether 
the candidate subpath
+      should be ignored.
+
+    :return : a generator of file paths which should not be ignored.
     """
-    patterns_by_dir: Dict[str, List[Pattern[str]]] = {}
+    patterns_by_dir: Dict[str, List[Union[Pattern, IgnoreRule]]] = {}
 
     for root, dirs, files in os.walk(str(base_dir_path), followlinks=True):
-        patterns: List[Pattern[str]] = patterns_by_dir.get(root, [])
+        patterns: List[Union[Pattern, IgnoreRule]] = patterns_by_dir.get(root, 
[])
 
         ignore_file_path = os.path.join(root, ignore_file_name)
         if os.path.isfile(ignore_file_path):
-            with open(ignore_file_path) as file:
-                lines_no_comments = [re.sub(r"\s*#.*", "", line) for line in 
file.read().split("\n")]
-                patterns += [re.compile(line) for line in lines_no_comments if 
line]
-                patterns = list(set(patterns))
-
-        dirs[:] = [
-            subdir
-            for subdir in dirs
-            if not any(
-                p.search(os.path.join(os.path.relpath(root, 
str(base_dir_path)), subdir)) for p in patterns
-            )
-        ]
+            with open(ignore_file_path) as ifile:
+                lines_no_comments = [re.sub(r"\s*#.*", "", line) for line in 
ifile.read().split("\n")]
+                # append new patterns and filter out "None" objects, which are 
invalid patterns
+                patterns += [
+                    p
+                    for p in [
+                        compile_pattern_fn(line, root, ignore_file_path) for 
line in lines_no_comments if line
+                    ]
+                    if p is not None
+                ]
+                # evaluation order is important with negation so that later 
patterns can
+                # override earlier patterns
+                patterns = list(OrderedDict.fromkeys(patterns).keys())
+
+        dirs[:] = [subdir for subdir in dirs if not match_pattern_fn(root, 
subdir, patterns)]
 
         patterns_by_dir.update({os.path.join(root, sd): patterns.copy() for sd 
in dirs})
 
-        for file in files:  # type: ignore
+        for file in files:
             if file == ignore_file_name:
                 continue
             abs_file_path = os.path.join(root, str(file))
-            rel_file_path = os.path.join(os.path.relpath(root, 
str(base_dir_path)), str(file))
-            if any(p.search(rel_file_path) for p in patterns):
+            if match_pattern_fn(root, file, patterns):
                 continue
             yield str(abs_file_path)
 
 
+def _regex_syntax_compile_fn(
+    pattern: str, base_path: str, ignore_file_path: str
+) -> Optional[Union[Pattern, IgnoreRule]]:
+    """
+    Compile a regular expression from the supplied pattern and log a useful 
warning if it is invalid
+    :param pattern: the string pattern
+    :param base_path: the base directory where the pattern is defined, unused
+    :param ignore_file_path: the ignore file where the pattern was defined, 
used for logging warnings
+
+    :return: a regular expression Pattern
+    """
+    p: Optional[Union[Pattern, IgnoreRule]] = None
+    try:
+        p = re.compile(pattern)
+    except re.error as e:
+        log.warning(f"Ignoring invalid regex '{pattern}' from 
{ignore_file_path}: {e}")
+    return p
+
+
+def _get_regex_syntax_match_fn(
+    base_dir_path: str,
+) -> Callable[[str, str, List[Pattern]], bool]:
+    """
+    Return a function which can take a root, a candidate subpath and a list of 
regular expression
+    patterns for evaluation.
+    :param base_dir_path: the base path for DAG searching, supplied to the 
underlying function via closure
+
+    :return: a callable function which can be used to evaluate supplied ignore 
expressions
+    """
+
+    def match_function(root: str, subpath: str, patterns: List[Pattern]) -> 
bool:
+        test_path: str = os.path.join(os.path.relpath(root, 
str(base_dir_path)), subpath)
+        return any(p.search(test_path) for p in patterns)
+
+    return match_function
+
+
+def _glob_syntax_compile_fn(
+    pattern: str, base_path: str, ignore_file_path: str
+) -> Optional[Union[Pattern, IgnoreRule]]:
+    """Build an ignore rule from the supplied pattern and log a useful warning 
if it is invalid"""
+    rule = rule_from_pattern(pattern, base_path=Path(base_path).resolve())
+    if not rule:
+        # There are four reasons why a None rule might be returned by 
"rule_from_pattern":
+        # 1. The pattern string is empty or is a comment - we can ignore
+        # 2. The pattern contains three or more consecutive '*' chars - should 
warn
+        # 3. The pattern contains '**' not at beginning or end and it is not 
between '/' chars - should warn
+        # 4. The pattern is '/' which matches nothing - should warn
+        reason = ""
+        if '/' == pattern.strip():
+            reason = "will not match any files or directories"
+        elif '***' in pattern:
+            reason = "more than two consecutive '*'"
+        elif '**' in pattern and '/**/' not in pattern:
+            reason = "'**' must be between '/' chars when not at beginning or 
end of pattern"
+        if reason:
+            log.warning(f"Ignoring glob '{pattern}' from {ignore_file_path}: 
{reason}")

Review comment:
       ```suggestion
               log.warning(f"Ignoring glob '%s' from %s: %s:', pattern, 
ignore_file_path,  reason)
   ```
   Please avoid stirring formatting before passing to logger.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [airflow] mik-laj commented on a change in pull request #22051: Support glob syntax in .airflowignore files (#21392)

Reply via email to