pierrejeambrun commented on code in PR #47543:
URL: https://github.com/apache/airflow/pull/47543#discussion_r2013921868


##########
airflow/api_fastapi/core_api/middleware.py:
##########
@@ -37,3 +46,130 @@ async def dispatch(self, request: Request, call_next):
                     detail=body["error"],
                 )
         return response
+
+
+class RegexpExceptionMiddleware(BaseHTTPMiddleware):
+    """Middleware that converts exceptions response if any field in the 
request contains regexp pattern."""
+
+    @classmethod
+    def _detect_regexp(cls, key: str | None = None, value: Any = None) -> str 
| None:
+        """Return the key value if the value contains a regexp pattern."""
+        # Common evil regex patterns
+        common_evil_regex = [
+            # Three common regex structures
+            # Please be proactive if there are important regex structures that 
are not included here
+            "(a+)+",  # Nesting quantifiers
+            "(a|a)+",  # Quantified overlapping disjunctions
+            r"\d+\d+",  # Quantified Overlapping Adjacencies
+        ]
+        # There are infinite ways to write a regex pattern, so we are checking 
for common regex structures
+        # If the value contains any of the common regex structures, we will 
consider it as a regex pattern
+        regex_structures = [
+            # Three common regex structures
+            r"\[.*?\]",  # Character classes
+            r"\(.*?\)",  # Grouping
+            r"\{.*?,.*?\}",  # Quantifiers with ranges
+            r"\^.*\$",  # start and end anchors
+            r"\|",  # or operator
+            r"\(\?.*?\)",  # non-capturing groups
+            r"\.\*",  # common wildcard
+            r"\.\+",
+            r"\.\?",
+            r"\\A",  # start of string
+            r"\\b",  # word boundary
+            r"\\B",  # non-word boundary
+            r"\\d",  # digit
+            r"\\D",  # non-digit
+            r"\\s",  # whitespace
+            r"\\S",  # non-whitespace
+            r"\\w",  # word character
+            r"\\W",  # non-word character
+            r"\\Z",  # end of string
+            r"\*",  # quantifier
+            r"\+",  # quantifier
+            r"\?",  # quantifier
+            r"\\.",  # escaped dot
+            r"\\-",  # escaped dash
+            r"\\/",  # escaped slash
+            r"\\\\",  # escaped backslash
+        ]
+
+        # Date-time pattern regex to exclude
+        date_time_pattern = 
r"^\d{4}-\d{2}-\d{2}(T\d{2}:\d{2}:\d{2}(\.\d+)?)?(Z|[\+-]\d{2}:\d{2})?$"
+
+        # Excluded keys to avoid checking for fields that can contain regexp 
like patterns
+        # password and extra are excluded as they can contain any string for 
connections.
+        # We should always ensure these fields not accepting regexp and 
validate accordingly.
+        excluded_keys: set = {"password", "extra"}
+
+        compiled_structures = [re.compile(pattern) for pattern in 
regex_structures]
+
+        if isinstance(value, str):
+            # Early return if the value is empty or the key is in the excluded 
keys
+            if key in excluded_keys or value == "":
+                return None
+            # Check if the string is a valid JSON and call the function 
recursively
+            try:
+                dict_candidate = json.loads(value)
+                return cls._detect_regexp(key=key, value=dict_candidate)
+            except JSONDecodeError:
+                pass
+
+            # Include matching regex indicators and exclude date-time pattern
+            if (
+                any(structure.search(value) for structure in 
compiled_structures)

Review Comment:
   That is a vector attack. (regexp pattern is fixed, but value is a user 
input, crafting a user input that will do excessive backtracking against one of 
our patterns should be doable)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to