pierrejeambrun commented on code in PR #47543:
URL: https://github.com/apache/airflow/pull/47543#discussion_r2013921868
##########
airflow/api_fastapi/core_api/middleware.py:
##########
@@ -37,3 +46,130 @@ async def dispatch(self, request: Request, call_next):
detail=body["error"],
)
return response
+
+
+class RegexpExceptionMiddleware(BaseHTTPMiddleware):
+ """Middleware that converts exceptions response if any field in the
request contains regexp pattern."""
+
+ @classmethod
+ def _detect_regexp(cls, key: str | None = None, value: Any = None) -> str
| None:
+ """Return the key value if the value contains a regexp pattern."""
+ # Common evil regex patterns
+ common_evil_regex = [
+ # Three common regex structures
+ # Please be proactive if there are important regex structures that
are not included here
+ "(a+)+", # Nesting quantifiers
+ "(a|a)+", # Quantified overlapping disjunctions
+ r"\d+\d+", # Quantified Overlapping Adjacencies
+ ]
+ # There are infinite ways to write a regex pattern, so we are checking
for common regex structures
+ # If the value contains any of the common regex structures, we will
consider it as a regex pattern
+ regex_structures = [
+ # Three common regex structures
+ r"\[.*?\]", # Character classes
+ r"\(.*?\)", # Grouping
+ r"\{.*?,.*?\}", # Quantifiers with ranges
+ r"\^.*\$", # start and end anchors
+ r"\|", # or operator
+ r"\(\?.*?\)", # non-capturing groups
+ r"\.\*", # common wildcard
+ r"\.\+",
+ r"\.\?",
+ r"\\A", # start of string
+ r"\\b", # word boundary
+ r"\\B", # non-word boundary
+ r"\\d", # digit
+ r"\\D", # non-digit
+ r"\\s", # whitespace
+ r"\\S", # non-whitespace
+ r"\\w", # word character
+ r"\\W", # non-word character
+ r"\\Z", # end of string
+ r"\*", # quantifier
+ r"\+", # quantifier
+ r"\?", # quantifier
+ r"\\.", # escaped dot
+ r"\\-", # escaped dash
+ r"\\/", # escaped slash
+ r"\\\\", # escaped backslash
+ ]
+
+ # Date-time pattern regex to exclude
+ date_time_pattern =
r"^\d{4}-\d{2}-\d{2}(T\d{2}:\d{2}:\d{2}(\.\d+)?)?(Z|[\+-]\d{2}:\d{2})?$"
+
+ # Excluded keys to avoid checking for fields that can contain regexp
like patterns
+ # password and extra are excluded as they can contain any string for
connections.
+ # We should always ensure these fields not accepting regexp and
validate accordingly.
+ excluded_keys: set = {"password", "extra"}
+
+ compiled_structures = [re.compile(pattern) for pattern in
regex_structures]
+
+ if isinstance(value, str):
+ # Early return if the value is empty or the key is in the excluded
keys
+ if key in excluded_keys or value == "":
+ return None
+ # Check if the string is a valid JSON and call the function
recursively
+ try:
+ dict_candidate = json.loads(value)
+ return cls._detect_regexp(key=key, value=dict_candidate)
+ except JSONDecodeError:
+ pass
+
+ # Include matching regex indicators and exclude date-time pattern
+ if (
+ any(structure.search(value) for structure in
compiled_structures)
Review Comment:
That is a vector attack. (regexp pattern is fixed, but value is a user
input, crafting a user input that will do excessive backtracking against one of
our patterns should be doable)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]