This is an automated email from the ASF dual-hosted git repository.
potiuk pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git
The following commit(s) were added to refs/heads/main by this push:
new 9f5e76f053d Add inventory of providers using DB (#49189)
9f5e76f053d is described below
commit 9f5e76f053dcfe07b86368a52b1bcbe8b5dd7554
Author: Jarek Potiuk <[email protected]>
AuthorDate: Sun Apr 13 22:15:07 2025 +0200
Add inventory of providers using DB (#49189)
---
dev/provider_db_inventory.py | 105 +++++++++++++++++++++++++++++++++++++++++++
1 file changed, 105 insertions(+)
diff --git a/dev/provider_db_inventory.py b/dev/provider_db_inventory.py
new file mode 100755
index 00000000000..296938891bf
--- /dev/null
+++ b/dev/provider_db_inventory.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# PEP 723 compliant inline script metadata
+# /// script
+# requires-python = ">=3.9"
+# dependencies = [
+# "rich",
+# "pyyaml",
+# ]
+# ///
+from __future__ import annotations
+
+import os
+import re
+from collections import defaultdict
+from pathlib import Path
+
+import rich
+import yaml
+
+AIRFLOW_SOURCES_PATH = Path(__file__).parents[1]
+
+# Directories to scan
+PYPROJECT_TOML_FILES =
AIRFLOW_SOURCES_PATH.rglob("providers/**/pyproject.toml")
+
+# Patterns to identify Airflow metadata DB access
+DB_PATTERNS: list[tuple[re.Pattern, re.Pattern | None]] = [
+ (re.compile(r"from airflow\.utils\.session"), None),
+ (re.compile(r"from airflow\.settings import Session"), None),
+ (re.compile(r"@provide_session"), None),
+ (re.compile(r"from sqlalchemy\.orm\.session"), None),
+ (re.compile(r"session\.query"), None),
+]
+
+AFFECTED_PROVIDERS: dict[str, list[Path]] = defaultdict(list)
+MATCHES: dict[Path, list[str]] = defaultdict(list)
+
+
+def line_matches_pattern(line: str, patterns: list[tuple[re.Pattern,
re.Pattern | None]]) -> bool:
+ """Check if a line matches any metadata DB access pattern."""
+ return any(
+ pattern.search(line) and not (exclude_pattern and
exclude_pattern.search(line))
+ for pattern, exclude_pattern in patterns
+ )
+
+
+def any_line_matches_pattern(filepath: Path) -> bool:
+ """Scan a single file for metadata DB access patterns."""
+ lines = filepath.read_text().splitlines()
+ matches = False
+ for i, line in enumerate(lines, start=1):
+ if line_matches_pattern(line, DB_PATTERNS):
+ rich.print(f"[bright_blue]Match found[/] in {filepath} ->
#{i}:{line}")
+ MATCHES[filepath].append(
+
f"[Line:{i}](https://github.com/apache/airflow/blob/main/{filepath}#L{i}):
{line} "
+ )
+ matches = True
+ return matches
+
+
+def scan_directory(directory):
+ provider_name = yaml.safe_load((directory /
"provider.yaml").read_text())["package-name"]
+ for path in (directory / "src").rglob("*.py"):
+ rel_path = path.relative_to(AIRFLOW_SOURCES_PATH)
+ if any_line_matches_pattern(rel_path):
+ rich.print(f"[green]Found metadata DB access in {path}[/]")
+ AFFECTED_PROVIDERS[provider_name].append(rel_path)
+
+
+def main():
+ for pyproject_toml in PYPROJECT_TOML_FILES:
+ directory = pyproject_toml.parent
+ if os.path.exists(directory):
+ rich.print(f"Scanning src folder of {directory}...")
+ scan_directory(directory)
+ print()
+ print(f"Found {len(AFFECTED_PROVIDERS)} providers with metadata DB access
patterns:")
+ print()
+ for provider in sorted(AFFECTED_PROVIDERS):
+ print(f"## Provider: {provider}\n")
+ for file in AFFECTED_PROVIDERS[provider]:
+ print(f" - [ ]
[{file.name}](https://github.com/apache/airflow/blob/main/{file})")
+ for match in MATCHES[file]:
+ print(f" - {match}")
+ print()
+
+
+if __name__ == "__main__":
+ main()