Lee-W commented on code in PR #66960:
URL: https://github.com/apache/airflow/pull/66960#discussion_r3246629221


##########
scripts/ci/prek/check_trigger_serialize_init.py:
##########
@@ -0,0 +1,284 @@
+#!/usr/bin/env python
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# /// script
+# requires-python = ">=3.10,<3.11"
+# dependencies = [
+#   "rich>=13.6.0",
+# ]
+# ///
+"""
+Check that provider ``Trigger`` classes keep ``__init__`` and ``serialize()`` 
in sync.
+
+``__init__`` and ``serialize()`` are written as a pair: a trigger is 
instantiated once when the
+operator defers, then serialized and re-instantiated on whichever triggerer 
process runs it. Any
+``__init__`` parameter that ``serialize()`` does not return is silently 
dropped on that round-trip
+-- the reconstructed trigger falls back to the parameter's default. See
+https://github.com/apache/airflow/blob/main/airflow-core/docs/authoring-and-scheduling/deferring.rst
+
+This check parses each provider trigger module with ``ast`` and, for every 
trigger class whose
+``__init__`` signature *and* ``serialize()`` return dict can both be resolved 
statically (including
+through in-file base classes), flags any ``__init__`` parameter missing from 
the ``serialize()``
+return dict.
+
+Classes whose ``serialize()`` is built dynamically (``**spread`` of a 
non-``super()`` value,
+``.update()``, returning a variable, ...) or that inherit 
``__init__``/``serialize()`` from a base
+class defined in another file cannot be resolved statically and are skipped -- 
the check never
+guesses, so it produces no false positives.
+"""
+
+from __future__ import annotations
+
+import ast
+import sys
+from pathlib import Path
+
+from common_prek_utils import AIRFLOW_PROVIDERS_ROOT_PATH, console
+
+DEFERRING_DOC = (
+    
"https://github.com/apache/airflow/blob/main/airflow-core/docs/authoring-and-scheduling/deferring.rst";
+)
+
+# Key format for both sets below: "<path relative to the providers/ 
directory>::<ClassName>".
+
+# Trigger classes that genuinely violate the __init__/serialize() contract 
today. They predate
+# the check and are excluded so it can be enabled without a tree-wide fix; 
each is tracked for a
+# follow-up fix in a separate PR. Do NOT add new entries here -- fix the 
trigger instead.
+KNOWN_VIOLATIONS: set[str] = {
+    # `caller` is passed straight to DatabricksHook(caller=...) and never 
stored/serialized, so it
+    # falls back to the class-name default on a triggerer round-trip.
+    
"databricks/src/airflow/providers/databricks/triggers/databricks.py::DatabricksExecutionTrigger",
+    
"databricks/src/airflow/providers/databricks/triggers/databricks.py::DatabricksSQLStatementExecutionTrigger",
+    # `dataset_id`, `table_id`, `poll_interval` are forwarded to the parent 
__init__ and used, but
+    # the overridden serialize() omits them.
+    
"google/src/airflow/providers/google/cloud/triggers/bigquery.py::BigQueryIntervalCheckTrigger",
+    # `poll_interval` and `impersonation_chain` are stored and used but 
missing from serialize().
+    
"google/src/airflow/providers/google/cloud/triggers/datafusion.py::DataFusionStartPipelineTrigger",
+    # `endpoint_prefix` is stored as self._endpoint_prefix and used in run() 
but missing from serialize().
+    
"apache/livy/src/airflow/providers/apache/livy/triggers/livy.py::LivyTrigger",
+}
+
+# Trigger classes that the check flags but which are correct *by design*: an 
old/aliased parameter
+# is folded into its replacement at construction time, so it does not need to 
round-trip. These are
+# permanent exclusions, not tech debt.
+BY_DESIGN_EXCLUSIONS: set[str] = {
+    # `delta` is converted to an absolute `moment` in __init__ and the class 
deliberately serializes
+    # as a DateTimeTrigger (see its docstring) -- there is no `delta` to 
reconstruct.
+    
"standard/src/airflow/providers/standard/triggers/temporal.py::TimeDeltaTrigger",
+    # `pod_name` is a deprecated alias folded into `pod_names` in __init__; 
serializing only
+    # `pod_names` preserves the value and avoids re-triggering the deprecation 
path on restart.
+    
"google/src/airflow/providers/google/cloud/triggers/kubernetes_engine.py::GKEJobTrigger",
+    
"cncf/kubernetes/src/airflow/providers/cncf/kubernetes/triggers/job.py::KubernetesJobTrigger",
+}
+
+_EXCLUDED = KNOWN_VIOLATIONS | BY_DESIGN_EXCLUSIONS
+
+
+def _init_param_names(func: ast.FunctionDef) -> set[str]:
+    """Return the names of reconstructable __init__ parameters 
(``*args``/``**kwargs`` excluded)."""
+    args = func.args
+    names = {a.arg for a in (*args.posonlyargs, *args.args, *args.kwonlyargs)}
+    names.discard("self")
+    names.discard("cls")
+    return names
+
+
+def _base_simple_names(cls: ast.ClassDef) -> list[str]:
+    """Return the simple (last-component) names of a class's bases."""
+    out: list[str] = []
+    for base in cls.bases:
+        if isinstance(base, ast.Name):
+            out.append(base.id)
+        elif isinstance(base, ast.Attribute):
+            out.append(base.attr)
+    return out
+
+
+def _method(cls: ast.ClassDef, name: str) -> ast.FunctionDef | None:
+    for node in cls.body:
+        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) and 
node.name == name:
+            return node if isinstance(node, ast.FunctionDef) else None
+    return None
+
+
+class ModuleAnalyzer:
+    """Resolves trigger __init__/serialize() pairs within a single module, 
following in-file bases."""
+
+    def __init__(self, path: Path) -> None:
+        self.path = path
+        self.classes: dict[str, ast.ClassDef] = {}
+        tree = ast.parse(path.read_text("utf-8"), str(path))
+        for node in ast.walk(tree):
+            if isinstance(node, ast.ClassDef):
+                self.classes[node.name] = node

Review Comment:
   ```suggestion
           tree = ast.parse(path.read_text("utf-8"), str(path))
           self.classes: dict[str, ast.ClassDef] = {
               node.name: node
               for node in ast.walk(tree)
               if isinstance(node, ast.ClassDef)
           }
   ```



##########
scripts/ci/prek/check_trigger_serialize_init.py:
##########
@@ -0,0 +1,284 @@
+#!/usr/bin/env python
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# /// script
+# requires-python = ">=3.10,<3.11"
+# dependencies = [
+#   "rich>=13.6.0",
+# ]
+# ///
+"""
+Check that provider ``Trigger`` classes keep ``__init__`` and ``serialize()`` 
in sync.
+
+``__init__`` and ``serialize()`` are written as a pair: a trigger is 
instantiated once when the
+operator defers, then serialized and re-instantiated on whichever triggerer 
process runs it. Any
+``__init__`` parameter that ``serialize()`` does not return is silently 
dropped on that round-trip
+-- the reconstructed trigger falls back to the parameter's default. See
+https://github.com/apache/airflow/blob/main/airflow-core/docs/authoring-and-scheduling/deferring.rst
+
+This check parses each provider trigger module with ``ast`` and, for every 
trigger class whose
+``__init__`` signature *and* ``serialize()`` return dict can both be resolved 
statically (including
+through in-file base classes), flags any ``__init__`` parameter missing from 
the ``serialize()``
+return dict.
+
+Classes whose ``serialize()`` is built dynamically (``**spread`` of a 
non-``super()`` value,
+``.update()``, returning a variable, ...) or that inherit 
``__init__``/``serialize()`` from a base
+class defined in another file cannot be resolved statically and are skipped -- 
the check never
+guesses, so it produces no false positives.
+"""
+
+from __future__ import annotations
+
+import ast
+import sys
+from pathlib import Path
+
+from common_prek_utils import AIRFLOW_PROVIDERS_ROOT_PATH, console
+
+DEFERRING_DOC = (
+    
"https://github.com/apache/airflow/blob/main/airflow-core/docs/authoring-and-scheduling/deferring.rst";
+)
+
+# Key format for both sets below: "<path relative to the providers/ 
directory>::<ClassName>".
+
+# Trigger classes that genuinely violate the __init__/serialize() contract 
today. They predate
+# the check and are excluded so it can be enabled without a tree-wide fix; 
each is tracked for a
+# follow-up fix in a separate PR. Do NOT add new entries here -- fix the 
trigger instead.
+KNOWN_VIOLATIONS: set[str] = {
+    # `caller` is passed straight to DatabricksHook(caller=...) and never 
stored/serialized, so it
+    # falls back to the class-name default on a triggerer round-trip.
+    
"databricks/src/airflow/providers/databricks/triggers/databricks.py::DatabricksExecutionTrigger",
+    
"databricks/src/airflow/providers/databricks/triggers/databricks.py::DatabricksSQLStatementExecutionTrigger",
+    # `dataset_id`, `table_id`, `poll_interval` are forwarded to the parent 
__init__ and used, but
+    # the overridden serialize() omits them.
+    
"google/src/airflow/providers/google/cloud/triggers/bigquery.py::BigQueryIntervalCheckTrigger",
+    # `poll_interval` and `impersonation_chain` are stored and used but 
missing from serialize().
+    
"google/src/airflow/providers/google/cloud/triggers/datafusion.py::DataFusionStartPipelineTrigger",
+    # `endpoint_prefix` is stored as self._endpoint_prefix and used in run() 
but missing from serialize().
+    
"apache/livy/src/airflow/providers/apache/livy/triggers/livy.py::LivyTrigger",
+}
+
+# Trigger classes that the check flags but which are correct *by design*: an 
old/aliased parameter
+# is folded into its replacement at construction time, so it does not need to 
round-trip. These are
+# permanent exclusions, not tech debt.
+BY_DESIGN_EXCLUSIONS: set[str] = {
+    # `delta` is converted to an absolute `moment` in __init__ and the class 
deliberately serializes
+    # as a DateTimeTrigger (see its docstring) -- there is no `delta` to 
reconstruct.
+    
"standard/src/airflow/providers/standard/triggers/temporal.py::TimeDeltaTrigger",
+    # `pod_name` is a deprecated alias folded into `pod_names` in __init__; 
serializing only
+    # `pod_names` preserves the value and avoids re-triggering the deprecation 
path on restart.
+    
"google/src/airflow/providers/google/cloud/triggers/kubernetes_engine.py::GKEJobTrigger",
+    
"cncf/kubernetes/src/airflow/providers/cncf/kubernetes/triggers/job.py::KubernetesJobTrigger",
+}
+
+_EXCLUDED = KNOWN_VIOLATIONS | BY_DESIGN_EXCLUSIONS
+
+
+def _init_param_names(func: ast.FunctionDef) -> set[str]:
+    """Return the names of reconstructable __init__ parameters 
(``*args``/``**kwargs`` excluded)."""
+    args = func.args
+    names = {a.arg for a in (*args.posonlyargs, *args.args, *args.kwonlyargs)}
+    names.discard("self")
+    names.discard("cls")
+    return names
+
+
+def _base_simple_names(cls: ast.ClassDef) -> list[str]:

Review Comment:
   All these function names are nouns, which look weird to me. although kinda 
redundant, i might try `_get_...`  or `_extract_...`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to