shahar1 commented on code in PR #66960: URL: https://github.com/apache/airflow/pull/66960#discussion_r3247350623
########## scripts/ci/prek/check_trigger_serialize_init.py: ########## @@ -0,0 +1,284 @@ +#!/usr/bin/env python +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# /// script +# requires-python = ">=3.10,<3.11" +# dependencies = [ +# "rich>=13.6.0", +# ] +# /// +""" +Check that provider ``Trigger`` classes keep ``__init__`` and ``serialize()`` in sync. + +``__init__`` and ``serialize()`` are written as a pair: a trigger is instantiated once when the +operator defers, then serialized and re-instantiated on whichever triggerer process runs it. Any +``__init__`` parameter that ``serialize()`` does not return is silently dropped on that round-trip +-- the reconstructed trigger falls back to the parameter's default. See +https://github.com/apache/airflow/blob/main/airflow-core/docs/authoring-and-scheduling/deferring.rst + +This check parses each provider trigger module with ``ast`` and, for every trigger class whose +``__init__`` signature *and* ``serialize()`` return dict can both be resolved statically (including +through in-file base classes), flags any ``__init__`` parameter missing from the ``serialize()`` +return dict. + +Classes whose ``serialize()`` is built dynamically (``**spread`` of a non-``super()`` value, +``.update()``, returning a variable, ...) or that inherit ``__init__``/``serialize()`` from a base +class defined in another file cannot be resolved statically and are skipped -- the check never +guesses, so it produces no false positives. +""" + +from __future__ import annotations + +import ast +import sys +from pathlib import Path + +from common_prek_utils import AIRFLOW_PROVIDERS_ROOT_PATH, console + +DEFERRING_DOC = ( + "https://github.com/apache/airflow/blob/main/airflow-core/docs/authoring-and-scheduling/deferring.rst" +) + +# Key format for both sets below: "<path relative to the providers/ directory>::<ClassName>". + +# Trigger classes that genuinely violate the __init__/serialize() contract today. They predate +# the check and are excluded so it can be enabled without a tree-wide fix; each is tracked for a +# follow-up fix in a separate PR. Do NOT add new entries here -- fix the trigger instead. +KNOWN_VIOLATIONS: set[str] = { + # `caller` is passed straight to DatabricksHook(caller=...) and never stored/serialized, so it + # falls back to the class-name default on a triggerer round-trip. + "databricks/src/airflow/providers/databricks/triggers/databricks.py::DatabricksExecutionTrigger", + "databricks/src/airflow/providers/databricks/triggers/databricks.py::DatabricksSQLStatementExecutionTrigger", + # `dataset_id`, `table_id`, `poll_interval` are forwarded to the parent __init__ and used, but + # the overridden serialize() omits them. + "google/src/airflow/providers/google/cloud/triggers/bigquery.py::BigQueryIntervalCheckTrigger", + # `poll_interval` and `impersonation_chain` are stored and used but missing from serialize(). + "google/src/airflow/providers/google/cloud/triggers/datafusion.py::DataFusionStartPipelineTrigger", + # `endpoint_prefix` is stored as self._endpoint_prefix and used in run() but missing from serialize(). + "apache/livy/src/airflow/providers/apache/livy/triggers/livy.py::LivyTrigger", +} + +# Trigger classes that the check flags but which are correct *by design*: an old/aliased parameter +# is folded into its replacement at construction time, so it does not need to round-trip. These are +# permanent exclusions, not tech debt. +BY_DESIGN_EXCLUSIONS: set[str] = { + # `delta` is converted to an absolute `moment` in __init__ and the class deliberately serializes + # as a DateTimeTrigger (see its docstring) -- there is no `delta` to reconstruct. + "standard/src/airflow/providers/standard/triggers/temporal.py::TimeDeltaTrigger", + # `pod_name` is a deprecated alias folded into `pod_names` in __init__; serializing only + # `pod_names` preserves the value and avoids re-triggering the deprecation path on restart. + "google/src/airflow/providers/google/cloud/triggers/kubernetes_engine.py::GKEJobTrigger", + "cncf/kubernetes/src/airflow/providers/cncf/kubernetes/triggers/job.py::KubernetesJobTrigger", +} + +_EXCLUDED = KNOWN_VIOLATIONS | BY_DESIGN_EXCLUSIONS + + +def _init_param_names(func: ast.FunctionDef) -> set[str]: + """Return the names of reconstructable __init__ parameters (``*args``/``**kwargs`` excluded).""" + args = func.args + names = {a.arg for a in (*args.posonlyargs, *args.args, *args.kwonlyargs)} + names.discard("self") + names.discard("cls") + return names + + +def _base_simple_names(cls: ast.ClassDef) -> list[str]: Review Comment: > All these function names are nouns, which look weird to me. although kinda redundant, i might try `_get_...` or `_extract_...` Agree, worth adding to agent's instructions -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
