Copilot commented on code in PR #64311:
URL: https://github.com/apache/airflow/pull/64311#discussion_r3032003803
##########
airflow-core/src/airflow/models/serialized_dag.py:
##########
@@ -304,6 +304,8 @@ class SerializedDagModel(Base):
)
dag_hash: Mapped[str] = mapped_column(String(32), nullable=False)
+ __table_args__ = (Index("idx_serialized_dag_dag_id_last_updated", dag_id,
last_updated),)
Review Comment:
The new composite index uses (dag_id, last_updated), but the code paths that
fetch the “latest” serialized DAG for a dag_id appear to order/group by
created_at (e.g., latest_item_select_object(), get_latest_serialized_dags(),
read_all_dags()). With the current index, those queries can use the dag_id
prefix but still need an extra sort/aggregation on created_at. Consider
switching the second column to created_at (or updating the “latest” queries to
use last_updated) so the index matches the actual access pattern you’re trying
to speed up.
```suggestion
__table_args__ = (Index("idx_serialized_dag_dag_id_created_at", dag_id,
created_at),)
```
##########
airflow-core/src/airflow/migrations/versions/0111_3_3_0_add_performance_indexes.py:
##########
@@ -0,0 +1,68 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Add performance indexes on serialized_dag, dag_code, and task_instance.
+
+Revision ID: 76984aa0347c
+Revises: a4c2d171ae18
+Create Date: 2026-03-27 00:00:00.000000
+
+"""
+
+from __future__ import annotations
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision = "76984aa0347c"
+down_revision = "a4c2d171ae18"
+branch_labels = None
+depends_on = None
+airflow_version = "3.3.0"
+
+
+def upgrade():
+ """Add indexes for query performance on serialized_dag, dag_code, and
task_instance."""
+ with op.batch_alter_table("serialized_dag", schema=None) as batch_op:
+ batch_op.create_index(
+ "idx_serialized_dag_dag_id_last_updated", ["dag_id",
"last_updated"], unique=False
+ )
+
+ with op.batch_alter_table("dag_code", schema=None) as batch_op:
+ batch_op.create_index("idx_dag_code_dag_id", ["dag_id"], unique=False)
+
+ with op.batch_alter_table("task_instance", schema=None) as batch_op:
+ batch_op.create_index("idx_task_instance_dag_version_id",
["dag_version_id"], unique=False)
+
+
+def downgrade():
+ """Remove performance indexes."""
+ conn = op.get_bind()
+
+ # MySQL requires an index on FK columns; since dag_version_id has a FK
constraint,
+ # we cannot drop the index on MySQL.
+ if conn.dialect.name != "mysql":
+ with op.batch_alter_table("task_instance", schema=None) as batch_op:
+ batch_op.drop_index("idx_task_instance_dag_version_id")
+
Review Comment:
On MySQL/InnoDB, adding a FOREIGN KEY typically requires an index on the
referencing column and MySQL will create one automatically if it doesn’t exist.
Since task_instance.dag_version_id was introduced with a FK in 0047, this
upgrade may create a second redundant index on MySQL (extra disk/write
overhead). Consider making index creation conditional (only create when missing
/ non-MySQL), or replacing the auto-created FK index with this named one to
avoid duplicates.
##########
airflow-core/src/airflow/migrations/versions/0111_3_3_0_add_performance_indexes.py:
##########
@@ -0,0 +1,68 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Add performance indexes on serialized_dag, dag_code, and task_instance.
+
+Revision ID: 76984aa0347c
+Revises: a4c2d171ae18
+Create Date: 2026-03-27 00:00:00.000000
+
+"""
+
+from __future__ import annotations
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision = "76984aa0347c"
+down_revision = "a4c2d171ae18"
+branch_labels = None
+depends_on = None
+airflow_version = "3.3.0"
+
+
+def upgrade():
+ """Add indexes for query performance on serialized_dag, dag_code, and
task_instance."""
+ with op.batch_alter_table("serialized_dag", schema=None) as batch_op:
+ batch_op.create_index(
+ "idx_serialized_dag_dag_id_last_updated", ["dag_id",
"last_updated"], unique=False
+ )
Review Comment:
This migration creates idx_serialized_dag_dag_id_last_updated on (dag_id,
last_updated), but the ORM queries that select the latest serialized DAG per
dag_id use created_at for ordering/aggregation. If the goal is to speed up
those “latest per dag_id” queries, consider indexing (dag_id, created_at)
instead (or adjust the query to use last_updated) so the index supports the
ORDER BY / MAX column directly.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]