This is an automated email from the ASF dual-hosted git repository.
rahulvats pushed a commit to branch v3-1-test
in repository https://gitbox.apache.org/repos/asf/airflow.git
commit 5f57234967bf959732d6c0478126df41e9d79b03
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Mon Mar 9 10:55:34 2026 +0100
[v3-1-test] perf: use load_only() in get_dag_runs eager loading to reduce
data fetched per task instance (#62482) (#62996)
* perf: use load_only() in eager_load_dag_run_for_validation to reduce data
fetched
The get_dag_runs API endpoint was slow on large deployments because
eager_load_dag_run_for_validation() used selectinload on task_instances and
task_instances_histories without restricting which columns were fetched.
This caused SQLAlchemy to load all heavyweight columns (executor_config with
pickled data, hostname, rendered fields, etc.) for every task instance
across
every DAG run in the result page — even though only dag_version_id is needed
to traverse the association proxy to DagVersion.
Add load_only(TaskInstance.dag_version_id) and
load_only(TaskInstanceHistory.dag_version_id) to the selectinload chains so
the SELECT for task instances fetches only the identity columns and the FK
needed to resolve the dag_version relationship, significantly reducing the
volume of data transferred from the database on busy deployments.
Fixes #62025
* Fix static checks
---------
(cherry picked from commit 13af96b80868ef91ca623d35afcd76003bfbda90)
Co-authored-by: Lakshmi Sravya
<[email protected]>
Co-authored-by: pierrejeambrun <[email protected]>
---
airflow-core/src/airflow/api_fastapi/common/db/dag_runs.py | 13 ++++++++++++-
1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/airflow-core/src/airflow/api_fastapi/common/db/dag_runs.py
b/airflow-core/src/airflow/api_fastapi/common/db/dag_runs.py
index 2a5fb76c8b8..a9ccf6031c7 100644
--- a/airflow-core/src/airflow/api_fastapi/common/db/dag_runs.py
+++ b/airflow-core/src/airflow/api_fastapi/common/db/dag_runs.py
@@ -41,13 +41,24 @@ dagruns_select_with_state_count = (
def eager_load_dag_run_for_validation() -> tuple[LoaderOption, ...]:
- """Construct the eager loading options necessary for a DagRunResponse
object."""
+ """
+ Construct the eager loading options necessary for a DagRunResponse object.
+
+ For the list endpoint (get_dag_runs), loading all task instance columns is
+ wasteful because we only need the dag_version_id FK to traverse to
DagVersion.
+ Using load_only() on TaskInstance and TaskInstanceHistory restricts the
SELECT
+ to just the identity columns and dag_version_id, avoiding large
intermediate
+ result sets caused by loading heavyweight columns (executor_config, etc.)
for
+ every task instance across every DAG run returned by the query.
+ """
return (
joinedload(DagRun.dag_model),
selectinload(DagRun.task_instances)
+ .load_only(TaskInstance.dag_version_id)
.joinedload(TaskInstance.dag_version)
.joinedload(DagVersion.bundle),
selectinload(DagRun.task_instances_histories)
+ .load_only(TaskInstanceHistory.dag_version_id)
.joinedload(TaskInstanceHistory.dag_version)
.joinedload(DagVersion.bundle),
joinedload(DagRun.dag_run_note),