kaxil commented on a change in pull request #14048:
URL: https://github.com/apache/airflow/pull/14048#discussion_r572011423
##########
File path: airflow/models/taskinstance.py
##########
@@ -172,17 +173,34 @@ def clear_task_instances(
if tr_filter:
# Clear all reschedules related to the ti to clear
- delete_qry = TR.__table__.delete().where(
- or_(
- and_(
- TR.dag_id == dag_id,
- TR.task_id == task_id,
- TR.execution_date == execution_date,
- TR.try_number == try_number,
- )
- for dag_id, task_id, execution_date, try_number in tr_filter
+
+ # This is an optimization for the common case where all tis are for a
small number
+ # of dag_id, execution_date and try_number. Use a nested dict of
dag_id,
+ # execution_date, try_number and task_id to construct the where clause
in a
+ # hierarchical manner. This speeds up the delete statement by more
than 40x for
+ # large number of tis (50k+).
+ task_id_by_key = defaultdict(lambda: defaultdict(lambda:
defaultdict(set)))
+ for dag_id, task_id, execution_date, try_number in tr_filter:
+ task_id_by_key[dag_id][execution_date][try_number].add(task_id)
Review comment:
Can you post the code that calculates times takes to see 40x speed
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]