[airflow] 02/04: Fix race condition when starting DagProcessorAgent (#19935)

potiuk Sat, 11 Dec 2021 07:25:03 -0800

This is an automated email from the ASF dual-hosted git repository.

potiuk pushed a commit to branch v2-2-test
in repository https://gitbox.apache.org/repos/asf/airflow.git


commit 1b958ca85dc9abafb0ad687c0a32992f59ae6186
Author: Jarek Potiuk <[email protected]>
AuthorDate: Fri Dec 3 00:44:59 2021 +0100

    Fix race condition when starting DagProcessorAgent (#19935)
    
    As described in detail in #19860, there was a race condition in
    starting and terminating DagProcessorAgent that caused us a lot
    of headeaches with flaky test_scheduler_job failures on our CI
    and after long investigation, it turned out to be a race
    condition. Not very likely, but possible to happen in production.
    
    The race condition involved starting DagProcessorAgent via
    multiprocessing, where the first action of the agent was changing
    the process GID to be the same as PID. If the DagProcessorAgent
    was terminated quickly (on a busy system) before the process
    could change the GID, the `reap_process_group` that was supposed
    to kill the whole group, was failing and the DagProcessorAgent
    remained running.
    
    This problem revealed a wrong behaviour of Airflow in some edge
    conditions when 'spawn' mode was used for starting the DAG processor
    Details are described in #19934, but this problem will have to be
    solved differently (avoiding ORM reinitialization during DAG
    processor starting).
    
    This change also moves the tests for `spawn` method out from
    test_scheduler_job.py (it was a remnant of old Airlfow and it
    did not really test what it was supposed to test). Instead tests
    were added for different spawn modes and killing the processor
    agent in both spawn and "default" mode.
    
    (cherry picked from commit 525484388464619832a14d1b28e06e3a097aac97)
---
 airflow/dag_processing/manager.py      | 10 +++++++
 airflow/utils/process_utils.py         | 55 ++++++++++++++++++++++++----------
 tests/dag_processing/test_processor.py | 50 +++++++++++++++++++++++++++++++
 3 files changed, 100 insertions(+), 15 deletions(-)

diff --git a/airflow/dag_processing/manager.py 
b/airflow/dag_processing/manager.py
index 19a97ff..00bffc5 100644
--- a/airflow/dag_processing/manager.py
+++ b/airflow/dag_processing/manager.py
@@ -260,6 +260,16 @@ class DagFileProcessorAgent(LoggingMixin, 
MultiprocessingStartMethodMixin):
         os.environ['AIRFLOW__LOGGING__COLORED_CONSOLE_LOG'] = 'False'
         # Replicating the behavior of how logging module was loaded
         # in logging_config.py
+
+        # TODO: This reloading should be removed when we fix our logging 
behaviour
+        # In case of "spawn" method of starting processes for multiprocessing, 
reinitializing of the
+        # SQLAlchemy engine causes extremely unexpected behaviour of messing 
with objects already loaded
+        # in a parent process (likely via resources shared in memory by the 
ORM libraries).
+        # This caused flaky tests in our CI for many months and has been 
discovered while
+        # iterating on https://github.com/apache/airflow/pull/19860
+        # The issue that describes the problem and possible remediation is
+        # at https://github.com/apache/airflow/issues/19934
+
         
importlib.reload(import_module(airflow.settings.LOGGING_CLASS_PATH.rsplit('.', 
1)[0]))  # type: ignore
         importlib.reload(airflow.settings)
         airflow.settings.initialize()
diff --git a/airflow/utils/process_utils.py b/airflow/utils/process_utils.py
index 3c03917..3b12115 100644
--- a/airflow/utils/process_utils.py
+++ b/airflow/utils/process_utils.py
@@ -45,7 +45,7 @@ DEFAULT_TIME_TO_WAIT_AFTER_SIGTERM = conf.getint('core', 
'KILLED_TASK_CLEANUP_TI
 
 
 def reap_process_group(
-    pgid: int,
+    process_group_id: int,
     logger,
     sig: 'signal.Signals' = signal.SIGTERM,
     timeout: int = DEFAULT_TIME_TO_WAIT_AFTER_SIGTERM,
@@ -55,7 +55,11 @@ def reap_process_group(
     sig (SIGTERM) to the process group of pid. If any process is alive after 
timeout
     a SIGKILL will be send.
 
-    :param pgid: process group id to kill
+    :param process_group_id: process group id to kill.
+           The process that wants to create the group should run 
`os.setpgid(0, 0)` as the first
+           command it executes which will set group id = process_id. 
Effectively the process that is the
+           "root" of the group has pid = gid and all other processes in the 
group have different
+           pids but the same gid (equal the pid of the root process)
     :param logger: log handler
     :param sig: signal type
     :param timeout: how much time a process has to terminate
@@ -68,36 +72,57 @@ def reap_process_group(
 
     def signal_procs(sig):
         try:
-            os.killpg(pgid, sig)
-        except OSError as err:
+            logger.info("Sending the signal %s to group %s", sig, 
process_group_id)
+            os.killpg(process_group_id, sig)
+        except OSError as err_killpg:
             # If operation not permitted error is thrown due to run_as_user,
             # use sudo -n(--non-interactive) to kill the process
-            if err.errno == errno.EPERM:
+            if err_killpg.errno == errno.EPERM:
                 subprocess.check_call(
-                    ["sudo", "-n", "kill", "-" + str(int(sig))] + [str(p.pid) 
for p in children]
+                    ["sudo", "-n", "kill", "-" + str(int(sig))]
+                    + [str(p.pid) for p in all_processes_in_the_group]
+                )
+            elif err_killpg.errno == errno.ESRCH:
+                # There is a rare condition that the process has not managed 
yet to change it's process
+                # group. In this case os.killpg fails with ESRCH error
+                # So we additionally send a kill signal to the process itself.
+                logger.info(
+                    "Sending the signal %s to process %s as process group is 
missing.", sig, process_group_id
                 )
+                try:
+                    os.kill(process_group_id, sig)
+                except OSError as err_kill:
+                    if err_kill.errno == errno.EPERM:
+                        subprocess.check_call(["sudo", "-n", "kill", "-" + 
str(process_group_id)])
+                    else:
+                        raise
             else:
                 raise
 
-    if pgid == os.getpgid(0):
+    if process_group_id == os.getpgid(0):
         raise RuntimeError("I refuse to kill myself")
 
     try:
-        parent = psutil.Process(pgid)
+        parent = psutil.Process(process_group_id)
 
-        children = parent.children(recursive=True)
-        children.append(parent)
+        all_processes_in_the_group = parent.children(recursive=True)
+        all_processes_in_the_group.append(parent)
     except psutil.NoSuchProcess:
         # The process already exited, but maybe it's children haven't.
-        children = []
+        all_processes_in_the_group = []
         for proc in psutil.process_iter():
             try:
-                if os.getpgid(proc.pid) == pgid and proc.pid != 0:
-                    children.append(proc)
+                if os.getpgid(proc.pid) == process_group_id and proc.pid != 0:
+                    all_processes_in_the_group.append(proc)
             except OSError:
                 pass
 
-    logger.info("Sending %s to GPID %s", sig, pgid)
+    logger.info(
+        "Sending %s to group %s. PIDs of all processes in the group: %s",
+        sig,
+        process_group_id,
+        [p.pid for p in all_processes_in_the_group],
+    )
     try:
         signal_procs(sig)
     except OSError as err:
@@ -106,7 +131,7 @@ def reap_process_group(
         if err.errno == errno.ESRCH:
             return returncodes
 
-    _, alive = psutil.wait_procs(children, timeout=timeout, 
callback=on_terminate)
+    _, alive = psutil.wait_procs(all_processes_in_the_group, timeout=timeout, 
callback=on_terminate)
 
     if alive:
         for proc in alive:
diff --git a/tests/dag_processing/test_processor.py 
b/tests/dag_processing/test_processor.py
index 6bfec8f..c9ecfb0 100644
--- a/tests/dag_processing/test_processor.py
+++ b/tests/dag_processing/test_processor.py
@@ -27,6 +27,7 @@ import pytest
 
 from airflow import settings
 from airflow.configuration import TEST_DAGS_FOLDER, conf
+from airflow.dag_processing.manager import DagFileProcessorAgent
 from airflow.dag_processing.processor import DagFileProcessor
 from airflow.models import DagBag, DagModel, SlaMiss, TaskInstance, errors
 from airflow.models.taskinstance import SimpleTaskInstance
@@ -723,3 +724,52 @@ class TestDagFileProcessor:
         dags = session.query(DagModel).all()
         assert [dag.dag_id for dag in dags if dag.is_active] == 
['test_only_dummy_tasks']
         assert [dag.dag_id for dag in dags if not dag.is_active] == 
['missing_dag']
+
+
+class TestProcessorAgent:
+    @pytest.fixture(autouse=True)
+    def per_test(self):
+        self.processor_agent = None
+        yield
+        if self.processor_agent:
+            self.processor_agent.end()
+
+    def test_error_when_waiting_in_async_mode(self, tmp_path):
+        self.processor_agent = DagFileProcessorAgent(
+            dag_directory=str(tmp_path),
+            max_runs=1,
+            processor_timeout=datetime.timedelta(1),
+            dag_ids=[],
+            pickle_dags=False,
+            async_mode=True,
+        )
+        self.processor_agent.start()
+        with pytest.raises(RuntimeError, match="wait_until_finished should 
only be called in sync_mode"):
+            self.processor_agent.wait_until_finished()
+
+    def test_default_multiprocessing_behaviour(self, tmp_path):
+        self.processor_agent = DagFileProcessorAgent(
+            dag_directory=str(tmp_path),
+            max_runs=1,
+            processor_timeout=datetime.timedelta(1),
+            dag_ids=[],
+            pickle_dags=False,
+            async_mode=False,
+        )
+        self.processor_agent.start()
+        self.processor_agent.run_single_parsing_loop()
+        self.processor_agent.wait_until_finished()
+
+    @conf_vars({("core", "mp_start_method"): "spawn"})
+    def test_spawn_multiprocessing_behaviour(self, tmp_path):
+        self.processor_agent = DagFileProcessorAgent(
+            dag_directory=str(tmp_path),
+            max_runs=1,
+            processor_timeout=datetime.timedelta(1),
+            dag_ids=[],
+            pickle_dags=False,
+            async_mode=False,
+        )
+        self.processor_agent.start()
+        self.processor_agent.run_single_parsing_loop()
+        self.processor_agent.wait_until_finished()

[airflow] 02/04: Fix race condition when starting DagProcessorAgent (#19935)

Reply via email to