This is an automated email from the ASF dual-hosted git repository.

potiuk pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git


The following commit(s) were added to refs/heads/main by this push:
     new 16547dfef2e Validate downloaded paths stay within the destination 
directory in SFTPHook.retrieve_directory (#67985)
16547dfef2e is described below

commit 16547dfef2e09c3f56fa5af978d075ce1ca65fb6
Author: Jarek Potiuk <[email protected]>
AuthorDate: Thu Jun 4 15:33:34 2026 +0200

    Validate downloaded paths stay within the destination directory in 
SFTPHook.retrieve_directory (#67985)
    
    SFTPHook.retrieve_directory and retrieve_directory_concurrently build each
    local destination path by joining the local directory with a path derived
    from directory-entry names returned by the remote SFTP server. Because those
    names can contain ".." components, the recursive download could write 
outside
    the configured local destination directory.
    
    Add a containment check (_validate_within_directory) that resolves each
    computed local path and refuses to write when it falls outside the
    destination directory, applied to both the serial and concurrent retrieval
    paths.
    
    Generated-by: Claude Opus 4.8 (1M context) following the guidelines at
    
https://github.com/apache/airflow/blob/main/contributing-docs/05_pull_requests.rst#gen-ai-assisted-contributions
---
 .../sftp/src/airflow/providers/sftp/hooks/sftp.py  | 37 +++++++++++++++++++---
 providers/sftp/tests/unit/sftp/hooks/test_sftp.py  | 23 ++++++++++++++
 2 files changed, 56 insertions(+), 4 deletions(-)

diff --git a/providers/sftp/src/airflow/providers/sftp/hooks/sftp.py 
b/providers/sftp/src/airflow/providers/sftp/hooks/sftp.py
index 2f31c66d3b2..bb272348974 100644
--- a/providers/sftp/src/airflow/providers/sftp/hooks/sftp.py
+++ b/providers/sftp/src/airflow/providers/sftp/hooks/sftp.py
@@ -384,6 +384,25 @@ class SFTPHook(SSHHook):
         """
         self.conn.remove(path)  # type: ignore[arg-type, union-attr]
 
+    @staticmethod
+    def _validate_within_directory(base_dir: str, candidate: str) -> str:
+        """
+        Ensure ``candidate`` resolves to a path inside ``base_dir``.
+
+        Directory-entry names are returned by the remote SFTP server and may
+        contain ``..`` components; joining them into the local destination path
+        could otherwise write outside it. Containment is verified before any
+        local write or ``mkdir``.
+        """
+        base_real = os.path.realpath(base_dir)
+        candidate_real = os.path.realpath(candidate)
+        if candidate_real != base_real and os.path.commonpath([base_real, 
candidate_real]) != base_real:
+            raise ValueError(
+                f"Refusing to write outside the destination directory: "
+                f"{candidate!r} resolves outside {base_dir!r}"
+            )
+        return candidate
+
     def retrieve_directory(self, remote_full_path: str, local_full_path: str, 
prefetch: bool = True) -> None:
         """
         Transfer the remote directory to a local location.
@@ -400,10 +419,14 @@ class SFTPHook(SSHHook):
         Path(local_full_path).mkdir(parents=True)
         files, dirs, _ = self.get_tree_map(remote_full_path)
         for dir_path in dirs:
-            new_local_path = os.path.join(local_full_path, 
os.path.relpath(dir_path, remote_full_path))
+            new_local_path = self._validate_within_directory(
+                local_full_path, os.path.join(local_full_path, 
os.path.relpath(dir_path, remote_full_path))
+            )
             Path(new_local_path).mkdir(parents=True, exist_ok=True)
         for file_path in files:
-            new_local_path = os.path.join(local_full_path, 
os.path.relpath(file_path, remote_full_path))
+            new_local_path = self._validate_within_directory(
+                local_full_path, os.path.join(local_full_path, 
os.path.relpath(file_path, remote_full_path))
+            )
             self.retrieve_file(file_path, new_local_path, prefetch)
 
     def retrieve_directory_concurrently(
@@ -438,12 +461,18 @@ class SFTPHook(SSHHook):
             new_local_file_paths, remote_file_paths = [], []
             files, dirs, _ = self.get_tree_map(remote_full_path)
             for dir_path in dirs:
-                new_local_path = os.path.join(local_full_path, 
os.path.relpath(dir_path, remote_full_path))
+                new_local_path = self._validate_within_directory(
+                    local_full_path,
+                    os.path.join(local_full_path, os.path.relpath(dir_path, 
remote_full_path)),
+                )
                 Path(new_local_path).mkdir(parents=True, exist_ok=True)
             for file in files:
                 remote_file_paths.append(file)
                 new_local_file_paths.append(
-                    os.path.join(local_full_path, os.path.relpath(file, 
remote_full_path))
+                    self._validate_within_directory(
+                        local_full_path,
+                        os.path.join(local_full_path, os.path.relpath(file, 
remote_full_path)),
+                    )
                 )
         remote_file_chunks = [remote_file_paths[i::workers] for i in 
range(workers)]
         local_file_chunks = [new_local_file_paths[i::workers] for i in 
range(workers)]
diff --git a/providers/sftp/tests/unit/sftp/hooks/test_sftp.py 
b/providers/sftp/tests/unit/sftp/hooks/test_sftp.py
index 902a191041b..41ab0878ec4 100644
--- a/providers/sftp/tests/unit/sftp/hooks/test_sftp.py
+++ b/providers/sftp/tests/unit/sftp/hooks/test_sftp.py
@@ -633,6 +633,29 @@ class TestSFTPHook:
         )
         assert retrieved_dir_name in os.listdir(os.path.join(self.temp_dir, 
TMP_DIR_FOR_TESTS))
 
+    def test_validate_within_directory_rejects_escape(self):
+        base = os.path.join(self.temp_dir, "download")
+        with pytest.raises(ValueError, match="outside the destination 
directory"):
+            SFTPHook._validate_within_directory(base, os.path.join(base, "..", 
"victim"))
+        # An in-bounds candidate is returned unchanged.
+        inside = os.path.join(base, "sub", "file")
+        assert SFTPHook._validate_within_directory(base, inside) == inside
+
+    def test_retrieve_directory_rejects_server_path_traversal(self):
+        # A remote SFTP server can return a directory-entry name containing 
".."
+        # so the recursive download would escape the local destination 
directory.
+        remote = "/srv/export"
+        local = os.path.join(self.temp_dir, "download_traversal")
+        escaping_file = "/srv/export/../victim/payload"
+        with (
+            patch.object(SFTPHook, "get_tree_map", 
return_value=([escaping_file], [], [])),
+            patch.object(SFTPHook, "retrieve_file") as mock_retrieve,
+        ):
+            with pytest.raises(ValueError, match="outside the destination 
directory"):
+                self.hook.retrieve_directory(remote_full_path=remote, 
local_full_path=local)
+            mock_retrieve.assert_not_called()
+        assert not os.path.exists(os.path.join(self.temp_dir, "victim"))
+
     @patch("paramiko.SSHClient")
     @patch("paramiko.ProxyCommand")
     @patch("airflow.providers.sftp.hooks.sftp.SFTPHook.get_connection")

Reply via email to