This is an automated email from the ASF dual-hosted git repository.

shahar1 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git


The following commit(s) were added to refs/heads/main by this push:
     new 0385bae7055 Validate destination paths derived from GCS object names 
(#67667)
0385bae7055 is described below

commit 0385bae70553223677753047b3e779d8b86b15c4
Author: Jarek Potiuk <[email protected]>
AuthorDate: Wed Jun 17 01:11:32 2026 -0400

    Validate destination paths derived from GCS object names (#67667)
    
    GCSToSFTPOperator._resolve_destination_path computes the SFTP destination by
    joining the GCS object name onto destination_path. GCS object names are
    arbitrary UTF-8 strings controlled by whoever can write to the source 
bucket,
    so a name with ".." segments or an absolute prefix could canonicalise 
outside
    the configured destination and land somewhere like ~/.ssh/authorized_keys on
    the SFTP host.
    
    Normalise the joined path and require it to stay within the configured base 
—
    for absolute and relative bases alike, skipping only "." / "" (the SFTP 
login
    directory, where any non-escaping relative path is already in-bounds). On a
    violation raise ValueError identifying the offending object name.
    
    The sibling GCSTimeSpanFileTransformOperator._download path was already
    hardened separately in #67509.
---
 .../google/cloud/transfers/gcs_to_sftp.py          | 23 ++++-
 .../google/cloud/transfers/test_gcs_to_sftp.py     | 99 ++++++++++++++++++++++
 2 files changed, 121 insertions(+), 1 deletion(-)

diff --git 
a/providers/google/src/airflow/providers/google/cloud/transfers/gcs_to_sftp.py 
b/providers/google/src/airflow/providers/google/cloud/transfers/gcs_to_sftp.py
index 664c6aea319..d601d6b7db6 100644
--- 
a/providers/google/src/airflow/providers/google/cloud/transfers/gcs_to_sftp.py
+++ 
b/providers/google/src/airflow/providers/google/cloud/transfers/gcs_to_sftp.py
@@ -176,7 +176,28 @@ class GCSToSFTPOperator(BaseOperator):
                 source_object = os.path.relpath(source_object, start=prefix)
             else:
                 source_object = os.path.basename(source_object)
-        return os.path.join(self.destination_path, source_object)
+        # GCS object names are arbitrary UTF-8 strings controlled by whoever 
can
+        # write to the source bucket, so ``..`` segments or an absolute prefix
+        # could canonicalise outside ``destination_path`` on the SFTP server.
+        # Resolve the join and require it to stay within the configured base.
+        resolved = os.path.normpath(os.path.join(self.destination_path, 
source_object))
+        base = os.path.normpath(self.destination_path)
+        escapes = (
+            resolved == ".."
+            or resolved.startswith(".." + os.sep)
+            # An absolute source_object absorbed a relative base entirely.
+            or (os.path.isabs(resolved) and not os.path.isabs(base))
+            # A configured base directory must remain the prefix of the result.
+            # ``base == "."`` is the SFTP login directory, where any 
non-escaping
+            # relative path is already in-bounds.
+            or (base != "." and resolved != base and not 
resolved.startswith(base.rstrip(os.sep) + os.sep))
+        )
+        if escapes:
+            raise ValueError(
+                f"Refusing to copy GCS object {source_object!r}: resolved 
destination "
+                f"{resolved!r} escapes configured destination_path 
{self.destination_path!r}."
+            )
+        return resolved
 
     def _copy_single_object(
         self,
diff --git 
a/providers/google/tests/unit/google/cloud/transfers/test_gcs_to_sftp.py 
b/providers/google/tests/unit/google/cloud/transfers/test_gcs_to_sftp.py
index 28ca53acdb5..8bc93419338 100644
--- a/providers/google/tests/unit/google/cloud/transfers/test_gcs_to_sftp.py
+++ b/providers/google/tests/unit/google/cloud/transfers/test_gcs_to_sftp.py
@@ -466,3 +466,102 @@ class TestGoogleCloudStorageToSFTPOperator:
         task.execute(None)
 
         sftp_hook_mock.return_value.create_directory.assert_not_called()
+
+    @pytest.mark.parametrize(
+        "source_object",
+        [
+            pytest.param("incoming/../../../../etc/passwd", 
id="dotdot-segments"),
+            pytest.param("/etc/passwd", id="absolute-path"),
+        ],
+    )
+    def test_resolve_destination_path_rejects_escape(self, source_object):
+        # ``_resolve_destination_path`` is the GCSToSFTPOperator method that
+        # joins a GCS object name with the configured ``destination_path``.
+        # When the joined path canonicalises outside the destination — either
+        # via ``..`` segments or an absolute ``source_object`` that absorbs
+        # the prefix — the method must refuse rather than hand the path to
+        # the SFTP server, where the server would resolve it on its own host.
+        task = GCSToSFTPOperator(
+            task_id=TASK_ID,
+            source_bucket=TEST_BUCKET,
+            source_object="incoming/*",
+            destination_path="/srv/sftp/incoming",
+            keep_directory_structure=True,
+            gcp_conn_id=GCP_CONN_ID,
+            sftp_conn_id=SFTP_CONN_ID,
+        )
+        with pytest.raises(ValueError, match="escapes configured 
destination_path"):
+            task._resolve_destination_path(source_object)
+
+    def test_resolve_destination_path_allows_benign_nested(self):
+        # The new validation is post-join normalisation; benign nested paths
+        # under the destination must still resolve cleanly.
+        task = GCSToSFTPOperator(
+            task_id=TASK_ID,
+            source_bucket=TEST_BUCKET,
+            source_object="incoming/*",
+            destination_path="/srv/sftp/incoming",
+            keep_directory_structure=True,
+            gcp_conn_id=GCP_CONN_ID,
+            sftp_conn_id=SFTP_CONN_ID,
+        )
+        assert (
+            task._resolve_destination_path("incoming/sub/dir/file.csv")
+            == "/srv/sftp/incoming/incoming/sub/dir/file.csv"
+        )
+
+    @pytest.mark.parametrize(
+        ("destination_path", "source_object", "expected"),
+        [
+            pytest.param(".", "file.txt", "file.txt", id="dot-base-benign"),
+            pytest.param("", "file.txt", "file.txt", id="empty-base-benign"),
+            pytest.param(".", "sub/dir/file.txt", "sub/dir/file.txt", 
id="dot-base-nested"),
+        ],
+    )
+    def test_resolve_destination_path_allows_relative_base(self, 
destination_path, source_object, expected):
+        # ``destination_path="."`` and ``destination_path=""`` are valid SFTP
+        # destinations — they refer to the SFTP user's login / current
+        # directory. The validation must allow these benign uploads while
+        # still rejecting ``..`` escapes and absolute-path absorption (covered
+        # by the next test).
+        task = GCSToSFTPOperator(
+            task_id=TASK_ID,
+            source_bucket=TEST_BUCKET,
+            source_object="*",
+            destination_path=destination_path,
+            keep_directory_structure=True,
+            gcp_conn_id=GCP_CONN_ID,
+            sftp_conn_id=SFTP_CONN_ID,
+        )
+        assert task._resolve_destination_path(source_object) == expected
+
+    @pytest.mark.parametrize(
+        ("destination_path", "source_object"),
+        [
+            pytest.param(".", "../etc/passwd", 
id="dotdot-escape-from-dot-base"),
+            pytest.param(".", "/etc/passwd", id="absolute-absorbs-dot-base"),
+            # Non-trivial relative bases: the ``..`` segments cancel the base
+            # itself, so ``normpath`` leaves no leading ``..`` and the
+            # leading-``..`` check alone would let them through — containment
+            # against the configured base must still reject them.
+            pytest.param("incoming", "../.ssh/authorized_keys", 
id="dotdot-escape-from-nested-base"),
+            pytest.param("uploads/in", "../../etc/passwd", 
id="dotdot-escape-from-deeper-base"),
+        ],
+    )
+    def test_resolve_destination_path_rejects_escape_from_relative_base(
+        self, destination_path, source_object
+    ):
+        # A relative ``destination_path`` must still contain the resolved path:
+        # ``..`` segments that cancel the base and absolute ``source_object``
+        # values that absorb it entirely are both refused.
+        task = GCSToSFTPOperator(
+            task_id=TASK_ID,
+            source_bucket=TEST_BUCKET,
+            source_object="*",
+            destination_path=destination_path,
+            keep_directory_structure=True,
+            gcp_conn_id=GCP_CONN_ID,
+            sftp_conn_id=SFTP_CONN_ID,
+        )
+        with pytest.raises(ValueError, match="escapes configured 
destination_path"):
+            task._resolve_destination_path(source_object)

Reply via email to