This is an automated email from the ASF dual-hosted git repository.
shahar1 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git
The following commit(s) were added to refs/heads/main by this push:
new 0385bae7055 Validate destination paths derived from GCS object names
(#67667)
0385bae7055 is described below
commit 0385bae70553223677753047b3e779d8b86b15c4
Author: Jarek Potiuk <[email protected]>
AuthorDate: Wed Jun 17 01:11:32 2026 -0400
Validate destination paths derived from GCS object names (#67667)
GCSToSFTPOperator._resolve_destination_path computes the SFTP destination by
joining the GCS object name onto destination_path. GCS object names are
arbitrary UTF-8 strings controlled by whoever can write to the source
bucket,
so a name with ".." segments or an absolute prefix could canonicalise
outside
the configured destination and land somewhere like ~/.ssh/authorized_keys on
the SFTP host.
Normalise the joined path and require it to stay within the configured base
—
for absolute and relative bases alike, skipping only "." / "" (the SFTP
login
directory, where any non-escaping relative path is already in-bounds). On a
violation raise ValueError identifying the offending object name.
The sibling GCSTimeSpanFileTransformOperator._download path was already
hardened separately in #67509.
---
.../google/cloud/transfers/gcs_to_sftp.py | 23 ++++-
.../google/cloud/transfers/test_gcs_to_sftp.py | 99 ++++++++++++++++++++++
2 files changed, 121 insertions(+), 1 deletion(-)
diff --git
a/providers/google/src/airflow/providers/google/cloud/transfers/gcs_to_sftp.py
b/providers/google/src/airflow/providers/google/cloud/transfers/gcs_to_sftp.py
index 664c6aea319..d601d6b7db6 100644
---
a/providers/google/src/airflow/providers/google/cloud/transfers/gcs_to_sftp.py
+++
b/providers/google/src/airflow/providers/google/cloud/transfers/gcs_to_sftp.py
@@ -176,7 +176,28 @@ class GCSToSFTPOperator(BaseOperator):
source_object = os.path.relpath(source_object, start=prefix)
else:
source_object = os.path.basename(source_object)
- return os.path.join(self.destination_path, source_object)
+ # GCS object names are arbitrary UTF-8 strings controlled by whoever
can
+ # write to the source bucket, so ``..`` segments or an absolute prefix
+ # could canonicalise outside ``destination_path`` on the SFTP server.
+ # Resolve the join and require it to stay within the configured base.
+ resolved = os.path.normpath(os.path.join(self.destination_path,
source_object))
+ base = os.path.normpath(self.destination_path)
+ escapes = (
+ resolved == ".."
+ or resolved.startswith(".." + os.sep)
+ # An absolute source_object absorbed a relative base entirely.
+ or (os.path.isabs(resolved) and not os.path.isabs(base))
+ # A configured base directory must remain the prefix of the result.
+ # ``base == "."`` is the SFTP login directory, where any
non-escaping
+ # relative path is already in-bounds.
+ or (base != "." and resolved != base and not
resolved.startswith(base.rstrip(os.sep) + os.sep))
+ )
+ if escapes:
+ raise ValueError(
+ f"Refusing to copy GCS object {source_object!r}: resolved
destination "
+ f"{resolved!r} escapes configured destination_path
{self.destination_path!r}."
+ )
+ return resolved
def _copy_single_object(
self,
diff --git
a/providers/google/tests/unit/google/cloud/transfers/test_gcs_to_sftp.py
b/providers/google/tests/unit/google/cloud/transfers/test_gcs_to_sftp.py
index 28ca53acdb5..8bc93419338 100644
--- a/providers/google/tests/unit/google/cloud/transfers/test_gcs_to_sftp.py
+++ b/providers/google/tests/unit/google/cloud/transfers/test_gcs_to_sftp.py
@@ -466,3 +466,102 @@ class TestGoogleCloudStorageToSFTPOperator:
task.execute(None)
sftp_hook_mock.return_value.create_directory.assert_not_called()
+
+ @pytest.mark.parametrize(
+ "source_object",
+ [
+ pytest.param("incoming/../../../../etc/passwd",
id="dotdot-segments"),
+ pytest.param("/etc/passwd", id="absolute-path"),
+ ],
+ )
+ def test_resolve_destination_path_rejects_escape(self, source_object):
+ # ``_resolve_destination_path`` is the GCSToSFTPOperator method that
+ # joins a GCS object name with the configured ``destination_path``.
+ # When the joined path canonicalises outside the destination — either
+ # via ``..`` segments or an absolute ``source_object`` that absorbs
+ # the prefix — the method must refuse rather than hand the path to
+ # the SFTP server, where the server would resolve it on its own host.
+ task = GCSToSFTPOperator(
+ task_id=TASK_ID,
+ source_bucket=TEST_BUCKET,
+ source_object="incoming/*",
+ destination_path="/srv/sftp/incoming",
+ keep_directory_structure=True,
+ gcp_conn_id=GCP_CONN_ID,
+ sftp_conn_id=SFTP_CONN_ID,
+ )
+ with pytest.raises(ValueError, match="escapes configured
destination_path"):
+ task._resolve_destination_path(source_object)
+
+ def test_resolve_destination_path_allows_benign_nested(self):
+ # The new validation is post-join normalisation; benign nested paths
+ # under the destination must still resolve cleanly.
+ task = GCSToSFTPOperator(
+ task_id=TASK_ID,
+ source_bucket=TEST_BUCKET,
+ source_object="incoming/*",
+ destination_path="/srv/sftp/incoming",
+ keep_directory_structure=True,
+ gcp_conn_id=GCP_CONN_ID,
+ sftp_conn_id=SFTP_CONN_ID,
+ )
+ assert (
+ task._resolve_destination_path("incoming/sub/dir/file.csv")
+ == "/srv/sftp/incoming/incoming/sub/dir/file.csv"
+ )
+
+ @pytest.mark.parametrize(
+ ("destination_path", "source_object", "expected"),
+ [
+ pytest.param(".", "file.txt", "file.txt", id="dot-base-benign"),
+ pytest.param("", "file.txt", "file.txt", id="empty-base-benign"),
+ pytest.param(".", "sub/dir/file.txt", "sub/dir/file.txt",
id="dot-base-nested"),
+ ],
+ )
+ def test_resolve_destination_path_allows_relative_base(self,
destination_path, source_object, expected):
+ # ``destination_path="."`` and ``destination_path=""`` are valid SFTP
+ # destinations — they refer to the SFTP user's login / current
+ # directory. The validation must allow these benign uploads while
+ # still rejecting ``..`` escapes and absolute-path absorption (covered
+ # by the next test).
+ task = GCSToSFTPOperator(
+ task_id=TASK_ID,
+ source_bucket=TEST_BUCKET,
+ source_object="*",
+ destination_path=destination_path,
+ keep_directory_structure=True,
+ gcp_conn_id=GCP_CONN_ID,
+ sftp_conn_id=SFTP_CONN_ID,
+ )
+ assert task._resolve_destination_path(source_object) == expected
+
+ @pytest.mark.parametrize(
+ ("destination_path", "source_object"),
+ [
+ pytest.param(".", "../etc/passwd",
id="dotdot-escape-from-dot-base"),
+ pytest.param(".", "/etc/passwd", id="absolute-absorbs-dot-base"),
+ # Non-trivial relative bases: the ``..`` segments cancel the base
+ # itself, so ``normpath`` leaves no leading ``..`` and the
+ # leading-``..`` check alone would let them through — containment
+ # against the configured base must still reject them.
+ pytest.param("incoming", "../.ssh/authorized_keys",
id="dotdot-escape-from-nested-base"),
+ pytest.param("uploads/in", "../../etc/passwd",
id="dotdot-escape-from-deeper-base"),
+ ],
+ )
+ def test_resolve_destination_path_rejects_escape_from_relative_base(
+ self, destination_path, source_object
+ ):
+ # A relative ``destination_path`` must still contain the resolved path:
+ # ``..`` segments that cancel the base and absolute ``source_object``
+ # values that absorb it entirely are both refused.
+ task = GCSToSFTPOperator(
+ task_id=TASK_ID,
+ source_bucket=TEST_BUCKET,
+ source_object="*",
+ destination_path=destination_path,
+ keep_directory_structure=True,
+ gcp_conn_id=GCP_CONN_ID,
+ sftp_conn_id=SFTP_CONN_ID,
+ )
+ with pytest.raises(ValueError, match="escapes configured
destination_path"):
+ task._resolve_destination_path(source_object)