This is an automated email from the ASF dual-hosted git repository.
eladkal pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git
The following commit(s) were added to refs/heads/main by this push:
new 062fb3a438 Fix treatment of "#" in S3Hook.parse_s3_url() (#41796)
062fb3a438 is described below
commit 062fb3a4380d3e849f5eec8b9b21eb7065c597ce
Author: GlenboLake <[email protected]>
AuthorDate: Thu Aug 29 17:31:43 2024 -0400
Fix treatment of "#" in S3Hook.parse_s3_url() (#41796)
The current implementation of parse_s3_url will truncate a key if it
contains
an octothorpe character. By passing the allow_fragments=False argument to
urlsplit, keys will be correctly parsed.
---
airflow/providers/amazon/aws/hooks/s3.py | 2 +-
tests/providers/amazon/aws/hooks/test_s3.py | 74 +++++++++++++++++++++--------
2 files changed, 56 insertions(+), 20 deletions(-)
diff --git a/airflow/providers/amazon/aws/hooks/s3.py
b/airflow/providers/amazon/aws/hooks/s3.py
index 5f2c136640..76aed19782 100644
--- a/airflow/providers/amazon/aws/hooks/s3.py
+++ b/airflow/providers/amazon/aws/hooks/s3.py
@@ -238,7 +238,7 @@ class S3Hook(AwsBaseHook):
valid_s3_virtual_hosted_format =
"https://bucket-name.s3.region-code.amazonaws.com/key-name"
format = s3url.split("//")
if re.match(r"s3[na]?:", format[0], re.IGNORECASE):
- parsed_url = urlsplit(s3url)
+ parsed_url = urlsplit(s3url, allow_fragments=False)
if not parsed_url.netloc:
raise S3HookUriParseFailure(
"Please provide a bucket name using a valid format of the
form: "
diff --git a/tests/providers/amazon/aws/hooks/test_s3.py
b/tests/providers/amazon/aws/hooks/test_s3.py
index 9dade82004..97696c64b6 100644
--- a/tests/providers/amazon/aws/hooks/test_s3.py
+++ b/tests/providers/amazon/aws/hooks/test_s3.py
@@ -81,25 +81,61 @@ class TestAwsS3Hook:
with pytest.raises(TypeError, match="transfer_config_args expected
dict, got .*"):
S3Hook(transfer_config_args=transfer_config_args)
- def test_parse_s3_url(self):
- parsed = S3Hook.parse_s3_url("s3://test/this/is/not/a-real-key.txt")
- assert parsed == ("test", "this/is/not/a-real-key.txt"), "Incorrect
parsing of the s3 url"
-
- def test_parse_s3_url_s3a_style(self):
- parsed = S3Hook.parse_s3_url("s3a://test/this/is/not/a-real-key.txt")
- assert parsed == ("test", "this/is/not/a-real-key.txt"), "Incorrect
parsing of the s3 url"
-
- def test_parse_s3_url_s3n_style(self):
- parsed = S3Hook.parse_s3_url("s3n://test/this/is/not/a-real-key.txt")
- assert parsed == ("test", "this/is/not/a-real-key.txt"), "Incorrect
parsing of the s3 url"
-
- def test_parse_s3_url_path_style(self):
- parsed =
S3Hook.parse_s3_url("https://s3.us-west-2.amazonaws.com/DOC-EXAMPLE-BUCKET1/test.jpg")
- assert parsed == ("DOC-EXAMPLE-BUCKET1", "test.jpg"), "Incorrect
parsing of the s3 url"
-
- def test_parse_s3_url_virtual_hosted_style(self):
- parsed =
S3Hook.parse_s3_url("https://DOC-EXAMPLE-BUCKET1.s3.us-west-2.amazonaws.com/test.png")
- assert parsed == ("DOC-EXAMPLE-BUCKET1", "test.png"), "Incorrect
parsing of the s3 url"
+ @pytest.mark.parametrize(
+ "url, expected",
+ [
+ pytest.param(
+ "s3://test/this/is/not/a-real-key.txt", ("test",
"this/is/not/a-real-key.txt"), id="s3 style"
+ ),
+ pytest.param(
+ "s3a://test/this/is/not/a-real-key.txt",
+ ("test", "this/is/not/a-real-key.txt"),
+ id="s3a style",
+ ),
+ pytest.param(
+ "s3n://test/this/is/not/a-real-key.txt",
+ ("test", "this/is/not/a-real-key.txt"),
+ id="s3n style",
+ ),
+ pytest.param(
+
"https://s3.us-west-2.amazonaws.com/DOC-EXAMPLE-BUCKET1/test.jpg",
+ ("DOC-EXAMPLE-BUCKET1", "test.jpg"),
+ id="path style",
+ ),
+ pytest.param(
+
"https://DOC-EXAMPLE-BUCKET1.s3.us-west-2.amazonaws.com/test.png",
+ ("DOC-EXAMPLE-BUCKET1", "test.png"),
+ id="virtual hosted style",
+ ),
+ pytest.param(
+ "s3://test/this/is/not/a-real-key #2.txt",
+ ("test", "this/is/not/a-real-key #2.txt"),
+ id="s3 style with #",
+ ),
+ pytest.param(
+ "s3a://test/this/is/not/a-real-key #2.txt",
+ ("test", "this/is/not/a-real-key #2.txt"),
+ id="s3a style with #",
+ ),
+ pytest.param(
+ "s3n://test/this/is/not/a-real-key #2.txt",
+ ("test", "this/is/not/a-real-key #2.txt"),
+ id="s3n style with #",
+ ),
+ pytest.param(
+ "https://s3.us-west-2.amazonaws.com/DOC-EXAMPLE-BUCKET1/test
#2.jpg",
+ ("DOC-EXAMPLE-BUCKET1", "test #2.jpg"),
+ id="path style with #",
+ ),
+ pytest.param(
+ "https://DOC-EXAMPLE-BUCKET1.s3.us-west-2.amazonaws.com/test
#2.png",
+ ("DOC-EXAMPLE-BUCKET1", "test #2.png"),
+ id="virtual hosted style with #",
+ ),
+ ],
+ )
+ def test_parse_s3_url(self, url: str, expected: tuple[str, str]):
+ assert S3Hook.parse_s3_url(url) == expected, "Incorrect parsing of the
s3 url"
def test_parse_invalid_s3_url_virtual_hosted_style(self):
with pytest.raises(