This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new 6e7a4c9dad Python: Add S3 proxies to PyIceberg Catalog FileIO (#7958)
6e7a4c9dad is described below
commit 6e7a4c9dadb841cfd944bedddb14d328ac93ec44
Author: Sung Yun <[email protected]>
AuthorDate: Thu Jul 6 03:35:33 2023 -0400
Python: Add S3 proxies to PyIceberg Catalog FileIO (#7958)
* proxies
* proxies config
* typo
* one proxy-uri cofig
* isort
* lint
* proxy uri for both http and https on s3fs
* lint
* rename property
* s3.proxy-uri
* lint
---
python/mkdocs/docs/configuration.md | 15 ++++++++-------
python/pyiceberg/io/__init__.py | 1 +
python/pyiceberg/io/fsspec.py | 4 ++++
python/pyiceberg/io/pyarrow.py | 5 +++++
4 files changed, 18 insertions(+), 7 deletions(-)
diff --git a/python/mkdocs/docs/configuration.md
b/python/mkdocs/docs/configuration.md
index 65808581f8..f461f249c1 100644
--- a/python/mkdocs/docs/configuration.md
+++ b/python/mkdocs/docs/configuration.md
@@ -60,13 +60,14 @@ For the FileIO there are several configuration options
available:
### S3
-| Key | Example | Description
|
-| -------------------- | ------------------- |
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
-| s3.endpoint | https://10.0.19.25/ | Configure an alternative
endpoint of the S3 service for the FileIO to access. This could be used to use
S3FileIO with any s3-compatible object storage service that has a different
endpoint, or access a private S3 endpoint in a virtual private cloud. |
-| s3.access-key-id | admin | Configure the static secret
access key used to access the FileIO.
|
-| s3.secret-access-key | password | Configure the static session
token used to access the FileIO.
|
-| s3.signer | bearer | Configure the signature version
of the FileIO.
|
-| s3.region | us-west-2 | Sets the region of the bucket
|
+| Key | Example | Description
|
+| -------------------- | ------------------------ |
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
+| s3.endpoint | https://10.0.19.25/ | Configure an alternative
endpoint of the S3 service for the FileIO to access. This could be used to use
S3FileIO with any s3-compatible object storage service that has a different
endpoint, or access a private S3 endpoint in a virtual private cloud. |
+| s3.access-key-id | admin | Configure the static
secret access key used to access the FileIO.
|
+| s3.secret-access-key | password | Configure the static
session token used to access the FileIO.
|
+| s3.signer | bearer | Configure the signature
version of the FileIO.
|
+| s3.region | us-west-2 | Sets the region of the
bucket
|
+| s3.proxy-uri | http://my.proxy.com:8080 | Configure the proxy server
to be used by the FileIO.
|
### Azure Data lake
diff --git a/python/pyiceberg/io/__init__.py b/python/pyiceberg/io/__init__.py
index 2c82d890e4..c477c1ac4c 100644
--- a/python/pyiceberg/io/__init__.py
+++ b/python/pyiceberg/io/__init__.py
@@ -50,6 +50,7 @@ S3_ACCESS_KEY_ID = "s3.access-key-id"
S3_SECRET_ACCESS_KEY = "s3.secret-access-key"
S3_SESSION_TOKEN = "s3.session-token"
S3_REGION = "s3.region"
+S3_PROXY_URI = "s3.proxy-uri"
@runtime_checkable
diff --git a/python/pyiceberg/io/fsspec.py b/python/pyiceberg/io/fsspec.py
index c4484130ed..65472904d0 100644
--- a/python/pyiceberg/io/fsspec.py
+++ b/python/pyiceberg/io/fsspec.py
@@ -39,6 +39,7 @@ from pyiceberg.exceptions import SignError
from pyiceberg.io import (
S3_ACCESS_KEY_ID,
S3_ENDPOINT,
+ S3_PROXY_URI,
S3_REGION,
S3_SECRET_ACCESS_KEY,
S3_SESSION_TOKEN,
@@ -112,6 +113,9 @@ def _s3(properties: Properties) -> AbstractFileSystem:
else:
raise ValueError(f"Signer not available: {signer}")
+ if proxy_uri := properties.get(S3_PROXY_URI):
+ config_kwargs["proxies"] = {"http": proxy_uri, "https": proxy_uri}
+
fs = S3FileSystem(client_kwargs=client_kwargs, config_kwargs=config_kwargs)
for event_name, event_function in register_events.items():
diff --git a/python/pyiceberg/io/pyarrow.py b/python/pyiceberg/io/pyarrow.py
index 89ddf805eb..3a22f24b0b 100644
--- a/python/pyiceberg/io/pyarrow.py
+++ b/python/pyiceberg/io/pyarrow.py
@@ -80,6 +80,7 @@ from pyiceberg.expressions.visitors import visit as
boolean_expression_visit
from pyiceberg.io import (
S3_ACCESS_KEY_ID,
S3_ENDPOINT,
+ S3_PROXY_URI,
S3_REGION,
S3_SECRET_ACCESS_KEY,
S3_SESSION_TOKEN,
@@ -294,6 +295,10 @@ class PyArrowFileIO(FileIO):
"session_token": self.properties.get(S3_SESSION_TOKEN),
"region": self.properties.get(S3_REGION),
}
+
+ if proxy_uri := self.properties.get(S3_PROXY_URI):
+ client_kwargs["proxy_options"] = proxy_uri
+
return S3FileSystem(**client_kwargs)
elif scheme == "file":
return LocalFileSystem()