This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git
The following commit(s) were added to refs/heads/main by this push:
new 2cba3b7c fsspec: Support token in ADLS (#2331)
2cba3b7c is described below
commit 2cba3b7c3a514b9dffa31473e902d64358ade848
Author: Kevin Liu <[email protected]>
AuthorDate: Mon Aug 18 00:44:04 2025 -0700
fsspec: Support token in ADLS (#2331)
<!--
Thanks for opening a pull request!
-->
<!-- In the case this PR will resolve an issue, please replace
${GITHUB_ISSUE_ID} below with the actual Github issue id. -->
<!-- Closes #${GITHUB_ISSUE_ID} -->
# Rationale for this change
Closes #2328
Pass `token` string as `credential` in `AzureBlobFileSystem`,
https://fsspec.github.io/adlfs/api/
This is a known workaround as described by
https://github.com/Azure/azure-sdk-for-python/issues/9075#issuecomment-564171752
I've also made a feature request for java implementation
https://github.com/apache/iceberg/issues/13818
Note,
[`pyarrow.fs.AzureFileSystem`](https://arrow.apache.org/docs/python/generated/pyarrow.fs.AzureFileSystem.html)
does not currently expose credential as a parameter
# Are these changes tested?
Yes, manually since azurite does not integrate with entra.
heres a repro script
```
from azure.identity import DefaultAzureCredential
import pyarrow as pa
credential = DefaultAzureCredential()
token = credential.get_token("https://storage.azure.com/.default").token
warehouse =
"abfss://[email protected]/kevinliu_demo_lh.Lakehouse/Files"
account_name = "daily-onelake"
account_host = f"{account_name}.blob.fabric.microsoft.com"
catalog = load_catalog("default", **{
"type": "in-memory",
"warehouse": warehouse,
"adls.account-name": account_name,
"adls.account-host": account_host,
"adls.token": token,
})
catalog.create_namespace_if_not_exists("default")
try:
catalog.drop_table("default.test")
except:
...
TEST_DATA = {
"id": [1, 2, 3, 1, 1],
"name": ["AB", "CD", "EF", "CD", "EF"],
}
arrow_table = pa.Table.from_pydict(TEST_DATA)
tbl = catalog.create_table_if_not_exists("default.test",
schema=arrow_table.schema)
tbl.append(arrow_table)
tbl.scan().to_arrow()
```
# Are there any user-facing changes?
<!-- In the case of user-facing changes, please add the changelog label.
-->
---
mkdocs/docs/configuration.md | 1 +
pyiceberg/io/__init__.py | 1 +
pyiceberg/io/fsspec.py | 25 ++++++++++++++++++++++++-
3 files changed, 26 insertions(+), 1 deletion(-)
diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md
index fad038e8..9ef4a1f4 100644
--- a/mkdocs/docs/configuration.md
+++ b/mkdocs/docs/configuration.md
@@ -162,6 +162,7 @@ For the FileIO there are several configuration options
available:
| adls.dfs-storage-authority | .dfs.core.windows.net
| The hostname[:port] of the Data
Lake Gen 2 Service. Defaults to `.dfs.core.windows.net`. Useful for connecting
to a local emulator, like [azurite](https://github.com/azure/azurite). See
[AzureFileSystem](https://arrow.apache.org/docs/python/filesystems.html#azure-storage-file-system)
for reference |
| adls.blob-storage-scheme | https
| Either `http` or `https`.
Defaults to `https`. Useful for connecting to a local emulator, like
[azurite](https://github.com/azure/azurite). See
[AzureFileSystem](https://arrow.apache.org/docs/python/filesystems.html#azure-storage-file-system)
for reference |
| adls.dfs-storage-scheme | https
| Either `http` or `https`.
Defaults to `https`. Useful for connecting to a local emulator, like
[azurite](https://github.com/azure/azurite). See
[AzureFileSystem](https://arrow.apache.org/docs/python/filesystems.html#azure-storage-file-system)
for reference |
+| adls.token | eyJ0eXAiOiJKV1QiLCJhbGci...
| Static access token for
authenticating with ADLS. Used for OAuth2 flows.
|
<!-- markdown-link-check-enable-->
diff --git a/pyiceberg/io/__init__.py b/pyiceberg/io/__init__.py
index 91ca6ee4..172dd44b 100644
--- a/pyiceberg/io/__init__.py
+++ b/pyiceberg/io/__init__.py
@@ -87,6 +87,7 @@ ADLS_BLOB_STORAGE_AUTHORITY = "adls.blob-storage-authority"
ADLS_DFS_STORAGE_AUTHORITY = "adls.dfs-storage-authority"
ADLS_BLOB_STORAGE_SCHEME = "adls.blob-storage-scheme"
ADLS_DFS_STORAGE_SCHEME = "adls.dfs-storage-scheme"
+ADLS_TOKEN = "adls.token"
GCS_TOKEN = "gcs.oauth2.token"
GCS_TOKEN_EXPIRES_AT_MS = "gcs.oauth2.token-expires-at"
GCS_PROJECT_ID = "gcs.project-id"
diff --git a/pyiceberg/io/fsspec.py b/pyiceberg/io/fsspec.py
index c1f95b71..8c8f6936 100644
--- a/pyiceberg/io/fsspec.py
+++ b/pyiceberg/io/fsspec.py
@@ -48,6 +48,7 @@ from pyiceberg.io import (
ADLS_CREDENTIAL,
ADLS_SAS_TOKEN,
ADLS_TENANT_ID,
+ ADLS_TOKEN,
AWS_ACCESS_KEY_ID,
AWS_REGION,
AWS_SECRET_ACCESS_KEY,
@@ -197,7 +198,11 @@ def _gs(properties: Properties) -> AbstractFileSystem:
def _adls(properties: Properties) -> AbstractFileSystem:
+ # https://fsspec.github.io/adlfs/api/
+
from adlfs import AzureBlobFileSystem
+ from azure.core.credentials import AccessToken
+ from azure.core.credentials_async import AsyncTokenCredential
for key, sas_token in {
key.replace(f"{ADLS_SAS_TOKEN}.", ""): value for key, value in
properties.items() if key.startswith(ADLS_SAS_TOKEN)
@@ -207,9 +212,27 @@ def _adls(properties: Properties) -> AbstractFileSystem:
if ADLS_SAS_TOKEN not in properties:
properties[ADLS_SAS_TOKEN] = sas_token
+ class StaticTokenCredential(AsyncTokenCredential):
+ _DEFAULT_EXPIRY_SECONDS = 3600
+
+ def __init__(self, token_string: str) -> None:
+ self._token = token_string
+
+ async def get_token(self, *scopes: str, **kwargs: Any) -> AccessToken:
+ import time
+
+ # Set expiration 1 hour from now
+ expires_on = int(time.time()) + self._DEFAULT_EXPIRY_SECONDS
+ return AccessToken(self._token, expires_on)
+
+ if token := properties.get(ADLS_TOKEN):
+ credential = StaticTokenCredential(token)
+ else:
+ credential = properties.get(ADLS_CREDENTIAL) # type: ignore
+
return AzureBlobFileSystem(
connection_string=properties.get(ADLS_CONNECTION_STRING),
- credential=properties.get(ADLS_CREDENTIAL),
+ credential=credential,
account_name=properties.get(ADLS_ACCOUNT_NAME),
account_key=properties.get(ADLS_ACCOUNT_KEY),
sas_token=properties.get(ADLS_SAS_TOKEN),