This is an automated email from the ASF dual-hosted git repository.

fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git


The following commit(s) were added to refs/heads/main by this push:
     new 2cba3b7c fsspec: Support token in ADLS (#2331)
2cba3b7c is described below

commit 2cba3b7c3a514b9dffa31473e902d64358ade848
Author: Kevin Liu <[email protected]>
AuthorDate: Mon Aug 18 00:44:04 2025 -0700

    fsspec: Support token in ADLS (#2331)
    
    <!--
    Thanks for opening a pull request!
    -->
    
    <!-- In the case this PR will resolve an issue, please replace
    ${GITHUB_ISSUE_ID} below with the actual Github issue id. -->
    <!-- Closes #${GITHUB_ISSUE_ID} -->
    
    # Rationale for this change
    Closes #2328
    
    Pass `token` string as `credential` in `AzureBlobFileSystem`,
    https://fsspec.github.io/adlfs/api/
    This is a known workaround as described by
    
https://github.com/Azure/azure-sdk-for-python/issues/9075#issuecomment-564171752
    
    I've also made a feature request for java implementation
    https://github.com/apache/iceberg/issues/13818
    
    Note,
    
[`pyarrow.fs.AzureFileSystem`](https://arrow.apache.org/docs/python/generated/pyarrow.fs.AzureFileSystem.html)
    does not currently expose credential as a parameter
    
    # Are these changes tested?
    Yes, manually since azurite does not integrate with entra.
    
    heres a repro script
    ```
    from azure.identity import DefaultAzureCredential
    import pyarrow as pa
    
    credential = DefaultAzureCredential()
    token = credential.get_token("https://storage.azure.com/.default";).token
    
    warehouse = 
"abfss://[email protected]/kevinliu_demo_lh.Lakehouse/Files"
    account_name = "daily-onelake"
    account_host = f"{account_name}.blob.fabric.microsoft.com"
    
    catalog = load_catalog("default", **{
        "type": "in-memory",
        "warehouse": warehouse,
        "adls.account-name": account_name,
        "adls.account-host": account_host,
        "adls.token": token,
    })
    
    catalog.create_namespace_if_not_exists("default")
    try:
        catalog.drop_table("default.test")
    except:
        ...
    
    
    TEST_DATA = {
        "id": [1, 2, 3, 1, 1],
        "name": ["AB", "CD", "EF", "CD", "EF"],
    }
    arrow_table = pa.Table.from_pydict(TEST_DATA)
    tbl = catalog.create_table_if_not_exists("default.test", 
schema=arrow_table.schema)
    tbl.append(arrow_table)
    tbl.scan().to_arrow()
    ```
    
    
    
    # Are there any user-facing changes?
    
    <!-- In the case of user-facing changes, please add the changelog label.
    -->
---
 mkdocs/docs/configuration.md |  1 +
 pyiceberg/io/__init__.py     |  1 +
 pyiceberg/io/fsspec.py       | 25 ++++++++++++++++++++++++-
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md
index fad038e8..9ef4a1f4 100644
--- a/mkdocs/docs/configuration.md
+++ b/mkdocs/docs/configuration.md
@@ -162,6 +162,7 @@ For the FileIO there are several configuration options 
available:
 | adls.dfs-storage-authority   | .dfs.core.windows.net                         
                                              | The hostname[:port] of the Data 
Lake Gen 2 Service. Defaults to `.dfs.core.windows.net`. Useful for connecting 
to a local emulator, like [azurite](https://github.com/azure/azurite). See 
[AzureFileSystem](https://arrow.apache.org/docs/python/filesystems.html#azure-storage-file-system)
 for reference                |
 | adls.blob-storage-scheme     | https                                         
                                              | Either `http` or `https`. 
Defaults to `https`. Useful for connecting to a local emulator, like 
[azurite](https://github.com/azure/azurite). See 
[AzureFileSystem](https://arrow.apache.org/docs/python/filesystems.html#azure-storage-file-system)
 for reference                                                      |
 | adls.dfs-storage-scheme      | https                                         
                                              | Either `http` or `https`. 
Defaults to `https`. Useful for connecting to a local emulator, like 
[azurite](https://github.com/azure/azurite). See 
[AzureFileSystem](https://arrow.apache.org/docs/python/filesystems.html#azure-storage-file-system)
 for reference                                                          |
+| adls.token                   | eyJ0eXAiOiJKV1QiLCJhbGci...                   
                                              | Static access token for 
authenticating with ADLS. Used for OAuth2 flows.                                
                                                                                
                                                       |
 
 <!-- markdown-link-check-enable-->
 
diff --git a/pyiceberg/io/__init__.py b/pyiceberg/io/__init__.py
index 91ca6ee4..172dd44b 100644
--- a/pyiceberg/io/__init__.py
+++ b/pyiceberg/io/__init__.py
@@ -87,6 +87,7 @@ ADLS_BLOB_STORAGE_AUTHORITY = "adls.blob-storage-authority"
 ADLS_DFS_STORAGE_AUTHORITY = "adls.dfs-storage-authority"
 ADLS_BLOB_STORAGE_SCHEME = "adls.blob-storage-scheme"
 ADLS_DFS_STORAGE_SCHEME = "adls.dfs-storage-scheme"
+ADLS_TOKEN = "adls.token"
 GCS_TOKEN = "gcs.oauth2.token"
 GCS_TOKEN_EXPIRES_AT_MS = "gcs.oauth2.token-expires-at"
 GCS_PROJECT_ID = "gcs.project-id"
diff --git a/pyiceberg/io/fsspec.py b/pyiceberg/io/fsspec.py
index c1f95b71..8c8f6936 100644
--- a/pyiceberg/io/fsspec.py
+++ b/pyiceberg/io/fsspec.py
@@ -48,6 +48,7 @@ from pyiceberg.io import (
     ADLS_CREDENTIAL,
     ADLS_SAS_TOKEN,
     ADLS_TENANT_ID,
+    ADLS_TOKEN,
     AWS_ACCESS_KEY_ID,
     AWS_REGION,
     AWS_SECRET_ACCESS_KEY,
@@ -197,7 +198,11 @@ def _gs(properties: Properties) -> AbstractFileSystem:
 
 
 def _adls(properties: Properties) -> AbstractFileSystem:
+    # https://fsspec.github.io/adlfs/api/
+
     from adlfs import AzureBlobFileSystem
+    from azure.core.credentials import AccessToken
+    from azure.core.credentials_async import AsyncTokenCredential
 
     for key, sas_token in {
         key.replace(f"{ADLS_SAS_TOKEN}.", ""): value for key, value in 
properties.items() if key.startswith(ADLS_SAS_TOKEN)
@@ -207,9 +212,27 @@ def _adls(properties: Properties) -> AbstractFileSystem:
         if ADLS_SAS_TOKEN not in properties:
             properties[ADLS_SAS_TOKEN] = sas_token
 
+    class StaticTokenCredential(AsyncTokenCredential):
+        _DEFAULT_EXPIRY_SECONDS = 3600
+
+        def __init__(self, token_string: str) -> None:
+            self._token = token_string
+
+        async def get_token(self, *scopes: str, **kwargs: Any) -> AccessToken:
+            import time
+
+            # Set expiration 1 hour from now
+            expires_on = int(time.time()) + self._DEFAULT_EXPIRY_SECONDS
+            return AccessToken(self._token, expires_on)
+
+    if token := properties.get(ADLS_TOKEN):
+        credential = StaticTokenCredential(token)
+    else:
+        credential = properties.get(ADLS_CREDENTIAL)  # type: ignore
+
     return AzureBlobFileSystem(
         connection_string=properties.get(ADLS_CONNECTION_STRING),
-        credential=properties.get(ADLS_CREDENTIAL),
+        credential=credential,
         account_name=properties.get(ADLS_ACCOUNT_NAME),
         account_key=properties.get(ADLS_ACCOUNT_KEY),
         sas_token=properties.get(ADLS_SAS_TOKEN),

Reply via email to