This is an automated email from the ASF dual-hosted git repository.

kevinjqliu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git


The following commit(s) were added to refs/heads/main by this push:
     new 0618b661 Add anon property to fsspec adls file io config to ease usage 
of DefaultCredential pipeline (#2661)
0618b661 is described below

commit 0618b661dc0999936b684343a0a0eae61faff05d
Author: NikitaMatskevich <[email protected]>
AuthorDate: Thu Jan 29 07:50:34 2026 +0300

    Add anon property to fsspec adls file io config to ease usage of 
DefaultCredential pipeline (#2661)
    
    <!--
    Thanks for opening a pull request!
    -->
    
    <!-- In the case this PR will resolve an issue, please replace
    ${GITHUB_ISSUE_ID} below with the actual Github issue id. -->
    <!-- Closes #${GITHUB_ISSUE_ID} -->
    
    # Rationale for this change
    
    We are using default credential pipeline to get access to Azure (more
    concretely, [managed
    
identities](https://learn.microsoft.com/en-us/entra/identity/managed-identities-azure-resources/overview)).
    We found out that fsspec library [only allows it if we set
    
anon=False](https://github.com/fsspec/adlfs/blob/main/adlfs/spec.py#L357-L367)
    and specify the account name.
    
    Thus, the anon property is added to pyiceberg config of the file io.
    
    ## Are these changes tested?
    
    We've tested that this works with the following snippet:
    ```
    import os
    from fsspec import AbstractFileSystem
    from pyiceberg.io.fsspec import FsspecFileIO
    from pyiceberg.catalog.rest import RestCatalog
    from typing import Any
    
    ADLS_ANON = "adls.anon"
    ADLS_CONNECTION_STRING = "adls.connection-string"
    ADLS_ACCOUNT_NAME = "adls.account-name"
    ADLS_ACCOUNT_KEY = "adls.account-key"
    ADLS_SAS_TOKEN = "adls.sas-token"
    ADLS_TENANT_ID = "adls.tenant-id"
    ADLS_CLIENT_ID = "adls.client-id"
    ADLS_CLIENT_SECRET = "adls.client-secret"
    ADLS_ACCOUNT_HOST = "adls.account-host"
    
    Properties = dict[str, Any]
    
    def my_adls(properties: Properties) -> AbstractFileSystem:
        from adlfs import AzureBlobFileSystem
    
        for key, sas_token in {
            key.replace(f"{ADLS_SAS_TOKEN}.", ""): value for key, value in 
properties.items() if key.startswith(ADLS_SAS_TOKEN)
        }.items():
            if ADLS_ACCOUNT_NAME not in properties:
                properties[ADLS_ACCOUNT_NAME] = key.split(".")[0]
            if ADLS_SAS_TOKEN not in properties:
                properties[ADLS_SAS_TOKEN] = sas_token
    
        return AzureBlobFileSystem(
            connection_string=properties.get(ADLS_CONNECTION_STRING),
            anon=properties.get(ADLS_ANON),
            account_name=properties.get(ADLS_ACCOUNT_NAME),
            account_key=properties.get(ADLS_ACCOUNT_KEY),
            sas_token=properties.get(ADLS_SAS_TOKEN),
            tenant_id=properties.get(ADLS_TENANT_ID),
            client_id=properties.get(ADLS_CLIENT_ID),
            client_secret=properties.get(ADLS_CLIENT_SECRET),
            account_host=properties.get(ADLS_ACCOUNT_HOST),
        )
    
    injected_file_io = FsspecFileIO(properties={ADLS_ANON: False, 
ADLS_ACCOUNT_NAME: "usagestorageprod"})
    injected_file_io.get_fs = lambda scheme: 
my_adls(injected_file_io.properties)
    
    CATALOG_URI = "https://lakehouse...";
    
    catalog_config = {
        "uri": CATALOG_URI,
        "properties": {
            "io-impl": "pyiceberg.io.fsspec.FsspecFileIO",
        },
        ...
    }
    
    catalog = RestCatalog("lakehouse", **catalog_config)
    catalog.file_io = injected_file_io
    
    table = catalog.load_table("some_ns.some_table")
    table.io = injected_file_io
    table.scan(snapshot_id=xxx).count()
    ```
    
    ## Are there any user-facing changes?
    
    Zero breaking changes
    
    <!-- In the case of user-facing changes, please add the changelog label.
    -->
---
 pyiceberg/io/__init__.py | 1 +
 pyiceberg/io/fsspec.py   | 2 ++
 tests/io/test_fsspec.py  | 1 +
 3 files changed, 4 insertions(+)

diff --git a/pyiceberg/io/__init__.py b/pyiceberg/io/__init__.py
index 71f763bf..3df22be7 100644
--- a/pyiceberg/io/__init__.py
+++ b/pyiceberg/io/__init__.py
@@ -85,6 +85,7 @@ ADLS_DFS_STORAGE_AUTHORITY = "adls.dfs-storage-authority"
 ADLS_BLOB_STORAGE_SCHEME = "adls.blob-storage-scheme"
 ADLS_DFS_STORAGE_SCHEME = "adls.dfs-storage-scheme"
 ADLS_TOKEN = "adls.token"
+ADLS_ANON = "adls.anon"
 GCS_TOKEN = "gcs.oauth2.token"
 GCS_TOKEN_EXPIRES_AT_MS = "gcs.oauth2.token-expires-at"
 GCS_PROJECT_ID = "gcs.project-id"
diff --git a/pyiceberg/io/fsspec.py b/pyiceberg/io/fsspec.py
index 6f44501e..ac108c80 100644
--- a/pyiceberg/io/fsspec.py
+++ b/pyiceberg/io/fsspec.py
@@ -43,6 +43,7 @@ from pyiceberg.io import (
     ADLS_ACCOUNT_HOST,
     ADLS_ACCOUNT_KEY,
     ADLS_ACCOUNT_NAME,
+    ADLS_ANON,
     ADLS_CLIENT_ID,
     ADLS_CLIENT_SECRET,
     ADLS_CONNECTION_STRING,
@@ -286,6 +287,7 @@ def _adls(properties: Properties) -> AbstractFileSystem:
         client_id=properties.get(ADLS_CLIENT_ID),
         client_secret=properties.get(ADLS_CLIENT_SECRET),
         account_host=properties.get(ADLS_ACCOUNT_HOST),
+        anon=properties.get(ADLS_ANON),
     )
 
 
diff --git a/tests/io/test_fsspec.py b/tests/io/test_fsspec.py
index 94eff1cb..392fa60a 100644
--- a/tests/io/test_fsspec.py
+++ b/tests/io/test_fsspec.py
@@ -602,6 +602,7 @@ def test_adls_account_name_sas_token_extraction() -> None:
             client_id=None,
             client_secret=None,
             account_host="testaccount.dfs.core.windows.net",
+            anon=None,
         )
 
 

Reply via email to