This is an automated email from the ASF dual-hosted git repository.
kevinjqliu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git
The following commit(s) were added to refs/heads/main by this push:
new 0618b661 Add anon property to fsspec adls file io config to ease usage
of DefaultCredential pipeline (#2661)
0618b661 is described below
commit 0618b661dc0999936b684343a0a0eae61faff05d
Author: NikitaMatskevich <[email protected]>
AuthorDate: Thu Jan 29 07:50:34 2026 +0300
Add anon property to fsspec adls file io config to ease usage of
DefaultCredential pipeline (#2661)
<!--
Thanks for opening a pull request!
-->
<!-- In the case this PR will resolve an issue, please replace
${GITHUB_ISSUE_ID} below with the actual Github issue id. -->
<!-- Closes #${GITHUB_ISSUE_ID} -->
# Rationale for this change
We are using default credential pipeline to get access to Azure (more
concretely, [managed
identities](https://learn.microsoft.com/en-us/entra/identity/managed-identities-azure-resources/overview)).
We found out that fsspec library [only allows it if we set
anon=False](https://github.com/fsspec/adlfs/blob/main/adlfs/spec.py#L357-L367)
and specify the account name.
Thus, the anon property is added to pyiceberg config of the file io.
## Are these changes tested?
We've tested that this works with the following snippet:
```
import os
from fsspec import AbstractFileSystem
from pyiceberg.io.fsspec import FsspecFileIO
from pyiceberg.catalog.rest import RestCatalog
from typing import Any
ADLS_ANON = "adls.anon"
ADLS_CONNECTION_STRING = "adls.connection-string"
ADLS_ACCOUNT_NAME = "adls.account-name"
ADLS_ACCOUNT_KEY = "adls.account-key"
ADLS_SAS_TOKEN = "adls.sas-token"
ADLS_TENANT_ID = "adls.tenant-id"
ADLS_CLIENT_ID = "adls.client-id"
ADLS_CLIENT_SECRET = "adls.client-secret"
ADLS_ACCOUNT_HOST = "adls.account-host"
Properties = dict[str, Any]
def my_adls(properties: Properties) -> AbstractFileSystem:
from adlfs import AzureBlobFileSystem
for key, sas_token in {
key.replace(f"{ADLS_SAS_TOKEN}.", ""): value for key, value in
properties.items() if key.startswith(ADLS_SAS_TOKEN)
}.items():
if ADLS_ACCOUNT_NAME not in properties:
properties[ADLS_ACCOUNT_NAME] = key.split(".")[0]
if ADLS_SAS_TOKEN not in properties:
properties[ADLS_SAS_TOKEN] = sas_token
return AzureBlobFileSystem(
connection_string=properties.get(ADLS_CONNECTION_STRING),
anon=properties.get(ADLS_ANON),
account_name=properties.get(ADLS_ACCOUNT_NAME),
account_key=properties.get(ADLS_ACCOUNT_KEY),
sas_token=properties.get(ADLS_SAS_TOKEN),
tenant_id=properties.get(ADLS_TENANT_ID),
client_id=properties.get(ADLS_CLIENT_ID),
client_secret=properties.get(ADLS_CLIENT_SECRET),
account_host=properties.get(ADLS_ACCOUNT_HOST),
)
injected_file_io = FsspecFileIO(properties={ADLS_ANON: False,
ADLS_ACCOUNT_NAME: "usagestorageprod"})
injected_file_io.get_fs = lambda scheme:
my_adls(injected_file_io.properties)
CATALOG_URI = "https://lakehouse..."
catalog_config = {
"uri": CATALOG_URI,
"properties": {
"io-impl": "pyiceberg.io.fsspec.FsspecFileIO",
},
...
}
catalog = RestCatalog("lakehouse", **catalog_config)
catalog.file_io = injected_file_io
table = catalog.load_table("some_ns.some_table")
table.io = injected_file_io
table.scan(snapshot_id=xxx).count()
```
## Are there any user-facing changes?
Zero breaking changes
<!-- In the case of user-facing changes, please add the changelog label.
-->
---
pyiceberg/io/__init__.py | 1 +
pyiceberg/io/fsspec.py | 2 ++
tests/io/test_fsspec.py | 1 +
3 files changed, 4 insertions(+)
diff --git a/pyiceberg/io/__init__.py b/pyiceberg/io/__init__.py
index 71f763bf..3df22be7 100644
--- a/pyiceberg/io/__init__.py
+++ b/pyiceberg/io/__init__.py
@@ -85,6 +85,7 @@ ADLS_DFS_STORAGE_AUTHORITY = "adls.dfs-storage-authority"
ADLS_BLOB_STORAGE_SCHEME = "adls.blob-storage-scheme"
ADLS_DFS_STORAGE_SCHEME = "adls.dfs-storage-scheme"
ADLS_TOKEN = "adls.token"
+ADLS_ANON = "adls.anon"
GCS_TOKEN = "gcs.oauth2.token"
GCS_TOKEN_EXPIRES_AT_MS = "gcs.oauth2.token-expires-at"
GCS_PROJECT_ID = "gcs.project-id"
diff --git a/pyiceberg/io/fsspec.py b/pyiceberg/io/fsspec.py
index 6f44501e..ac108c80 100644
--- a/pyiceberg/io/fsspec.py
+++ b/pyiceberg/io/fsspec.py
@@ -43,6 +43,7 @@ from pyiceberg.io import (
ADLS_ACCOUNT_HOST,
ADLS_ACCOUNT_KEY,
ADLS_ACCOUNT_NAME,
+ ADLS_ANON,
ADLS_CLIENT_ID,
ADLS_CLIENT_SECRET,
ADLS_CONNECTION_STRING,
@@ -286,6 +287,7 @@ def _adls(properties: Properties) -> AbstractFileSystem:
client_id=properties.get(ADLS_CLIENT_ID),
client_secret=properties.get(ADLS_CLIENT_SECRET),
account_host=properties.get(ADLS_ACCOUNT_HOST),
+ anon=properties.get(ADLS_ANON),
)
diff --git a/tests/io/test_fsspec.py b/tests/io/test_fsspec.py
index 94eff1cb..392fa60a 100644
--- a/tests/io/test_fsspec.py
+++ b/tests/io/test_fsspec.py
@@ -602,6 +602,7 @@ def test_adls_account_name_sas_token_extraction() -> None:
client_id=None,
client_secret=None,
account_host="testaccount.dfs.core.windows.net",
+ anon=None,
)