This is an automated email from the ASF dual-hosted git repository.
raulcd pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 953947a781 GH-46833: [Python] Expose
ConfigureManagedIdentityCredential and ConfigureClientSecretCredential to
AzureFileSystem on PyArrow (#46837)
953947a781 is described below
commit 953947a78157090ff94b14bc027cbcf520a5b2a6
Author: Kirill Tsyganov <[email protected]>
AuthorDate: Mon Jun 30 16:56:43 2025 +1000
GH-46833: [Python] Expose ConfigureManagedIdentityCredential and
ConfigureClientSecretCredential to AzureFileSystem on PyArrow (#46837)
### Rationale for this change
`ClientSecretCredential` is another method to authenticate to Azure
resources using Service Principle (SP). We work on AzureML, where there are two
main types of compute ComputeInstance (personal VM) and ComputeCluster. In
context of AzureML you can work interactively on ComputeInstance, where pyarrow
defaults to`DefaultAzureCredential` when reading/writing data over `abfss://`
protocol. However when running AzureML "jobs" which is run non-interactively
execution, the context (networ [...]
### What changes are included in this PR?
Adding ConfigureClientSecretCredential to AzureFilesystem, which is already
implemented at the C++, but hasn't been propagated to python library. This pull
request just propagates C++ method to pyarrow
### Are these changes tested?
Yes, test have been included in the relevant file
### Are there any user-facing changes?
Docs of `AzureFileSystem` have been updated in the code. It would be
amazing for those changes to surface on the website, which I'm happy to try to
help with if needed
* GitHub Issue: #46833
Lead-authored-by: Kirill Tsyganov <[email protected]>
Co-authored-by: Raúl Cumplido <[email protected]>
Signed-off-by: Raúl Cumplido <[email protected]>
---
python/pyarrow/_azurefs.pyx | 63 +++++++++++++++++++++++++++------
python/pyarrow/includes/libarrow_fs.pxd | 4 +++
python/pyarrow/tests/test_fs.py | 57 +++++++++++++++++++++++++++++
3 files changed, 114 insertions(+), 10 deletions(-)
diff --git a/python/pyarrow/_azurefs.pyx b/python/pyarrow/_azurefs.pyx
index 188d18a27a..aa6ad1d90e 100644
--- a/python/pyarrow/_azurefs.pyx
+++ b/python/pyarrow/_azurefs.pyx
@@ -53,19 +53,36 @@ cdef class AzureFileSystem(FileSystem):
blob_storage_authority : str, default None
hostname[:port] of the Blob Service. Defaults to
`.blob.core.windows.net`. Useful
for connecting to a local emulator, like Azurite.
- dfs_storage_authority : str, default None
- hostname[:port] of the Data Lake Gen 2 Service. Defaults to
- `.dfs.core.windows.net`. Useful for connecting to a local emulator,
like Azurite.
blob_storage_scheme : str, default None
Either `http` or `https`. Defaults to `https`. Useful for connecting
to a local
emulator, like Azurite.
+ client_id : str, default None
+ The client ID (Application ID) for Azure Active Directory
authentication.
+ Its interpretation depends on the credential type being used:
+ - For `ClientSecretCredential`: It is the Application (client) ID of
your
+ registered Azure AD application (Service Principal). It must be
provided
+ together with `tenant_id` and `client_secret` to use
ClientSecretCredential.
+ - For `ManagedIdentityCredential`: It is the client ID of a specific
+ user-assigned managed identity. This is only necessary if you are
using a
+ user-assigned managed identity and need to explicitly specify which
one
+ (e.g., if the resource has multiple user-assigned identities). For
+ system-assigned managed identities, this parameter is typically not
required.
+ client_secret : str, default None
+ Client secret for Azure Active Directory authentication. Must be
provided together
+ with `tenant_id` and `client_id` to use ClientSecretCredential.
+ dfs_storage_authority : str, default None
+ hostname[:port] of the Data Lake Gen 2 Service. Defaults to
+ `.dfs.core.windows.net`. Useful for connecting to a local emulator,
like Azurite.
dfs_storage_scheme : str, default None
- Either `http` or `https`. Defaults to `https`. Useful for connecting
to a local
+ Either `http` or `https`. Defaults to `https`. Useful for connecting
to a local
emulator, like Azurite.
sas_token : str, default None
SAS token for the storage account, used as an alternative to
account_key. If sas_token
- and account_key are None the default credential will be used. The
parameters
+ and account_key are None the default credential will be used. The
parameters
account_key and sas_token are mutually exclusive.
+ tenant_id : str, default None
+ Tenant ID for Azure Active Directory authentication. Must be provided
together with
+ `client_id` and `client_secret` to use ClientSecretCredential.
Examples
--------
@@ -86,10 +103,14 @@ cdef class AzureFileSystem(FileSystem):
CAzureFileSystem* azurefs
c_string account_key
c_string sas_token
+ c_string tenant_id
+ c_string client_id
+ c_string client_secret
def __init__(self, account_name, *, account_key=None,
blob_storage_authority=None,
- dfs_storage_authority=None, blob_storage_scheme=None,
- dfs_storage_scheme=None, sas_token=None):
+ blob_storage_scheme=None, client_id=None, client_secret=None,
+ dfs_storage_authority=None, dfs_storage_scheme=None,
+ sas_token=None, tenant_id=None):
cdef:
CAzureOptions options
shared_ptr[CAzureFileSystem] wrapped
@@ -107,7 +128,26 @@ cdef class AzureFileSystem(FileSystem):
if account_key and sas_token:
raise ValueError("Cannot specify both account_key and sas_token.")
- if account_key:
+ if (tenant_id or client_id or client_secret):
+ if not client_id:
+ raise ValueError("client_id must be specified")
+ if not tenant_id and not client_secret:
+ options.ConfigureManagedIdentityCredential(tobytes(client_id))
+ self.client_id = tobytes(client_id)
+ elif tenant_id and client_secret:
+ options.ConfigureClientSecretCredential(
+ tobytes(tenant_id), tobytes(client_id),
tobytes(client_secret)
+ )
+ self.tenant_id = tobytes(tenant_id)
+ self.client_id = tobytes(client_id)
+ self.client_secret = tobytes(client_secret)
+ else:
+ raise ValueError(
+ "Invalid Azure credential configuration: "
+ "For ManagedIdentityCredential, provide only client_id. "
+ "For ClientSecretCredential, provide tenant_id, client_id,
and client_secret."
+ )
+ elif account_key:
options.ConfigureAccountKeyCredential(tobytes(account_key))
self.account_key = tobytes(account_key)
elif sas_token:
@@ -138,8 +178,11 @@ cdef class AzureFileSystem(FileSystem):
account_name=frombytes(opts.account_name),
account_key=frombytes(self.account_key),
blob_storage_authority=frombytes(opts.blob_storage_authority),
- dfs_storage_authority=frombytes(opts.dfs_storage_authority),
blob_storage_scheme=frombytes(opts.blob_storage_scheme),
+ client_id=frombytes(self.client_id),
+ client_secret=frombytes(self.client_secret),
+ dfs_storage_authority=frombytes(opts.dfs_storage_authority),
dfs_storage_scheme=frombytes(opts.dfs_storage_scheme),
- sas_token=frombytes(self.sas_token)
+ sas_token=frombytes(self.sas_token),
+ tenant_id=frombytes(self.tenant_id)
),))
diff --git a/python/pyarrow/includes/libarrow_fs.pxd
b/python/pyarrow/includes/libarrow_fs.pxd
index 736046cfe2..af01c47c8c 100644
--- a/python/pyarrow/includes/libarrow_fs.pxd
+++ b/python/pyarrow/includes/libarrow_fs.pxd
@@ -255,6 +255,10 @@ cdef extern from "arrow/filesystem/api.h" namespace
"arrow::fs" nogil:
CStatus ConfigureDefaultCredential()
CStatus ConfigureAccountKeyCredential(c_string account_key)
CStatus ConfigureSASCredential(c_string sas_token)
+ CStatus ConfigureManagedIdentityCredential(c_string client_id)
+ CStatus ConfigureClientSecretCredential(c_string tenant_id,
+ c_string client_id,
+ c_string client_secret)
cdef cppclass CAzureFileSystem "arrow::fs::AzureFileSystem":
@staticmethod
diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py
index 49b785d2f7..0d61ea6e0c 100644
--- a/python/pyarrow/tests/test_fs.py
+++ b/python/pyarrow/tests/test_fs.py
@@ -1481,6 +1481,63 @@ def test_azurefs_options(pickle_module):
assert pickle_module.loads(pickle_module.dumps(fs4)) == fs4
assert fs4 != fs3
+ fs5 = AzureFileSystem(
+ account_name='fake-account-name',
+ tenant_id='fake-tenant-id',
+ client_id='fake-client-id',
+ client_secret='fake-client-secret'
+ )
+ assert isinstance(fs5, AzureFileSystem)
+ assert pickle_module.loads(pickle_module.dumps(fs5)) == fs5
+ assert fs5 != fs4
+
+ fs6 = AzureFileSystem(
+ account_name='fake-account-name',
+ client_id='fake-client-id'
+ )
+ assert isinstance(fs6, AzureFileSystem)
+ assert pickle_module.loads(pickle_module.dumps(fs6)) == fs6
+ assert fs6 != fs5
+
+ with pytest.raises(ValueError, match="client_id must be specified"):
+ AzureFileSystem(
+ account_name='fake-account-name',
+ tenant_id='fake-tenant-id'
+ )
+
+ with pytest.raises(ValueError, match="client_id must be specified"):
+ AzureFileSystem(
+ account_name='fake-account-name',
+ client_secret='fake-client-secret'
+ )
+
+ invalid_msg = (
+ "Invalid Azure credential configuration: "
+ "For ManagedIdentityCredential, provide only client_id. "
+ "For ClientSecretCredential, provide tenant_id, client_id, and
client_secret."
+ )
+
+ with pytest.raises(ValueError, match=invalid_msg):
+ AzureFileSystem(
+ account_name='fake-account-name',
+ client_id='fake-client-id',
+ client_secret='fake-client-secret'
+ )
+
+ with pytest.raises(ValueError, match="client_id must be specified"):
+ AzureFileSystem(
+ account_name='fake-account-name',
+ tenant_id='fake-tenant-id',
+ client_secret='fake-client-secret'
+ )
+
+ with pytest.raises(ValueError, match=invalid_msg):
+ AzureFileSystem(
+ account_name='fake-account-name',
+ tenant_id='fake-tenant-id',
+ client_id='fake-client-id'
+ )
+
with pytest.raises(ValueError):
AzureFileSystem(account_name='fake-account-name',
account_key='fakeaccount',
sas_token='fakesastoken')