This is an automated email from the ASF dual-hosted git repository.

raulcd pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 953947a781 GH-46833: [Python] Expose 
ConfigureManagedIdentityCredential and ConfigureClientSecretCredential to 
AzureFileSystem on PyArrow (#46837)
953947a781 is described below

commit 953947a78157090ff94b14bc027cbcf520a5b2a6
Author: Kirill Tsyganov <[email protected]>
AuthorDate: Mon Jun 30 16:56:43 2025 +1000

    GH-46833: [Python] Expose ConfigureManagedIdentityCredential and 
ConfigureClientSecretCredential to AzureFileSystem on PyArrow (#46837)
    
    ### Rationale for this change
    
    `ClientSecretCredential` is another method to authenticate to Azure 
resources using Service Principle (SP). We work on AzureML, where there are two 
main types of compute ComputeInstance (personal VM) and ComputeCluster. In 
context of AzureML you can work interactively on ComputeInstance, where pyarrow 
 defaults to`DefaultAzureCredential` when reading/writing data over `abfss://` 
protocol. However when running AzureML "jobs" which is run non-interactively 
execution, the context (networ [...]
    
    ### What changes are included in this PR?
    
    Adding ConfigureClientSecretCredential to AzureFilesystem, which is already 
implemented at the C++, but hasn't been propagated to python library. This pull 
request just propagates C++ method to pyarrow
    
    ### Are these changes tested?
    
    Yes, test have been included in the relevant file
    
    ### Are there any user-facing changes?
    
    Docs of `AzureFileSystem` have been updated in the code. It would be 
amazing for those changes to surface on the website, which I'm happy to try to 
help with if needed
    
    * GitHub Issue: #46833
    
    Lead-authored-by: Kirill Tsyganov <[email protected]>
    Co-authored-by: Raúl Cumplido <[email protected]>
    Signed-off-by: Raúl Cumplido <[email protected]>
---
 python/pyarrow/_azurefs.pyx             | 63 +++++++++++++++++++++++++++------
 python/pyarrow/includes/libarrow_fs.pxd |  4 +++
 python/pyarrow/tests/test_fs.py         | 57 +++++++++++++++++++++++++++++
 3 files changed, 114 insertions(+), 10 deletions(-)

diff --git a/python/pyarrow/_azurefs.pyx b/python/pyarrow/_azurefs.pyx
index 188d18a27a..aa6ad1d90e 100644
--- a/python/pyarrow/_azurefs.pyx
+++ b/python/pyarrow/_azurefs.pyx
@@ -53,19 +53,36 @@ cdef class AzureFileSystem(FileSystem):
     blob_storage_authority : str, default None
         hostname[:port] of the Blob Service. Defaults to 
`.blob.core.windows.net`. Useful
         for connecting to a local emulator, like Azurite.
-    dfs_storage_authority : str, default None
-        hostname[:port] of the Data Lake Gen 2 Service. Defaults to 
-        `.dfs.core.windows.net`. Useful for connecting to a local emulator, 
like Azurite.
     blob_storage_scheme : str, default None
         Either `http` or `https`. Defaults to `https`. Useful for connecting 
to a local 
         emulator, like Azurite.
+    client_id : str, default None
+        The client ID (Application ID) for Azure Active Directory 
authentication.
+        Its interpretation depends on the credential type being used:
+        - For `ClientSecretCredential`: It is the Application (client) ID of 
your
+          registered Azure AD application (Service Principal). It must be 
provided
+          together with `tenant_id` and `client_secret` to use 
ClientSecretCredential.
+        - For `ManagedIdentityCredential`: It is the client ID of a specific
+          user-assigned managed identity. This is only necessary if you are 
using a
+          user-assigned managed identity and need to explicitly specify which 
one
+          (e.g., if the resource has multiple user-assigned identities). For
+          system-assigned managed identities, this parameter is typically not 
required.
+    client_secret : str, default None
+        Client secret for Azure Active Directory authentication. Must be 
provided together
+        with `tenant_id` and `client_id` to use ClientSecretCredential.
+    dfs_storage_authority : str, default None
+        hostname[:port] of the Data Lake Gen 2 Service. Defaults to
+        `.dfs.core.windows.net`. Useful for connecting to a local emulator, 
like Azurite.
     dfs_storage_scheme : str, default None
-        Either `http` or `https`. Defaults to `https`. Useful for connecting 
to a local 
+        Either `http` or `https`. Defaults to `https`. Useful for connecting 
to a local
         emulator, like Azurite.
     sas_token : str, default None
         SAS token for the storage account, used as an alternative to 
account_key. If sas_token
-        and account_key are None the default credential will be used. The 
parameters 
+        and account_key are None the default credential will be used. The 
parameters
         account_key and sas_token are mutually exclusive.
+    tenant_id : str, default None
+        Tenant ID for Azure Active Directory authentication. Must be provided 
together with
+        `client_id` and `client_secret` to use ClientSecretCredential.
 
     Examples
     --------
@@ -86,10 +103,14 @@ cdef class AzureFileSystem(FileSystem):
         CAzureFileSystem* azurefs
         c_string account_key
         c_string sas_token
+        c_string tenant_id
+        c_string client_id
+        c_string client_secret
 
     def __init__(self, account_name, *, account_key=None, 
blob_storage_authority=None,
-                 dfs_storage_authority=None, blob_storage_scheme=None,
-                 dfs_storage_scheme=None, sas_token=None):
+                 blob_storage_scheme=None, client_id=None, client_secret=None,
+                 dfs_storage_authority=None, dfs_storage_scheme=None,
+                 sas_token=None, tenant_id=None):
         cdef:
             CAzureOptions options
             shared_ptr[CAzureFileSystem] wrapped
@@ -107,7 +128,26 @@ cdef class AzureFileSystem(FileSystem):
         if account_key and sas_token:
             raise ValueError("Cannot specify both account_key and sas_token.")
 
-        if account_key:
+        if (tenant_id or client_id or client_secret):
+            if not client_id:
+                raise ValueError("client_id must be specified")
+            if not tenant_id and not client_secret:
+                options.ConfigureManagedIdentityCredential(tobytes(client_id))
+                self.client_id = tobytes(client_id)
+            elif tenant_id and client_secret:
+                options.ConfigureClientSecretCredential(
+                    tobytes(tenant_id), tobytes(client_id), 
tobytes(client_secret)
+                )
+                self.tenant_id = tobytes(tenant_id)
+                self.client_id = tobytes(client_id)
+                self.client_secret = tobytes(client_secret)
+            else:
+                raise ValueError(
+                    "Invalid Azure credential configuration: "
+                    "For ManagedIdentityCredential, provide only client_id. "
+                    "For ClientSecretCredential, provide tenant_id, client_id, 
and client_secret."
+                )
+        elif account_key:
             options.ConfigureAccountKeyCredential(tobytes(account_key))
             self.account_key = tobytes(account_key)
         elif sas_token:
@@ -138,8 +178,11 @@ cdef class AzureFileSystem(FileSystem):
                 account_name=frombytes(opts.account_name),
                 account_key=frombytes(self.account_key),
                 blob_storage_authority=frombytes(opts.blob_storage_authority),
-                dfs_storage_authority=frombytes(opts.dfs_storage_authority),
                 blob_storage_scheme=frombytes(opts.blob_storage_scheme),
+                client_id=frombytes(self.client_id),
+                client_secret=frombytes(self.client_secret),
+                dfs_storage_authority=frombytes(opts.dfs_storage_authority),
                 dfs_storage_scheme=frombytes(opts.dfs_storage_scheme),
-                sas_token=frombytes(self.sas_token)
+                sas_token=frombytes(self.sas_token),
+                tenant_id=frombytes(self.tenant_id)
             ),))
diff --git a/python/pyarrow/includes/libarrow_fs.pxd 
b/python/pyarrow/includes/libarrow_fs.pxd
index 736046cfe2..af01c47c8c 100644
--- a/python/pyarrow/includes/libarrow_fs.pxd
+++ b/python/pyarrow/includes/libarrow_fs.pxd
@@ -255,6 +255,10 @@ cdef extern from "arrow/filesystem/api.h" namespace 
"arrow::fs" nogil:
         CStatus ConfigureDefaultCredential()
         CStatus ConfigureAccountKeyCredential(c_string account_key)
         CStatus ConfigureSASCredential(c_string sas_token)
+        CStatus ConfigureManagedIdentityCredential(c_string client_id)
+        CStatus ConfigureClientSecretCredential(c_string tenant_id,
+                                                c_string client_id,
+                                                c_string client_secret)
 
     cdef cppclass CAzureFileSystem "arrow::fs::AzureFileSystem":
         @staticmethod
diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py
index 49b785d2f7..0d61ea6e0c 100644
--- a/python/pyarrow/tests/test_fs.py
+++ b/python/pyarrow/tests/test_fs.py
@@ -1481,6 +1481,63 @@ def test_azurefs_options(pickle_module):
     assert pickle_module.loads(pickle_module.dumps(fs4)) == fs4
     assert fs4 != fs3
 
+    fs5 = AzureFileSystem(
+        account_name='fake-account-name',
+        tenant_id='fake-tenant-id',
+        client_id='fake-client-id',
+        client_secret='fake-client-secret'
+    )
+    assert isinstance(fs5, AzureFileSystem)
+    assert pickle_module.loads(pickle_module.dumps(fs5)) == fs5
+    assert fs5 != fs4
+
+    fs6 = AzureFileSystem(
+        account_name='fake-account-name',
+        client_id='fake-client-id'
+    )
+    assert isinstance(fs6, AzureFileSystem)
+    assert pickle_module.loads(pickle_module.dumps(fs6)) == fs6
+    assert fs6 != fs5
+
+    with pytest.raises(ValueError, match="client_id must be specified"):
+        AzureFileSystem(
+            account_name='fake-account-name',
+            tenant_id='fake-tenant-id'
+        )
+
+    with pytest.raises(ValueError, match="client_id must be specified"):
+        AzureFileSystem(
+            account_name='fake-account-name',
+            client_secret='fake-client-secret'
+        )
+
+    invalid_msg = (
+        "Invalid Azure credential configuration: "
+        "For ManagedIdentityCredential, provide only client_id. "
+        "For ClientSecretCredential, provide tenant_id, client_id, and 
client_secret."
+    )
+
+    with pytest.raises(ValueError, match=invalid_msg):
+        AzureFileSystem(
+            account_name='fake-account-name',
+            client_id='fake-client-id',
+            client_secret='fake-client-secret'
+        )
+
+    with pytest.raises(ValueError, match="client_id must be specified"):
+        AzureFileSystem(
+            account_name='fake-account-name',
+            tenant_id='fake-tenant-id',
+            client_secret='fake-client-secret'
+        )
+
+    with pytest.raises(ValueError, match=invalid_msg):
+        AzureFileSystem(
+            account_name='fake-account-name',
+            tenant_id='fake-tenant-id',
+            client_id='fake-client-id'
+        )
+
     with pytest.raises(ValueError):
         AzureFileSystem(account_name='fake-account-name', 
account_key='fakeaccount',
                         sas_token='fakesastoken')

Reply via email to