alexwilcoxson-rel commented on issue #461:
URL: 
https://github.com/apache/arrow-rs-object-store/issues/461#issuecomment-3774290382

   Late last year I was trying to use the Microsoft .NET SDK to prove out if 
this was working, but it looks like they deployed on January 5. Here is a 
thrown together python script that is hitting a directory with 10k blobs. I do 
see the performance difference when only requesting a few blobs at the end of 
the directory, 200ms vs 6sec to list the full thing
   
   ```python
   import datetime as dt
   import os
   import time
   import xml.etree.ElementTree as ET
   from typing import Iterable, Optional
   from urllib.parse import urlencode
   
   import requests
   from azure.identity import DefaultAzureCredential
   
   
   def list_blobs(
       container_url: str,
       token: str,
       prefix: Optional[str] = None,
       start_from: Optional[str] = None,
   ) -> Iterable[str]:
       marker: Optional[str] = None
       first_request = True
   
       while True:
           params = {
               "restype": "container",
               "comp": "list",
           }
           if prefix:
               params["prefix"] = prefix
           if first_request and start_from:
               params["startFrom"] = start_from
           if marker:
               params["marker"] = marker
   
           headers = {
               "x-ms-date": dt.datetime.now(dt.UTC).strftime("%a, %d %b %Y 
%H:%M:%S GMT"),
               "x-ms-version": "2026-02-06",
               "Authorization": f"Bearer {token}",
           }
   
           response = requests.get(f"{container_url}?{urlencode(params)}", 
headers=headers)
           response.raise_for_status()
   
           root = ET.fromstring(response.text)
           for blob in root.findall("./Blobs/Blob"):
               name = blob.findtext("Name")
               if name:
                   yield name
   
           marker = root.findtext("NextMarker") or None
           if not marker:
               break
   
           first_request = False
   
   
   def main() -> None:
       storage_account = os.getenv("STORAGE_ACCOUNT_NAME", "")
       container_name = os.getenv("CONTAINER_NAME", "")
       prefix = os.getenv("BLOB_PREFIX", "folder/subfolder/")
       start_from = os.getenv("START_FROM", f"{prefix}testblob-0000000003")
   
       url = f"https://{storage_account}.blob.core.windows.net/{container_name}";
       credential = DefaultAzureCredential()
       token = credential.get_token("https://storage.azure.com/.default";).token
       init_start = time.perf_counter()
       blobs_iter = list_blobs(url, token, prefix, start_from)
       init_elapsed = time.perf_counter() - init_start
   
       start = time.perf_counter()
       blob_names = list(blobs_iter)
       elapsed = time.perf_counter() - start
   
       print(f"storage_account={storage_account}")
       print(f"container={container_name}")
       print(f"prefix={prefix}")
       print(f"start_from={start_from}")
       print(f"listed_blob_count={len(blob_names)}")
       print(f"first_blob={blob_names[0] if blob_names else 'N/A'}")
       print(f"last_blob={blob_names[-1] if blob_names else 'N/A'}")
       print(f"init_time={init_elapsed:.3f}")
       print(f"list_time={elapsed:.3f}")
   
   
   if __name__ == "__main__":
       main()
   ```
   
   A couple things to note, differing from start-after in GCP and S3.
   * startFrom is inclusive of the value. GCP and S3 are exclusive of the value 
of start-after.
   * startFrom when using a prefix you include the prefix in the startFrom 
value. i.e. see in the above code how prefix and start_form are configured. Not 
sure if this is a difference to the other providers.
   
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to