alexwilcoxson-rel commented on issue #461:
URL:
https://github.com/apache/arrow-rs-object-store/issues/461#issuecomment-3774290382
Late last year I was trying to use the Microsoft .NET SDK to prove out if
this was working, but it looks like they deployed on January 5. Here is a
thrown together python script that is hitting a directory with 10k blobs. I do
see the performance difference when only requesting a few blobs at the end of
the directory, 200ms vs 6sec to list the full thing
```python
import datetime as dt
import os
import time
import xml.etree.ElementTree as ET
from typing import Iterable, Optional
from urllib.parse import urlencode
import requests
from azure.identity import DefaultAzureCredential
def list_blobs(
container_url: str,
token: str,
prefix: Optional[str] = None,
start_from: Optional[str] = None,
) -> Iterable[str]:
marker: Optional[str] = None
first_request = True
while True:
params = {
"restype": "container",
"comp": "list",
}
if prefix:
params["prefix"] = prefix
if first_request and start_from:
params["startFrom"] = start_from
if marker:
params["marker"] = marker
headers = {
"x-ms-date": dt.datetime.now(dt.UTC).strftime("%a, %d %b %Y
%H:%M:%S GMT"),
"x-ms-version": "2026-02-06",
"Authorization": f"Bearer {token}",
}
response = requests.get(f"{container_url}?{urlencode(params)}",
headers=headers)
response.raise_for_status()
root = ET.fromstring(response.text)
for blob in root.findall("./Blobs/Blob"):
name = blob.findtext("Name")
if name:
yield name
marker = root.findtext("NextMarker") or None
if not marker:
break
first_request = False
def main() -> None:
storage_account = os.getenv("STORAGE_ACCOUNT_NAME", "")
container_name = os.getenv("CONTAINER_NAME", "")
prefix = os.getenv("BLOB_PREFIX", "folder/subfolder/")
start_from = os.getenv("START_FROM", f"{prefix}testblob-0000000003")
url = f"https://{storage_account}.blob.core.windows.net/{container_name}"
credential = DefaultAzureCredential()
token = credential.get_token("https://storage.azure.com/.default").token
init_start = time.perf_counter()
blobs_iter = list_blobs(url, token, prefix, start_from)
init_elapsed = time.perf_counter() - init_start
start = time.perf_counter()
blob_names = list(blobs_iter)
elapsed = time.perf_counter() - start
print(f"storage_account={storage_account}")
print(f"container={container_name}")
print(f"prefix={prefix}")
print(f"start_from={start_from}")
print(f"listed_blob_count={len(blob_names)}")
print(f"first_blob={blob_names[0] if blob_names else 'N/A'}")
print(f"last_blob={blob_names[-1] if blob_names else 'N/A'}")
print(f"init_time={init_elapsed:.3f}")
print(f"list_time={elapsed:.3f}")
if __name__ == "__main__":
main()
```
A couple things to note, differing from start-after in GCP and S3.
* startFrom is inclusive of the value. GCP and S3 are exclusive of the value
of start-after.
* startFrom when using a prefix you include the prefix in the startFrom
value. i.e. see in the above code how prefix and start_form are configured. Not
sure if this is a difference to the other providers.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]