This is an automated email from the ASF dual-hosted git repository.
Xuanwo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-rust.git
The following commit(s) were added to refs/heads/main by this push:
new 2d5444b05 feat: HuggingFace Hub storage backend and CDC table
properties (#2375)
2d5444b05 is described below
commit 2d5444b050bbdc1808786c6502fdabc35e49587c
Author: Krisztián Szűcs <[email protected]>
AuthorDate: Thu May 21 10:02:11 2026 +0200
feat: HuggingFace Hub storage backend and CDC table properties (#2375)
## Which issue does this PR close?
- Closes #.
## What changes are included in this PR?
Adds two opt-in capabilities for storing Iceberg tables on HuggingFace
Hub with content-defined chunking for efficient deduplication.
### HuggingFace Hub storage
New `opendal-hf` feature on `iceberg-storage-opendal` (off by default,
included in `opendal-all`) that wires HuggingFace's OpenDAL service into
`FileIO`. Paths use the form:
`hf://<repo_type>/<owner>/<repo>[@<revision>]/<path_in_repo>`
where `repo_type` must be one of `models`, `datasets`, `spaces`, or
`buckets`. The prefix is mandatory. Configuration via `FileIOBuilder`
properties:
- `hf.token` — API token (required for private repos / writes)
- `hf.endpoint` — Hub endpoint, defaults to https://huggingface.co
- `hf.revision` — fallback revision when a path has no `@<revision>`
`OpenDalResolvingStorage` recognises the `hf` scheme and lazily
constructs a per-scheme storage instance. `delete_stream` groups paths
by `<repo_type>/<repo_id>` so bucket and dataset paths to the same repo
do not share an operator.
### CDC (content-defined chunking) table properties
New table properties under `write.parquet.content-defined-chunking.*`
(matching PyIceberg convention):
- `write.parquet.content-defined-chunking.enabled` (bool, default false)
- `write.parquet.content-defined-chunking.min-chunk-size` (bytes,
default 256 KiB)
- `write.parquet.content-defined-chunking.max-chunk-size` (bytes,
default 1 MiB)
- `write.parquet.content-defined-chunking.norm-level` (i32, default 0)
CDC activates only when `enabled = "true"` is set explicitly. Defaults
match parquet's own `CdcOptions` defaults. CDC options are applied in
the DataFusion physical write plan.
## Are these changes tested?
- Rust unit tests for `HfUri` parsing and CDC property parsing.
- Rust integration tests in `file_io_hf_test.rs` guarded on
`HF_OPENDAL_TOKEN`, `HF_OPENDAL_BUCKET`, `HF_OPENDAL_DATASET`; tests
skip gracefully when env vars are unset.
- Python tests in `test_huggingface_and_cdc.py` covering CDC property
persistence, PyIceberg writes with CDC, DataFusion read-back, and HF
credentials end-to-end (skipped without `HF_OPENDAL_TOKEN` /
`HF_OPENDAL_TABLE_METADATA`).
---
.github/workflows/ci_hf_cdc.yml | 110 ++++++
bindings/python/pyproject.toml | 3 +
bindings/python/tests/test_huggingface_and_cdc.py | 194 +++++++++++
bindings/python/uv.lock | 223 ++++++++++++
crates/iceberg/src/io/storage/config/hf.rs | 104 ++++++
crates/iceberg/src/io/storage/config/mod.rs | 2 +
crates/iceberg/src/spec/table_properties.rs | 189 ++++++++++-
.../datafusion/src/physical_plan/write.rs | 13 +-
crates/storage/opendal/Cargo.toml | 3 +-
crates/storage/opendal/src/hf.rs | 348 +++++++++++++++++++
crates/storage/opendal/src/lib.rs | 57 +++-
crates/storage/opendal/src/resolving.rs | 15 +-
crates/storage/opendal/tests/file_io_hf_test.rs | 376 +++++++++++++++++++++
13 files changed, 1625 insertions(+), 12 deletions(-)
diff --git a/.github/workflows/ci_hf_cdc.yml b/.github/workflows/ci_hf_cdc.yml
new file mode 100644
index 000000000..78cd50d0a
--- /dev/null
+++ b/.github/workflows/ci_hf_cdc.yml
@@ -0,0 +1,110 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+name: HuggingFace and CDC Integration Tests
+
+on:
+ push:
+ branches:
+ - main
+ pull_request:
+ paths:
+ - 'crates/storage/opendal/**'
+ - 'crates/iceberg/src/io/**'
+ - 'crates/iceberg/src/spec/table_properties.rs'
+ - 'crates/integrations/datafusion/**'
+ - 'bindings/python/tests/test_huggingface_and_cdc.py'
+ - '.github/workflows/ci_hf_cdc.yml'
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
+ cancel-in-progress: true
+
+permissions:
+ contents: read
+
+jobs:
+ hf-integration:
+ name: HuggingFace Hub integration tests
+ runs-on: ubuntu-latest
+ # Skip the job entirely when HF secrets are not available (e.g. PRs from
forks).
+ if: ${{ secrets.HF_TOKEN != '' }}
+ steps:
+ - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd #
v6.0.2
+ with:
+ persist-credentials: false
+
+ - name: Setup Rust toolchain
+ uses: ./.github/actions/setup-builder
+
+ - name: Cache Rust artifacts
+ uses: swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2
+
+ - name: Install protoc
+ uses: arduino/setup-protoc@c65c819552d16ad3c9b72d9dfd5ba5237b9c906b #
v3.0.0
+ with:
+ repo-token: ${{ secrets.GITHUB_TOKEN }}
+
+ - name: Run Rust HF integration tests
+ env:
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
+ HF_BUCKET: ${{ secrets.HF_BUCKET }}
+ HF_DATASET: ${{ secrets.HF_DATASET }}
+ run: |
+ cargo test -p iceberg-storage-opendal \
+ --features opendal-hf \
+ --test file_io_hf_test \
+ -- --test-threads=1
+
+ cdc-python:
+ name: CDC and HuggingFace Python tests
+ runs-on: ubuntu-latest
+ # Skip when HF secrets are not available (e.g. PRs from forks).
+ if: ${{ secrets.HF_TOKEN != '' }}
+ steps:
+ - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd #
v6.0.2
+ with:
+ persist-credentials: false
+
+ - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 #
v6.2.0
+ with:
+ python-version: "3.12"
+
+ - uses: PyO3/maturin-action@e83996d129638aa358a18fbd1dfb82f0b0fb5d3b #
v1.51.0
+ with:
+ working-directory: "bindings/python"
+ command: build
+ args: --out dist -i python3.12
+
+ - uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b #
v8.1.0
+ with:
+ version: "0.9.3"
+ enable-cache: true
+
+ - name: Install dependencies
+ working-directory: "bindings/python"
+ run: |
+ make install
+ uv pip install --no-build --reinstall --find-links dist/
pyiceberg-core
+
+ - name: Run CDC and HuggingFace Python tests
+ working-directory: "bindings/python"
+ env:
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
+ HF_DATASET: ${{ secrets.HF_DATASET }}
+ run: |
+ uv run --no-sync pytest tests/test_huggingface_and_cdc.py -v
diff --git a/bindings/python/pyproject.toml b/bindings/python/pyproject.toml
index 0933bdc5e..f23944332 100644
--- a/bindings/python/pyproject.toml
+++ b/bindings/python/pyproject.toml
@@ -54,6 +54,8 @@ ignore = ["F403", "F405"]
[tool.pytest.ini_options]
filterwarnings = [
"error",
+ # huggingface_hub uses hf_xet.upload_files() internally which is deprecated
in hf_xet
+ "ignore::DeprecationWarning:huggingface_hub",
]
[dependency-groups]
@@ -68,4 +70,5 @@ dev = [
"pyiceberg[sql-sqlite]>=0.11",
"pyarrow>=17",
"fastavro>=1.11.1",
+ "huggingface_hub>=0.20",
]
diff --git a/bindings/python/tests/test_huggingface_and_cdc.py
b/bindings/python/tests/test_huggingface_and_cdc.py
new file mode 100644
index 000000000..7e69ffe67
--- /dev/null
+++ b/bindings/python/tests/test_huggingface_and_cdc.py
@@ -0,0 +1,194 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Tests for HuggingFace Hub URI support and CDC (content-defined chunking)
options.
+
+CDC options are standard Iceberg table properties and work in both Rust and
PyIceberg
+automatically — no special API is required beyond setting string properties.
+
+HF credentials are passed as file_io_properties to IcebergDataFusionTable.
+Tests requiring live HF credentials are skipped when HF_TOKEN or HF_DATASET is
not set.
+"""
+
+import os
+import pytest
+import pyarrow as pa
+import datafusion
+from datafusion import SessionContext
+from packaging.version import Version
+from pyiceberg.catalog import load_catalog
+from pyiceberg_core.datafusion import IcebergDataFusionTable
+
+requires_datafusion_53 = pytest.mark.skipif(
+ Version(datafusion.__version__) < Version("53.0.0"),
+ reason="IcebergDataFusionTable requires datafusion>=53 for FFI
compatibility",
+)
+
+
+# ---------------------------------------------------------------------------
+# CDC tests — run without any external credentials
+# ---------------------------------------------------------------------------
+
+
[email protected](scope="module")
+def local_catalog(tmp_path_factory: pytest.TempPathFactory):
+ warehouse = tmp_path_factory.mktemp("cdc_warehouse")
+ return load_catalog(
+ "default",
+ **{
+ "uri": f"sqlite:///{warehouse}/pyiceberg_catalog.db",
+ "warehouse": f"file://{warehouse}",
+ },
+ )
+
+
[email protected](scope="module")
+def sample_table() -> pa.Table:
+ return pa.table(
+ {
+ "id": pa.array(list(range(1000)), type=pa.int32()),
+ "payload": pa.array(
+ [f"row-{i:06d}" for i in range(1000)], type=pa.large_utf8()
+ ),
+ }
+ )
+
+
+def test_cdc_table_properties_are_persisted(local_catalog, sample_table):
+ """Table properties with CDC options are stored and returned as-is."""
+ local_catalog.create_namespace_if_not_exists("cdc_ns")
+
+ # Use values that differ from parquet defaults (256 KiB min, 1 MiB max, 0
norm).
+ tbl = local_catalog.create_table_if_not_exists(
+ "cdc_ns.cdc_persist",
+ schema=sample_table.schema,
+ properties={
+ "write.parquet.content-defined-chunking.min-chunk-size": "65536",
+ "write.parquet.content-defined-chunking.max-chunk-size": "524288",
+ "write.parquet.content-defined-chunking.norm-level": "2",
+ },
+ )
+
+ props = tbl.properties
+ assert props.get("write.parquet.content-defined-chunking.min-chunk-size")
== "65536"
+ assert (
+ props.get("write.parquet.content-defined-chunking.max-chunk-size") ==
"524288"
+ )
+ assert props.get("write.parquet.content-defined-chunking.norm-level") ==
"2"
+
+
+def test_cdc_write_via_pyiceberg(local_catalog, sample_table):
+ """PyIceberg tbl.append() writes parquet with CDC options when properties
are set."""
+ local_catalog.create_namespace_if_not_exists("cdc_ns")
+
+ tbl = local_catalog.create_table_if_not_exists(
+ "cdc_ns.cdc_pyiceberg_write",
+ schema=sample_table.schema,
+ properties={"write.parquet.content-defined-chunking.enabled": "true"},
+ )
+ tbl.append(sample_table)
+
+ result = tbl.scan().to_arrow()
+ assert len(result) == len(sample_table)
+
+
+@requires_datafusion_53
+def test_cdc_write_and_read_via_datafusion(local_catalog, sample_table):
+ """A table with CDC properties can be written and read back via
DataFusion."""
+ local_catalog.create_namespace_if_not_exists("cdc_ns")
+
+ tbl = local_catalog.create_table_if_not_exists(
+ "cdc_ns.cdc_write_read",
+ schema=sample_table.schema,
+ properties={"write.parquet.content-defined-chunking.enabled": "true"},
+ )
+ tbl.append(sample_table)
+
+ provider = IcebergDataFusionTable(
+ identifier=tbl.name(),
+ metadata_location=tbl.metadata_location,
+ file_io_properties=tbl.io.properties,
+ )
+
+ ctx = SessionContext()
+ ctx.register_table("cdc_table", provider)
+ assert ctx.table("cdc_table").count() == len(sample_table)
+
+
+# ---------------------------------------------------------------------------
+# HF + CDC tests — skipped when HF_TOKEN or HF_DATASET is not set
+# ---------------------------------------------------------------------------
+
+requires_hf = pytest.mark.skipif(
+ not os.environ.get("HF_TOKEN") or not os.environ.get("HF_DATASET"),
+ reason="HF_TOKEN or HF_DATASET not set",
+)
+
+
[email protected](scope="module")
+def hf_cdc_table(sample_table):
+ """Write a CDC-enabled Iceberg table to HF Hub once; share across HF tests.
+
+ Uses FsspecFileIO backed by huggingface_hub's HfFileSystem (hf:// in
fsspec).
+ HF_TOKEN is read from the environment automatically by HfFileSystem.
+ """
+ token = os.environ["HF_TOKEN"]
+ dataset = os.environ["HF_DATASET"]
+
+ warehouse = f"hf://datasets/{dataset}/iceberg-ci-{os.getpid()}"
+ catalog = load_catalog(
+ "hf_test",
+ **{
+ "uri": "sqlite:///:memory:",
+ "warehouse": warehouse,
+ "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO",
+ },
+ )
+ catalog.create_namespace("ns")
+ tbl = catalog.create_table(
+ "ns.cdc_tbl",
+ schema=sample_table.schema,
+ properties={"write.parquet.content-defined-chunking.enabled": "true"},
+ )
+ tbl.append(sample_table)
+ # HfFileSystem.dircache may reflect the pre-write state; invalidate it so
+ # subsequent reads (info/open) see the files just uploaded via xet.
+ tbl.io.get_fs("hf").invalidate_cache()
+ return tbl, token
+
+
+@requires_hf
+def test_hf_cdc_write_and_read_via_pyarrow(hf_cdc_table, sample_table):
+ """PyIceberg writes CDC parquet to HF Hub; PyArrow scan reads it back."""
+ tbl, _ = hf_cdc_table
+ result = tbl.scan().to_arrow()
+ assert len(result) == len(sample_table)
+
+
+@requires_hf
+@requires_datafusion_53
+def test_hf_cdc_write_and_read_via_datafusion(hf_cdc_table, sample_table):
+ """PyIceberg writes CDC parquet to HF Hub; IcebergDataFusionTable reads it
back via opendal-hf."""
+ tbl, token = hf_cdc_table
+ provider = IcebergDataFusionTable(
+ identifier=tbl.name(),
+ metadata_location=tbl.metadata_location,
+ file_io_properties={"hf.token": token},
+ )
+ ctx = SessionContext()
+ ctx.register_table("hf_table", provider)
+ assert ctx.table("hf_table").count() == len(sample_table)
diff --git a/bindings/python/uv.lock b/bindings/python/uv.lock
index c346e3245..26a9f5df3 100644
--- a/bindings/python/uv.lock
+++ b/bindings/python/uv.lock
@@ -6,6 +6,15 @@ resolution-markers = [
"python_full_version < '3.14'",
]
+[[package]]
+name = "annotated-doc"
+version = "0.0.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url =
"https://files.pythonhosted.org/packages/57/ba/046ceea27344560984e26a590f90bc7f4a75b06701f653222458922b558c/annotated_doc-0.0.4.tar.gz",
hash =
"sha256:fbcda96e87e9c92ad167c2e53839e57503ecfda18804ea28102353485033faa4", size
= 7288, upload-time = "2025-11-10T22:07:42.062Z" }
+wheels = [
+ { url =
"https://files.pythonhosted.org/packages/1e/d3/26bf1008eb3d2daa8ef4cacc7f3bfdc11818d111f7e2d0201bc6e3b49d45/annotated_doc-0.0.4-py3-none-any.whl",
hash =
"sha256:571ac1dc6991c450b25a9c2d84a3705e2ae7a53467b5d111c24fa8baabbed320", size
= 5303, upload-time = "2025-11-10T22:07:40.673Z" },
+]
+
[[package]]
name = "annotated-types"
version = "0.7.0"
@@ -15,6 +24,20 @@ wheels = [
{ url =
"https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl",
hash =
"sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size
= 13643, upload-time = "2024-05-20T21:33:24.1Z" },
]
+[[package]]
+name = "anyio"
+version = "4.13.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "exceptiongroup", marker = "python_full_version < '3.11'" },
+ { name = "idna" },
+ { name = "typing-extensions", marker = "python_full_version < '3.13'" },
+]
+sdist = { url =
"https://files.pythonhosted.org/packages/19/14/2c5dd9f512b66549ae92767a9c7b330ae88e1932ca57876909410251fe13/anyio-4.13.0.tar.gz",
hash =
"sha256:334b70e641fd2221c1505b3890c69882fe4a2df910cba14d97019b90b24439dc", size
= 231622, upload-time = "2026-03-24T12:59:09.671Z" }
+wheels = [
+ { url =
"https://files.pythonhosted.org/packages/da/42/e921fccf5015463e32a3cf6ee7f980a6ed0f395ceeaa45060b61d86486c2/anyio-4.13.0-py3-none-any.whl",
hash =
"sha256:08b310f9e24a9594186fd75b4f73f4a4152069e3853f1ed8bfbf58369f4ad708", size
= 114353, upload-time = "2026-03-24T12:59:08.246Z" },
+]
+
[[package]]
name = "cachetools"
version = "6.2.6"
@@ -240,6 +263,15 @@ wheels = [
{ url =
"https://files.pythonhosted.org/packages/32/f1/f21bd5319113e89ceceed2df840df21e9c5150d181db74b6ba80400f9f48/fastavro-1.12.2-cp314-cp314t-musllinux_1_2_x86_64.whl",
hash =
"sha256:afede7324822800e4f90e96b9514188a237a60f35e8e7a10b2129c10c78f6e4d", size
= 3356664, upload-time = "2026-04-24T14:37:34.231Z" },
]
+[[package]]
+name = "filelock"
+version = "3.29.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url =
"https://files.pythonhosted.org/packages/b5/fe/997687a931ab51049acce6fa1f23e8f01216374ea81374ddee763c493db5/filelock-3.29.0.tar.gz",
hash =
"sha256:69974355e960702e789734cb4871f884ea6fe50bd8404051a3530bc07809cf90", size
= 57571, upload-time = "2026-04-19T15:39:10.068Z" }
+wheels = [
+ { url =
"https://files.pythonhosted.org/packages/81/47/dd9a212ef6e343a6857485ffe25bba537304f1913bdbed446a23f7f592e1/filelock-3.29.0-py3-none-any.whl",
hash =
"sha256:96f5f6344709aa1572bbf631c640e4ebeeb519e08da902c39a001882f30ac258", size
= 39812, upload-time = "2026-04-19T15:39:08.752Z" },
+]
+
[[package]]
name = "fsspec"
version = "2026.4.0"
@@ -303,6 +335,95 @@ wheels = [
{ url =
"https://files.pythonhosted.org/packages/15/32/77ee8a6c1564fc345a491a4e85b3bf360e4cf26eac98c4532d2fdb96e01f/greenlet-3.5.0-cp314-cp314t-win_amd64.whl",
hash =
"sha256:d60097128cb0a1cab9ea541186ea13cd7b847b8449a7787c2e2350da0cb82d86", size
= 245324, upload-time = "2026-04-27T12:24:40.295Z" },
]
+[[package]]
+name = "h11"
+version = "0.16.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url =
"https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz",
hash =
"sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size
= 101250, upload-time = "2025-04-24T03:35:25.427Z" }
+wheels = [
+ { url =
"https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl",
hash =
"sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size
= 37515, upload-time = "2025-04-24T03:35:24.344Z" },
+]
+
+[[package]]
+name = "hf-xet"
+version = "1.5.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url =
"https://files.pythonhosted.org/packages/74/d8/5c06fc76461418326a7decf8367480c35be11a41fd938633929c60a9ec6b/hf_xet-1.5.0.tar.gz",
hash =
"sha256:e0fb0a34d9f406eed88233e829a67ec016bec5af19e480eac65a233ea289a948", size
= 837196, upload-time = "2026-05-06T06:18:15.583Z" }
+wheels = [
+ { url =
"https://files.pythonhosted.org/packages/68/9b/6912c99070915a4f28119e3c5b52a9abd1eec0ad5cb293b8c967a0c6f5a2/hf_xet-1.5.0-cp313-cp313t-macosx_10_12_x86_64.whl",
hash =
"sha256:7d70fe2ce97b9db73b9c9b9c81fe3693640aec83416a966c446afea54acfae3c", size
= 4023383, upload-time = "2026-05-06T06:17:53.947Z" },
+ { url =
"https://files.pythonhosted.org/packages/0f/6d/9563cfde59b5d8128a9c7ec972a087f4c782e4f7bac5a85234edfd5d5e49/hf_xet-1.5.0-cp313-cp313t-macosx_11_0_arm64.whl",
hash =
"sha256:73a0dae8c71de3b0633a45c73f4a4a5ed09e94b43441d82981a781d4f12baa42", size
= 3792751, upload-time = "2026-05-06T06:17:51.791Z" },
+ { url =
"https://files.pythonhosted.org/packages/07/a5/ed5a0cf35b49a0571af5a8f53416dad1877a718c021c9937c3a53cb45781/hf_xet-1.5.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl",
hash =
"sha256:a60290ec57e9b71767fba7c3645ddafdd0759974b540441510c629c6db6db24a", size
= 4456058, upload-time = "2026-05-06T06:17:40.735Z" },
+ { url =
"https://files.pythonhosted.org/packages/60/fb/3ae8bf2a7a37a4197d0195d7247fd25b3952e15cb8a599e285dfaa6f52b3/hf_xet-1.5.0-cp313-cp313t-manylinux_2_28_aarch64.whl",
hash =
"sha256:e5de0f6deada0dada870bb376a11bcd1f08abf3a968a6d118f33e72d1b1eb480", size
= 4250783, upload-time = "2026-05-06T06:17:38.412Z" },
+ { url =
"https://files.pythonhosted.org/packages/a2/9b/8bae40d4d91525085137196e84eb0ed49cf65b5e96e5c3ecdadd8bd0fac2/hf_xet-1.5.0-cp313-cp313t-musllinux_1_2_aarch64.whl",
hash =
"sha256:c799d49f1a5544a0ef7591c0ee75e0d6b93d6f56dc7a4979f59f7518d2872216", size
= 4445594, upload-time = "2026-05-06T06:18:04.219Z" },
+ { url =
"https://files.pythonhosted.org/packages/13/59/c74efbbd4e8728172b2cc72a2bc014d2947a4b7bdced932fbd3f5da1a4e5/hf_xet-1.5.0-cp313-cp313t-musllinux_1_2_x86_64.whl",
hash =
"sha256:2baea1b0b989e5c152fe81425f7745ddc8901280ba3d97c98d8cdece7b706c60", size
= 4663995, upload-time = "2026-05-06T06:18:06.1Z" },
+ { url =
"https://files.pythonhosted.org/packages/73/32/8e1e0410af64cda9b139d1dcebdc993a8ff9c8c7c0e2696ae356d75ccc0d/hf_xet-1.5.0-cp313-cp313t-win_amd64.whl",
hash =
"sha256:526345b3ed45f374f6317349df489167606736c876241ba984105afe7fd4839d", size
= 3966608, upload-time = "2026-05-06T06:18:19.74Z" },
+ { url =
"https://files.pythonhosted.org/packages/fc/34/a8febc8f4edbea8b3e21b02ebc8b628679b84ba7e45cde624a7736b51500/hf_xet-1.5.0-cp313-cp313t-win_arm64.whl",
hash =
"sha256:786d28e2eb8315d5035544b9d137b4a842d600c434bb91bf7d0d953cce906ad4", size
= 3796946, upload-time = "2026-05-06T06:18:17.568Z" },
+ { url =
"https://files.pythonhosted.org/packages/2a/20/8fc8996afe5815fa1a6be8e9e5c02f24500f409d599e905800d498a4e14d/hf_xet-1.5.0-cp314-cp314t-macosx_10_12_x86_64.whl",
hash =
"sha256:872d5601e6deea30d15865ede55d29eac6daf5a534ab417b99b6ef6b076dd96c", size
= 4023495, upload-time = "2026-05-06T06:18:01.94Z" },
+ { url =
"https://files.pythonhosted.org/packages/32/6a/93d84463c00cecb561a7508aa6303e35ee2894294eac14245526924415fe/hf_xet-1.5.0-cp314-cp314t-macosx_11_0_arm64.whl",
hash =
"sha256:9929561f5abf4581c8ea79587881dfef6b8abb2a0d8a51915936fc2a614f4e73", size
= 3792731, upload-time = "2026-05-06T06:18:00.021Z" },
+ { url =
"https://files.pythonhosted.org/packages/9d/5a/8ec8e0c863b382d00b3c2e2af6ded6b06371be617144a625903a6d562f4b/hf_xet-1.5.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl",
hash =
"sha256:f7b7bbae318e583a86fb21e5a4a175d6721d628a2874f4bd022d0e660c32a682", size
= 4456738, upload-time = "2026-05-06T06:17:49.574Z" },
+ { url =
"https://files.pythonhosted.org/packages/c5/ca/f7effa1a67717da2bcc6b6c28f71c6ca648c77acaec4e2c32f40cbe16d85/hf_xet-1.5.0-cp314-cp314t-manylinux_2_28_aarch64.whl",
hash =
"sha256:cf7b2dc6f31a4ea754bb50f74cde482dcf5d366d184076d8530b9872787f3761", size
= 4251622, upload-time = "2026-05-06T06:17:47.096Z" },
+ { url =
"https://files.pythonhosted.org/packages/65/f2/19247dba3e231cf77dec59ddfb878f00057635ff773d099c9b59d37812c3/hf_xet-1.5.0-cp314-cp314t-musllinux_1_2_aarch64.whl",
hash =
"sha256:8dbcbab554c9ef158ef2c991545c3e970ddd8cc7acdcd0a78c5a41095dab4ded", size
= 4445667, upload-time = "2026-05-06T06:18:11.983Z" },
+ { url =
"https://files.pythonhosted.org/packages/7f/64/6f116801a3bcfb6f59f5c251f48cadc47ea54026441c4a385079286a94fa/hf_xet-1.5.0-cp314-cp314t-musllinux_1_2_x86_64.whl",
hash =
"sha256:5906bf7718d3636dc13402914736abe723492cb730f744834f5f5b67d3a12702", size
= 4664619, upload-time = "2026-05-06T06:18:13.771Z" },
+ { url =
"https://files.pythonhosted.org/packages/5c/e8/069542d37946ed08669b127e1496fa99e78196d71de8d41eda5e9f1b7a58/hf_xet-1.5.0-cp314-cp314t-win_amd64.whl",
hash =
"sha256:5f3dc2248fc01cc0a00cd392ab497f1ca373fcbc7e3f2da1f452480b384e839e", size
= 3966802, upload-time = "2026-05-06T06:18:28.162Z" },
+ { url =
"https://files.pythonhosted.org/packages/f9/91/fc6fdec27b14d04e88c386ac0a0129732b53fa23f7c4a78f4b83a039c567/hf_xet-1.5.0-cp314-cp314t-win_arm64.whl",
hash =
"sha256:b285cea1b5bab46b758772716ba8d6854a1a0310fed1c249d678a8b38601e5a0", size
= 3797168, upload-time = "2026-05-06T06:18:26.287Z" },
+ { url =
"https://files.pythonhosted.org/packages/3d/fb/69ff198a82cae7eb1a69fb84d93b3a3e4816564d76817fe541ddc96874eb/hf_xet-1.5.0-cp37-abi3-macosx_10_12_x86_64.whl",
hash =
"sha256:dad0dc84e941b8ba3c860659fe1fdc35c049d47cce293f003287757e971a8f56", size
= 4030814, upload-time = "2026-05-06T06:17:57.933Z" },
+ { url =
"https://files.pythonhosted.org/packages/9b/ff/edcc2b40162bef3ff78e14ab637e5f3b89243d6aee72f5949d3bb6a5af83/hf_xet-1.5.0-cp37-abi3-macosx_11_0_arm64.whl",
hash =
"sha256:fd6e5a9b0fdac4ed03ed45ef79254a655b1aaab514a02202617fbf643f5fdf7a", size
= 3798444, upload-time = "2026-05-06T06:17:55.79Z" },
+ { url =
"https://files.pythonhosted.org/packages/49/4d/103f76b04310e5e57656696cc184690d20c466af0bca3ca88f8c8ea5d4f3/hf_xet-1.5.0-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl",
hash =
"sha256:3531b1823a0e6d77d80f9ed15ca0e00f0d115094f8ac033d5cae88f4564cc949", size
= 4465986, upload-time = "2026-05-06T06:17:44.886Z" },
+ { url =
"https://files.pythonhosted.org/packages/c4/a2/546f47f464737b3edbab6f8ddb57f2599b93d2cbb66f06abb475ccb48651/hf_xet-1.5.0-cp37-abi3-manylinux_2_28_aarch64.whl",
hash =
"sha256:9a0ee58cd18d5ea799f7ed11290bbccbe56bdd8b1d97ca74b9cc49a3945d7a3b", size
= 4259865, upload-time = "2026-05-06T06:17:42.639Z" },
+ { url =
"https://files.pythonhosted.org/packages/95/7f/1be593c1f28613be2e196473481cd81bfc5910795e30a34e8f744f6cac4f/hf_xet-1.5.0-cp37-abi3-musllinux_1_2_aarch64.whl",
hash =
"sha256:1e60df5a42e9bed8628b6416af2cba4cba57ae9f02de226a06b020d98e1aab18", size
= 4459835, upload-time = "2026-05-06T06:18:08.026Z" },
+ { url =
"https://files.pythonhosted.org/packages/aa/b2/703569fc881f3284487e68cda7b42179978480da3c438042a6bbbb4a671c/hf_xet-1.5.0-cp37-abi3-musllinux_1_2_x86_64.whl",
hash =
"sha256:4b35549ce62601b84da4ff9b24d970032ace3d4430f52d91bcbb26c901d6c690", size
= 4672414, upload-time = "2026-05-06T06:18:09.864Z" },
+ { url =
"https://files.pythonhosted.org/packages/af/37/1b6def445c567286b50aa3b33828158e135b1be44938dde59f11382a500c/hf_xet-1.5.0-cp37-abi3-win_amd64.whl",
hash =
"sha256:2806c7c17b4d23f8d88f7c4814f838c3b6150773fe339c20af23e1cfaf2797e4", size
= 3977238, upload-time = "2026-05-06T06:18:23.621Z" },
+ { url =
"https://files.pythonhosted.org/packages/62/94/3b66b148778ee100dcfd69c2ca22b57b41b44d3063ceec934f209e9184ce/hf_xet-1.5.0-cp37-abi3-win_arm64.whl",
hash =
"sha256:b6c9df403040248c76d808d3e047d64db2d923bae593eb244c41e425cf6cd7be", size
= 3806916, upload-time = "2026-05-06T06:18:21.7Z" },
+]
+
+[[package]]
+name = "httpcore"
+version = "1.0.9"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "certifi" },
+ { name = "h11" },
+]
+sdist = { url =
"https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz",
hash =
"sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size
= 85484, upload-time = "2025-04-24T22:06:22.219Z" }
+wheels = [
+ { url =
"https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl",
hash =
"sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size
= 78784, upload-time = "2025-04-24T22:06:20.566Z" },
+]
+
+[[package]]
+name = "httpx"
+version = "0.28.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "anyio" },
+ { name = "certifi" },
+ { name = "httpcore" },
+ { name = "idna" },
+]
+sdist = { url =
"https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz",
hash =
"sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size
= 141406, upload-time = "2024-12-06T15:37:23.222Z" }
+wheels = [
+ { url =
"https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl",
hash =
"sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size
= 73517, upload-time = "2024-12-06T15:37:21.509Z" },
+]
+
+[[package]]
+name = "huggingface-hub"
+version = "1.15.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "filelock" },
+ { name = "fsspec" },
+ { name = "hf-xet", marker = "platform_machine == 'AMD64' or
platform_machine == 'aarch64' or platform_machine == 'amd64' or
platform_machine == 'arm64' or platform_machine == 'x86_64'" },
+ { name = "httpx" },
+ { name = "packaging" },
+ { name = "pyyaml" },
+ { name = "tqdm" },
+ { name = "typer" },
+ { name = "typing-extensions" },
+]
+sdist = { url =
"https://files.pythonhosted.org/packages/bb/b6/e22bd20a25299c34b8c5922c1545a6320825b13906eb0f7298edfd034a0b/huggingface_hub-1.15.0.tar.gz",
hash =
"sha256:28abfdddda3927fd4de6a63cf26ab012498a2c24dae52baf150c5c6edf98a1d5", size
= 784100, upload-time = "2026-05-15T11:42:52.149Z" }
+wheels = [
+ { url =
"https://files.pythonhosted.org/packages/6e/11/0b64cc9024329b76d7547c19a67604a61d21d3ba678a69d1b220c29d5112/huggingface_hub-1.15.0-py3-none-any.whl",
hash =
"sha256:a4a59af04cbc41a3fe3fec429b171ef994ef8c971eda10136746f408dd4e3744", size
= 663602, upload-time = "2026-05-15T11:42:50.487Z" },
+]
+
[[package]]
name = "idna"
version = "3.15"
@@ -758,6 +879,7 @@ source = { editable = "." }
dev = [
{ name = "datafusion" },
{ name = "fastavro" },
+ { name = "huggingface-hub" },
{ name = "maturin" },
{ name = "pyarrow" },
{ name = "pyiceberg", extra = ["sql-sqlite"] },
@@ -770,6 +892,7 @@ dev = [
dev = [
{ name = "datafusion", specifier = "==52.*" },
{ name = "fastavro", specifier = ">=1.11.1" },
+ { name = "huggingface-hub", specifier = ">=0.20" },
{ name = "maturin", specifier = ">=1.0,<2.0" },
{ name = "pyarrow", specifier = ">=17" },
{ name = "pyiceberg", extras = ["sql-sqlite"], specifier = ">=0.11" },
@@ -895,6 +1018,70 @@ wheels = [
{ url =
"https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl",
hash =
"sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size
= 229892, upload-time = "2024-03-01T18:36:18.57Z" },
]
+[[package]]
+name = "pyyaml"
+version = "6.0.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url =
"https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz",
hash =
"sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size
= 130960, upload-time = "2025-09-25T21:33:16.546Z" }
+wheels = [
+ { url =
"https://files.pythonhosted.org/packages/f4/a0/39350dd17dd6d6c6507025c0e53aef67a9293a6d37d3511f23ea510d5800/pyyaml-6.0.3-cp310-cp310-macosx_10_13_x86_64.whl",
hash =
"sha256:214ed4befebe12df36bcc8bc2b64b396ca31be9304b8f59e25c11cf94a4c033b", size
= 184227, upload-time = "2025-09-25T21:31:46.04Z" },
+ { url =
"https://files.pythonhosted.org/packages/05/14/52d505b5c59ce73244f59c7a50ecf47093ce4765f116cdb98286a71eeca2/pyyaml-6.0.3-cp310-cp310-macosx_11_0_arm64.whl",
hash =
"sha256:02ea2dfa234451bbb8772601d7b8e426c2bfa197136796224e50e35a78777956", size
= 174019, upload-time = "2025-09-25T21:31:47.706Z" },
+ { url =
"https://files.pythonhosted.org/packages/43/f7/0e6a5ae5599c838c696adb4e6330a59f463265bfa1e116cfd1fbb0abaaae/pyyaml-6.0.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl",
hash =
"sha256:b30236e45cf30d2b8e7b3e85881719e98507abed1011bf463a8fa23e9c3e98a8", size
= 740646, upload-time = "2025-09-25T21:31:49.21Z" },
+ { url =
"https://files.pythonhosted.org/packages/2f/3a/61b9db1d28f00f8fd0ae760459a5c4bf1b941baf714e207b6eb0657d2578/pyyaml-6.0.3-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl",
hash =
"sha256:66291b10affd76d76f54fad28e22e51719ef9ba22b29e1d7d03d6777a9174198", size
= 840793, upload-time = "2025-09-25T21:31:50.735Z" },
+ { url =
"https://files.pythonhosted.org/packages/7a/1e/7acc4f0e74c4b3d9531e24739e0ab832a5edf40e64fbae1a9c01941cabd7/pyyaml-6.0.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl",
hash =
"sha256:9c7708761fccb9397fe64bbc0395abcae8c4bf7b0eac081e12b809bf47700d0b", size
= 770293, upload-time = "2025-09-25T21:31:51.828Z" },
+ { url =
"https://files.pythonhosted.org/packages/8b/ef/abd085f06853af0cd59fa5f913d61a8eab65d7639ff2a658d18a25d6a89d/pyyaml-6.0.3-cp310-cp310-musllinux_1_2_aarch64.whl",
hash =
"sha256:418cf3f2111bc80e0933b2cd8cd04f286338bb88bdc7bc8e6dd775ebde60b5e0", size
= 732872, upload-time = "2025-09-25T21:31:53.282Z" },
+ { url =
"https://files.pythonhosted.org/packages/1f/15/2bc9c8faf6450a8b3c9fc5448ed869c599c0a74ba2669772b1f3a0040180/pyyaml-6.0.3-cp310-cp310-musllinux_1_2_x86_64.whl",
hash =
"sha256:5e0b74767e5f8c593e8c9b5912019159ed0533c70051e9cce3e8b6aa699fcd69", size
= 758828, upload-time = "2025-09-25T21:31:54.807Z" },
+ { url =
"https://files.pythonhosted.org/packages/a3/00/531e92e88c00f4333ce359e50c19b8d1de9fe8d581b1534e35ccfbc5f393/pyyaml-6.0.3-cp310-cp310-win32.whl",
hash =
"sha256:28c8d926f98f432f88adc23edf2e6d4921ac26fb084b028c733d01868d19007e", size
= 142415, upload-time = "2025-09-25T21:31:55.885Z" },
+ { url =
"https://files.pythonhosted.org/packages/2a/fa/926c003379b19fca39dd4634818b00dec6c62d87faf628d1394e137354d4/pyyaml-6.0.3-cp310-cp310-win_amd64.whl",
hash =
"sha256:bdb2c67c6c1390b63c6ff89f210c8fd09d9a1217a465701eac7316313c915e4c", size
= 158561, upload-time = "2025-09-25T21:31:57.406Z" },
+ { url =
"https://files.pythonhosted.org/packages/6d/16/a95b6757765b7b031c9374925bb718d55e0a9ba8a1b6a12d25962ea44347/pyyaml-6.0.3-cp311-cp311-macosx_10_13_x86_64.whl",
hash =
"sha256:44edc647873928551a01e7a563d7452ccdebee747728c1080d881d68af7b997e", size
= 185826, upload-time = "2025-09-25T21:31:58.655Z" },
+ { url =
"https://files.pythonhosted.org/packages/16/19/13de8e4377ed53079ee996e1ab0a9c33ec2faf808a4647b7b4c0d46dd239/pyyaml-6.0.3-cp311-cp311-macosx_11_0_arm64.whl",
hash =
"sha256:652cb6edd41e718550aad172851962662ff2681490a8a711af6a4d288dd96824", size
= 175577, upload-time = "2025-09-25T21:32:00.088Z" },
+ { url =
"https://files.pythonhosted.org/packages/0c/62/d2eb46264d4b157dae1275b573017abec435397aa59cbcdab6fc978a8af4/pyyaml-6.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl",
hash =
"sha256:10892704fc220243f5305762e276552a0395f7beb4dbf9b14ec8fd43b57f126c", size
= 775556, upload-time = "2025-09-25T21:32:01.31Z" },
+ { url =
"https://files.pythonhosted.org/packages/10/cb/16c3f2cf3266edd25aaa00d6c4350381c8b012ed6f5276675b9eba8d9ff4/pyyaml-6.0.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl",
hash =
"sha256:850774a7879607d3a6f50d36d04f00ee69e7fc816450e5f7e58d7f17f1ae5c00", size
= 882114, upload-time = "2025-09-25T21:32:03.376Z" },
+ { url =
"https://files.pythonhosted.org/packages/71/60/917329f640924b18ff085ab889a11c763e0b573da888e8404ff486657602/pyyaml-6.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl",
hash =
"sha256:b8bb0864c5a28024fac8a632c443c87c5aa6f215c0b126c449ae1a150412f31d", size
= 806638, upload-time = "2025-09-25T21:32:04.553Z" },
+ { url =
"https://files.pythonhosted.org/packages/dd/6f/529b0f316a9fd167281a6c3826b5583e6192dba792dd55e3203d3f8e655a/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_aarch64.whl",
hash =
"sha256:1d37d57ad971609cf3c53ba6a7e365e40660e3be0e5175fa9f2365a379d6095a", size
= 767463, upload-time = "2025-09-25T21:32:06.152Z" },
+ { url =
"https://files.pythonhosted.org/packages/f2/6a/b627b4e0c1dd03718543519ffb2f1deea4a1e6d42fbab8021936a4d22589/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_x86_64.whl",
hash =
"sha256:37503bfbfc9d2c40b344d06b2199cf0e96e97957ab1c1b546fd4f87e53e5d3e4", size
= 794986, upload-time = "2025-09-25T21:32:07.367Z" },
+ { url =
"https://files.pythonhosted.org/packages/45/91/47a6e1c42d9ee337c4839208f30d9f09caa9f720ec7582917b264defc875/pyyaml-6.0.3-cp311-cp311-win32.whl",
hash =
"sha256:8098f252adfa6c80ab48096053f512f2321f0b998f98150cea9bd23d83e1467b", size
= 142543, upload-time = "2025-09-25T21:32:08.95Z" },
+ { url =
"https://files.pythonhosted.org/packages/da/e3/ea007450a105ae919a72393cb06f122f288ef60bba2dc64b26e2646fa315/pyyaml-6.0.3-cp311-cp311-win_amd64.whl",
hash =
"sha256:9f3bfb4965eb874431221a3ff3fdcddc7e74e3b07799e0e84ca4a0f867d449bf", size
= 158763, upload-time = "2025-09-25T21:32:09.96Z" },
+ { url =
"https://files.pythonhosted.org/packages/d1/33/422b98d2195232ca1826284a76852ad5a86fe23e31b009c9886b2d0fb8b2/pyyaml-6.0.3-cp312-cp312-macosx_10_13_x86_64.whl",
hash =
"sha256:7f047e29dcae44602496db43be01ad42fc6f1cc0d8cd6c83d342306c32270196", size
= 182063, upload-time = "2025-09-25T21:32:11.445Z" },
+ { url =
"https://files.pythonhosted.org/packages/89/a0/6cf41a19a1f2f3feab0e9c0b74134aa2ce6849093d5517a0c550fe37a648/pyyaml-6.0.3-cp312-cp312-macosx_11_0_arm64.whl",
hash =
"sha256:fc09d0aa354569bc501d4e787133afc08552722d3ab34836a80547331bb5d4a0", size
= 173973, upload-time = "2025-09-25T21:32:12.492Z" },
+ { url =
"https://files.pythonhosted.org/packages/ed/23/7a778b6bd0b9a8039df8b1b1d80e2e2ad78aa04171592c8a5c43a56a6af4/pyyaml-6.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl",
hash =
"sha256:9149cad251584d5fb4981be1ecde53a1ca46c891a79788c0df828d2f166bda28", size
= 775116, upload-time = "2025-09-25T21:32:13.652Z" },
+ { url =
"https://files.pythonhosted.org/packages/65/30/d7353c338e12baef4ecc1b09e877c1970bd3382789c159b4f89d6a70dc09/pyyaml-6.0.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl",
hash =
"sha256:5fdec68f91a0c6739b380c83b951e2c72ac0197ace422360e6d5a959d8d97b2c", size
= 844011, upload-time = "2025-09-25T21:32:15.21Z" },
+ { url =
"https://files.pythonhosted.org/packages/8b/9d/b3589d3877982d4f2329302ef98a8026e7f4443c765c46cfecc8858c6b4b/pyyaml-6.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl",
hash =
"sha256:ba1cc08a7ccde2d2ec775841541641e4548226580ab850948cbfda66a1befcdc", size
= 807870, upload-time = "2025-09-25T21:32:16.431Z" },
+ { url =
"https://files.pythonhosted.org/packages/05/c0/b3be26a015601b822b97d9149ff8cb5ead58c66f981e04fedf4e762f4bd4/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_aarch64.whl",
hash =
"sha256:8dc52c23056b9ddd46818a57b78404882310fb473d63f17b07d5c40421e47f8e", size
= 761089, upload-time = "2025-09-25T21:32:17.56Z" },
+ { url =
"https://files.pythonhosted.org/packages/be/8e/98435a21d1d4b46590d5459a22d88128103f8da4c2d4cb8f14f2a96504e1/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_x86_64.whl",
hash =
"sha256:41715c910c881bc081f1e8872880d3c650acf13dfa8214bad49ed4cede7c34ea", size
= 790181, upload-time = "2025-09-25T21:32:18.834Z" },
+ { url =
"https://files.pythonhosted.org/packages/74/93/7baea19427dcfbe1e5a372d81473250b379f04b1bd3c4c5ff825e2327202/pyyaml-6.0.3-cp312-cp312-win32.whl",
hash =
"sha256:96b533f0e99f6579b3d4d4995707cf36df9100d67e0c8303a0c55b27b5f99bc5", size
= 137658, upload-time = "2025-09-25T21:32:20.209Z" },
+ { url =
"https://files.pythonhosted.org/packages/86/bf/899e81e4cce32febab4fb42bb97dcdf66bc135272882d1987881a4b519e9/pyyaml-6.0.3-cp312-cp312-win_amd64.whl",
hash =
"sha256:5fcd34e47f6e0b794d17de1b4ff496c00986e1c83f7ab2fb8fcfe9616ff7477b", size
= 154003, upload-time = "2025-09-25T21:32:21.167Z" },
+ { url =
"https://files.pythonhosted.org/packages/1a/08/67bd04656199bbb51dbed1439b7f27601dfb576fb864099c7ef0c3e55531/pyyaml-6.0.3-cp312-cp312-win_arm64.whl",
hash =
"sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd", size
= 140344, upload-time = "2025-09-25T21:32:22.617Z" },
+ { url =
"https://files.pythonhosted.org/packages/d1/11/0fd08f8192109f7169db964b5707a2f1e8b745d4e239b784a5a1dd80d1db/pyyaml-6.0.3-cp313-cp313-macosx_10_13_x86_64.whl",
hash =
"sha256:8da9669d359f02c0b91ccc01cac4a67f16afec0dac22c2ad09f46bee0697eba8", size
= 181669, upload-time = "2025-09-25T21:32:23.673Z" },
+ { url =
"https://files.pythonhosted.org/packages/b1/16/95309993f1d3748cd644e02e38b75d50cbc0d9561d21f390a76242ce073f/pyyaml-6.0.3-cp313-cp313-macosx_11_0_arm64.whl",
hash =
"sha256:2283a07e2c21a2aa78d9c4442724ec1eb15f5e42a723b99cb3d822d48f5f7ad1", size
= 173252, upload-time = "2025-09-25T21:32:25.149Z" },
+ { url =
"https://files.pythonhosted.org/packages/50/31/b20f376d3f810b9b2371e72ef5adb33879b25edb7a6d072cb7ca0c486398/pyyaml-6.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl",
hash =
"sha256:ee2922902c45ae8ccada2c5b501ab86c36525b883eff4255313a253a3160861c", size
= 767081, upload-time = "2025-09-25T21:32:26.575Z" },
+ { url =
"https://files.pythonhosted.org/packages/49/1e/a55ca81e949270d5d4432fbbd19dfea5321eda7c41a849d443dc92fd1ff7/pyyaml-6.0.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl",
hash =
"sha256:a33284e20b78bd4a18c8c2282d549d10bc8408a2a7ff57653c0cf0b9be0afce5", size
= 841159, upload-time = "2025-09-25T21:32:27.727Z" },
+ { url =
"https://files.pythonhosted.org/packages/74/27/e5b8f34d02d9995b80abcef563ea1f8b56d20134d8f4e5e81733b1feceb2/pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl",
hash =
"sha256:0f29edc409a6392443abf94b9cf89ce99889a1dd5376d94316ae5145dfedd5d6", size
= 801626, upload-time = "2025-09-25T21:32:28.878Z" },
+ { url =
"https://files.pythonhosted.org/packages/f9/11/ba845c23988798f40e52ba45f34849aa8a1f2d4af4b798588010792ebad6/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_aarch64.whl",
hash =
"sha256:f7057c9a337546edc7973c0d3ba84ddcdf0daa14533c2065749c9075001090e6", size
= 753613, upload-time = "2025-09-25T21:32:30.178Z" },
+ { url =
"https://files.pythonhosted.org/packages/3d/e0/7966e1a7bfc0a45bf0a7fb6b98ea03fc9b8d84fa7f2229e9659680b69ee3/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_x86_64.whl",
hash =
"sha256:eda16858a3cab07b80edaf74336ece1f986ba330fdb8ee0d6c0d68fe82bc96be", size
= 794115, upload-time = "2025-09-25T21:32:31.353Z" },
+ { url =
"https://files.pythonhosted.org/packages/de/94/980b50a6531b3019e45ddeada0626d45fa85cbe22300844a7983285bed3b/pyyaml-6.0.3-cp313-cp313-win32.whl",
hash =
"sha256:d0eae10f8159e8fdad514efdc92d74fd8d682c933a6dd088030f3834bc8e6b26", size
= 137427, upload-time = "2025-09-25T21:32:32.58Z" },
+ { url =
"https://files.pythonhosted.org/packages/97/c9/39d5b874e8b28845e4ec2202b5da735d0199dbe5b8fb85f91398814a9a46/pyyaml-6.0.3-cp313-cp313-win_amd64.whl",
hash =
"sha256:79005a0d97d5ddabfeeea4cf676af11e647e41d81c9a7722a193022accdb6b7c", size
= 154090, upload-time = "2025-09-25T21:32:33.659Z" },
+ { url =
"https://files.pythonhosted.org/packages/73/e8/2bdf3ca2090f68bb3d75b44da7bbc71843b19c9f2b9cb9b0f4ab7a5a4329/pyyaml-6.0.3-cp313-cp313-win_arm64.whl",
hash =
"sha256:5498cd1645aa724a7c71c8f378eb29ebe23da2fc0d7a08071d89469bf1d2defb", size
= 140246, upload-time = "2025-09-25T21:32:34.663Z" },
+ { url =
"https://files.pythonhosted.org/packages/9d/8c/f4bd7f6465179953d3ac9bc44ac1a8a3e6122cf8ada906b4f96c60172d43/pyyaml-6.0.3-cp314-cp314-macosx_10_13_x86_64.whl",
hash =
"sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac", size
= 181814, upload-time = "2025-09-25T21:32:35.712Z" },
+ { url =
"https://files.pythonhosted.org/packages/bd/9c/4d95bb87eb2063d20db7b60faa3840c1b18025517ae857371c4dd55a6b3a/pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl",
hash =
"sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310", size
= 173809, upload-time = "2025-09-25T21:32:36.789Z" },
+ { url =
"https://files.pythonhosted.org/packages/92/b5/47e807c2623074914e29dabd16cbbdd4bf5e9b2db9f8090fa64411fc5382/pyyaml-6.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl",
hash =
"sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7", size
= 766454, upload-time = "2025-09-25T21:32:37.966Z" },
+ { url =
"https://files.pythonhosted.org/packages/02/9e/e5e9b168be58564121efb3de6859c452fccde0ab093d8438905899a3a483/pyyaml-6.0.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl",
hash =
"sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788", size
= 836355, upload-time = "2025-09-25T21:32:39.178Z" },
+ { url =
"https://files.pythonhosted.org/packages/88/f9/16491d7ed2a919954993e48aa941b200f38040928474c9e85ea9e64222c3/pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl",
hash =
"sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5", size
= 794175, upload-time = "2025-09-25T21:32:40.865Z" },
+ { url =
"https://files.pythonhosted.org/packages/dd/3f/5989debef34dc6397317802b527dbbafb2b4760878a53d4166579111411e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_aarch64.whl",
hash =
"sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764", size
= 755228, upload-time = "2025-09-25T21:32:42.084Z" },
+ { url =
"https://files.pythonhosted.org/packages/d7/ce/af88a49043cd2e265be63d083fc75b27b6ed062f5f9fd6cdc223ad62f03e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_x86_64.whl",
hash =
"sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35", size
= 789194, upload-time = "2025-09-25T21:32:43.362Z" },
+ { url =
"https://files.pythonhosted.org/packages/23/20/bb6982b26a40bb43951265ba29d4c246ef0ff59c9fdcdf0ed04e0687de4d/pyyaml-6.0.3-cp314-cp314-win_amd64.whl",
hash =
"sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac", size
= 156429, upload-time = "2025-09-25T21:32:57.844Z" },
+ { url =
"https://files.pythonhosted.org/packages/f4/f4/a4541072bb9422c8a883ab55255f918fa378ecf083f5b85e87fc2b4eda1b/pyyaml-6.0.3-cp314-cp314-win_arm64.whl",
hash =
"sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3", size
= 143912, upload-time = "2025-09-25T21:32:59.247Z" },
+ { url =
"https://files.pythonhosted.org/packages/7c/f9/07dd09ae774e4616edf6cda684ee78f97777bdd15847253637a6f052a62f/pyyaml-6.0.3-cp314-cp314t-macosx_10_13_x86_64.whl",
hash =
"sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3", size
= 189108, upload-time = "2025-09-25T21:32:44.377Z" },
+ { url =
"https://files.pythonhosted.org/packages/4e/78/8d08c9fb7ce09ad8c38ad533c1191cf27f7ae1effe5bb9400a46d9437fcf/pyyaml-6.0.3-cp314-cp314t-macosx_11_0_arm64.whl",
hash =
"sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba", size
= 183641, upload-time = "2025-09-25T21:32:45.407Z" },
+ { url =
"https://files.pythonhosted.org/packages/7b/5b/3babb19104a46945cf816d047db2788bcaf8c94527a805610b0289a01c6b/pyyaml-6.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl",
hash =
"sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c", size
= 831901, upload-time = "2025-09-25T21:32:48.83Z" },
+ { url =
"https://files.pythonhosted.org/packages/8b/cc/dff0684d8dc44da4d22a13f35f073d558c268780ce3c6ba1b87055bb0b87/pyyaml-6.0.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl",
hash =
"sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702", size
= 861132, upload-time = "2025-09-25T21:32:50.149Z" },
+ { url =
"https://files.pythonhosted.org/packages/b1/5e/f77dc6b9036943e285ba76b49e118d9ea929885becb0a29ba8a7c75e29fe/pyyaml-6.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl",
hash =
"sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c", size
= 839261, upload-time = "2025-09-25T21:32:51.808Z" },
+ { url =
"https://files.pythonhosted.org/packages/ce/88/a9db1376aa2a228197c58b37302f284b5617f56a5d959fd1763fb1675ce6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl",
hash =
"sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065", size
= 805272, upload-time = "2025-09-25T21:32:52.941Z" },
+ { url =
"https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl",
hash =
"sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size
= 829923, upload-time = "2025-09-25T21:32:54.537Z" },
+ { url =
"https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl",
hash =
"sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size
= 174062, upload-time = "2025-09-25T21:32:55.767Z" },
+ { url =
"https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl",
hash =
"sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size
= 149341, upload-time = "2025-09-25T21:32:56.828Z" },
+]
+
[[package]]
name = "requests"
version = "2.33.1"
@@ -923,6 +1110,15 @@ wheels = [
{ url =
"https://files.pythonhosted.org/packages/b3/76/6d163cfac87b632216f71879e6b2cf17163f773ff59c00b5ff4900a80fa3/rich-14.3.4-py3-none-any.whl",
hash =
"sha256:07e7adb4690f68864777b1450859253bed81a99a31ac321ac1817b2313558952", size
= 310480, upload-time = "2026-04-11T02:57:47.484Z" },
]
+[[package]]
+name = "shellingham"
+version = "1.5.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url =
"https://files.pythonhosted.org/packages/58/15/8b3609fd3830ef7b27b655beb4b4e9c62313a4e8da8c676e142cc210d58e/shellingham-1.5.4.tar.gz",
hash =
"sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de", size
= 10310, upload-time = "2023-10-24T04:13:40.426Z" }
+wheels = [
+ { url =
"https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl",
hash =
"sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size
= 9755, upload-time = "2023-10-24T04:13:38.866Z" },
+]
+
[[package]]
name = "six"
version = "1.17.0"
@@ -1067,6 +1263,33 @@ wheels = [
{ url =
"https://files.pythonhosted.org/packages/7b/61/cceae43728b7de99d9b847560c262873a1f6c98202171fd5ed62640b494b/tomli-2.4.1-py3-none-any.whl",
hash =
"sha256:0d85819802132122da43cb86656f8d1f8c6587d54ae7dcaf30e90533028b49fe", size
= 14583, upload-time = "2026-03-25T20:22:03.012Z" },
]
+[[package]]
+name = "tqdm"
+version = "4.67.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "colorama", marker = "sys_platform == 'win32'" },
+]
+sdist = { url =
"https://files.pythonhosted.org/packages/09/a9/6ba95a270c6f1fbcd8dac228323f2777d886cb206987444e4bce66338dd4/tqdm-4.67.3.tar.gz",
hash =
"sha256:7d825f03f89244ef73f1d4ce193cb1774a8179fd96f31d7e1dcde62092b960bb", size
= 169598, upload-time = "2026-02-03T17:35:53.048Z" }
+wheels = [
+ { url =
"https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl",
hash =
"sha256:ee1e4c0e59148062281c49d80b25b67771a127c85fc9676d3be5f243206826bf", size
= 78374, upload-time = "2026-02-03T17:35:50.982Z" },
+]
+
+[[package]]
+name = "typer"
+version = "0.25.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "annotated-doc" },
+ { name = "click" },
+ { name = "rich" },
+ { name = "shellingham" },
+]
+sdist = { url =
"https://files.pythonhosted.org/packages/e4/51/9aed62104cea109b820bbd6c14245af756112017d309da813ef107d42e7e/typer-0.25.1.tar.gz",
hash =
"sha256:9616eb8853a09ffeabab1698952f33c6f29ffdbceb4eaeecf571880e8d7664cc", size
= 122276, upload-time = "2026-04-30T19:32:16.964Z" }
+wheels = [
+ { url =
"https://files.pythonhosted.org/packages/3f/f9/2b3ff4e56e5fa7debfaf9eb135d0da96f3e9a1d5b27222223c7296336e5f/typer-0.25.1-py3-none-any.whl",
hash =
"sha256:75caa44ed46a03fb2dab8808753ffacdbfea88495e74c85a28c5eefcf5f39c89", size
= 58409, upload-time = "2026-04-30T19:32:18.271Z" },
+]
+
[[package]]
name = "typing-extensions"
version = "4.15.0"
diff --git a/crates/iceberg/src/io/storage/config/hf.rs
b/crates/iceberg/src/io/storage/config/hf.rs
new file mode 100644
index 000000000..fdb79a5af
--- /dev/null
+++ b/crates/iceberg/src/io/storage/config/hf.rs
@@ -0,0 +1,104 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! HuggingFace Hub storage configuration.
+
+use serde::{Deserialize, Serialize};
+use typed_builder::TypedBuilder;
+
+use super::StorageConfig;
+use crate::Result;
+
+/// HuggingFace Hub authentication token.
+pub const HF_TOKEN: &str = "hf.token";
+/// HuggingFace Hub endpoint URL. Defaults to `https://huggingface.co`.
+pub const HF_ENDPOINT: &str = "hf.endpoint";
+/// Default git revision/branch for all paths that don't specify one. Defaults
to `main`.
+pub const HF_REVISION: &str = "hf.revision";
+
+/// HuggingFace Hub storage configuration.
+///
+/// Repo type, repo ID, and revision are normally encoded in the file path URI
+/// (`hf://<repo_type>/<owner>/<repo>[@<revision>]/<path>`, where `<repo_type>`
+/// is one of `models`, `datasets`, `spaces`, or `buckets`).
+/// The fields here provide credentials and a default revision fallback.
+#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize,
TypedBuilder)]
+pub struct HfConfig {
+ /// HuggingFace Hub API token (required for private repos and write
access).
+ #[builder(default, setter(strip_option, into))]
+ pub token: Option<String>,
+ /// HuggingFace Hub endpoint. Defaults to `https://huggingface.co`.
+ #[builder(default, setter(strip_option, into))]
+ pub endpoint: Option<String>,
+ /// Default revision to use when a path URI does not specify one.
+ #[builder(default, setter(strip_option, into))]
+ pub revision: Option<String>,
+}
+
+impl TryFrom<&StorageConfig> for HfConfig {
+ type Error = crate::Error;
+
+ fn try_from(config: &StorageConfig) -> Result<Self> {
+ let props = config.props();
+ let mut cfg = HfConfig::default();
+ if let Some(token) = props.get(HF_TOKEN) {
+ cfg.token = Some(token.clone());
+ }
+ if let Some(endpoint) = props.get(HF_ENDPOINT) {
+ cfg.endpoint = Some(endpoint.clone());
+ }
+ if let Some(revision) = props.get(HF_REVISION) {
+ cfg.revision = Some(revision.clone());
+ }
+ Ok(cfg)
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn test_hf_config_builder() {
+ let cfg = HfConfig::builder()
+ .token("hf_mytoken")
+ .endpoint("https://huggingface.co")
+ .revision("dev")
+ .build();
+ assert_eq!(cfg.token.as_deref(), Some("hf_mytoken"));
+ assert_eq!(cfg.endpoint.as_deref(), Some("https://huggingface.co"));
+ assert_eq!(cfg.revision.as_deref(), Some("dev"));
+ }
+
+ #[test]
+ fn test_hf_config_from_storage_config() {
+ let storage_config = StorageConfig::new()
+ .with_prop(HF_TOKEN, "hf_abc123")
+ .with_prop(HF_ENDPOINT, "https://huggingface.co");
+
+ let cfg = HfConfig::try_from(&storage_config).unwrap();
+ assert_eq!(cfg.token.as_deref(), Some("hf_abc123"));
+ assert_eq!(cfg.endpoint.as_deref(), Some("https://huggingface.co"));
+ }
+
+ #[test]
+ fn test_hf_config_empty() {
+ let cfg = HfConfig::try_from(&StorageConfig::new()).unwrap();
+ assert_eq!(cfg.token, None);
+ assert_eq!(cfg.endpoint, None);
+ }
+}
diff --git a/crates/iceberg/src/io/storage/config/mod.rs
b/crates/iceberg/src/io/storage/config/mod.rs
index cbdb53730..2350aab6d 100644
--- a/crates/iceberg/src/io/storage/config/mod.rs
+++ b/crates/iceberg/src/io/storage/config/mod.rs
@@ -32,6 +32,7 @@
mod azdls;
mod gcs;
+mod hf;
mod oss;
mod s3;
@@ -39,6 +40,7 @@ use std::collections::HashMap;
pub use azdls::*;
pub use gcs::*;
+pub use hf::*;
pub use oss::*;
pub use s3::*;
use serde::{Deserialize, Serialize};
diff --git a/crates/iceberg/src/spec/table_properties.rs
b/crates/iceberg/src/spec/table_properties.rs
index a3d4e7fda..dc21da565 100644
--- a/crates/iceberg/src/spec/table_properties.rs
+++ b/crates/iceberg/src/spec/table_properties.rs
@@ -22,8 +22,6 @@ use std::str::FromStr;
use crate::compression::CompressionCodec;
use crate::error::{Error, ErrorKind, Result};
-// Helper function to parse a property from a HashMap
-// If the property is not found, use the default value
fn parse_property<T: FromStr>(
properties: &HashMap<String, String>,
key: &str,
@@ -121,6 +119,15 @@ pub struct TableProperties {
/// Whether garbage collection is enabled on drop.
/// When `false`, data files will not be deleted when a table is dropped.
pub gc_enabled: bool,
+ /// Whether content-defined chunking is enabled.
+ /// `true` only when `write.parquet.content-defined-chunking.enabled =
"true"`.
+ pub cdc_enabled: bool,
+ /// Content-defined chunking minimum chunk size in bytes.
+ pub cdc_min_chunk_size: usize,
+ /// Content-defined chunking maximum chunk size in bytes.
+ pub cdc_max_chunk_size: usize,
+ /// Content-defined chunking normalization level (gearhash bit adjustment).
+ pub cdc_norm_level: i32,
}
impl TableProperties {
@@ -226,6 +233,26 @@ impl TableProperties {
pub const PROPERTY_GC_ENABLED: &str = "gc.enabled";
/// Default value for gc.enabled
pub const PROPERTY_GC_ENABLED_DEFAULT: bool = true;
+
+ /// Enable content-defined chunking with parquet defaults (or per-property
overrides).
+ pub const PROPERTY_PARQUET_CDC_ENABLED: &str =
"write.parquet.content-defined-chunking.enabled";
+ /// Default value for content-defined chunking enabled.
+ pub const PROPERTY_PARQUET_CDC_ENABLED_DEFAULT: bool = false;
+ /// Minimum chunk size in bytes for content-defined chunking.
+ pub const PROPERTY_PARQUET_CDC_MIN_CHUNK_SIZE: &str =
+ "write.parquet.content-defined-chunking.min-chunk-size";
+ /// Default matches
`parquet::file::properties::DEFAULT_CDC_MIN_CHUNK_SIZE`.
+ pub const PROPERTY_PARQUET_CDC_MIN_CHUNK_SIZE_DEFAULT: usize = 256 * 1024;
+ /// Maximum chunk size in bytes for content-defined chunking.
+ pub const PROPERTY_PARQUET_CDC_MAX_CHUNK_SIZE: &str =
+ "write.parquet.content-defined-chunking.max-chunk-size";
+ /// Default matches
`parquet::file::properties::DEFAULT_CDC_MAX_CHUNK_SIZE`.
+ pub const PROPERTY_PARQUET_CDC_MAX_CHUNK_SIZE_DEFAULT: usize = 1024 * 1024;
+ /// Normalization level (gearhash bit adjustment) for content-defined
chunking.
+ pub const PROPERTY_PARQUET_CDC_NORM_LEVEL: &str =
+ "write.parquet.content-defined-chunking.norm-level";
+ /// Default matches `parquet::file::properties::DEFAULT_CDC_NORM_LEVEL`.
+ pub const PROPERTY_PARQUET_CDC_NORM_LEVEL_DEFAULT: i32 = 0;
}
impl TryFrom<&HashMap<String, String>> for TableProperties {
@@ -275,6 +302,26 @@ impl TryFrom<&HashMap<String, String>> for TableProperties
{
TableProperties::PROPERTY_GC_ENABLED,
TableProperties::PROPERTY_GC_ENABLED_DEFAULT,
)?,
+ cdc_enabled: parse_property(
+ props,
+ TableProperties::PROPERTY_PARQUET_CDC_ENABLED,
+ TableProperties::PROPERTY_PARQUET_CDC_ENABLED_DEFAULT,
+ )?,
+ cdc_min_chunk_size: parse_property(
+ props,
+ TableProperties::PROPERTY_PARQUET_CDC_MIN_CHUNK_SIZE,
+ TableProperties::PROPERTY_PARQUET_CDC_MIN_CHUNK_SIZE_DEFAULT,
+ )?,
+ cdc_max_chunk_size: parse_property(
+ props,
+ TableProperties::PROPERTY_PARQUET_CDC_MAX_CHUNK_SIZE,
+ TableProperties::PROPERTY_PARQUET_CDC_MAX_CHUNK_SIZE_DEFAULT,
+ )?,
+ cdc_norm_level: parse_property(
+ props,
+ TableProperties::PROPERTY_PARQUET_CDC_NORM_LEVEL,
+ TableProperties::PROPERTY_PARQUET_CDC_NORM_LEVEL_DEFAULT,
+ )?,
})
}
}
@@ -583,4 +630,142 @@ mod tests {
);
}
}
+
+ #[test]
+ fn test_cdc_disabled_by_default() {
+ let props = HashMap::new();
+ let tp = TableProperties::try_from(&props).unwrap();
+ assert!(!tp.cdc_enabled);
+ }
+
+ #[test]
+ fn test_cdc_enabled_via_flag() {
+ let props = HashMap::from([(
+ TableProperties::PROPERTY_PARQUET_CDC_ENABLED.to_string(),
+ "true".to_string(),
+ )]);
+ let tp = TableProperties::try_from(&props).unwrap();
+ assert!(tp.cdc_enabled);
+ assert_eq!(tp.cdc_min_chunk_size, 256 * 1024);
+ assert_eq!(tp.cdc_max_chunk_size, 1024 * 1024);
+ assert_eq!(tp.cdc_norm_level, 0);
+ }
+
+ #[test]
+ fn test_cdc_size_props_alone_do_not_enable() {
+ let props = HashMap::from([(
+ TableProperties::PROPERTY_PARQUET_CDC_MIN_CHUNK_SIZE.to_string(),
+ "262144".to_string(),
+ )]);
+ let tp = TableProperties::try_from(&props).unwrap();
+ assert!(!tp.cdc_enabled);
+ }
+
+ #[test]
+ fn test_cdc_custom_values() {
+ let props = HashMap::from([
+ (
+ TableProperties::PROPERTY_PARQUET_CDC_ENABLED.to_string(),
+ "true".to_string(),
+ ),
+ (
+
TableProperties::PROPERTY_PARQUET_CDC_MIN_CHUNK_SIZE.to_string(),
+ "200000".to_string(),
+ ),
+ (
+
TableProperties::PROPERTY_PARQUET_CDC_MAX_CHUNK_SIZE.to_string(),
+ "900000".to_string(),
+ ),
+ (
+ TableProperties::PROPERTY_PARQUET_CDC_NORM_LEVEL.to_string(),
+ "1".to_string(),
+ ),
+ ]);
+ let tp = TableProperties::try_from(&props).unwrap();
+ assert!(tp.cdc_enabled);
+ assert_eq!(tp.cdc_min_chunk_size, 200000);
+ assert_eq!(tp.cdc_max_chunk_size, 900000);
+ assert_eq!(tp.cdc_norm_level, 1);
+ }
+
+ #[test]
+ fn test_cdc_partial_override() {
+ let props = HashMap::from([
+ (
+ TableProperties::PROPERTY_PARQUET_CDC_ENABLED.to_string(),
+ "true".to_string(),
+ ),
+ (
+ TableProperties::PROPERTY_PARQUET_CDC_NORM_LEVEL.to_string(),
+ "2".to_string(),
+ ),
+ ]);
+ let tp = TableProperties::try_from(&props).unwrap();
+ assert!(tp.cdc_enabled);
+ assert_eq!(tp.cdc_min_chunk_size, 256 * 1024);
+ assert_eq!(tp.cdc_max_chunk_size, 1024 * 1024);
+ assert_eq!(tp.cdc_norm_level, 2);
+ }
+
+ #[test]
+ fn test_cdc_negative_norm_level() {
+ let props = HashMap::from([
+ (
+ TableProperties::PROPERTY_PARQUET_CDC_ENABLED.to_string(),
+ "true".to_string(),
+ ),
+ (
+ TableProperties::PROPERTY_PARQUET_CDC_NORM_LEVEL.to_string(),
+ "-2".to_string(),
+ ),
+ ]);
+ let tp = TableProperties::try_from(&props).unwrap();
+ assert_eq!(tp.cdc_norm_level, -2);
+ }
+
+ #[test]
+ fn test_cdc_invalid_min_chunk_size() {
+ let props = HashMap::from([
+ (
+ TableProperties::PROPERTY_PARQUET_CDC_ENABLED.to_string(),
+ "true".to_string(),
+ ),
+ (
+
TableProperties::PROPERTY_PARQUET_CDC_MIN_CHUNK_SIZE.to_string(),
+ "not_a_number".to_string(),
+ ),
+ ]);
+ let err = TableProperties::try_from(&props).unwrap_err();
+ assert!(
+ err.to_string().contains(
+ "Invalid value for
write.parquet.content-defined-chunking.min-chunk-size"
+ )
+ );
+ }
+
+ #[test]
+ fn test_cdc_invalid_norm_level() {
+ let props = HashMap::from([
+ (
+ TableProperties::PROPERTY_PARQUET_CDC_ENABLED.to_string(),
+ "true".to_string(),
+ ),
+ (
+ TableProperties::PROPERTY_PARQUET_CDC_NORM_LEVEL.to_string(),
+ "not_a_number".to_string(),
+ ),
+ ]);
+ let err = TableProperties::try_from(&props).unwrap_err();
+ assert!(
+ err.to_string()
+ .contains("Invalid value for
write.parquet.content-defined-chunking.norm-level")
+ );
+ }
+
+ #[test]
+ fn test_cdc_no_properties() {
+ let props = HashMap::from([("some.other.property".to_string(),
"value".to_string())]);
+ let tp = TableProperties::try_from(&props).unwrap();
+ assert!(!tp.cdc_enabled);
+ }
}
diff --git a/crates/integrations/datafusion/src/physical_plan/write.rs
b/crates/integrations/datafusion/src/physical_plan/write.rs
index 3b227e20f..282d1005b 100644
--- a/crates/integrations/datafusion/src/physical_plan/write.rs
+++ b/crates/integrations/datafusion/src/physical_plan/write.rs
@@ -45,7 +45,7 @@ use iceberg::writer::file_writer::location_generator::{
};
use iceberg::writer::file_writer::rolling_writer::RollingFileWriterBuilder;
use iceberg::{Error, ErrorKind};
-use parquet::file::properties::WriterProperties;
+use parquet::file::properties::{CdcOptions, WriterPropertiesBuilder};
use uuid::Uuid;
use crate::physical_plan::DATA_FILES_COL_NAME;
@@ -226,8 +226,17 @@ impl ExecutionPlan for IcebergWriteExec {
}
// Create data file writer builder
+ let cdc_options = table_props.cdc_enabled.then_some(CdcOptions {
+ min_chunk_size: table_props.cdc_min_chunk_size,
+ max_chunk_size: table_props.cdc_max_chunk_size,
+ norm_level: table_props.cdc_norm_level,
+ });
+ let writer_properties = WriterPropertiesBuilder::default()
+ .set_content_defined_chunking(cdc_options)
+ .build();
+
let parquet_file_writer_builder =
ParquetWriterBuilder::new_with_match_mode(
- WriterProperties::default(),
+ writer_properties,
self.table.metadata().current_schema().clone(),
FieldMatchMode::Name,
);
diff --git a/crates/storage/opendal/Cargo.toml
b/crates/storage/opendal/Cargo.toml
index 55aa6ac75..549959b53 100644
--- a/crates/storage/opendal/Cargo.toml
+++ b/crates/storage/opendal/Cargo.toml
@@ -28,11 +28,12 @@ keywords = ["iceberg", "opendal", "storage"]
[features]
default = ["opendal-memory", "opendal-fs", "opendal-s3"]
-opendal-all = ["opendal-memory", "opendal-fs", "opendal-s3", "opendal-gcs",
"opendal-oss", "opendal-azdls"]
+opendal-all = ["opendal-memory", "opendal-fs", "opendal-s3", "opendal-gcs",
"opendal-oss", "opendal-azdls", "opendal-hf"]
opendal-azdls = ["opendal/services-azdls"]
opendal-fs = ["opendal/services-fs"]
opendal-gcs = ["opendal/services-gcs"]
+opendal-hf = ["opendal/services-hf"]
opendal-memory = ["opendal/services-memory"]
opendal-oss = ["opendal/services-oss"]
opendal-s3 = ["opendal/services-s3", "reqsign-aws-v4", "reqsign-core"]
diff --git a/crates/storage/opendal/src/hf.rs b/crates/storage/opendal/src/hf.rs
new file mode 100644
index 000000000..a7ca2d884
--- /dev/null
+++ b/crates/storage/opendal/src/hf.rs
@@ -0,0 +1,348 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! HuggingFace Hub storage backend.
+
+use std::collections::HashMap;
+
+use iceberg::io::{HF_ENDPOINT, HF_REVISION, HF_TOKEN};
+use iceberg::{Error, ErrorKind, Result};
+use opendal::{Configurator, Operator, OperatorUri};
+
+use crate::utils::from_opendal_error;
+
+// ---------------------------------------------------------------------------
+// Minimal URI parser — extracts only what the caller needs.
+// TODO: remove once opendal-service-hf exports its URI parser publicly.
+// ---------------------------------------------------------------------------
+
+/// Repository type of a HuggingFace Hub repository.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub(crate) enum HfRepoType {
+ /// Model repository (`models/` prefix).
+ Model,
+ /// Dataset repository (`datasets/` prefix).
+ Dataset,
+ /// Spaces application repository (`spaces/` prefix).
+ Space,
+ /// XET-backed object-storage bucket (`buckets/` prefix).
+ Bucket,
+}
+
+impl HfRepoType {
+ /// Parse a repo-type keyword (singular or plural) into the corresponding
variant.
+ fn parse(s: &str) -> Option<Self> {
+ match s.to_lowercase().replace(' ', "").as_str() {
+ "model" | "models" => Some(Self::Model),
+ "dataset" | "datasets" => Some(Self::Dataset),
+ "space" | "spaces" => Some(Self::Space),
+ "bucket" | "buckets" => Some(Self::Bucket),
+ _ => None,
+ }
+ }
+
+ fn canonical(self) -> &'static str {
+ match self {
+ Self::Model => "models",
+ Self::Dataset => "datasets",
+ Self::Space => "spaces",
+ Self::Bucket => "buckets",
+ }
+ }
+}
+
+/// Parsed HuggingFace URI: `hf://<repo_type>/<repo_id>[@<revision>][/<path>]`.
+///
+/// `repo_type` must be explicitly specified — there is no implicit default.
+/// Only the fields required by this crate are stored; revision is consumed
+/// during parsing but not retained.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub(crate) struct HfUri {
+ pub repo_type: HfRepoType,
+ /// e.g. `"user/my-repo"`.
+ pub repo_id: String,
+ /// Path within the repository, e.g. `"train/data.parquet"`. Empty at repo
root.
+ pub path: String,
+}
+
+impl HfUri {
+ /// Parse a full `hf://…` URI or the bare path portion (without scheme).
+ /// Returns `None` if the URI does not begin with a recognized repo-type
prefix
+ /// (`models/`, `datasets/`, `spaces/`, or `buckets/`).
+ pub(crate) fn parse(full_uri: &str) -> Option<Self> {
+ let s = full_uri.strip_prefix("hf://").unwrap_or(full_uri);
+ if s.is_empty() {
+ return None;
+ }
+
+ // Require an explicit repo_type prefix — no implicit default.
+ let (first, rest) = s.split_once('/')?;
+ let repo_type = HfRepoType::parse(first)?;
+ let s = rest;
+
+ // Remaining: `<repo_id>[@<revision>][/<path_in_repo>]`
+ let (repo_id, path) = if s.contains('/') {
+ // Check if `@` appears in the first two slash-segments (the
repo_id portion).
+ // This distinguishes "user/repo@rev/file" from
"user/repo/path/@file".
+ let first_two = s.splitn(3,
'/').take(2).collect::<Vec<_>>().join("/");
+ if first_two.contains('@') {
+ let (repo_id, rev_and_path) = s.split_once('@').unwrap();
+ let rev_and_path = rev_and_path.replace("%2F", "/");
+ (repo_id.to_string(), path_after_revision(&rev_and_path))
+ } else {
+ let segs: Vec<_> = s.splitn(3, '/').collect();
+ let repo_id = format!("{}/{}", segs[0], segs[1]);
+ let path = segs.get(2).copied().unwrap_or("").to_string();
+ (repo_id, path)
+ }
+ } else if let Some((repo_id, _)) = s.split_once('@') {
+ (repo_id.to_string(), String::new())
+ } else {
+ (s.to_string(), String::new())
+ };
+
+ Some(Self {
+ repo_type,
+ repo_id,
+ path,
+ })
+ }
+}
+
+/// Given the string after `@`, extract the path-in-repo, correctly skipping
+/// multi-segment special refs (`refs/convert/parquet`, `refs/pr/N`).
+/// These are the only two multi-segment special ref prefixes in HF's git
model.
+fn path_after_revision(rev_and_path: &str) -> String {
+ if !rev_and_path.contains('/') {
+ return String::new();
+ }
+ if let Some(rest) = rev_and_path.strip_prefix("refs/convert/") {
+ return rest
+ .find('/')
+ .map_or(String::new(), |i| rest[i + 1..].to_string());
+ }
+ if let Some(rest) = rev_and_path.strip_prefix("refs/pr/") {
+ return rest
+ .find('/')
+ .map_or(String::new(), |i| rest[i + 1..].to_string());
+ }
+ rev_and_path
+ .split_once('/')
+ .map(|(_, path)| path.to_string())
+ .unwrap_or_default()
+}
+
+// ---------------------------------------------------------------------------
+// Public helpers used by lib.rs
+// ---------------------------------------------------------------------------
+
+/// Parse iceberg `StorageConfig` properties into an opendal
[`opendal::services::HfConfig`].
+pub(crate) fn hf_config_parse(m: HashMap<String, String>) ->
Result<opendal::services::HfConfig> {
+ let mut cfg = opendal::services::HfConfig::default();
+ if let Some(token) = m.get(HF_TOKEN) {
+ cfg.token = Some(token.clone());
+ }
+ if let Some(endpoint) = m.get(HF_ENDPOINT) {
+ cfg.endpoint = Some(endpoint.clone());
+ }
+ if let Some(revision) = m.get(HF_REVISION) {
+ cfg.revision = Some(revision.clone());
+ }
+ Ok(cfg)
+}
+
+/// Build an [`Operator`] for the given `hf://…` path and return it together
with
+/// the relative path-in-repo.
+///
+/// URI parsing is delegated to opendal's [`HfConfig::from_uri`]. The base
config
+/// provides fallback values for `revision` and `endpoint`; the `token` is
always
+/// taken from the base config and never from the URI.
+pub(crate) fn hf_config_build<'a>(
+ cfg: &opendal::services::HfConfig,
+ path: &'a str,
+) -> Result<(Operator, &'a str)> {
+ let uri = OperatorUri::new(path, Vec::<(String,
String)>::new()).map_err(|e| {
+ Error::new(ErrorKind::DataInvalid, format!("Invalid hf url:
{path}")).with_source(e)
+ })?;
+
+ let mut hf_cfg = opendal::services::HfConfig::from_uri(&uri).map_err(|e| {
+ Error::new(ErrorKind::DataInvalid, format!("Invalid hf url:
{path}")).with_source(e)
+ })?;
+
+ // Token must come from config only, never from the URI.
+ hf_cfg.token = cfg.token.clone();
+
+ if hf_cfg.endpoint.is_none() {
+ hf_cfg.endpoint = cfg.endpoint.clone();
+ }
+ if hf_cfg.revision.is_none() {
+ hf_cfg.revision = cfg.revision.clone();
+ }
+
+ let parsed = HfUri::parse(path)
+ .ok_or_else(|| Error::new(ErrorKind::DataInvalid, format!("Invalid hf
url: {path}")))?;
+ let relative_path = &path[path.len() - parsed.path.len()..];
+
+ let op = Operator::from_config(hf_cfg)
+ .map_err(from_opendal_error)?
+ .finish();
+ Ok((op, relative_path))
+}
+
+/// Returns a stable cache key for `delete_stream` batching:
`"<repo_type>/<repo_id>"`
+/// (e.g. `"buckets/user/my-repo"`), without revision.
+/// Repo type is included so bucket and dataset paths to the same repo use
separate operators.
+/// Falls back to the full path so that unparsable paths never share an
operator accidentally.
+pub(crate) fn hf_batch_key(path: &str) -> String {
+ HfUri::parse(path)
+ .map(|u| format!("{}/{}", u.repo_type.canonical(), u.repo_id))
+ .unwrap_or_else(|| path.to_string())
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ fn parse(uri: &str) -> HfUri {
+ HfUri::parse(uri).unwrap_or_else(|| panic!("parse failed for {uri:?}"))
+ }
+
+ #[test]
+ fn test_model_prefix() {
+ let u = parse("hf://models/user/my-model/path/to/file.parquet");
+ assert_eq!(u.repo_type, HfRepoType::Model);
+ assert_eq!(u.repo_id, "user/my-model");
+ assert_eq!(u.path, "path/to/file.parquet");
+ }
+
+ #[test]
+ fn test_dataset_prefix() {
+ let u = parse("hf://datasets/user/my-dataset/train/data.parquet");
+ assert_eq!(u.repo_type, HfRepoType::Dataset);
+ assert_eq!(u.repo_id, "user/my-dataset");
+ assert_eq!(u.path, "train/data.parquet");
+ }
+
+ #[test]
+ fn test_bucket_prefix() {
+ let u = parse("hf://buckets/myorg/my-bucket/iceberg/metadata/v1.json");
+ assert_eq!(u.repo_type, HfRepoType::Bucket);
+ assert_eq!(u.repo_id, "myorg/my-bucket");
+ assert_eq!(u.path, "iceberg/metadata/v1.json");
+ }
+
+ #[test]
+ fn test_revision() {
+ let u = parse("hf://datasets/user/my-dataset@main/train/data.parquet");
+ assert_eq!(u.repo_type, HfRepoType::Dataset);
+ assert_eq!(u.repo_id, "user/my-dataset");
+ assert_eq!(u.path, "train/data.parquet");
+ }
+
+ #[test]
+ fn test_refs_convert_revision() {
+ let u = parse("hf://datasets/squad@refs/convert/parquet/data.parquet");
+ assert_eq!(u.path, "data.parquet");
+ }
+
+ #[test]
+ fn test_refs_pr_revision() {
+ let u = parse("hf://models/user/repo@refs/pr/10/file.txt");
+ assert_eq!(u.path, "file.txt");
+ }
+
+ #[test]
+ fn test_encoded_revision() {
+ let u = parse("hf://models/user/repo@refs%2Fpr%2F10/file.txt");
+ assert_eq!(u.path, "file.txt");
+ }
+
+ #[test]
+ fn test_no_path() {
+ let u = parse("hf://models/user/my-model");
+ assert_eq!(u.repo_id, "user/my-model");
+ assert_eq!(u.path, "");
+ }
+
+ #[test]
+ fn test_at_in_path_not_revision() {
+ let u = parse("hf://models/user/repo/path/@not-a-revision.txt");
+ assert_eq!(u.path, "path/@not-a-revision.txt");
+ }
+
+ #[test]
+ fn test_single_segment_repo_id() {
+ // Without revision and path: unambiguous.
+ let u = parse("hf://models/gpt2");
+ assert_eq!(u.repo_type, HfRepoType::Model);
+ assert_eq!(u.repo_id, "gpt2");
+ assert_eq!(u.path, "");
+
+ // With explicit revision: single-segment repos with paths are parsed
correctly.
+ let u = parse("hf://models/gpt2@main/config.json");
+ assert_eq!(u.repo_type, HfRepoType::Model);
+ assert_eq!(u.repo_id, "gpt2");
+ assert_eq!(u.path, "config.json");
+ }
+
+ #[test]
+ fn test_batch_key() {
+ assert_eq!(
+ hf_batch_key("hf://datasets/user/repo@main/path/file.parquet"),
+ "datasets/user/repo"
+ );
+ assert_eq!(
+ hf_batch_key("hf://buckets/org/bucket/data/file.parquet"),
+ "buckets/org/bucket"
+ );
+ // Same repo_id, different repo_type → different keys.
+ assert_ne!(
+ hf_batch_key("hf://buckets/user/repo/file"),
+ hf_batch_key("hf://datasets/user/repo/file"),
+ );
+ }
+
+ #[test]
+ fn test_invalid_uri() {
+ assert!(HfUri::parse("hf://").is_none());
+ // bare repo-type, no repo_id
+ assert!(HfUri::parse("hf://datasets").is_none());
+ // missing repo-type prefix
+ assert!(HfUri::parse("hf://user/my-model").is_none());
+ assert!(HfUri::parse("hf://gpt2").is_none());
+ // unrecognized repo-type prefix
+ assert!(HfUri::parse("hf://repos/user/repo/file").is_none());
+ }
+
+ #[test]
+ fn test_hf_config_build_relative_path() {
+ let cfg = opendal::services::HfConfig::default();
+
+ let (_, rel) = hf_config_build(
+ &cfg,
+ "hf://datasets/user/my-dataset@main/train/data.parquet",
+ )
+ .unwrap();
+ assert_eq!(rel, "train/data.parquet");
+
+ let (_, rel) = hf_config_build(&cfg,
"hf://models/user/my-model/config.json").unwrap();
+ assert_eq!(rel, "config.json");
+
+ let (_, rel) = hf_config_build(&cfg,
"hf://models/user/my-model").unwrap();
+ assert_eq!(rel, "");
+ }
+}
diff --git a/crates/storage/opendal/src/lib.rs
b/crates/storage/opendal/src/lib.rs
index 65deaa5f4..67113833f 100644
--- a/crates/storage/opendal/src/lib.rs
+++ b/crates/storage/opendal/src/lib.rs
@@ -51,6 +51,14 @@ cfg_if! {
}
}
+cfg_if! {
+ if #[cfg(feature = "opendal-hf")] {
+ mod hf;
+ use hf::*;
+ use opendal::services::HfConfig;
+ }
+}
+
cfg_if! {
if #[cfg(feature = "opendal-fs")] {
mod fs;
@@ -120,6 +128,9 @@ pub enum OpenDalStorageFactory {
/// Azure Data Lake Storage factory.
#[cfg(feature = "opendal-azdls")]
Azdls,
+ /// HuggingFace Hub storage factory.
+ #[cfg(feature = "opendal-hf")]
+ Hf,
}
#[typetag::serde(name = "OpenDalStorageFactory")]
@@ -152,6 +163,10 @@ impl StorageFactory for OpenDalStorageFactory {
OpenDalStorageFactory::Azdls => Ok(Arc::new(OpenDalStorage::Azdls {
config: azdls_config_parse(config.props().clone())?.into(),
})),
+ #[cfg(feature = "opendal-hf")]
+ OpenDalStorageFactory::Hf => Ok(Arc::new(OpenDalStorage::Hf {
+ config: hf_config_parse(config.props().clone())?.into(),
+ })),
#[cfg(all(
not(feature = "opendal-memory"),
not(feature = "opendal-fs"),
@@ -159,6 +174,7 @@ impl StorageFactory for OpenDalStorageFactory {
not(feature = "opendal-gcs"),
not(feature = "opendal-oss"),
not(feature = "opendal-azdls"),
+ not(feature = "opendal-hf"),
))]
_ => Err(Error::new(
ErrorKind::FeatureUnsupported,
@@ -218,6 +234,16 @@ pub enum OpenDalStorage {
/// Azure DLS configuration.
config: Arc<AzdlsConfig>,
},
+ /// HuggingFace Hub storage variant.
+ ///
+ /// Accepts paths of the form
+ /// `hf://<repo_type>/<owner>/<repo>[@<revision>]/<path_in_repo>`,
+ /// where `<repo_type>` must be one of `models`, `datasets`, `spaces`, or
`buckets`.
+ #[cfg(feature = "opendal-hf")]
+ Hf {
+ /// HuggingFace Hub configuration (token + endpoint).
+ config: Arc<HfConfig>,
+ },
}
impl OpenDalStorage {
@@ -311,12 +337,15 @@ impl OpenDalStorage {
}
#[cfg(feature = "opendal-azdls")]
OpenDalStorage::Azdls { config } => azdls_create_operator(path,
config)?,
+ #[cfg(feature = "opendal-hf")]
+ OpenDalStorage::Hf { config } => hf_config_build(config, path)?,
#[cfg(all(
not(feature = "opendal-s3"),
not(feature = "opendal-fs"),
not(feature = "opendal-gcs"),
not(feature = "opendal-oss"),
not(feature = "opendal-azdls"),
+ not(feature = "opendal-hf"),
))]
_ => {
return Err(Error::new(
@@ -332,6 +361,21 @@ impl OpenDalStorage {
Ok((operator, relative_path))
}
+ /// Returns a cache key used by `delete_stream` to group paths by storage
operator.
+ ///
+ /// For most backends the URL host (bucket name) is sufficient. For HF the
host
+ /// encodes the repo type, not the repo identity, so a more specific key
is used.
+ fn batch_key_for_path(&self, path: &str) -> String {
+ match self {
+ #[cfg(feature = "opendal-hf")]
+ OpenDalStorage::Hf { .. } => hf_batch_key(path),
+ _ => url::Url::parse(path)
+ .ok()
+ .and_then(|u| u.host_str().map(|s| s.to_string()))
+ .unwrap_or_default(),
+ }
+ }
+
/// Extracts the relative path from an absolute path without building an
operator.
///
/// This is a lightweight alternative to
[`create_operator`](Self::create_operator) for cases
@@ -408,12 +452,20 @@ impl OpenDalStorage {
let relative_path_len = azure_path.path.len();
Ok(&path[path.len() - relative_path_len..])
}
+ #[cfg(feature = "opendal-hf")]
+ OpenDalStorage::Hf { .. } => {
+ let parsed = hf::HfUri::parse(path).ok_or_else(|| {
+ Error::new(ErrorKind::DataInvalid, format!("Invalid hf
url: {path}"))
+ })?;
+ Ok(&path[path.len() - parsed.path.len()..])
+ }
#[cfg(all(
not(feature = "opendal-s3"),
not(feature = "opendal-fs"),
not(feature = "opendal-gcs"),
not(feature = "opendal-oss"),
not(feature = "opendal-azdls"),
+ not(feature = "opendal-hf"),
))]
_ => Err(Error::new(
ErrorKind::FeatureUnsupported,
@@ -493,10 +545,7 @@ impl Storage for OpenDalStorage {
let mut deleters: HashMap<String, opendal::Deleter> = HashMap::new();
while let Some(path) = paths.next().await {
- let bucket = url::Url::parse(&path)
- .ok()
- .and_then(|u| u.host_str().map(|s| s.to_string()))
- .unwrap_or_default();
+ let bucket = self.batch_key_for_path(&path);
let (relative_path, deleter) = match deleters.entry(bucket) {
Entry::Occupied(entry) => {
diff --git a/crates/storage/opendal/src/resolving.rs
b/crates/storage/opendal/src/resolving.rs
index 621495519..86993220a 100644
--- a/crates/storage/opendal/src/resolving.rs
+++ b/crates/storage/opendal/src/resolving.rs
@@ -50,6 +50,7 @@ pub const SCHEME_ABFSS: &str = "abfss";
pub const SCHEME_ABFS: &str = "abfs";
pub const SCHEME_WASBS: &str = "wasbs";
pub const SCHEME_WASB: &str = "wasb";
+pub const SCHEME_HF: &str = "hf";
/// Parse a URL scheme string.
fn parse_scheme(scheme: &str) -> Result<&'static str> {
@@ -60,6 +61,7 @@ fn parse_scheme(scheme: &str) -> Result<&'static str> {
SCHEME_GS | SCHEME_GCS => Ok("gcs"),
SCHEME_OSS => Ok("oss"),
SCHEME_ABFSS | SCHEME_ABFS | SCHEME_WASBS | SCHEME_WASB => Ok("azdls"),
+ SCHEME_HF => Ok("hf"),
s => Err(Error::new(
ErrorKind::FeatureUnsupported,
format!("Unsupported storage scheme: {s}"),
@@ -118,6 +120,13 @@ fn build_storage_for_scheme(
"file" => Ok(OpenDalStorage::LocalFs),
#[cfg(feature = "opendal-memory")]
"memory" =>
Ok(OpenDalStorage::Memory(crate::memory::memory_config_build()?)),
+ #[cfg(feature = "opendal-hf")]
+ "hf" => {
+ let config = crate::hf::hf_config_parse(props.clone())?;
+ Ok(OpenDalStorage::Hf {
+ config: Arc::new(config),
+ })
+ }
unsupported => Err(Error::new(
ErrorKind::FeatureUnsupported,
format!("Unsupported storage scheme: {unsupported}"),
@@ -196,7 +205,7 @@ impl StorageFactory for OpenDalResolvingStorageFactory {
pub struct OpenDalResolvingStorage {
/// Configuration properties shared across all backends.
props: HashMap<String, String>,
- /// Cache of scheme to storage mappings.
+ /// Cache of canonical scheme to storage mappings.
#[serde(skip, default)]
storages: RwLock<HashMap<&'static str, Arc<OpenDalStorage>>>,
/// Custom AWS credential loader for S3 storage.
@@ -206,7 +215,7 @@ pub struct OpenDalResolvingStorage {
}
impl OpenDalResolvingStorage {
- /// Resolve the storage for the given path by extracting the scheme and
+ /// Resolve the storage for the given path by extracting the canonical
scheme and
/// returning the cached or newly-created [`OpenDalStorage`].
fn resolve(&self, path: &str) -> Result<Arc<OpenDalStorage>> {
let scheme = extract_scheme(path)?;
@@ -281,7 +290,7 @@ impl Storage for OpenDalResolvingStorage {
}
async fn delete_stream(&self, mut paths: BoxStream<'static, String>) ->
Result<()> {
- // Group paths by scheme so each resolved storage receives a batch,
+ // Group paths by canonical scheme so each resolved storage receives a
batch,
// avoiding repeated operator creation per path.
let mut grouped: HashMap<&'static str, Vec<String>> = HashMap::new();
while let Some(path) = paths.next().await {
diff --git a/crates/storage/opendal/tests/file_io_hf_test.rs
b/crates/storage/opendal/tests/file_io_hf_test.rs
new file mode 100644
index 000000000..3c773887f
--- /dev/null
+++ b/crates/storage/opendal/tests/file_io_hf_test.rs
@@ -0,0 +1,376 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Integration tests for FileIO HuggingFace Hub.
+//!
+//! These tests require a real HuggingFace token and are skipped when
+//! `HF_TOKEN` is not set in the environment.
+//!
+//! The following environment variables are used:
+//! - `HF_TOKEN` — HuggingFace API token (required)
+//! - `HF_BUCKET` — `owner/repo` for a bucket-type repo (required when running
bucket tests)
+//! - `HF_DATASET` — `owner/repo` for a dataset-type repo (required when
running dataset tests)
+
+#[cfg(feature = "opendal-hf")]
+mod tests {
+ use std::sync::Arc;
+
+ use bytes::Bytes;
+ use futures::StreamExt;
+ use iceberg::io::{FileIO, FileIOBuilder, HF_REVISION, HF_TOKEN};
+ use iceberg_storage_opendal::{OpenDalResolvingStorageFactory,
OpenDalStorageFactory};
+ use iceberg_test_utils::{normalize_test_name_with_parts, set_up};
+
+ const ENV_HF_TOKEN: &str = "HF_TOKEN";
+ const ENV_HF_BUCKET: &str = "HF_BUCKET";
+ const ENV_HF_DATASET: &str = "HF_DATASET";
+
+ macro_rules! require_env {
+ ($var:expr) => {
+ match std::env::var($var) {
+ Ok(v) => v,
+ Err(_) => {
+ eprintln!("Skipping HF test: {} not set", $var);
+ return;
+ }
+ }
+ };
+ }
+
+ fn get_file_io(token: &str) -> FileIO {
+ set_up();
+ FileIOBuilder::new(Arc::new(OpenDalStorageFactory::Hf))
+ .with_props(vec![(HF_TOKEN, token.to_string())])
+ .build()
+ }
+
+ fn get_resolving_file_io(token: &str) -> FileIO {
+ set_up();
+ FileIOBuilder::new(Arc::new(OpenDalResolvingStorageFactory::new()))
+ .with_props(vec![(HF_TOKEN, token.to_string())])
+ .build()
+ }
+
+ // --- bucket tests ---
+
+ #[tokio::test]
+ async fn test_hf_bucket_write_read_delete() {
+ let token = require_env!(ENV_HF_TOKEN);
+ let bucket = require_env!(ENV_HF_BUCKET);
+ let file_io = get_file_io(&token);
+ let path = format!(
+ "hf://buckets/{}/{}",
+ bucket,
+ normalize_test_name_with_parts!("test_hf_bucket_write_read_delete")
+ );
+
+ let _ = file_io.delete(&path).await;
+ assert!(!file_io.exists(&path).await.unwrap());
+
+ file_io
+ .new_output(&path)
+ .unwrap()
+ .write(Bytes::from_static(b"iceberg-hf-bucket"))
+ .await
+ .unwrap();
+ assert!(file_io.exists(&path).await.unwrap());
+
+ let data = file_io.new_input(&path).unwrap().read().await.unwrap();
+ assert_eq!(data, Bytes::from_static(b"iceberg-hf-bucket"));
+
+ file_io.delete(&path).await.unwrap();
+ assert!(!file_io.exists(&path).await.unwrap());
+ }
+
+ #[tokio::test]
+ async fn test_hf_bucket_overwrite() {
+ let token = require_env!(ENV_HF_TOKEN);
+ let bucket = require_env!(ENV_HF_BUCKET);
+ let file_io = get_file_io(&token);
+ let path = format!(
+ "hf://buckets/{}/{}",
+ bucket,
+ normalize_test_name_with_parts!("test_hf_bucket_overwrite")
+ );
+
+ let _ = file_io.delete(&path).await;
+
+ file_io
+ .new_output(&path)
+ .unwrap()
+ .write(Bytes::from_static(b"first"))
+ .await
+ .unwrap();
+ file_io
+ .new_output(&path)
+ .unwrap()
+ .write(Bytes::from_static(b"second"))
+ .await
+ .unwrap();
+
+ let data = file_io.new_input(&path).unwrap().read().await.unwrap();
+ assert_eq!(data, Bytes::from_static(b"second"));
+
+ file_io.delete(&path).await.unwrap();
+ }
+
+ #[tokio::test]
+ async fn test_hf_bucket_range_read() {
+ let token = require_env!(ENV_HF_TOKEN);
+ let bucket = require_env!(ENV_HF_BUCKET);
+ let file_io = get_file_io(&token);
+ let path = format!(
+ "hf://buckets/{}/{}",
+ bucket,
+ normalize_test_name_with_parts!("test_hf_bucket_range_read")
+ );
+
+ let _ = file_io.delete(&path).await;
+ file_io
+ .new_output(&path)
+ .unwrap()
+ .write(Bytes::from_static(b"hello world"))
+ .await
+ .unwrap();
+
+ let reader = file_io.new_input(&path).unwrap().reader().await.unwrap();
+ let chunk = reader.read(6..11).await.unwrap();
+ assert_eq!(chunk, Bytes::from_static(b"world"));
+
+ file_io.delete(&path).await.unwrap();
+ }
+
+ #[tokio::test]
+ async fn test_hf_bucket_metadata() {
+ let token = require_env!(ENV_HF_TOKEN);
+ let bucket = require_env!(ENV_HF_BUCKET);
+ let file_io = get_file_io(&token);
+ let path = format!(
+ "hf://buckets/{}/{}",
+ bucket,
+ normalize_test_name_with_parts!("test_hf_bucket_metadata")
+ );
+
+ let _ = file_io.delete(&path).await;
+ file_io
+ .new_output(&path)
+ .unwrap()
+ .write(Bytes::from_static(b"metadata-test"))
+ .await
+ .unwrap();
+
+ let meta = file_io.new_input(&path).unwrap().metadata().await.unwrap();
+ assert_eq!(meta.size, b"metadata-test".len() as u64);
+
+ file_io.delete(&path).await.unwrap();
+ }
+
+ #[tokio::test]
+ async fn test_hf_bucket_delete_stream() {
+ let token = require_env!(ENV_HF_TOKEN);
+ let bucket = require_env!(ENV_HF_BUCKET);
+ let file_io = get_file_io(&token);
+
+ let paths: Vec<String> = (0..3)
+ .map(|i| {
+ format!(
+ "hf://buckets/{}/{}/file-{i}",
+ bucket,
+
normalize_test_name_with_parts!("test_hf_bucket_delete_stream")
+ )
+ })
+ .collect();
+
+ for path in &paths {
+ let _ = file_io.delete(path).await;
+ file_io
+ .new_output(path)
+ .unwrap()
+ .write(Bytes::from_static(b"x"))
+ .await
+ .unwrap();
+ assert!(file_io.exists(path).await.unwrap());
+ }
+
+ let stream = futures::stream::iter(paths.clone()).boxed();
+ file_io.delete_stream(stream).await.unwrap();
+
+ for path in &paths {
+ assert!(!file_io.exists(path).await.unwrap());
+ }
+ }
+
+ #[tokio::test]
+ async fn test_hf_bucket_delete_stream_empty() {
+ let token = require_env!(ENV_HF_TOKEN);
+ let file_io = get_file_io(&token);
+ file_io
+ .delete_stream(futures::stream::empty().boxed())
+ .await
+ .unwrap();
+ }
+
+ // --- dataset tests ---
+
+ #[tokio::test]
+ async fn test_hf_dataset_write_read_delete() {
+ let token = require_env!(ENV_HF_TOKEN);
+ let dataset = require_env!(ENV_HF_DATASET);
+ let file_io = get_file_io(&token);
+ let path = format!(
+ "hf://datasets/{}/{}",
+ dataset,
+
normalize_test_name_with_parts!("test_hf_dataset_write_read_delete")
+ );
+
+ let _ = file_io.delete(&path).await;
+ assert!(!file_io.exists(&path).await.unwrap());
+
+ file_io
+ .new_output(&path)
+ .unwrap()
+ .write(Bytes::from_static(b"iceberg-hf-dataset"))
+ .await
+ .unwrap();
+ assert!(file_io.exists(&path).await.unwrap());
+
+ let data = file_io.new_input(&path).unwrap().read().await.unwrap();
+ assert_eq!(data, Bytes::from_static(b"iceberg-hf-dataset"));
+
+ file_io.delete(&path).await.unwrap();
+ assert!(!file_io.exists(&path).await.unwrap());
+ }
+
+ // --- revision tests ---
+
+ #[tokio::test]
+ async fn test_hf_explicit_revision_in_uri() {
+ let token = require_env!(ENV_HF_TOKEN);
+ let file_io = get_file_io(&token);
+ let name =
normalize_test_name_with_parts!("test_hf_explicit_revision_in_uri");
+
+ let bucket = require_env!(ENV_HF_BUCKET);
+ // Write without revision, read back with explicit @main.
+ let write_path = format!("hf://buckets/{}/{}", bucket, name);
+ let read_path = format!("hf://buckets/{}@main/{}", bucket, name);
+
+ let _ = file_io.delete(&write_path).await;
+ file_io
+ .new_output(&write_path)
+ .unwrap()
+ .write(Bytes::from_static(b"revision-test"))
+ .await
+ .unwrap();
+
+ let data =
file_io.new_input(&read_path).unwrap().read().await.unwrap();
+ assert_eq!(data, Bytes::from_static(b"revision-test"));
+
+ file_io.delete(&write_path).await.unwrap();
+ }
+
+ #[tokio::test]
+ async fn test_hf_revision_from_config() {
+ let token = require_env!(ENV_HF_TOKEN);
+ set_up();
+
+ // Build FileIO with HF_REVISION set in config — paths without
@revision use it.
+ let file_io = FileIOBuilder::new(Arc::new(OpenDalStorageFactory::Hf))
+ .with_props(vec![
+ (HF_TOKEN, token.to_string()),
+ (HF_REVISION, "main".to_string()),
+ ])
+ .build();
+
+ let bucket = require_env!(ENV_HF_BUCKET);
+ let path = format!(
+ "hf://buckets/{}/{}",
+ bucket,
+ normalize_test_name_with_parts!("test_hf_revision_from_config")
+ );
+
+ let _ = file_io.delete(&path).await;
+ file_io
+ .new_output(&path)
+ .unwrap()
+ .write(Bytes::from_static(b"config-revision"))
+ .await
+ .unwrap();
+
+ let data = file_io.new_input(&path).unwrap().read().await.unwrap();
+ assert_eq!(data, Bytes::from_static(b"config-revision"));
+
+ file_io.delete(&path).await.unwrap();
+ }
+
+ // --- resolving storage tests ---
+
+ #[tokio::test]
+ async fn test_hf_resolving_storage() {
+ let token = require_env!(ENV_HF_TOKEN);
+ let file_io = get_resolving_file_io(&token);
+
+ let bucket = require_env!(ENV_HF_BUCKET);
+ let path = format!(
+ "hf://buckets/{}/{}",
+ bucket,
+ normalize_test_name_with_parts!("test_hf_resolving_storage")
+ );
+
+ let _ = file_io.delete(&path).await;
+
+ file_io
+ .new_output(&path)
+ .unwrap()
+ .write(Bytes::from_static(b"resolving"))
+ .await
+ .unwrap();
+
+ let data = file_io.new_input(&path).unwrap().read().await.unwrap();
+ assert_eq!(data, Bytes::from_static(b"resolving"));
+
+ file_io.delete(&path).await.unwrap();
+ }
+
+ #[tokio::test]
+ async fn test_hf_resolving_delete_stream_across_repo_types() {
+ let token = require_env!(ENV_HF_TOKEN);
+ let file_io = get_resolving_file_io(&token);
+
+ let bucket = require_env!(ENV_HF_BUCKET);
+ let dataset = require_env!(ENV_HF_DATASET);
+ let name =
normalize_test_name_with_parts!("test_hf_resolving_delete_stream_across");
+ let bucket_path = format!("hf://buckets/{}/{}", bucket, name);
+ let dataset_path = format!("hf://datasets/{}/{}", dataset, name);
+
+ for path in [&bucket_path, &dataset_path] {
+ let _ = file_io.delete(path).await;
+ file_io
+ .new_output(path)
+ .unwrap()
+ .write(Bytes::from_static(b"x"))
+ .await
+ .unwrap();
+ assert!(file_io.exists(path).await.unwrap());
+ }
+
+ let stream = futures::stream::iter(vec![bucket_path.clone(),
dataset_path.clone()]).boxed();
+ file_io.delete_stream(stream).await.unwrap();
+
+ assert!(!file_io.exists(&bucket_path).await.unwrap());
+ assert!(!file_io.exists(&dataset_path).await.unwrap());
+ }
+}