This is an automated email from the ASF dual-hosted git repository.
xuanwo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opendal.git
The following commit(s) were added to refs/heads/main by this push:
new 4e587a952 fix(services/huggingface): Implement pagination with Link
header for large repos (#6832)
4e587a952 is described below
commit 4e587a9527298ec259052d8ca40b2f5fcb57673d
Author: Aryan Bagade <[email protected]>
AuthorDate: Fri Nov 28 22:51:55 2025 -0800
fix(services/huggingface): Implement pagination with Link header for large
repos (#6832)
* fix(services/huggingface): Implement pagination with Link header for
large repos
This commit fixes list truncation on large HuggingFace repositories by
implementing proper pagination support following the Hub tree API
Fixes #6830
* refactor: Use split_once for cleaner Link header parsing and removed
trim()
---
core/src/services/huggingface/core.rs | 49 +++++++++++++-
core/src/services/huggingface/lister.rs | 116 +++++++++++++++++++++++++++++++-
2 files changed, 160 insertions(+), 5 deletions(-)
diff --git a/core/src/services/huggingface/core.rs
b/core/src/services/huggingface/core.rs
index 6aa635c59..12f036075 100644
--- a/core/src/services/huggingface/core.rs
+++ b/core/src/services/huggingface/core.rs
@@ -96,7 +96,12 @@ impl HuggingfaceCore {
self.info.http_client().send(req).await
}
- pub async fn hf_list(&self, path: &str, recursive: bool) ->
Result<Response<Buffer>> {
+ pub async fn hf_list(
+ &self,
+ path: &str,
+ recursive: bool,
+ cursor: Option<&str>,
+ ) -> Result<Response<Buffer>> {
let p = build_abs_path(&self.root, path)
.trim_end_matches('/')
.to_string();
@@ -122,6 +127,10 @@ impl HuggingfaceCore {
url.push_str("&recursive=True");
}
+ if let Some(cursor_val) = cursor {
+ url.push_str(&format!("&cursor={}", cursor_val));
+ }
+
let mut req = Request::get(&url);
// Inject operation to the request.
req = req.extension(Operation::List);
@@ -135,6 +144,20 @@ impl HuggingfaceCore {
self.info.http_client().send(req).await
}
+ pub async fn hf_list_with_url(&self, url: &str) ->
Result<Response<Buffer>> {
+ let mut req = Request::get(url);
+ // Inject operation to the request.
+ req = req.extension(Operation::List);
+ if let Some(token) = &self.token {
+ let auth_header_content = format_authorization_by_bearer(token)?;
+ req = req.header(header::AUTHORIZATION, auth_header_content);
+ }
+
+ let req = req.body(Buffer::new()).map_err(new_request_build_error)?;
+
+ self.info.http_client().send(req).await
+ }
+
pub async fn hf_resolve(
&self,
path: &str,
@@ -391,7 +414,7 @@ mod tests {
"https://huggingface.co",
);
- core.hf_list("path1", false).await?;
+ core.hf_list("path1", false, None).await?;
let url = mock_client.get_captured_url();
assert_eq!(
@@ -411,7 +434,7 @@ mod tests {
"https://huggingface.co",
);
- core.hf_list("path2", true).await?;
+ core.hf_list("path2", true, None).await?;
let url = mock_client.get_captured_url();
assert_eq!(
@@ -422,6 +445,26 @@ mod tests {
Ok(())
}
+ #[tokio::test]
+ async fn test_hf_list_url_with_cursor() -> Result<()> {
+ let (core, mock_client) = create_test_core(
+ RepoType::Model,
+ "org/model",
+ "main",
+ "https://huggingface.co",
+ );
+
+ core.hf_list("path3", false, Some("abc123")).await?;
+
+ let url = mock_client.get_captured_url();
+ assert_eq!(
+ url,
+
"https://huggingface.co/api/models/org/model/tree/main/path3?expand=True&cursor=abc123"
+ );
+
+ Ok(())
+ }
+
#[tokio::test]
async fn test_hf_resolve_url_model() -> Result<()> {
let (core, mock_client) = create_test_core(
diff --git a/core/src/services/huggingface/lister.rs
b/core/src/services/huggingface/lister.rs
index b8d768eea..83a60f79c 100644
--- a/core/src/services/huggingface/lister.rs
+++ b/core/src/services/huggingface/lister.rs
@@ -43,7 +43,12 @@ impl HuggingfaceLister {
impl oio::PageList for HuggingfaceLister {
async fn next_page(&self, ctx: &mut oio::PageContext) -> Result<()> {
- let response = self.core.hf_list(&self.path, self.recursive).await?;
+ // Use the next page URL from context if available, otherwise start
from beginning
+ let response = if ctx.token.is_empty() {
+ self.core.hf_list(&self.path, self.recursive, None).await?
+ } else {
+ self.core.hf_list_with_url(&ctx.token).await?
+ };
let status_code = response.status();
if !status_code.is_success() {
@@ -51,11 +56,19 @@ impl oio::PageList for HuggingfaceLister {
return Err(error);
}
+ // Parse Link header for pagination
+ let next_link = parse_link_header(response.headers());
+
let bytes = response.into_body();
let decoded_response: Vec<HuggingfaceStatus> =
serde_json::from_reader(bytes.reader()).map_err(new_json_deserialize_error)?;
- ctx.done = true;
+ // Only mark as done if there's no next page
+ if let Some(next_url) = next_link {
+ ctx.token = next_url;
+ } else {
+ ctx.done = true;
+ }
for status in decoded_response {
let entry_type = match status.type_.as_str() {
@@ -89,3 +102,102 @@ impl oio::PageList for HuggingfaceLister {
Ok(())
}
}
+
+/// Parse the Link header to extract the next page URL.
+/// HuggingFace API returns pagination info in the Link header with rel="next".
+/// Example: <https://huggingface.co/api/models/.../tree?cursor=xxx>;
rel="next"
+fn parse_link_header(headers: &http::HeaderMap) -> Option<String> {
+ let link_header = headers.get(http::header::LINK)?;
+ let link_str = link_header.to_str().ok()?;
+
+ // Parse Link header format: <url>; rel="next"
+ for link in link_str.split(',') {
+ if link.contains("rel=\"next\"") || link.contains("rel='next'") {
+ // Extract URL from <url> using split_once for cleaner parsing
+ let (_, rest) = link.split_once('<')?;
+ let (inside, _) = rest.split_once('>')?;
+ return Some(inside.to_string());
+ }
+ }
+
+ None
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use http::HeaderMap;
+ use http::HeaderValue;
+
+ #[test]
+ fn test_parse_link_header_with_next() {
+ let mut headers = HeaderMap::new();
+ headers.insert(
+ http::header::LINK,
+ HeaderValue::from_static(
+
r#"<https://huggingface.co/api/models/test/tree/main?cursor=abc123>;
rel="next""#,
+ ),
+ );
+
+ let result = parse_link_header(&headers);
+ assert_eq!(
+ result,
+
Some("https://huggingface.co/api/models/test/tree/main?cursor=abc123".to_string())
+ );
+ }
+
+ #[test]
+ fn test_parse_link_header_with_single_quotes() {
+ let mut headers = HeaderMap::new();
+ headers.insert(
+ http::header::LINK,
+ HeaderValue::from_static(
+
r#"<https://huggingface.co/api/models/test/tree/main?cursor=xyz>; rel='next'"#,
+ ),
+ );
+
+ let result = parse_link_header(&headers);
+ assert_eq!(
+ result,
+
Some("https://huggingface.co/api/models/test/tree/main?cursor=xyz".to_string())
+ );
+ }
+
+ #[test]
+ fn test_parse_link_header_without_next() {
+ let mut headers = HeaderMap::new();
+ headers.insert(
+ http::header::LINK,
+ HeaderValue::from_static(
+ r#"<https://huggingface.co/api/models/test/tree/main>;
rel="prev""#,
+ ),
+ );
+
+ let result = parse_link_header(&headers);
+ assert_eq!(result, None);
+ }
+
+ #[test]
+ fn test_parse_link_header_multiple_links() {
+ let mut headers = HeaderMap::new();
+ headers.insert(
+ http::header::LINK,
+ HeaderValue::from_static(
+ r#"<https://huggingface.co/api/prev>; rel="prev",
<https://huggingface.co/api/next?cursor=456>; rel="next""#,
+ ),
+ );
+
+ let result = parse_link_header(&headers);
+ assert_eq!(
+ result,
+ Some("https://huggingface.co/api/next?cursor=456".to_string())
+ );
+ }
+
+ #[test]
+ fn test_parse_link_header_no_header() {
+ let headers = HeaderMap::new();
+ let result = parse_link_header(&headers);
+ assert_eq!(result, None);
+ }
+}