This is an automated email from the ASF dual-hosted git repository.

xuanwo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opendal.git


The following commit(s) were added to refs/heads/main by this push:
     new 4e587a952 fix(services/huggingface): Implement pagination with Link 
header for large repos (#6832)
4e587a952 is described below

commit 4e587a9527298ec259052d8ca40b2f5fcb57673d
Author: Aryan Bagade <[email protected]>
AuthorDate: Fri Nov 28 22:51:55 2025 -0800

    fix(services/huggingface): Implement pagination with Link header for large 
repos (#6832)
    
    * fix(services/huggingface): Implement pagination with Link header for 
large repos
    
    This commit fixes list truncation on large HuggingFace repositories by 
implementing proper pagination support following the Hub tree API
    
    Fixes #6830
    
    * refactor: Use split_once for cleaner Link header parsing and removed 
trim()
---
 core/src/services/huggingface/core.rs   |  49 +++++++++++++-
 core/src/services/huggingface/lister.rs | 116 +++++++++++++++++++++++++++++++-
 2 files changed, 160 insertions(+), 5 deletions(-)

diff --git a/core/src/services/huggingface/core.rs 
b/core/src/services/huggingface/core.rs
index 6aa635c59..12f036075 100644
--- a/core/src/services/huggingface/core.rs
+++ b/core/src/services/huggingface/core.rs
@@ -96,7 +96,12 @@ impl HuggingfaceCore {
         self.info.http_client().send(req).await
     }
 
-    pub async fn hf_list(&self, path: &str, recursive: bool) -> 
Result<Response<Buffer>> {
+    pub async fn hf_list(
+        &self,
+        path: &str,
+        recursive: bool,
+        cursor: Option<&str>,
+    ) -> Result<Response<Buffer>> {
         let p = build_abs_path(&self.root, path)
             .trim_end_matches('/')
             .to_string();
@@ -122,6 +127,10 @@ impl HuggingfaceCore {
             url.push_str("&recursive=True");
         }
 
+        if let Some(cursor_val) = cursor {
+            url.push_str(&format!("&cursor={}", cursor_val));
+        }
+
         let mut req = Request::get(&url);
         // Inject operation to the request.
         req = req.extension(Operation::List);
@@ -135,6 +144,20 @@ impl HuggingfaceCore {
         self.info.http_client().send(req).await
     }
 
+    pub async fn hf_list_with_url(&self, url: &str) -> 
Result<Response<Buffer>> {
+        let mut req = Request::get(url);
+        // Inject operation to the request.
+        req = req.extension(Operation::List);
+        if let Some(token) = &self.token {
+            let auth_header_content = format_authorization_by_bearer(token)?;
+            req = req.header(header::AUTHORIZATION, auth_header_content);
+        }
+
+        let req = req.body(Buffer::new()).map_err(new_request_build_error)?;
+
+        self.info.http_client().send(req).await
+    }
+
     pub async fn hf_resolve(
         &self,
         path: &str,
@@ -391,7 +414,7 @@ mod tests {
             "https://huggingface.co";,
         );
 
-        core.hf_list("path1", false).await?;
+        core.hf_list("path1", false, None).await?;
 
         let url = mock_client.get_captured_url();
         assert_eq!(
@@ -411,7 +434,7 @@ mod tests {
             "https://huggingface.co";,
         );
 
-        core.hf_list("path2", true).await?;
+        core.hf_list("path2", true, None).await?;
 
         let url = mock_client.get_captured_url();
         assert_eq!(
@@ -422,6 +445,26 @@ mod tests {
         Ok(())
     }
 
+    #[tokio::test]
+    async fn test_hf_list_url_with_cursor() -> Result<()> {
+        let (core, mock_client) = create_test_core(
+            RepoType::Model,
+            "org/model",
+            "main",
+            "https://huggingface.co";,
+        );
+
+        core.hf_list("path3", false, Some("abc123")).await?;
+
+        let url = mock_client.get_captured_url();
+        assert_eq!(
+            url,
+            
"https://huggingface.co/api/models/org/model/tree/main/path3?expand=True&cursor=abc123";
+        );
+
+        Ok(())
+    }
+
     #[tokio::test]
     async fn test_hf_resolve_url_model() -> Result<()> {
         let (core, mock_client) = create_test_core(
diff --git a/core/src/services/huggingface/lister.rs 
b/core/src/services/huggingface/lister.rs
index b8d768eea..83a60f79c 100644
--- a/core/src/services/huggingface/lister.rs
+++ b/core/src/services/huggingface/lister.rs
@@ -43,7 +43,12 @@ impl HuggingfaceLister {
 
 impl oio::PageList for HuggingfaceLister {
     async fn next_page(&self, ctx: &mut oio::PageContext) -> Result<()> {
-        let response = self.core.hf_list(&self.path, self.recursive).await?;
+        // Use the next page URL from context if available, otherwise start 
from beginning
+        let response = if ctx.token.is_empty() {
+            self.core.hf_list(&self.path, self.recursive, None).await?
+        } else {
+            self.core.hf_list_with_url(&ctx.token).await?
+        };
 
         let status_code = response.status();
         if !status_code.is_success() {
@@ -51,11 +56,19 @@ impl oio::PageList for HuggingfaceLister {
             return Err(error);
         }
 
+        // Parse Link header for pagination
+        let next_link = parse_link_header(response.headers());
+
         let bytes = response.into_body();
         let decoded_response: Vec<HuggingfaceStatus> =
             
serde_json::from_reader(bytes.reader()).map_err(new_json_deserialize_error)?;
 
-        ctx.done = true;
+        // Only mark as done if there's no next page
+        if let Some(next_url) = next_link {
+            ctx.token = next_url;
+        } else {
+            ctx.done = true;
+        }
 
         for status in decoded_response {
             let entry_type = match status.type_.as_str() {
@@ -89,3 +102,102 @@ impl oio::PageList for HuggingfaceLister {
         Ok(())
     }
 }
+
+/// Parse the Link header to extract the next page URL.
+/// HuggingFace API returns pagination info in the Link header with rel="next".
+/// Example: <https://huggingface.co/api/models/.../tree?cursor=xxx>; 
rel="next"
+fn parse_link_header(headers: &http::HeaderMap) -> Option<String> {
+    let link_header = headers.get(http::header::LINK)?;
+    let link_str = link_header.to_str().ok()?;
+
+    // Parse Link header format: <url>; rel="next"
+    for link in link_str.split(',') {
+        if link.contains("rel=\"next\"") || link.contains("rel='next'") {
+            // Extract URL from <url> using split_once for cleaner parsing
+            let (_, rest) = link.split_once('<')?;
+            let (inside, _) = rest.split_once('>')?;
+            return Some(inside.to_string());
+        }
+    }
+
+    None
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use http::HeaderMap;
+    use http::HeaderValue;
+
+    #[test]
+    fn test_parse_link_header_with_next() {
+        let mut headers = HeaderMap::new();
+        headers.insert(
+            http::header::LINK,
+            HeaderValue::from_static(
+                
r#"<https://huggingface.co/api/models/test/tree/main?cursor=abc123>; 
rel="next""#,
+            ),
+        );
+
+        let result = parse_link_header(&headers);
+        assert_eq!(
+            result,
+            
Some("https://huggingface.co/api/models/test/tree/main?cursor=abc123".to_string())
+        );
+    }
+
+    #[test]
+    fn test_parse_link_header_with_single_quotes() {
+        let mut headers = HeaderMap::new();
+        headers.insert(
+            http::header::LINK,
+            HeaderValue::from_static(
+                
r#"<https://huggingface.co/api/models/test/tree/main?cursor=xyz>; rel='next'"#,
+            ),
+        );
+
+        let result = parse_link_header(&headers);
+        assert_eq!(
+            result,
+            
Some("https://huggingface.co/api/models/test/tree/main?cursor=xyz".to_string())
+        );
+    }
+
+    #[test]
+    fn test_parse_link_header_without_next() {
+        let mut headers = HeaderMap::new();
+        headers.insert(
+            http::header::LINK,
+            HeaderValue::from_static(
+                r#"<https://huggingface.co/api/models/test/tree/main>; 
rel="prev""#,
+            ),
+        );
+
+        let result = parse_link_header(&headers);
+        assert_eq!(result, None);
+    }
+
+    #[test]
+    fn test_parse_link_header_multiple_links() {
+        let mut headers = HeaderMap::new();
+        headers.insert(
+            http::header::LINK,
+            HeaderValue::from_static(
+                r#"<https://huggingface.co/api/prev>; rel="prev", 
<https://huggingface.co/api/next?cursor=456>; rel="next""#,
+            ),
+        );
+
+        let result = parse_link_header(&headers);
+        assert_eq!(
+            result,
+            Some("https://huggingface.co/api/next?cursor=456".to_string())
+        );
+    }
+
+    #[test]
+    fn test_parse_link_header_no_header() {
+        let headers = HeaderMap::new();
+        let result = parse_link_header(&headers);
+        assert_eq!(result, None);
+    }
+}

Reply via email to