This is an automated email from the ASF dual-hosted git repository.

xuanwo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-opendal.git


The following commit(s) were added to refs/heads/main by this push:
     new 547c23dd2 feat(core): service add HuggingFace file system (#3670)
547c23dd2 is described below

commit 547c23dd2cd96bc2be6c14a83c3d5df74f7f4b6a
Author: Morris Tai <[email protected]>
AuthorDate: Sat Dec 2 05:49:52 2023 -0500

    feat(core): service add HuggingFace file system (#3670)
    
    * feat: service add huggingface file system
    
    * chore: fix typo
    
    * feat: refactor `read_token` with `token`
    
    * feat: add HuggingFaceConfig implementation
    
    * chore: use better syntax implementation
    
    * chore: use format_authorization_by_bearer instead
    
    * chore: use Eq, PartialEq to make testing cleaner
    
    * feat: change scheme from `huggingface` to `hf`
    
    * feat: rename `HuggingFace` to `Huggingface`
    
    * chore: move message.rs to core.rs
    
    * feat: drop dependency serde_urlencoded
    
    * chore: fix for cargo clippy
    
    * feat: rename huggingface scheme
    
    * chore: polish code
    
    * chore: use `http::header` for well-known header names
    
    * feat: fix for PR review
    
    * chore: read `hf_read` to `hf_resolve`
    
    * chore: read `hf_read` to `hf_resolve`
---
 core/Cargo.toml                          |   1 +
 core/src/services/huggingface/backend.rs | 341 +++++++++++++++++++++++++
 core/src/services/huggingface/core.rs    | 415 +++++++++++++++++++++++++++++++
 core/src/services/huggingface/docs.md    |  63 +++++
 core/src/services/huggingface/error.rs   |  93 +++++++
 core/src/services/huggingface/lister.rs  |  89 +++++++
 core/src/services/huggingface/mod.rs     |  24 ++
 core/src/services/mod.rs                 |   7 +
 core/src/types/scheme.rs                 |   6 +
 9 files changed, 1039 insertions(+)

diff --git a/core/Cargo.toml b/core/Cargo.toml
index 5c71da150..55541ee6c 100644
--- a/core/Cargo.toml
+++ b/core/Cargo.toml
@@ -151,6 +151,7 @@ services-ghac = []
 services-gridfs = ["dep:mongodb"]
 services-hdfs = ["dep:hdrs"]
 services-http = []
+services-huggingface = []
 services-ipfs = ["dep:prost"]
 services-ipmfs = []
 services-libsql = ["dep:hrana-client-proto"]
diff --git a/core/src/services/huggingface/backend.rs 
b/core/src/services/huggingface/backend.rs
new file mode 100644
index 000000000..49502680e
--- /dev/null
+++ b/core/src/services/huggingface/backend.rs
@@ -0,0 +1,341 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::collections::HashMap;
+use std::fmt::Debug;
+use std::fmt::Formatter;
+use std::sync::Arc;
+
+use async_trait::async_trait;
+use http::StatusCode;
+use log::debug;
+use serde::Deserialize;
+
+use super::core::HuggingfaceCore;
+use super::core::HuggingfaceStatus;
+use super::error::parse_error;
+use super::lister::HuggingfaceLister;
+use crate::raw::*;
+use crate::*;
+
+/// Configuration for Huggingface service support.
+#[derive(Default, Deserialize, Clone)]
+#[serde(default)]
+#[non_exhaustive]
+pub struct HuggingfaceConfig {
+    /// Repo type of this backend. Default is model.
+    ///
+    /// Available values:
+    /// - model
+    /// - dataset
+    pub repo_type: Option<String>,
+    /// Repo id of this backend.
+    ///
+    /// This is required.
+    pub repo_id: Option<String>,
+    /// Revision of this backend.
+    ///
+    /// Default is main.
+    pub revision: Option<String>,
+    /// Root of this backend. Can be "/path/to/dir".
+    ///
+    /// Default is "/".
+    pub root: Option<String>,
+    /// Token of this backend.
+    ///
+    /// This is optional.
+    pub token: Option<String>,
+}
+
+impl Debug for HuggingfaceConfig {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        let mut ds = f.debug_struct("HuggingfaceConfig");
+
+        if let Some(repo_type) = &self.repo_type {
+            ds.field("repo_type", &repo_type);
+        }
+        if let Some(repo_id) = &self.repo_id {
+            ds.field("repo_id", &repo_id);
+        }
+        if let Some(revision) = &self.revision {
+            ds.field("revision", &revision);
+        }
+        if let Some(root) = &self.root {
+            ds.field("root", &root);
+        }
+        if self.token.is_some() {
+            ds.field("token", &"<redacted>");
+        }
+
+        ds.finish()
+    }
+}
+
+/// 
[Huggingface](https://huggingface.co/docs/huggingface_hub/package_reference/hf_api)'s
 API support.
+#[doc = include_str!("docs.md")]
+#[derive(Default, Clone)]
+pub struct HuggingfaceBuilder {
+    config: HuggingfaceConfig,
+}
+
+impl Debug for HuggingfaceBuilder {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        let mut ds = f.debug_struct("Builder");
+
+        ds.field("config", &self.config);
+        ds.finish()
+    }
+}
+
+impl HuggingfaceBuilder {
+    /// Set repo type of this backend. Default is model.
+    ///
+    /// Available values:
+    /// - model
+    /// - dataset
+    ///
+    /// Currently, only models and datasets are supported.
+    /// [Reference](https://huggingface.co/docs/hub/repositories)
+    pub fn repo_type(&mut self, repo_type: &str) -> &mut Self {
+        if !repo_type.is_empty() {
+            self.config.repo_type = Some(repo_type.to_string());
+        }
+        self
+    }
+
+    /// Set repo id of this backend. This is required.
+    ///
+    /// Repo id consists of the account name and the repository name.
+    ///
+    /// For example, model's repo id looks like:
+    /// - meta-llama/Llama-2-7b
+    ///
+    /// Dataset's repo id looks like:
+    /// - databricks/databricks-dolly-15k
+    pub fn repo_id(&mut self, repo_id: &str) -> &mut Self {
+        if !repo_id.is_empty() {
+            self.config.repo_id = Some(repo_id.to_string());
+        }
+        self
+    }
+
+    /// Set revision of this backend. Default is main.
+    ///
+    /// Revision can be a branch name or a commit hash.
+    ///
+    /// For example, revision can be:
+    /// - main
+    /// - 1d0c4eb
+    pub fn revision(&mut self, revision: &str) -> &mut Self {
+        if !revision.is_empty() {
+            self.config.revision = Some(revision.to_string());
+        }
+        self
+    }
+
+    /// Set root of this backend.
+    ///
+    /// All operations will happen under this root.
+    pub fn root(&mut self, root: &str) -> &mut Self {
+        if !root.is_empty() {
+            self.config.root = Some(root.to_string());
+        }
+        self
+    }
+
+    /// Set the token of this backend.
+    ///
+    /// This is optional.
+    pub fn token(&mut self, token: &str) -> &mut Self {
+        if !token.is_empty() {
+            self.config.token = Some(token.to_string());
+        }
+        self
+    }
+}
+
+impl Builder for HuggingfaceBuilder {
+    const SCHEME: Scheme = Scheme::Huggingface;
+    type Accessor = HuggingfaceBackend;
+
+    fn from_map(map: HashMap<String, String>) -> Self {
+        let config = 
HuggingfaceConfig::deserialize(ConfigDeserializer::new(map))
+            .expect("config deserialize must succeed");
+
+        HuggingfaceBuilder { config }
+    }
+
+    /// Build a HuggingfaceBackend.
+    fn build(&mut self) -> Result<Self::Accessor> {
+        debug!("backend build started: {:?}", &self);
+
+        let repo_type = match self.config.repo_type.as_deref() {
+            Some("model") => Ok(RepoType::Model),
+            Some("dataset") => Ok(RepoType::Dataset),
+            Some("space") => Err(Error::new(
+                ErrorKind::ConfigInvalid,
+                "repo type \"space\" is unsupported",
+            )),
+            Some(repo_type) => Err(Error::new(
+                ErrorKind::ConfigInvalid,
+                format!("unknown repo_type: {}", repo_type).as_str(),
+            )
+            .with_operation("Builder::build")
+            .with_context("service", Scheme::Huggingface)),
+            None => Ok(RepoType::Model),
+        }?;
+        debug!("backend use repo_type: {:?}", &repo_type);
+
+        let repo_id = match &self.config.repo_id {
+            Some(repo_id) => Ok(repo_id.clone()),
+            None => Err(Error::new(ErrorKind::ConfigInvalid, "repo_id is 
empty")
+                .with_operation("Builder::build")
+                .with_context("service", Scheme::Huggingface)),
+        }?;
+        debug!("backend use repo_id: {}", &repo_id);
+
+        let revision = match &self.config.revision {
+            Some(revision) => revision.clone(),
+            None => "main".to_string(),
+        };
+        debug!("backend use revision: {}", &revision);
+
+        let root = 
normalize_root(&self.config.root.take().unwrap_or_default());
+        debug!("backend use root: {}", &root);
+
+        let token = self.config.token.as_ref().cloned();
+
+        let client = HttpClient::new()?;
+
+        debug!("backend build finished: {:?}", &self);
+        Ok(HuggingfaceBackend {
+            core: Arc::new(HuggingfaceCore {
+                repo_type,
+                repo_id,
+                revision,
+                root,
+                token,
+                client,
+            }),
+        })
+    }
+}
+
+/// Backend for Huggingface service
+#[derive(Debug, Clone)]
+pub struct HuggingfaceBackend {
+    core: Arc<HuggingfaceCore>,
+}
+
+#[async_trait]
+impl Accessor for HuggingfaceBackend {
+    type Reader = IncomingAsyncBody;
+    type BlockingReader = ();
+    type Writer = ();
+    type BlockingWriter = ();
+    type Lister = oio::PageLister<HuggingfaceLister>;
+    type BlockingLister = ();
+
+    fn info(&self) -> AccessorInfo {
+        let mut am = AccessorInfo::default();
+        am.set_scheme(Scheme::Huggingface)
+            .set_native_capability(Capability {
+                stat: true,
+
+                read: true,
+                read_can_next: true,
+                read_with_range: true,
+
+                list: true,
+                list_without_recursive: true,
+                list_with_recursive: true,
+
+                ..Default::default()
+            });
+        am
+    }
+
+    async fn read(&self, path: &str, args: OpRead) -> Result<(RpRead, 
Self::Reader)> {
+        let resp = self.core.hf_resolve(path, args).await?;
+
+        let status = resp.status();
+
+        match status {
+            StatusCode::OK => {
+                let size = parse_content_length(resp.headers())?;
+                Ok((RpRead::new().with_size(size), resp.into_body()))
+            }
+            StatusCode::RANGE_NOT_SATISFIABLE => Ok((RpRead::new(), 
IncomingAsyncBody::empty())),
+            _ => Err(parse_error(resp).await?),
+        }
+    }
+
+    async fn stat(&self, path: &str, _: OpStat) -> Result<RpStat> {
+        // Stat root always returns a DIR.
+        if path == "/" {
+            return Ok(RpStat::new(Metadata::new(EntryMode::DIR)));
+        }
+
+        let resp = self.core.hf_path_info(path).await?;
+
+        let status = resp.status();
+
+        match status {
+            StatusCode::OK => {
+                let mut meta = parse_into_metadata(path, resp.headers())?;
+                let bs = resp.into_body().bytes().await?;
+
+                let decoded_response = 
serde_json::from_slice::<Vec<HuggingfaceStatus>>(&bs)
+                    .map_err(new_json_deserialize_error)?;
+
+                // NOTE: if the file is not found, the server will return 200 
with an empty array
+                if let Some(status) = decoded_response.get(0) {
+                    if let Some(commit_info) = status.last_commit.as_ref() {
+                        meta.set_last_modified(parse_datetime_from_rfc3339(
+                            commit_info.date.as_str(),
+                        )?);
+                    }
+
+                    match status.type_.as_str() {
+                        "directory" => meta.set_mode(EntryMode::DIR),
+                        "file" => meta.set_mode(EntryMode::FILE),
+                        _ => return Err(Error::new(ErrorKind::Unexpected, 
"unknown status type")),
+                    };
+                } else {
+                    return Err(Error::new(ErrorKind::NotFound, "path not 
found"));
+                }
+
+                Ok(RpStat::new(meta))
+            }
+            _ => Err(parse_error(resp).await?),
+        }
+    }
+
+    async fn list(&self, path: &str, args: OpList) -> Result<(RpList, 
Self::Lister)> {
+        let l = HuggingfaceLister::new(self.core.clone(), path.to_string(), 
args.recursive());
+
+        Ok((RpList::default(), oio::PageLister::new(l)))
+    }
+}
+
+/// Repository type of Huggingface. Currently, we only support `model` and 
`dataset`.
+/// [Reference](https://huggingface.co/docs/hub/repositories)
+#[derive(Debug, Clone, Copy)]
+pub enum RepoType {
+    Model,
+    Dataset,
+}
diff --git a/core/src/services/huggingface/core.rs 
b/core/src/services/huggingface/core.rs
new file mode 100644
index 000000000..45ec528ed
--- /dev/null
+++ b/core/src/services/huggingface/core.rs
@@ -0,0 +1,415 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::fmt::Debug;
+
+use bytes::Bytes;
+use http::Request;
+use http::Response;
+use http::{header, StatusCode};
+use serde::Deserialize;
+
+use super::backend::RepoType;
+use super::error::parse_error;
+use crate::raw::*;
+use crate::*;
+
+pub struct HuggingfaceCore {
+    pub repo_type: RepoType,
+    pub repo_id: String,
+    pub revision: String,
+    pub root: String,
+    pub token: Option<String>,
+
+    pub client: HttpClient,
+}
+
+impl Debug for HuggingfaceCore {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("HuggingfaceCore")
+            .field("repo_type", &self.repo_type)
+            .field("repo_id", &self.repo_id)
+            .field("revision", &self.revision)
+            .field("root", &self.root)
+            .finish_non_exhaustive()
+    }
+}
+
+impl HuggingfaceCore {
+    pub async fn hf_path_info(&self, path: &str) -> 
Result<Response<IncomingAsyncBody>> {
+        let p = build_abs_path(&self.root, path)
+            .trim_end_matches('/')
+            .to_string();
+
+        let url = match self.repo_type {
+            RepoType::Model => format!(
+                "https://huggingface.co/api/models/{}/paths-info/{}";,
+                &self.repo_id, &self.revision
+            ),
+            RepoType::Dataset => format!(
+                "https://huggingface.co/api/datasets/{}/paths-info/{}";,
+                &self.repo_id, &self.revision
+            ),
+        };
+
+        let mut req = Request::post(&url);
+
+        if let Some(token) = &self.token {
+            let auth_header_content = format_authorization_by_bearer(token)?;
+            req = req.header(header::AUTHORIZATION, auth_header_content);
+        }
+
+        req = req.header(header::CONTENT_TYPE, 
"application/x-www-form-urlencoded");
+
+        let req_body = format!("paths={}&expand=True", 
percent_encode_path(&p));
+
+        let req = req
+            .body(AsyncBody::Bytes(Bytes::from(req_body)))
+            .map_err(new_request_build_error)?;
+
+        self.client.send(req).await
+    }
+
+    pub async fn hf_list(
+        &self,
+        path: &str,
+        recursive: bool,
+    ) -> Result<Response<IncomingAsyncBody>> {
+        let p = build_abs_path(&self.root, path)
+            .trim_end_matches('/')
+            .to_string();
+
+        let mut url = match self.repo_type {
+            RepoType::Model => format!(
+                "https://huggingface.co/api/models/{}/tree/{}/{}?expand=True";,
+                &self.repo_id,
+                &self.revision,
+                percent_encode_path(&p)
+            ),
+            RepoType::Dataset => format!(
+                
"https://huggingface.co/api/datasets/{}/tree/{}/{}?expand=True";,
+                &self.repo_id,
+                &self.revision,
+                percent_encode_path(&p)
+            ),
+        };
+
+        if recursive {
+            url.push_str("&recursive=True");
+        }
+
+        let mut req = Request::get(&url);
+
+        if let Some(token) = &self.token {
+            let auth_header_content = format_authorization_by_bearer(token)?;
+            req = req.header(header::AUTHORIZATION, auth_header_content);
+        }
+
+        let req = req
+            .body(AsyncBody::Empty)
+            .map_err(new_request_build_error)?;
+
+        self.client.send(req).await
+    }
+
+    pub async fn hf_resolve(&self, path: &str, arg: OpRead) -> 
Result<Response<IncomingAsyncBody>> {
+        let p = build_abs_path(&self.root, path)
+            .trim_end_matches('/')
+            .to_string();
+
+        let url = match self.repo_type {
+            RepoType::Model => format!(
+                "https://huggingface.co/{}/resolve/{}/{}";,
+                &self.repo_id,
+                &self.revision,
+                percent_encode_path(&p)
+            ),
+            RepoType::Dataset => format!(
+                "https://huggingface.co/datasets/{}/resolve/{}/{}";,
+                &self.repo_id,
+                &self.revision,
+                percent_encode_path(&p)
+            ),
+        };
+
+        let mut req = Request::get(&url);
+
+        if let Some(token) = &self.token {
+            let auth_header_content = format_authorization_by_bearer(token)?;
+            req = req.header(header::AUTHORIZATION, auth_header_content);
+        }
+
+        let range = arg.range();
+        if !range.is_full() {
+            req = req.header(header::RANGE, &range.to_header());
+        }
+
+        let req = req
+            .body(AsyncBody::Empty)
+            .map_err(new_request_build_error)?;
+
+        let resp = self.client.send(req).await?;
+
+        let status = resp.status();
+
+        match status {
+            StatusCode::OK => Ok(resp),
+            _ => Err(parse_error(resp).await?),
+        }
+    }
+}
+
+#[derive(Deserialize, Eq, PartialEq, Debug)]
+#[serde(rename_all = "camelCase")]
+#[allow(dead_code)]
+pub(super) struct HuggingfaceStatus {
+    #[serde(rename = "type")]
+    pub type_: String,
+    pub oid: String,
+    pub size: u64,
+    pub lfs: Option<HuggingfaceLfs>,
+    pub path: String,
+    pub last_commit: Option<HuggingfaceLastCommit>,
+    pub security: Option<HuggingfaceSecurity>,
+}
+
+#[derive(Deserialize, Eq, PartialEq, Debug)]
+#[serde(rename_all = "camelCase")]
+#[allow(dead_code)]
+pub(super) struct HuggingfaceLfs {
+    pub oid: String,
+    pub size: u64,
+    pub pointer_size: u64,
+}
+
+#[derive(Deserialize, Eq, PartialEq, Debug)]
+#[serde(rename_all = "camelCase")]
+#[allow(dead_code)]
+pub(super) struct HuggingfaceLastCommit {
+    pub id: String,
+    pub title: String,
+    pub date: String,
+}
+
+#[derive(Deserialize, Eq, PartialEq, Debug)]
+#[serde(rename_all = "camelCase")]
+#[allow(dead_code)]
+pub(super) struct HuggingfaceSecurity {
+    pub blob_id: String,
+    pub name: String,
+    pub safe: bool,
+    pub av_scan: Option<HuggingfaceAvScan>,
+    pub pickle_import_scan: Option<HuggingfacePickleImportScan>,
+}
+
+#[derive(Deserialize, Eq, PartialEq, Debug)]
+#[allow(dead_code)]
+#[serde(rename_all = "camelCase")]
+pub(super) struct HuggingfaceAvScan {
+    pub virus_found: bool,
+    pub virus_names: Option<Vec<String>>,
+}
+
+#[derive(Deserialize, Eq, PartialEq, Debug)]
+#[serde(rename_all = "camelCase")]
+#[allow(dead_code)]
+pub(super) struct HuggingfacePickleImportScan {
+    pub highest_safety_level: String,
+    pub imports: Vec<HuggingfaceImport>,
+}
+
+#[derive(Deserialize, Eq, PartialEq, Debug)]
+#[allow(dead_code)]
+pub(super) struct HuggingfaceImport {
+    pub module: String,
+    pub name: String,
+    pub safety: String,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::raw::new_json_deserialize_error;
+    use crate::types::Result;
+    use bytes::Bytes;
+
+    #[test]
+    fn parse_list_response_test() -> Result<()> {
+        let resp = Bytes::from(
+            r#"
+            [
+                {
+                    "type": "file",
+                    "oid": "45fa7c3d85ee7dd4139adbc056da25ae136a65f2",
+                    "size": 69512435,
+                    "lfs": {
+                        "oid": 
"b43f4c2ea569da1d66ca74e26ca8ea4430dfc29195e97144b2d0b4f3f6cafa1c",
+                        "size": 69512435,
+                        "pointerSize": 133
+                    },
+                    "path": "maelstrom/lib/maelstrom.jar"
+                },
+                {
+                    "type": "directory",
+                    "oid": 
"b43f4c2ea569da1d66ca74e26ca8ea4430dfc29195e97144b2d0b4f3f6cafa1c",
+                    "size": 69512435,
+                    "path": "maelstrom/lib/plugins"
+                }
+            ]
+            "#,
+        );
+
+        let decoded_response = 
serde_json::from_slice::<Vec<HuggingfaceStatus>>(&resp)
+            .map_err(new_json_deserialize_error)?;
+
+        assert_eq!(decoded_response.len(), 2);
+
+        let file_entry = HuggingfaceStatus {
+            type_: "file".to_string(),
+            oid: "45fa7c3d85ee7dd4139adbc056da25ae136a65f2".to_string(),
+            size: 69512435,
+            lfs: Some(HuggingfaceLfs {
+                oid: 
"b43f4c2ea569da1d66ca74e26ca8ea4430dfc29195e97144b2d0b4f3f6cafa1c".to_string(),
+                size: 69512435,
+                pointer_size: 133,
+            }),
+            path: "maelstrom/lib/maelstrom.jar".to_string(),
+            last_commit: None,
+            security: None,
+        };
+
+        assert_eq!(decoded_response[0], file_entry);
+
+        let dir_entry = HuggingfaceStatus {
+            type_: "directory".to_string(),
+            oid: 
"b43f4c2ea569da1d66ca74e26ca8ea4430dfc29195e97144b2d0b4f3f6cafa1c".to_string(),
+            size: 69512435,
+            lfs: None,
+            path: "maelstrom/lib/plugins".to_string(),
+            last_commit: None,
+            security: None,
+        };
+
+        assert_eq!(decoded_response[1], dir_entry);
+
+        Ok(())
+    }
+
+    #[test]
+    fn parse_files_info_test() -> Result<()> {
+        let resp = Bytes::from(
+            r#"
+            [
+                {
+                    "type": "file",
+                    "oid": "45fa7c3d85ee7dd4139adbc056da25ae136a65f2",
+                    "size": 69512435,
+                    "lfs": {
+                        "oid": 
"b43f4c2ea569da1d66ca74e26ca8ea4430dfc29195e97144b2d0b4f3f6cafa1c",
+                        "size": 69512435,
+                        "pointerSize": 133
+                    },
+                    "path": "maelstrom/lib/maelstrom.jar",
+                    "lastCommit": {
+                        "id": "bc1ef030bf3743290d5e190695ab94582e51ae2f",
+                        "title": "Upload 141 files",
+                        "date": "2023-11-17T23:50:28.000Z"
+                    },
+                    "security": {
+                        "blobId": "45fa7c3d85ee7dd4139adbc056da25ae136a65f2",
+                        "name": "maelstrom/lib/maelstrom.jar",
+                        "safe": true,
+                        "avScan": {
+                            "virusFound": false,
+                            "virusNames": null
+                        },
+                        "pickleImportScan": {
+                            "highestSafetyLevel": "innocuous",
+                            "imports": [
+                                {"module": "torch", "name": "FloatStorage", 
"safety": "innocuous"},
+                                {"module": "collections", "name": 
"OrderedDict", "safety": "innocuous"},
+                                {"module": "torch", "name": "LongStorage", 
"safety": "innocuous"},
+                                {"module": "torch._utils", "name": 
"_rebuild_tensor_v2", "safety": "innocuous"}
+                            ]
+                        }
+                    }
+                }
+            ]
+            "#,
+        );
+
+        let decoded_response = 
serde_json::from_slice::<Vec<HuggingfaceStatus>>(&resp)
+            .map_err(new_json_deserialize_error)?;
+
+        assert_eq!(decoded_response.len(), 1);
+
+        let file_info = HuggingfaceStatus {
+            type_: "file".to_string(),
+            oid: "45fa7c3d85ee7dd4139adbc056da25ae136a65f2".to_string(),
+            size: 69512435,
+            lfs: Some(HuggingfaceLfs {
+                oid: 
"b43f4c2ea569da1d66ca74e26ca8ea4430dfc29195e97144b2d0b4f3f6cafa1c".to_string(),
+                size: 69512435,
+                pointer_size: 133,
+            }),
+            path: "maelstrom/lib/maelstrom.jar".to_string(),
+            last_commit: Some(HuggingfaceLastCommit {
+                id: "bc1ef030bf3743290d5e190695ab94582e51ae2f".to_string(),
+                title: "Upload 141 files".to_string(),
+                date: "2023-11-17T23:50:28.000Z".to_string(),
+            }),
+            security: Some(HuggingfaceSecurity {
+                blob_id: 
"45fa7c3d85ee7dd4139adbc056da25ae136a65f2".to_string(),
+                name: "maelstrom/lib/maelstrom.jar".to_string(),
+                safe: true,
+                av_scan: Some(HuggingfaceAvScan {
+                    virus_found: false,
+                    virus_names: None,
+                }),
+                pickle_import_scan: Some(HuggingfacePickleImportScan {
+                    highest_safety_level: "innocuous".to_string(),
+                    imports: vec![
+                        HuggingfaceImport {
+                            module: "torch".to_string(),
+                            name: "FloatStorage".to_string(),
+                            safety: "innocuous".to_string(),
+                        },
+                        HuggingfaceImport {
+                            module: "collections".to_string(),
+                            name: "OrderedDict".to_string(),
+                            safety: "innocuous".to_string(),
+                        },
+                        HuggingfaceImport {
+                            module: "torch".to_string(),
+                            name: "LongStorage".to_string(),
+                            safety: "innocuous".to_string(),
+                        },
+                        HuggingfaceImport {
+                            module: "torch._utils".to_string(),
+                            name: "_rebuild_tensor_v2".to_string(),
+                            safety: "innocuous".to_string(),
+                        },
+                    ],
+                }),
+            }),
+        };
+
+        assert_eq!(decoded_response[0], file_info);
+
+        Ok(())
+    }
+}
diff --git a/core/src/services/huggingface/docs.md 
b/core/src/services/huggingface/docs.md
new file mode 100644
index 000000000..951087811
--- /dev/null
+++ b/core/src/services/huggingface/docs.md
@@ -0,0 +1,63 @@
+This service will visit the [Huggingface 
API](https://huggingface.co/docs/huggingface_hub/package_reference/hf_api) to 
access the Huggingface File System.
+Currently, we only support the `model` and `dataset` types of repositories, 
and operations are limited to reading and listing/stating.
+
+Huggingface doesn't host official HTTP API docs. Detailed HTTP request API 
information can be found on the [Huggingface 
Hub](https://github.com/huggingface/huggingface_hub).
+
+## Capabilities
+
+This service can be used to:
+
+- [x] stat
+- [x] read
+- [ ] write
+- [ ] create_dir
+- [ ] delete
+- [ ] copy
+- [ ] rename
+- [x] list
+- [ ] ~~scan~~
+- [ ] ~~presign~~
+- [ ] blocking
+
+## Configurations
+
+- `repo_type`: The type of the repository.
+- `repo_id`: The id of the repository.
+- `revision`: The revision of the repository.
+- `root`: Set the work directory for backend.
+- `token`: The token for accessing the repository.
+
+Refer to [`Builder`]'s public API docs for more information.
+
+## Examples
+
+### Via Builder
+
+```rust
+use std::sync::Arc;
+
+use anyhow::Result;
+use opendal::services::Huggingface;
+use opendal::Operator;
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    // Create Huggingface backend builder
+    let mut builder = Huggingface::default();
+    
+    // set the type of Huggingface repository
+    builder.repo_type("dataset");
+    // set the id of Huggingface repository
+    builder.repo_id("databricks/databricks-dolly-15k");
+    // set the revision of Huggingface repository
+    builder.revision("main");
+    // set the root for Huggingface, all operations will happen under this root
+    builder.root("/path/to/dir");
+    // set the token for accessing the repository
+    builder.token("access_token");
+
+    let op: Operator = Operator::new(builder)?.finish();
+
+    Ok(())
+}
+```
diff --git a/core/src/services/huggingface/error.rs 
b/core/src/services/huggingface/error.rs
new file mode 100644
index 000000000..4e5361ece
--- /dev/null
+++ b/core/src/services/huggingface/error.rs
@@ -0,0 +1,93 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::fmt::Debug;
+
+use http::Response;
+use http::StatusCode;
+use serde::Deserialize;
+
+use crate::raw::*;
+use crate::*;
+
+/// HuggingfaceError is the error returned by Huggingface File System.
+#[derive(Default, Deserialize)]
+struct HuggingfaceError {
+    error: String,
+}
+
+impl Debug for HuggingfaceError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let mut de = f.debug_struct("HuggingfaceError");
+        de.field("message", &self.error.replace('\n', " "));
+
+        de.finish()
+    }
+}
+
+pub async fn parse_error(resp: Response<IncomingAsyncBody>) -> Result<Error> {
+    let (parts, body) = resp.into_parts();
+    let bs = body.bytes().await?;
+
+    let (kind, retryable) = match parts.status {
+        StatusCode::NOT_FOUND => (ErrorKind::NotFound, false),
+        StatusCode::UNAUTHORIZED | StatusCode::FORBIDDEN => 
(ErrorKind::PermissionDenied, false),
+        StatusCode::PRECONDITION_FAILED => (ErrorKind::ConditionNotMatch, 
false),
+        StatusCode::INTERNAL_SERVER_ERROR
+        | StatusCode::BAD_GATEWAY
+        | StatusCode::SERVICE_UNAVAILABLE
+        | StatusCode::GATEWAY_TIMEOUT => (ErrorKind::Unexpected, true),
+        _ => (ErrorKind::Unexpected, false),
+    };
+
+    let message = match serde_json::from_slice::<HuggingfaceError>(&bs) {
+        Ok(hf_error) => format!("{:?}", hf_error.error),
+        Err(_) => String::from_utf8_lossy(&bs).into_owned(),
+    };
+
+    let mut err = Error::new(kind, &message);
+
+    err = with_error_response_context(err, parts);
+
+    if retryable {
+        err = err.set_temporary();
+    }
+
+    Ok(err)
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::raw::new_json_deserialize_error;
+    use crate::types::Result;
+
+    #[test]
+    fn test_parse_error() -> Result<()> {
+        let resp = r#"
+            {
+                "error": "Invalid username or password."
+            }
+            "#;
+        let decoded_response = 
serde_json::from_slice::<HuggingfaceError>(resp.as_bytes())
+            .map_err(new_json_deserialize_error)?;
+
+        assert_eq!(decoded_response.error, "Invalid username or password.");
+
+        Ok(())
+    }
+}
diff --git a/core/src/services/huggingface/lister.rs 
b/core/src/services/huggingface/lister.rs
new file mode 100644
index 000000000..5cb591f82
--- /dev/null
+++ b/core/src/services/huggingface/lister.rs
@@ -0,0 +1,89 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use async_trait::async_trait;
+
+use super::core::HuggingfaceCore;
+use super::core::HuggingfaceStatus;
+use super::error::parse_error;
+use crate::raw::*;
+use crate::*;
+
+pub struct HuggingfaceLister {
+    core: Arc<HuggingfaceCore>,
+    path: String,
+    recursive: bool,
+}
+
+impl HuggingfaceLister {
+    pub fn new(core: Arc<HuggingfaceCore>, path: String, recursive: bool) -> 
Self {
+        Self {
+            core,
+            path,
+            recursive,
+        }
+    }
+}
+
+#[async_trait]
+impl oio::PageList for HuggingfaceLister {
+    async fn next_page(&self, ctx: &mut oio::PageContext) -> Result<()> {
+        let response = self.core.hf_list(&self.path, self.recursive).await?;
+
+        let status_code = response.status();
+        if !status_code.is_success() {
+            let error = parse_error(response).await?;
+            return Err(error);
+        }
+
+        let bytes = response.into_body().bytes().await?;
+        let decoded_response = 
serde_json::from_slice::<Vec<HuggingfaceStatus>>(&bytes)
+            .map_err(new_json_deserialize_error)?;
+
+        ctx.done = true;
+
+        for status in decoded_response {
+            let entry_type = match status.type_.as_str() {
+                "directory" => EntryMode::DIR,
+                "file" => EntryMode::FILE,
+                _ => EntryMode::Unknown,
+            };
+
+            let mut meta = Metadata::new(entry_type);
+
+            if let Some(commit_info) = status.last_commit.as_ref() {
+                
meta.set_last_modified(parse_datetime_from_rfc3339(commit_info.date.as_str())?);
+            }
+
+            if entry_type == EntryMode::FILE {
+                meta.set_content_length(status.size);
+            }
+
+            let path = if entry_type == EntryMode::DIR {
+                format!("{}/", &status.path)
+            } else {
+                status.path.clone()
+            };
+
+            ctx.entries.push_back(oio::Entry::new(&path, meta));
+        }
+
+        Ok(())
+    }
+}
diff --git a/core/src/services/huggingface/mod.rs 
b/core/src/services/huggingface/mod.rs
new file mode 100644
index 000000000..3a692fa45
--- /dev/null
+++ b/core/src/services/huggingface/mod.rs
@@ -0,0 +1,24 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod backend;
+pub use backend::HuggingfaceBuilder as Huggingface;
+pub use backend::HuggingfaceConfig;
+
+mod core;
+mod error;
+mod lister;
diff --git a/core/src/services/mod.rs b/core/src/services/mod.rs
index 1a2e64e02..edb0444e7 100644
--- a/core/src/services/mod.rs
+++ b/core/src/services/mod.rs
@@ -92,6 +92,13 @@ pub use self::http::Http;
 #[cfg(feature = "services-http")]
 pub use self::http::HttpConfig;
 
+#[cfg(feature = "services-huggingface")]
+mod huggingface;
+#[cfg(feature = "services-huggingface")]
+pub use huggingface::Huggingface;
+#[cfg(feature = "services-huggingface")]
+pub use huggingface::HuggingfaceConfig;
+
 #[cfg(feature = "services-ipfs")]
 mod ipfs;
 #[cfg(feature = "services-ipfs")]
diff --git a/core/src/types/scheme.rs b/core/src/types/scheme.rs
index 57093f510..bdc8630de 100644
--- a/core/src/types/scheme.rs
+++ b/core/src/types/scheme.rs
@@ -68,6 +68,8 @@ pub enum Scheme {
     Hdfs,
     /// [http][crate::services::Http]: HTTP backend.
     Http,
+    /// [huggingface][crate::services::Huggingface]: Huggingface services.
+    Huggingface,
     /// [alluxio][created::services::Alluxio]: Alluxio services.
     Alluxio,
 
@@ -201,6 +203,8 @@ impl Scheme {
             Scheme::Hdfs,
             #[cfg(feature = "services-http")]
             Scheme::Http,
+            #[cfg(feature = "services-huggingface")]
+            Scheme::Huggingface,
             #[cfg(feature = "services-ipfs")]
             Scheme::Ipfs,
             #[cfg(feature = "services-ipmfs")]
@@ -303,6 +307,7 @@ impl FromStr for Scheme {
             "gridfs" => Ok(Scheme::Gridfs),
             "hdfs" => Ok(Scheme::Hdfs),
             "http" | "https" => Ok(Scheme::Http),
+            "huggingface" | "hf" => Ok(Scheme::Huggingface),
             "ftp" | "ftps" => Ok(Scheme::Ftp),
             "ipfs" | "ipns" => Ok(Scheme::Ipfs),
             "ipmfs" => Ok(Scheme::Ipmfs),
@@ -357,6 +362,7 @@ impl From<Scheme> for &'static str {
             Scheme::Gridfs => "gridfs",
             Scheme::Hdfs => "hdfs",
             Scheme::Http => "http",
+            Scheme::Huggingface => "huggingface",
             Scheme::Foundationdb => "foundationdb",
             Scheme::Ftp => "ftp",
             Scheme::Ipfs => "ipfs",


Reply via email to