This is an automated email from the ASF dual-hosted git repository.
xuanwo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-opendal.git
The following commit(s) were added to refs/heads/main by this push:
new 547c23dd2 feat(core): service add HuggingFace file system (#3670)
547c23dd2 is described below
commit 547c23dd2cd96bc2be6c14a83c3d5df74f7f4b6a
Author: Morris Tai <[email protected]>
AuthorDate: Sat Dec 2 05:49:52 2023 -0500
feat(core): service add HuggingFace file system (#3670)
* feat: service add huggingface file system
* chore: fix typo
* feat: refactor `read_token` with `token`
* feat: add HuggingFaceConfig implementation
* chore: use better syntax implementation
* chore: use format_authorization_by_bearer instead
* chore: use Eq, PartialEq to make testing cleaner
* feat: change scheme from `huggingface` to `hf`
* feat: rename `HuggingFace` to `Huggingface`
* chore: move message.rs to core.rs
* feat: drop dependency serde_urlencoded
* chore: fix for cargo clippy
* feat: rename huggingface scheme
* chore: polish code
* chore: use `http::header` for well-known header names
* feat: fix for PR review
* chore: read `hf_read` to `hf_resolve`
* chore: read `hf_read` to `hf_resolve`
---
core/Cargo.toml | 1 +
core/src/services/huggingface/backend.rs | 341 +++++++++++++++++++++++++
core/src/services/huggingface/core.rs | 415 +++++++++++++++++++++++++++++++
core/src/services/huggingface/docs.md | 63 +++++
core/src/services/huggingface/error.rs | 93 +++++++
core/src/services/huggingface/lister.rs | 89 +++++++
core/src/services/huggingface/mod.rs | 24 ++
core/src/services/mod.rs | 7 +
core/src/types/scheme.rs | 6 +
9 files changed, 1039 insertions(+)
diff --git a/core/Cargo.toml b/core/Cargo.toml
index 5c71da150..55541ee6c 100644
--- a/core/Cargo.toml
+++ b/core/Cargo.toml
@@ -151,6 +151,7 @@ services-ghac = []
services-gridfs = ["dep:mongodb"]
services-hdfs = ["dep:hdrs"]
services-http = []
+services-huggingface = []
services-ipfs = ["dep:prost"]
services-ipmfs = []
services-libsql = ["dep:hrana-client-proto"]
diff --git a/core/src/services/huggingface/backend.rs
b/core/src/services/huggingface/backend.rs
new file mode 100644
index 000000000..49502680e
--- /dev/null
+++ b/core/src/services/huggingface/backend.rs
@@ -0,0 +1,341 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::collections::HashMap;
+use std::fmt::Debug;
+use std::fmt::Formatter;
+use std::sync::Arc;
+
+use async_trait::async_trait;
+use http::StatusCode;
+use log::debug;
+use serde::Deserialize;
+
+use super::core::HuggingfaceCore;
+use super::core::HuggingfaceStatus;
+use super::error::parse_error;
+use super::lister::HuggingfaceLister;
+use crate::raw::*;
+use crate::*;
+
+/// Configuration for Huggingface service support.
+#[derive(Default, Deserialize, Clone)]
+#[serde(default)]
+#[non_exhaustive]
+pub struct HuggingfaceConfig {
+ /// Repo type of this backend. Default is model.
+ ///
+ /// Available values:
+ /// - model
+ /// - dataset
+ pub repo_type: Option<String>,
+ /// Repo id of this backend.
+ ///
+ /// This is required.
+ pub repo_id: Option<String>,
+ /// Revision of this backend.
+ ///
+ /// Default is main.
+ pub revision: Option<String>,
+ /// Root of this backend. Can be "/path/to/dir".
+ ///
+ /// Default is "/".
+ pub root: Option<String>,
+ /// Token of this backend.
+ ///
+ /// This is optional.
+ pub token: Option<String>,
+}
+
+impl Debug for HuggingfaceConfig {
+ fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+ let mut ds = f.debug_struct("HuggingfaceConfig");
+
+ if let Some(repo_type) = &self.repo_type {
+ ds.field("repo_type", &repo_type);
+ }
+ if let Some(repo_id) = &self.repo_id {
+ ds.field("repo_id", &repo_id);
+ }
+ if let Some(revision) = &self.revision {
+ ds.field("revision", &revision);
+ }
+ if let Some(root) = &self.root {
+ ds.field("root", &root);
+ }
+ if self.token.is_some() {
+ ds.field("token", &"<redacted>");
+ }
+
+ ds.finish()
+ }
+}
+
+///
[Huggingface](https://huggingface.co/docs/huggingface_hub/package_reference/hf_api)'s
API support.
+#[doc = include_str!("docs.md")]
+#[derive(Default, Clone)]
+pub struct HuggingfaceBuilder {
+ config: HuggingfaceConfig,
+}
+
+impl Debug for HuggingfaceBuilder {
+ fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+ let mut ds = f.debug_struct("Builder");
+
+ ds.field("config", &self.config);
+ ds.finish()
+ }
+}
+
+impl HuggingfaceBuilder {
+ /// Set repo type of this backend. Default is model.
+ ///
+ /// Available values:
+ /// - model
+ /// - dataset
+ ///
+ /// Currently, only models and datasets are supported.
+ /// [Reference](https://huggingface.co/docs/hub/repositories)
+ pub fn repo_type(&mut self, repo_type: &str) -> &mut Self {
+ if !repo_type.is_empty() {
+ self.config.repo_type = Some(repo_type.to_string());
+ }
+ self
+ }
+
+ /// Set repo id of this backend. This is required.
+ ///
+ /// Repo id consists of the account name and the repository name.
+ ///
+ /// For example, model's repo id looks like:
+ /// - meta-llama/Llama-2-7b
+ ///
+ /// Dataset's repo id looks like:
+ /// - databricks/databricks-dolly-15k
+ pub fn repo_id(&mut self, repo_id: &str) -> &mut Self {
+ if !repo_id.is_empty() {
+ self.config.repo_id = Some(repo_id.to_string());
+ }
+ self
+ }
+
+ /// Set revision of this backend. Default is main.
+ ///
+ /// Revision can be a branch name or a commit hash.
+ ///
+ /// For example, revision can be:
+ /// - main
+ /// - 1d0c4eb
+ pub fn revision(&mut self, revision: &str) -> &mut Self {
+ if !revision.is_empty() {
+ self.config.revision = Some(revision.to_string());
+ }
+ self
+ }
+
+ /// Set root of this backend.
+ ///
+ /// All operations will happen under this root.
+ pub fn root(&mut self, root: &str) -> &mut Self {
+ if !root.is_empty() {
+ self.config.root = Some(root.to_string());
+ }
+ self
+ }
+
+ /// Set the token of this backend.
+ ///
+ /// This is optional.
+ pub fn token(&mut self, token: &str) -> &mut Self {
+ if !token.is_empty() {
+ self.config.token = Some(token.to_string());
+ }
+ self
+ }
+}
+
+impl Builder for HuggingfaceBuilder {
+ const SCHEME: Scheme = Scheme::Huggingface;
+ type Accessor = HuggingfaceBackend;
+
+ fn from_map(map: HashMap<String, String>) -> Self {
+ let config =
HuggingfaceConfig::deserialize(ConfigDeserializer::new(map))
+ .expect("config deserialize must succeed");
+
+ HuggingfaceBuilder { config }
+ }
+
+ /// Build a HuggingfaceBackend.
+ fn build(&mut self) -> Result<Self::Accessor> {
+ debug!("backend build started: {:?}", &self);
+
+ let repo_type = match self.config.repo_type.as_deref() {
+ Some("model") => Ok(RepoType::Model),
+ Some("dataset") => Ok(RepoType::Dataset),
+ Some("space") => Err(Error::new(
+ ErrorKind::ConfigInvalid,
+ "repo type \"space\" is unsupported",
+ )),
+ Some(repo_type) => Err(Error::new(
+ ErrorKind::ConfigInvalid,
+ format!("unknown repo_type: {}", repo_type).as_str(),
+ )
+ .with_operation("Builder::build")
+ .with_context("service", Scheme::Huggingface)),
+ None => Ok(RepoType::Model),
+ }?;
+ debug!("backend use repo_type: {:?}", &repo_type);
+
+ let repo_id = match &self.config.repo_id {
+ Some(repo_id) => Ok(repo_id.clone()),
+ None => Err(Error::new(ErrorKind::ConfigInvalid, "repo_id is
empty")
+ .with_operation("Builder::build")
+ .with_context("service", Scheme::Huggingface)),
+ }?;
+ debug!("backend use repo_id: {}", &repo_id);
+
+ let revision = match &self.config.revision {
+ Some(revision) => revision.clone(),
+ None => "main".to_string(),
+ };
+ debug!("backend use revision: {}", &revision);
+
+ let root =
normalize_root(&self.config.root.take().unwrap_or_default());
+ debug!("backend use root: {}", &root);
+
+ let token = self.config.token.as_ref().cloned();
+
+ let client = HttpClient::new()?;
+
+ debug!("backend build finished: {:?}", &self);
+ Ok(HuggingfaceBackend {
+ core: Arc::new(HuggingfaceCore {
+ repo_type,
+ repo_id,
+ revision,
+ root,
+ token,
+ client,
+ }),
+ })
+ }
+}
+
+/// Backend for Huggingface service
+#[derive(Debug, Clone)]
+pub struct HuggingfaceBackend {
+ core: Arc<HuggingfaceCore>,
+}
+
+#[async_trait]
+impl Accessor for HuggingfaceBackend {
+ type Reader = IncomingAsyncBody;
+ type BlockingReader = ();
+ type Writer = ();
+ type BlockingWriter = ();
+ type Lister = oio::PageLister<HuggingfaceLister>;
+ type BlockingLister = ();
+
+ fn info(&self) -> AccessorInfo {
+ let mut am = AccessorInfo::default();
+ am.set_scheme(Scheme::Huggingface)
+ .set_native_capability(Capability {
+ stat: true,
+
+ read: true,
+ read_can_next: true,
+ read_with_range: true,
+
+ list: true,
+ list_without_recursive: true,
+ list_with_recursive: true,
+
+ ..Default::default()
+ });
+ am
+ }
+
+ async fn read(&self, path: &str, args: OpRead) -> Result<(RpRead,
Self::Reader)> {
+ let resp = self.core.hf_resolve(path, args).await?;
+
+ let status = resp.status();
+
+ match status {
+ StatusCode::OK => {
+ let size = parse_content_length(resp.headers())?;
+ Ok((RpRead::new().with_size(size), resp.into_body()))
+ }
+ StatusCode::RANGE_NOT_SATISFIABLE => Ok((RpRead::new(),
IncomingAsyncBody::empty())),
+ _ => Err(parse_error(resp).await?),
+ }
+ }
+
+ async fn stat(&self, path: &str, _: OpStat) -> Result<RpStat> {
+ // Stat root always returns a DIR.
+ if path == "/" {
+ return Ok(RpStat::new(Metadata::new(EntryMode::DIR)));
+ }
+
+ let resp = self.core.hf_path_info(path).await?;
+
+ let status = resp.status();
+
+ match status {
+ StatusCode::OK => {
+ let mut meta = parse_into_metadata(path, resp.headers())?;
+ let bs = resp.into_body().bytes().await?;
+
+ let decoded_response =
serde_json::from_slice::<Vec<HuggingfaceStatus>>(&bs)
+ .map_err(new_json_deserialize_error)?;
+
+ // NOTE: if the file is not found, the server will return 200
with an empty array
+ if let Some(status) = decoded_response.get(0) {
+ if let Some(commit_info) = status.last_commit.as_ref() {
+ meta.set_last_modified(parse_datetime_from_rfc3339(
+ commit_info.date.as_str(),
+ )?);
+ }
+
+ match status.type_.as_str() {
+ "directory" => meta.set_mode(EntryMode::DIR),
+ "file" => meta.set_mode(EntryMode::FILE),
+ _ => return Err(Error::new(ErrorKind::Unexpected,
"unknown status type")),
+ };
+ } else {
+ return Err(Error::new(ErrorKind::NotFound, "path not
found"));
+ }
+
+ Ok(RpStat::new(meta))
+ }
+ _ => Err(parse_error(resp).await?),
+ }
+ }
+
+ async fn list(&self, path: &str, args: OpList) -> Result<(RpList,
Self::Lister)> {
+ let l = HuggingfaceLister::new(self.core.clone(), path.to_string(),
args.recursive());
+
+ Ok((RpList::default(), oio::PageLister::new(l)))
+ }
+}
+
+/// Repository type of Huggingface. Currently, we only support `model` and
`dataset`.
+/// [Reference](https://huggingface.co/docs/hub/repositories)
+#[derive(Debug, Clone, Copy)]
+pub enum RepoType {
+ Model,
+ Dataset,
+}
diff --git a/core/src/services/huggingface/core.rs
b/core/src/services/huggingface/core.rs
new file mode 100644
index 000000000..45ec528ed
--- /dev/null
+++ b/core/src/services/huggingface/core.rs
@@ -0,0 +1,415 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::fmt::Debug;
+
+use bytes::Bytes;
+use http::Request;
+use http::Response;
+use http::{header, StatusCode};
+use serde::Deserialize;
+
+use super::backend::RepoType;
+use super::error::parse_error;
+use crate::raw::*;
+use crate::*;
+
+pub struct HuggingfaceCore {
+ pub repo_type: RepoType,
+ pub repo_id: String,
+ pub revision: String,
+ pub root: String,
+ pub token: Option<String>,
+
+ pub client: HttpClient,
+}
+
+impl Debug for HuggingfaceCore {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ f.debug_struct("HuggingfaceCore")
+ .field("repo_type", &self.repo_type)
+ .field("repo_id", &self.repo_id)
+ .field("revision", &self.revision)
+ .field("root", &self.root)
+ .finish_non_exhaustive()
+ }
+}
+
+impl HuggingfaceCore {
+ pub async fn hf_path_info(&self, path: &str) ->
Result<Response<IncomingAsyncBody>> {
+ let p = build_abs_path(&self.root, path)
+ .trim_end_matches('/')
+ .to_string();
+
+ let url = match self.repo_type {
+ RepoType::Model => format!(
+ "https://huggingface.co/api/models/{}/paths-info/{}",
+ &self.repo_id, &self.revision
+ ),
+ RepoType::Dataset => format!(
+ "https://huggingface.co/api/datasets/{}/paths-info/{}",
+ &self.repo_id, &self.revision
+ ),
+ };
+
+ let mut req = Request::post(&url);
+
+ if let Some(token) = &self.token {
+ let auth_header_content = format_authorization_by_bearer(token)?;
+ req = req.header(header::AUTHORIZATION, auth_header_content);
+ }
+
+ req = req.header(header::CONTENT_TYPE,
"application/x-www-form-urlencoded");
+
+ let req_body = format!("paths={}&expand=True",
percent_encode_path(&p));
+
+ let req = req
+ .body(AsyncBody::Bytes(Bytes::from(req_body)))
+ .map_err(new_request_build_error)?;
+
+ self.client.send(req).await
+ }
+
+ pub async fn hf_list(
+ &self,
+ path: &str,
+ recursive: bool,
+ ) -> Result<Response<IncomingAsyncBody>> {
+ let p = build_abs_path(&self.root, path)
+ .trim_end_matches('/')
+ .to_string();
+
+ let mut url = match self.repo_type {
+ RepoType::Model => format!(
+ "https://huggingface.co/api/models/{}/tree/{}/{}?expand=True",
+ &self.repo_id,
+ &self.revision,
+ percent_encode_path(&p)
+ ),
+ RepoType::Dataset => format!(
+
"https://huggingface.co/api/datasets/{}/tree/{}/{}?expand=True",
+ &self.repo_id,
+ &self.revision,
+ percent_encode_path(&p)
+ ),
+ };
+
+ if recursive {
+ url.push_str("&recursive=True");
+ }
+
+ let mut req = Request::get(&url);
+
+ if let Some(token) = &self.token {
+ let auth_header_content = format_authorization_by_bearer(token)?;
+ req = req.header(header::AUTHORIZATION, auth_header_content);
+ }
+
+ let req = req
+ .body(AsyncBody::Empty)
+ .map_err(new_request_build_error)?;
+
+ self.client.send(req).await
+ }
+
+ pub async fn hf_resolve(&self, path: &str, arg: OpRead) ->
Result<Response<IncomingAsyncBody>> {
+ let p = build_abs_path(&self.root, path)
+ .trim_end_matches('/')
+ .to_string();
+
+ let url = match self.repo_type {
+ RepoType::Model => format!(
+ "https://huggingface.co/{}/resolve/{}/{}",
+ &self.repo_id,
+ &self.revision,
+ percent_encode_path(&p)
+ ),
+ RepoType::Dataset => format!(
+ "https://huggingface.co/datasets/{}/resolve/{}/{}",
+ &self.repo_id,
+ &self.revision,
+ percent_encode_path(&p)
+ ),
+ };
+
+ let mut req = Request::get(&url);
+
+ if let Some(token) = &self.token {
+ let auth_header_content = format_authorization_by_bearer(token)?;
+ req = req.header(header::AUTHORIZATION, auth_header_content);
+ }
+
+ let range = arg.range();
+ if !range.is_full() {
+ req = req.header(header::RANGE, &range.to_header());
+ }
+
+ let req = req
+ .body(AsyncBody::Empty)
+ .map_err(new_request_build_error)?;
+
+ let resp = self.client.send(req).await?;
+
+ let status = resp.status();
+
+ match status {
+ StatusCode::OK => Ok(resp),
+ _ => Err(parse_error(resp).await?),
+ }
+ }
+}
+
+#[derive(Deserialize, Eq, PartialEq, Debug)]
+#[serde(rename_all = "camelCase")]
+#[allow(dead_code)]
+pub(super) struct HuggingfaceStatus {
+ #[serde(rename = "type")]
+ pub type_: String,
+ pub oid: String,
+ pub size: u64,
+ pub lfs: Option<HuggingfaceLfs>,
+ pub path: String,
+ pub last_commit: Option<HuggingfaceLastCommit>,
+ pub security: Option<HuggingfaceSecurity>,
+}
+
+#[derive(Deserialize, Eq, PartialEq, Debug)]
+#[serde(rename_all = "camelCase")]
+#[allow(dead_code)]
+pub(super) struct HuggingfaceLfs {
+ pub oid: String,
+ pub size: u64,
+ pub pointer_size: u64,
+}
+
+#[derive(Deserialize, Eq, PartialEq, Debug)]
+#[serde(rename_all = "camelCase")]
+#[allow(dead_code)]
+pub(super) struct HuggingfaceLastCommit {
+ pub id: String,
+ pub title: String,
+ pub date: String,
+}
+
+#[derive(Deserialize, Eq, PartialEq, Debug)]
+#[serde(rename_all = "camelCase")]
+#[allow(dead_code)]
+pub(super) struct HuggingfaceSecurity {
+ pub blob_id: String,
+ pub name: String,
+ pub safe: bool,
+ pub av_scan: Option<HuggingfaceAvScan>,
+ pub pickle_import_scan: Option<HuggingfacePickleImportScan>,
+}
+
+#[derive(Deserialize, Eq, PartialEq, Debug)]
+#[allow(dead_code)]
+#[serde(rename_all = "camelCase")]
+pub(super) struct HuggingfaceAvScan {
+ pub virus_found: bool,
+ pub virus_names: Option<Vec<String>>,
+}
+
+#[derive(Deserialize, Eq, PartialEq, Debug)]
+#[serde(rename_all = "camelCase")]
+#[allow(dead_code)]
+pub(super) struct HuggingfacePickleImportScan {
+ pub highest_safety_level: String,
+ pub imports: Vec<HuggingfaceImport>,
+}
+
+#[derive(Deserialize, Eq, PartialEq, Debug)]
+#[allow(dead_code)]
+pub(super) struct HuggingfaceImport {
+ pub module: String,
+ pub name: String,
+ pub safety: String,
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use crate::raw::new_json_deserialize_error;
+ use crate::types::Result;
+ use bytes::Bytes;
+
+ #[test]
+ fn parse_list_response_test() -> Result<()> {
+ let resp = Bytes::from(
+ r#"
+ [
+ {
+ "type": "file",
+ "oid": "45fa7c3d85ee7dd4139adbc056da25ae136a65f2",
+ "size": 69512435,
+ "lfs": {
+ "oid":
"b43f4c2ea569da1d66ca74e26ca8ea4430dfc29195e97144b2d0b4f3f6cafa1c",
+ "size": 69512435,
+ "pointerSize": 133
+ },
+ "path": "maelstrom/lib/maelstrom.jar"
+ },
+ {
+ "type": "directory",
+ "oid":
"b43f4c2ea569da1d66ca74e26ca8ea4430dfc29195e97144b2d0b4f3f6cafa1c",
+ "size": 69512435,
+ "path": "maelstrom/lib/plugins"
+ }
+ ]
+ "#,
+ );
+
+ let decoded_response =
serde_json::from_slice::<Vec<HuggingfaceStatus>>(&resp)
+ .map_err(new_json_deserialize_error)?;
+
+ assert_eq!(decoded_response.len(), 2);
+
+ let file_entry = HuggingfaceStatus {
+ type_: "file".to_string(),
+ oid: "45fa7c3d85ee7dd4139adbc056da25ae136a65f2".to_string(),
+ size: 69512435,
+ lfs: Some(HuggingfaceLfs {
+ oid:
"b43f4c2ea569da1d66ca74e26ca8ea4430dfc29195e97144b2d0b4f3f6cafa1c".to_string(),
+ size: 69512435,
+ pointer_size: 133,
+ }),
+ path: "maelstrom/lib/maelstrom.jar".to_string(),
+ last_commit: None,
+ security: None,
+ };
+
+ assert_eq!(decoded_response[0], file_entry);
+
+ let dir_entry = HuggingfaceStatus {
+ type_: "directory".to_string(),
+ oid:
"b43f4c2ea569da1d66ca74e26ca8ea4430dfc29195e97144b2d0b4f3f6cafa1c".to_string(),
+ size: 69512435,
+ lfs: None,
+ path: "maelstrom/lib/plugins".to_string(),
+ last_commit: None,
+ security: None,
+ };
+
+ assert_eq!(decoded_response[1], dir_entry);
+
+ Ok(())
+ }
+
+ #[test]
+ fn parse_files_info_test() -> Result<()> {
+ let resp = Bytes::from(
+ r#"
+ [
+ {
+ "type": "file",
+ "oid": "45fa7c3d85ee7dd4139adbc056da25ae136a65f2",
+ "size": 69512435,
+ "lfs": {
+ "oid":
"b43f4c2ea569da1d66ca74e26ca8ea4430dfc29195e97144b2d0b4f3f6cafa1c",
+ "size": 69512435,
+ "pointerSize": 133
+ },
+ "path": "maelstrom/lib/maelstrom.jar",
+ "lastCommit": {
+ "id": "bc1ef030bf3743290d5e190695ab94582e51ae2f",
+ "title": "Upload 141 files",
+ "date": "2023-11-17T23:50:28.000Z"
+ },
+ "security": {
+ "blobId": "45fa7c3d85ee7dd4139adbc056da25ae136a65f2",
+ "name": "maelstrom/lib/maelstrom.jar",
+ "safe": true,
+ "avScan": {
+ "virusFound": false,
+ "virusNames": null
+ },
+ "pickleImportScan": {
+ "highestSafetyLevel": "innocuous",
+ "imports": [
+ {"module": "torch", "name": "FloatStorage",
"safety": "innocuous"},
+ {"module": "collections", "name":
"OrderedDict", "safety": "innocuous"},
+ {"module": "torch", "name": "LongStorage",
"safety": "innocuous"},
+ {"module": "torch._utils", "name":
"_rebuild_tensor_v2", "safety": "innocuous"}
+ ]
+ }
+ }
+ }
+ ]
+ "#,
+ );
+
+ let decoded_response =
serde_json::from_slice::<Vec<HuggingfaceStatus>>(&resp)
+ .map_err(new_json_deserialize_error)?;
+
+ assert_eq!(decoded_response.len(), 1);
+
+ let file_info = HuggingfaceStatus {
+ type_: "file".to_string(),
+ oid: "45fa7c3d85ee7dd4139adbc056da25ae136a65f2".to_string(),
+ size: 69512435,
+ lfs: Some(HuggingfaceLfs {
+ oid:
"b43f4c2ea569da1d66ca74e26ca8ea4430dfc29195e97144b2d0b4f3f6cafa1c".to_string(),
+ size: 69512435,
+ pointer_size: 133,
+ }),
+ path: "maelstrom/lib/maelstrom.jar".to_string(),
+ last_commit: Some(HuggingfaceLastCommit {
+ id: "bc1ef030bf3743290d5e190695ab94582e51ae2f".to_string(),
+ title: "Upload 141 files".to_string(),
+ date: "2023-11-17T23:50:28.000Z".to_string(),
+ }),
+ security: Some(HuggingfaceSecurity {
+ blob_id:
"45fa7c3d85ee7dd4139adbc056da25ae136a65f2".to_string(),
+ name: "maelstrom/lib/maelstrom.jar".to_string(),
+ safe: true,
+ av_scan: Some(HuggingfaceAvScan {
+ virus_found: false,
+ virus_names: None,
+ }),
+ pickle_import_scan: Some(HuggingfacePickleImportScan {
+ highest_safety_level: "innocuous".to_string(),
+ imports: vec![
+ HuggingfaceImport {
+ module: "torch".to_string(),
+ name: "FloatStorage".to_string(),
+ safety: "innocuous".to_string(),
+ },
+ HuggingfaceImport {
+ module: "collections".to_string(),
+ name: "OrderedDict".to_string(),
+ safety: "innocuous".to_string(),
+ },
+ HuggingfaceImport {
+ module: "torch".to_string(),
+ name: "LongStorage".to_string(),
+ safety: "innocuous".to_string(),
+ },
+ HuggingfaceImport {
+ module: "torch._utils".to_string(),
+ name: "_rebuild_tensor_v2".to_string(),
+ safety: "innocuous".to_string(),
+ },
+ ],
+ }),
+ }),
+ };
+
+ assert_eq!(decoded_response[0], file_info);
+
+ Ok(())
+ }
+}
diff --git a/core/src/services/huggingface/docs.md
b/core/src/services/huggingface/docs.md
new file mode 100644
index 000000000..951087811
--- /dev/null
+++ b/core/src/services/huggingface/docs.md
@@ -0,0 +1,63 @@
+This service will visit the [Huggingface
API](https://huggingface.co/docs/huggingface_hub/package_reference/hf_api) to
access the Huggingface File System.
+Currently, we only support the `model` and `dataset` types of repositories,
and operations are limited to reading and listing/stating.
+
+Huggingface doesn't host official HTTP API docs. Detailed HTTP request API
information can be found on the [Huggingface
Hub](https://github.com/huggingface/huggingface_hub).
+
+## Capabilities
+
+This service can be used to:
+
+- [x] stat
+- [x] read
+- [ ] write
+- [ ] create_dir
+- [ ] delete
+- [ ] copy
+- [ ] rename
+- [x] list
+- [ ] ~~scan~~
+- [ ] ~~presign~~
+- [ ] blocking
+
+## Configurations
+
+- `repo_type`: The type of the repository.
+- `repo_id`: The id of the repository.
+- `revision`: The revision of the repository.
+- `root`: Set the work directory for backend.
+- `token`: The token for accessing the repository.
+
+Refer to [`Builder`]'s public API docs for more information.
+
+## Examples
+
+### Via Builder
+
+```rust
+use std::sync::Arc;
+
+use anyhow::Result;
+use opendal::services::Huggingface;
+use opendal::Operator;
+
+#[tokio::main]
+async fn main() -> Result<()> {
+ // Create Huggingface backend builder
+ let mut builder = Huggingface::default();
+
+ // set the type of Huggingface repository
+ builder.repo_type("dataset");
+ // set the id of Huggingface repository
+ builder.repo_id("databricks/databricks-dolly-15k");
+ // set the revision of Huggingface repository
+ builder.revision("main");
+ // set the root for Huggingface, all operations will happen under this root
+ builder.root("/path/to/dir");
+ // set the token for accessing the repository
+ builder.token("access_token");
+
+ let op: Operator = Operator::new(builder)?.finish();
+
+ Ok(())
+}
+```
diff --git a/core/src/services/huggingface/error.rs
b/core/src/services/huggingface/error.rs
new file mode 100644
index 000000000..4e5361ece
--- /dev/null
+++ b/core/src/services/huggingface/error.rs
@@ -0,0 +1,93 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::fmt::Debug;
+
+use http::Response;
+use http::StatusCode;
+use serde::Deserialize;
+
+use crate::raw::*;
+use crate::*;
+
+/// HuggingfaceError is the error returned by Huggingface File System.
+#[derive(Default, Deserialize)]
+struct HuggingfaceError {
+ error: String,
+}
+
+impl Debug for HuggingfaceError {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ let mut de = f.debug_struct("HuggingfaceError");
+ de.field("message", &self.error.replace('\n', " "));
+
+ de.finish()
+ }
+}
+
+pub async fn parse_error(resp: Response<IncomingAsyncBody>) -> Result<Error> {
+ let (parts, body) = resp.into_parts();
+ let bs = body.bytes().await?;
+
+ let (kind, retryable) = match parts.status {
+ StatusCode::NOT_FOUND => (ErrorKind::NotFound, false),
+ StatusCode::UNAUTHORIZED | StatusCode::FORBIDDEN =>
(ErrorKind::PermissionDenied, false),
+ StatusCode::PRECONDITION_FAILED => (ErrorKind::ConditionNotMatch,
false),
+ StatusCode::INTERNAL_SERVER_ERROR
+ | StatusCode::BAD_GATEWAY
+ | StatusCode::SERVICE_UNAVAILABLE
+ | StatusCode::GATEWAY_TIMEOUT => (ErrorKind::Unexpected, true),
+ _ => (ErrorKind::Unexpected, false),
+ };
+
+ let message = match serde_json::from_slice::<HuggingfaceError>(&bs) {
+ Ok(hf_error) => format!("{:?}", hf_error.error),
+ Err(_) => String::from_utf8_lossy(&bs).into_owned(),
+ };
+
+ let mut err = Error::new(kind, &message);
+
+ err = with_error_response_context(err, parts);
+
+ if retryable {
+ err = err.set_temporary();
+ }
+
+ Ok(err)
+}
+
+#[cfg(test)]
+mod test {
+ use super::*;
+ use crate::raw::new_json_deserialize_error;
+ use crate::types::Result;
+
+ #[test]
+ fn test_parse_error() -> Result<()> {
+ let resp = r#"
+ {
+ "error": "Invalid username or password."
+ }
+ "#;
+ let decoded_response =
serde_json::from_slice::<HuggingfaceError>(resp.as_bytes())
+ .map_err(new_json_deserialize_error)?;
+
+ assert_eq!(decoded_response.error, "Invalid username or password.");
+
+ Ok(())
+ }
+}
diff --git a/core/src/services/huggingface/lister.rs
b/core/src/services/huggingface/lister.rs
new file mode 100644
index 000000000..5cb591f82
--- /dev/null
+++ b/core/src/services/huggingface/lister.rs
@@ -0,0 +1,89 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use async_trait::async_trait;
+
+use super::core::HuggingfaceCore;
+use super::core::HuggingfaceStatus;
+use super::error::parse_error;
+use crate::raw::*;
+use crate::*;
+
+pub struct HuggingfaceLister {
+ core: Arc<HuggingfaceCore>,
+ path: String,
+ recursive: bool,
+}
+
+impl HuggingfaceLister {
+ pub fn new(core: Arc<HuggingfaceCore>, path: String, recursive: bool) ->
Self {
+ Self {
+ core,
+ path,
+ recursive,
+ }
+ }
+}
+
+#[async_trait]
+impl oio::PageList for HuggingfaceLister {
+ async fn next_page(&self, ctx: &mut oio::PageContext) -> Result<()> {
+ let response = self.core.hf_list(&self.path, self.recursive).await?;
+
+ let status_code = response.status();
+ if !status_code.is_success() {
+ let error = parse_error(response).await?;
+ return Err(error);
+ }
+
+ let bytes = response.into_body().bytes().await?;
+ let decoded_response =
serde_json::from_slice::<Vec<HuggingfaceStatus>>(&bytes)
+ .map_err(new_json_deserialize_error)?;
+
+ ctx.done = true;
+
+ for status in decoded_response {
+ let entry_type = match status.type_.as_str() {
+ "directory" => EntryMode::DIR,
+ "file" => EntryMode::FILE,
+ _ => EntryMode::Unknown,
+ };
+
+ let mut meta = Metadata::new(entry_type);
+
+ if let Some(commit_info) = status.last_commit.as_ref() {
+
meta.set_last_modified(parse_datetime_from_rfc3339(commit_info.date.as_str())?);
+ }
+
+ if entry_type == EntryMode::FILE {
+ meta.set_content_length(status.size);
+ }
+
+ let path = if entry_type == EntryMode::DIR {
+ format!("{}/", &status.path)
+ } else {
+ status.path.clone()
+ };
+
+ ctx.entries.push_back(oio::Entry::new(&path, meta));
+ }
+
+ Ok(())
+ }
+}
diff --git a/core/src/services/huggingface/mod.rs
b/core/src/services/huggingface/mod.rs
new file mode 100644
index 000000000..3a692fa45
--- /dev/null
+++ b/core/src/services/huggingface/mod.rs
@@ -0,0 +1,24 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod backend;
+pub use backend::HuggingfaceBuilder as Huggingface;
+pub use backend::HuggingfaceConfig;
+
+mod core;
+mod error;
+mod lister;
diff --git a/core/src/services/mod.rs b/core/src/services/mod.rs
index 1a2e64e02..edb0444e7 100644
--- a/core/src/services/mod.rs
+++ b/core/src/services/mod.rs
@@ -92,6 +92,13 @@ pub use self::http::Http;
#[cfg(feature = "services-http")]
pub use self::http::HttpConfig;
+#[cfg(feature = "services-huggingface")]
+mod huggingface;
+#[cfg(feature = "services-huggingface")]
+pub use huggingface::Huggingface;
+#[cfg(feature = "services-huggingface")]
+pub use huggingface::HuggingfaceConfig;
+
#[cfg(feature = "services-ipfs")]
mod ipfs;
#[cfg(feature = "services-ipfs")]
diff --git a/core/src/types/scheme.rs b/core/src/types/scheme.rs
index 57093f510..bdc8630de 100644
--- a/core/src/types/scheme.rs
+++ b/core/src/types/scheme.rs
@@ -68,6 +68,8 @@ pub enum Scheme {
Hdfs,
/// [http][crate::services::Http]: HTTP backend.
Http,
+ /// [huggingface][crate::services::Huggingface]: Huggingface services.
+ Huggingface,
/// [alluxio][created::services::Alluxio]: Alluxio services.
Alluxio,
@@ -201,6 +203,8 @@ impl Scheme {
Scheme::Hdfs,
#[cfg(feature = "services-http")]
Scheme::Http,
+ #[cfg(feature = "services-huggingface")]
+ Scheme::Huggingface,
#[cfg(feature = "services-ipfs")]
Scheme::Ipfs,
#[cfg(feature = "services-ipmfs")]
@@ -303,6 +307,7 @@ impl FromStr for Scheme {
"gridfs" => Ok(Scheme::Gridfs),
"hdfs" => Ok(Scheme::Hdfs),
"http" | "https" => Ok(Scheme::Http),
+ "huggingface" | "hf" => Ok(Scheme::Huggingface),
"ftp" | "ftps" => Ok(Scheme::Ftp),
"ipfs" | "ipns" => Ok(Scheme::Ipfs),
"ipmfs" => Ok(Scheme::Ipmfs),
@@ -357,6 +362,7 @@ impl From<Scheme> for &'static str {
Scheme::Gridfs => "gridfs",
Scheme::Hdfs => "hdfs",
Scheme::Http => "http",
+ Scheme::Huggingface => "huggingface",
Scheme::Foundationdb => "foundationdb",
Scheme::Ftp => "ftp",
Scheme::Ipfs => "ipfs",