This is an automated email from the ASF dual-hosted git repository.

jiekaichang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git


The following commit(s) were added to refs/heads/main by this push:
     new b89617b1b [QDP] Add GCS remote URL support (#1174)
b89617b1b is described below

commit b89617b1b590d32a8599487dbab98ac602cf84dd
Author: Jie-Kai Chang <[email protected]>
AuthorDate: Fri Mar 13 10:13:58 2026 +0800

    [QDP] Add GCS remote URL support (#1174)
    
    * [QDP] Add GCS remote URL support
    
    Signed-off-by: 400Ping <[email protected]>
    
    * cargo.lock
    
    Signed-off-by: 400Ping <[email protected]>
    
    ---------
    
    Signed-off-by: 400Ping <[email protected]>
---
 docs/qdp/api.md                                    |  2 +
 docs/qdp/getting-started.md                        | 11 +++
 qdp/Cargo.lock                                     | 10 +++
 qdp/qdp-core/Cargo.toml                            |  2 +-
 qdp/qdp-core/src/remote.rs                         | 99 +++++++++++++++++-----
 qdp/qdp-python/README.md                           |  4 +
 qdp/qdp-python/qumat_qdp/loader.py                 |  7 +-
 qdp/qdp-python/src/engine.rs                       |  6 +-
 qdp/qdp-python/tests/test_quantum_data_loader.py   | 43 +++++++---
 website/versioned_docs/version-0.5/qdp/api.md      |  2 +
 .../version-0.5/qdp/getting-started.md             | 11 +++
 11 files changed, 159 insertions(+), 38 deletions(-)

diff --git a/docs/qdp/api.md b/docs/qdp/api.md
index ceb663e4b..ccbc7d6bc 100644
--- a/docs/qdp/api.md
+++ b/docs/qdp/api.md
@@ -41,6 +41,7 @@ Encode classical input into a quantum state and return a 
DLPack tensor on GPU.
   - `torch.Tensor` (CPU, float64, contiguous)
   - `str` / `pathlib.Path` file path
     - `.parquet`, `.arrow` / `.feather`, `.npy`, `.pt` / `.pth`, `.pb`
+    - remote URL (`s3://bucket/key`, `gs://bucket/key`) when built with 
`remote-io`
 - `num_qubits` (int): Number of qubits, range 1–30.
 - `encoding_method` (str): `"amplitude" | "angle" | "basis" | "iqp" | "iqp-z"` 
(lowercase).
 
@@ -53,6 +54,7 @@ Encode classical input into a quantum state and return a 
DLPack tensor on GPU.
 - Output dtype is `complex64` (`precision="float32"`) or `complex128` 
(`precision="float64"`).
 - Parquet streaming currently supports `"amplitude"` and `"basis"`.
 - PyTorch file inputs (`.pt`, `.pth`) require building with the `pytorch` 
feature.
+- Remote URL query/fragment is not supported (`?versionId=...`, `#...`).
 
 **Raises**
 - `RuntimeError`: Invalid inputs, shapes, dtypes, or unsupported formats.
diff --git a/docs/qdp/getting-started.md b/docs/qdp/getting-started.md
index 83ae588c7..a8747cbb2 100644
--- a/docs/qdp/getting-started.md
+++ b/docs/qdp/getting-started.md
@@ -56,6 +56,17 @@ tensor = torch.from_dlpack(qtensor)  # Note: can only be 
consumed once
 engine.encode("data.parquet", num_qubits=10, encoding_method="amplitude")  # 
also: .arrow, .npy, .pt, .pb
 ```
 
+Remote object storage URLs are supported when QDP is built with `remote-io`:
+
+```python
+engine.encode("s3://my-bucket/path/data.parquet", num_qubits=10, 
encoding_method="amplitude")
+engine.encode("gs://my-bucket/path/data.parquet", num_qubits=10, 
encoding_method="amplitude")
+```
+
+Notes:
+- Remote URL query/fragment is not supported (`?versionId=...`, `#...`).
+- Streaming still requires `.parquet`.
+
 ## Tips
 
 - Use `precision="float64"` for higher precision: `QdpEngine(0, 
precision="float64")`
diff --git a/qdp/Cargo.lock b/qdp/Cargo.lock
index edb08d548..eb080ba93 100644
--- a/qdp/Cargo.lock
+++ b/qdp/Cargo.lock
@@ -1694,6 +1694,7 @@ dependencies = [
  "rand 0.8.5",
  "reqwest",
  "ring",
+ "rustls-pemfile",
  "serde",
  "serde_json",
  "snafu",
@@ -2499,6 +2500,15 @@ dependencies = [
  "security-framework",
 ]
 
+[[package]]
+name = "rustls-pemfile"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index";
+checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50"
+dependencies = [
+ "rustls-pki-types",
+]
+
 [[package]]
 name = "rustls-pki-types"
 version = "1.14.0"
diff --git a/qdp/qdp-core/Cargo.toml b/qdp/qdp-core/Cargo.toml
index e5fafe1f0..d496c7b11 100644
--- a/qdp/qdp-core/Cargo.toml
+++ b/qdp/qdp-core/Cargo.toml
@@ -16,7 +16,7 @@ ndarray-npy = { workspace = true }
 prost = { workspace = true }
 bytes = { workspace = true }
 tch = { workspace = true, optional = true }
-object_store = { workspace = true, features = ["aws"], optional = true }
+object_store = { workspace = true, features = ["aws", "gcp"], optional = true }
 tokio = { workspace = true, features = ["rt"], optional = true }
 tempfile = { workspace = true, optional = true }
 futures = { version = "0.3", default-features = false, optional = true }
diff --git a/qdp/qdp-core/src/remote.rs b/qdp/qdp-core/src/remote.rs
index 42b375637..a2577eb6c 100644
--- a/qdp/qdp-core/src/remote.rs
+++ b/qdp/qdp-core/src/remote.rs
@@ -16,9 +16,9 @@
 
 //! Remote I/O support for cloud object storage.
 //!
-//! When the `remote-io` feature is enabled, cloud URLs (currently `s3://`)
+//! When the `remote-io` feature is enabled, cloud URLs (`s3://`, `gs://`)
 //! are transparently downloaded to a local temp file before being passed to
-//! readers. Adding GCS (`gs://`) or Azure (`az://`) support requires only a
+//! readers. Adding more providers (for example Azure `az://`) requires only a
 //! new match arm in [`build_store`] and the corresponding `object_store`
 //! cargo feature.
 
@@ -33,7 +33,7 @@ use tempfile::NamedTempFile;
 use crate::error::{MahoutError, Result};
 
 /// Recognized cloud URL schemes.
-const REMOTE_SCHEMES: &[&str] = &["s3://"];
+const REMOTE_SCHEMES: &[&str] = &["s3://", "gs://"];
 
 /// Returns true if `path` is a recognized remote URL.
 pub fn is_remote_path(path: &str) -> bool {
@@ -41,7 +41,14 @@ pub fn is_remote_path(path: &str) -> bool {
 }
 
 /// Parse a cloud URL into (scheme, bucket, key).
+/// Query/fragment components are intentionally rejected.
 fn parse_url(url: &str) -> Result<(&str, &str, &str)> {
+    if url.contains('?') || url.contains('#') {
+        return Err(MahoutError::InvalidInput(format!(
+            "Remote URL query/fragment is not supported: {}",
+            url
+        )));
+    }
     let (scheme, rest) = url
         .split_once("://")
         .ok_or_else(|| MahoutError::InvalidInput(format!("Not a remote URL: 
{}", url)))?;
@@ -76,7 +83,18 @@ fn build_store(scheme: &str, bucket: &str) -> Result<Arc<dyn 
ObjectStore>> {
                 })?;
             Ok(Arc::new(store))
         }
-        // To add GCS:  "gs" => { ... GoogleCloudStorageBuilder ... }
+        "gs" => {
+            let store = 
object_store::gcp::GoogleCloudStorageBuilder::from_env()
+                .with_bucket_name(bucket)
+                .build()
+                .map_err(|e| {
+                    MahoutError::Io(format!(
+                        "Failed to build GCS client for bucket '{}': {}",
+                        bucket, e
+                    ))
+                })?;
+            Ok(Arc::new(store))
+        }
         // To add Azure: "az" | "abfs" => { ... MicrosoftAzureBuilder ... }
         _ => Err(MahoutError::InvalidInput(format!(
             "Unsupported remote scheme '{}://'. Supported: {}",
@@ -174,6 +192,7 @@ mod tests {
     #[test]
     fn test_is_remote_path() {
         assert!(is_remote_path("s3://bucket/key.parquet"));
+        assert!(is_remote_path("gs://bucket/key.parquet"));
         assert!(!is_remote_path("/tmp/local.parquet"));
         assert!(!is_remote_path("data.parquet"));
     }
@@ -186,6 +205,14 @@ mod tests {
         assert_eq!(key, "path/to/data.parquet");
     }
 
+    #[test]
+    fn test_parse_url_gs() {
+        let (scheme, bucket, key) = 
parse_url("gs://my-bucket/path/to/data.parquet").unwrap();
+        assert_eq!(scheme, "gs");
+        assert_eq!(bucket, "my-bucket");
+        assert_eq!(key, "path/to/data.parquet");
+    }
+
     #[test]
     fn test_parse_url_no_key() {
         assert!(parse_url("s3://bucket-only").is_err());
@@ -197,6 +224,12 @@ mod tests {
         assert!(parse_url("s3://bucket/").is_err());
     }
 
+    #[test]
+    fn test_parse_url_rejects_query_and_fragment() {
+        assert!(parse_url("s3://bucket/data.parquet?versionId=abc").is_err());
+        assert!(parse_url("gs://bucket/data.parquet#generation").is_err());
+    }
+
     #[test]
     fn test_unsupported_scheme() {
         let err = build_store("gcs", "bucket").unwrap_err();
@@ -209,24 +242,7 @@ mod tests {
         assert_eq!(resolved.path, PathBuf::from("/tmp/local.parquet"));
     }
 
-    /// Integration test: download from a real S3-compatible endpoint (MinIO).
-    ///
-    /// Requires a running MinIO with a test file uploaded. Skipped unless
-    /// `MAHOUT_TEST_S3` env var is set. Run with:
-    ///
-    /// ```sh
-    /// MAHOUT_TEST_S3=1 \
-    /// AWS_ACCESS_KEY_ID=minioadmin AWS_SECRET_ACCESS_KEY=minioadmin \
-    /// AWS_ENDPOINT=http://localhost:9123 AWS_REGION=us-east-1 
AWS_ALLOW_HTTP=true \
-    /// cargo test -p qdp-core --features remote-io -- test_download_from_minio
-    /// ```
-    #[test]
-    fn test_download_from_minio() {
-        if std::env::var("MAHOUT_TEST_S3").is_err() {
-            eprintln!("skipping test_download_from_minio (set MAHOUT_TEST_S3=1 
to run)");
-            return;
-        }
-        let resolved = resolve_path("s3://test-bucket/data.parquet").unwrap();
+    fn assert_downloaded_parquet(resolved: &ResolvedPath) {
         assert!(
             resolved.path.exists(),
             "temp file should exist after download"
@@ -252,4 +268,43 @@ mod tests {
         assert_eq!(sample_size, 4);
         assert_eq!(data.len(), 32);
     }
+
+    /// Integration test: download from a real S3-compatible endpoint (MinIO).
+    ///
+    /// Requires a running MinIO with a test file uploaded. Skipped unless
+    /// `MAHOUT_TEST_S3` env var is set. Run with:
+    ///
+    /// ```sh
+    /// MAHOUT_TEST_S3=1 \
+    /// AWS_ACCESS_KEY_ID=minioadmin AWS_SECRET_ACCESS_KEY=minioadmin \
+    /// AWS_ENDPOINT=http://localhost:9123 AWS_REGION=us-east-1 
AWS_ALLOW_HTTP=true \
+    /// cargo test -p qdp-core --features remote-io -- test_download_from_minio
+    /// ```
+    #[test]
+    fn test_download_from_minio() {
+        if std::env::var("MAHOUT_TEST_S3").is_err() {
+            eprintln!("skipping test_download_from_minio (set MAHOUT_TEST_S3=1 
to run)");
+            return;
+        }
+        let resolved = resolve_path("s3://test-bucket/data.parquet").unwrap();
+        assert_downloaded_parquet(&resolved);
+    }
+
+    /// Integration test: download from a real GCS bucket.
+    ///
+    /// Skipped unless `MAHOUT_TEST_GCS` env var is set. The remote URL can be
+    /// overridden with `MAHOUT_TEST_GCS_URL` (default: 
gs://test-bucket/data.parquet).
+    /// Authentication must be configured via environment variables supported 
by
+    /// `GoogleCloudStorageBuilder::from_env`.
+    #[test]
+    fn test_download_from_gcs() {
+        if std::env::var("MAHOUT_TEST_GCS").is_err() {
+            eprintln!("skipping test_download_from_gcs (set MAHOUT_TEST_GCS=1 
to run)");
+            return;
+        }
+        let url = std::env::var("MAHOUT_TEST_GCS_URL")
+            .unwrap_or_else(|_| "gs://test-bucket/data.parquet".to_string());
+        let resolved = resolve_path(&url).unwrap();
+        assert_downloaded_parquet(&resolved);
+    }
 }
diff --git a/qdp/qdp-python/README.md b/qdp/qdp-python/README.md
index 6f47b9fea..ee70834ad 100644
--- a/qdp/qdp-python/README.md
+++ b/qdp/qdp-python/README.md
@@ -53,6 +53,10 @@ qtensor = engine.encode("data.parquet", 10, "amplitude")
 qtensor = engine.encode("data.arrow", 10, "amplitude")
 qtensor = engine.encode("data.npy", 10, "amplitude")
 qtensor = engine.encode("data.pt", 10, "amplitude")
+
+# Remote object storage URLs (requires building with remote-io feature)
+qtensor = engine.encode("s3://my-bucket/data.parquet", 10, "amplitude")
+qtensor = engine.encode("gs://my-bucket/data.parquet", 10, "amplitude")
 ```
 
 ## Links
diff --git a/qdp/qdp-python/qumat_qdp/loader.py 
b/qdp/qdp-python/qumat_qdp/loader.py
index c6514f094..42a0bea16 100644
--- a/qdp/qdp-python/qumat_qdp/loader.py
+++ b/qdp/qdp-python/qumat_qdp/loader.py
@@ -166,10 +166,15 @@ class QuantumDataLoader:
 
         For streaming=True (Phase 2b), only .parquet is supported; data is 
read in chunks to reduce memory.
         For streaming=False, supports .parquet, .arrow, .feather, .ipc, .npy, 
.pt, .pth, .pb.
-        Remote paths (s3://) are supported when the remote-io feature is 
enabled.
+        Remote paths (s3://, gs://) are supported when the remote-io feature 
is enabled.
+        Remote URL query/fragment (for example ?versionId=... or #...) is not 
supported.
         """
         if not path or not isinstance(path, str):
             raise ValueError(f"path must be a non-empty string, got {path!r}")
+        if "://" in path and ("?" in path or "#" in path):
+            raise ValueError(
+                "Remote URL query/fragment is not supported; use plain 
scheme://bucket/key path."
+            )
         # For remote URLs, extract the key portion for extension checks.
         check_path = path.split("?")[0].rsplit("/", 1)[-1] if "://" in path 
else path
         if streaming and not (check_path.lower().endswith(".parquet")):
diff --git a/qdp/qdp-python/src/engine.rs b/qdp/qdp-python/src/engine.rs
index 2c94899c5..69b857298 100644
--- a/qdp/qdp-python/src/engine.rs
+++ b/qdp/qdp-python/src/engine.rs
@@ -380,7 +380,7 @@ impl QdpEngine {
     }
 
     /// Internal helper to encode from file based on extension.
-    /// When the `remote-io` feature is enabled, `s3://` URLs are supported.
+    /// When the `remote-io` feature is enabled, `s3://` and `gs://` URLs are 
supported.
     fn encode_from_file(
         &self,
         path: &str,
@@ -667,7 +667,7 @@ impl QdpEngine {
             nh,
         );
         let engine = self.engine.clone();
-        // Resolve S3 URLs before detaching from GIL. The _resolved guard 
keeps the
+        // Resolve remote URLs before detaching from GIL. The _resolved guard 
keeps the
         // temp file alive until after the file is fully read inside py.detach.
         #[cfg(feature = "remote-io")]
         let _resolved = 
qdp_core::remote::resolve_path(path_str.as_str()).map_err(|e| {
@@ -715,7 +715,7 @@ impl QdpEngine {
             nh,
         );
         let engine = self.engine.clone();
-        // Resolve S3 URLs before detaching from GIL. The _resolved guard 
keeps the
+        // Resolve remote URLs before detaching from GIL. The _resolved guard 
keeps the
         // temp file alive; the streaming reader's open fd preserves data 
after drop.
         #[cfg(feature = "remote-io")]
         let _resolved = 
qdp_core::remote::resolve_path(path_str.as_str()).map_err(|e| {
diff --git a/qdp/qdp-python/tests/test_quantum_data_loader.py 
b/qdp/qdp-python/tests/test_quantum_data_loader.py
index 8c93c45c5..c80ee3f67 100644
--- a/qdp/qdp-python/tests/test_quantum_data_loader.py
+++ b/qdp/qdp-python/tests/test_quantum_data_loader.py
@@ -187,7 +187,7 @@ def test_null_handling_default_is_none() -> None:
     assert loader._null_handling is None
 
 
-# --- S3 URL (source_file) builder tests ---
+# --- Remote URL (source_file) builder tests ---
 
 
 @pytest.mark.skipif(not _loader_available(), reason="QuantumDataLoader not 
available")
@@ -196,20 +196,22 @@ def test_null_handling_default_is_none() -> None:
     [
         ("s3://my-bucket/data.parquet", False),
         ("s3://bucket/path/to/data.parquet", True),
-        ("s3://bucket/data.parquet?versionId=abc123", False),
-        ("s3://bucket/data.parquet?versionId=abc123", True),
         ("s3://bucket/data.npy", False),
+        ("gs://my-bucket/data.parquet", False),
+        ("gs://bucket/path/to/data.parquet", True),
+        ("gs://bucket/data.npy", False),
     ],
     ids=[
         "parquet-no-stream",
         "parquet-stream",
-        "parquet-query-no-stream",
-        "parquet-query-stream",
         "npy-no-stream",
+        "gcs-parquet-no-stream",
+        "gcs-parquet-stream",
+        "gcs-npy-no-stream",
     ],
 )
-def test_source_file_s3_accepted(path, streaming):
-    """source_file() accepts valid S3 URLs at builder level."""
+def test_source_file_remote_url_accepted(path, streaming):
+    """source_file() accepts valid remote URLs at builder level."""
     loader = (
         QuantumDataLoader(device_id=0)
         .qubits(4)
@@ -226,15 +228,34 @@ def test_source_file_s3_accepted(path, streaming):
     "path",
     [
         "s3://bucket/data.npy",
-        "s3://bucket/data.npy?versionId=abc",
+        "gs://bucket/data.npy",
     ],
-    ids=["npy", "npy-query"],
+    ids=["s3-npy", "gcs-npy"],
 )
-def test_source_file_s3_streaming_non_parquet_raises(path):
-    """source_file(s3://..., streaming=True) with non-.parquet raises 
ValueError."""
+def test_source_file_remote_streaming_non_parquet_raises(path):
+    """source_file(remote://..., streaming=True) with non-.parquet raises 
ValueError."""
     with pytest.raises(ValueError) as exc_info:
         QuantumDataLoader(device_id=0).qubits(4).batches(10, 
size=4).source_file(
             path, streaming=True
         )
     msg = str(exc_info.value).lower()
     assert "parquet" in msg or "streaming" in msg
+
+
[email protected](not _loader_available(), reason="QuantumDataLoader not 
available")
[email protected](
+    "path",
+    [
+        "s3://bucket/data.parquet?versionId=abc",
+        "s3://bucket/data.parquet#v1",
+        "gs://bucket/data.parquet?generation=123",
+        "gs://bucket/data.parquet#v2",
+    ],
+    ids=["s3-query", "s3-fragment", "gcs-query", "gcs-fragment"],
+)
+def test_source_file_remote_query_fragment_raises(path):
+    """source_file(remote://...?... or ...#...) raises ValueError."""
+    with pytest.raises(ValueError) as exc_info:
+        QuantumDataLoader(device_id=0).qubits(4).batches(10, 
size=4).source_file(path)
+    msg = str(exc_info.value).lower()
+    assert "query" in msg or "fragment" in msg or "scheme://bucket/key" in msg
diff --git a/website/versioned_docs/version-0.5/qdp/api.md 
b/website/versioned_docs/version-0.5/qdp/api.md
index ceb663e4b..ccbc7d6bc 100644
--- a/website/versioned_docs/version-0.5/qdp/api.md
+++ b/website/versioned_docs/version-0.5/qdp/api.md
@@ -41,6 +41,7 @@ Encode classical input into a quantum state and return a 
DLPack tensor on GPU.
   - `torch.Tensor` (CPU, float64, contiguous)
   - `str` / `pathlib.Path` file path
     - `.parquet`, `.arrow` / `.feather`, `.npy`, `.pt` / `.pth`, `.pb`
+    - remote URL (`s3://bucket/key`, `gs://bucket/key`) when built with 
`remote-io`
 - `num_qubits` (int): Number of qubits, range 1–30.
 - `encoding_method` (str): `"amplitude" | "angle" | "basis" | "iqp" | "iqp-z"` 
(lowercase).
 
@@ -53,6 +54,7 @@ Encode classical input into a quantum state and return a 
DLPack tensor on GPU.
 - Output dtype is `complex64` (`precision="float32"`) or `complex128` 
(`precision="float64"`).
 - Parquet streaming currently supports `"amplitude"` and `"basis"`.
 - PyTorch file inputs (`.pt`, `.pth`) require building with the `pytorch` 
feature.
+- Remote URL query/fragment is not supported (`?versionId=...`, `#...`).
 
 **Raises**
 - `RuntimeError`: Invalid inputs, shapes, dtypes, or unsupported formats.
diff --git a/website/versioned_docs/version-0.5/qdp/getting-started.md 
b/website/versioned_docs/version-0.5/qdp/getting-started.md
index f05e84f51..d77cfe17c 100644
--- a/website/versioned_docs/version-0.5/qdp/getting-started.md
+++ b/website/versioned_docs/version-0.5/qdp/getting-started.md
@@ -55,6 +55,17 @@ tensor = torch.from_dlpack(qtensor)  # Note: can only be 
consumed once
 engine.encode("data.parquet", num_qubits=10, encoding_method="amplitude")  # 
also: .arrow, .npy, .pt, .pb
 ```
 
+Remote object storage URLs are supported when QDP is built with `remote-io`:
+
+```python
+engine.encode("s3://my-bucket/path/data.parquet", num_qubits=10, 
encoding_method="amplitude")
+engine.encode("gs://my-bucket/path/data.parquet", num_qubits=10, 
encoding_method="amplitude")
+```
+
+Notes:
+- Remote URL query/fragment is not supported (`?versionId=...`, `#...`).
+- Streaming still requires `.parquet`.
+
 ## Tips
 
 - Use `precision="float64"` for higher precision: `QdpEngine(0, 
precision="float64")`

Reply via email to