This is an automated email from the ASF dual-hosted git repository.
jiekaichang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git
The following commit(s) were added to refs/heads/main by this push:
new b89617b1b [QDP] Add GCS remote URL support (#1174)
b89617b1b is described below
commit b89617b1b590d32a8599487dbab98ac602cf84dd
Author: Jie-Kai Chang <[email protected]>
AuthorDate: Fri Mar 13 10:13:58 2026 +0800
[QDP] Add GCS remote URL support (#1174)
* [QDP] Add GCS remote URL support
Signed-off-by: 400Ping <[email protected]>
* cargo.lock
Signed-off-by: 400Ping <[email protected]>
---------
Signed-off-by: 400Ping <[email protected]>
---
docs/qdp/api.md | 2 +
docs/qdp/getting-started.md | 11 +++
qdp/Cargo.lock | 10 +++
qdp/qdp-core/Cargo.toml | 2 +-
qdp/qdp-core/src/remote.rs | 99 +++++++++++++++++-----
qdp/qdp-python/README.md | 4 +
qdp/qdp-python/qumat_qdp/loader.py | 7 +-
qdp/qdp-python/src/engine.rs | 6 +-
qdp/qdp-python/tests/test_quantum_data_loader.py | 43 +++++++---
website/versioned_docs/version-0.5/qdp/api.md | 2 +
.../version-0.5/qdp/getting-started.md | 11 +++
11 files changed, 159 insertions(+), 38 deletions(-)
diff --git a/docs/qdp/api.md b/docs/qdp/api.md
index ceb663e4b..ccbc7d6bc 100644
--- a/docs/qdp/api.md
+++ b/docs/qdp/api.md
@@ -41,6 +41,7 @@ Encode classical input into a quantum state and return a
DLPack tensor on GPU.
- `torch.Tensor` (CPU, float64, contiguous)
- `str` / `pathlib.Path` file path
- `.parquet`, `.arrow` / `.feather`, `.npy`, `.pt` / `.pth`, `.pb`
+ - remote URL (`s3://bucket/key`, `gs://bucket/key`) when built with
`remote-io`
- `num_qubits` (int): Number of qubits, range 1–30.
- `encoding_method` (str): `"amplitude" | "angle" | "basis" | "iqp" | "iqp-z"`
(lowercase).
@@ -53,6 +54,7 @@ Encode classical input into a quantum state and return a
DLPack tensor on GPU.
- Output dtype is `complex64` (`precision="float32"`) or `complex128`
(`precision="float64"`).
- Parquet streaming currently supports `"amplitude"` and `"basis"`.
- PyTorch file inputs (`.pt`, `.pth`) require building with the `pytorch`
feature.
+- Remote URL query/fragment is not supported (`?versionId=...`, `#...`).
**Raises**
- `RuntimeError`: Invalid inputs, shapes, dtypes, or unsupported formats.
diff --git a/docs/qdp/getting-started.md b/docs/qdp/getting-started.md
index 83ae588c7..a8747cbb2 100644
--- a/docs/qdp/getting-started.md
+++ b/docs/qdp/getting-started.md
@@ -56,6 +56,17 @@ tensor = torch.from_dlpack(qtensor) # Note: can only be
consumed once
engine.encode("data.parquet", num_qubits=10, encoding_method="amplitude") #
also: .arrow, .npy, .pt, .pb
```
+Remote object storage URLs are supported when QDP is built with `remote-io`:
+
+```python
+engine.encode("s3://my-bucket/path/data.parquet", num_qubits=10,
encoding_method="amplitude")
+engine.encode("gs://my-bucket/path/data.parquet", num_qubits=10,
encoding_method="amplitude")
+```
+
+Notes:
+- Remote URL query/fragment is not supported (`?versionId=...`, `#...`).
+- Streaming still requires `.parquet`.
+
## Tips
- Use `precision="float64"` for higher precision: `QdpEngine(0,
precision="float64")`
diff --git a/qdp/Cargo.lock b/qdp/Cargo.lock
index edb08d548..eb080ba93 100644
--- a/qdp/Cargo.lock
+++ b/qdp/Cargo.lock
@@ -1694,6 +1694,7 @@ dependencies = [
"rand 0.8.5",
"reqwest",
"ring",
+ "rustls-pemfile",
"serde",
"serde_json",
"snafu",
@@ -2499,6 +2500,15 @@ dependencies = [
"security-framework",
]
+[[package]]
+name = "rustls-pemfile"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50"
+dependencies = [
+ "rustls-pki-types",
+]
+
[[package]]
name = "rustls-pki-types"
version = "1.14.0"
diff --git a/qdp/qdp-core/Cargo.toml b/qdp/qdp-core/Cargo.toml
index e5fafe1f0..d496c7b11 100644
--- a/qdp/qdp-core/Cargo.toml
+++ b/qdp/qdp-core/Cargo.toml
@@ -16,7 +16,7 @@ ndarray-npy = { workspace = true }
prost = { workspace = true }
bytes = { workspace = true }
tch = { workspace = true, optional = true }
-object_store = { workspace = true, features = ["aws"], optional = true }
+object_store = { workspace = true, features = ["aws", "gcp"], optional = true }
tokio = { workspace = true, features = ["rt"], optional = true }
tempfile = { workspace = true, optional = true }
futures = { version = "0.3", default-features = false, optional = true }
diff --git a/qdp/qdp-core/src/remote.rs b/qdp/qdp-core/src/remote.rs
index 42b375637..a2577eb6c 100644
--- a/qdp/qdp-core/src/remote.rs
+++ b/qdp/qdp-core/src/remote.rs
@@ -16,9 +16,9 @@
//! Remote I/O support for cloud object storage.
//!
-//! When the `remote-io` feature is enabled, cloud URLs (currently `s3://`)
+//! When the `remote-io` feature is enabled, cloud URLs (`s3://`, `gs://`)
//! are transparently downloaded to a local temp file before being passed to
-//! readers. Adding GCS (`gs://`) or Azure (`az://`) support requires only a
+//! readers. Adding more providers (for example Azure `az://`) requires only a
//! new match arm in [`build_store`] and the corresponding `object_store`
//! cargo feature.
@@ -33,7 +33,7 @@ use tempfile::NamedTempFile;
use crate::error::{MahoutError, Result};
/// Recognized cloud URL schemes.
-const REMOTE_SCHEMES: &[&str] = &["s3://"];
+const REMOTE_SCHEMES: &[&str] = &["s3://", "gs://"];
/// Returns true if `path` is a recognized remote URL.
pub fn is_remote_path(path: &str) -> bool {
@@ -41,7 +41,14 @@ pub fn is_remote_path(path: &str) -> bool {
}
/// Parse a cloud URL into (scheme, bucket, key).
+/// Query/fragment components are intentionally rejected.
fn parse_url(url: &str) -> Result<(&str, &str, &str)> {
+ if url.contains('?') || url.contains('#') {
+ return Err(MahoutError::InvalidInput(format!(
+ "Remote URL query/fragment is not supported: {}",
+ url
+ )));
+ }
let (scheme, rest) = url
.split_once("://")
.ok_or_else(|| MahoutError::InvalidInput(format!("Not a remote URL:
{}", url)))?;
@@ -76,7 +83,18 @@ fn build_store(scheme: &str, bucket: &str) -> Result<Arc<dyn
ObjectStore>> {
})?;
Ok(Arc::new(store))
}
- // To add GCS: "gs" => { ... GoogleCloudStorageBuilder ... }
+ "gs" => {
+ let store =
object_store::gcp::GoogleCloudStorageBuilder::from_env()
+ .with_bucket_name(bucket)
+ .build()
+ .map_err(|e| {
+ MahoutError::Io(format!(
+ "Failed to build GCS client for bucket '{}': {}",
+ bucket, e
+ ))
+ })?;
+ Ok(Arc::new(store))
+ }
// To add Azure: "az" | "abfs" => { ... MicrosoftAzureBuilder ... }
_ => Err(MahoutError::InvalidInput(format!(
"Unsupported remote scheme '{}://'. Supported: {}",
@@ -174,6 +192,7 @@ mod tests {
#[test]
fn test_is_remote_path() {
assert!(is_remote_path("s3://bucket/key.parquet"));
+ assert!(is_remote_path("gs://bucket/key.parquet"));
assert!(!is_remote_path("/tmp/local.parquet"));
assert!(!is_remote_path("data.parquet"));
}
@@ -186,6 +205,14 @@ mod tests {
assert_eq!(key, "path/to/data.parquet");
}
+ #[test]
+ fn test_parse_url_gs() {
+ let (scheme, bucket, key) =
parse_url("gs://my-bucket/path/to/data.parquet").unwrap();
+ assert_eq!(scheme, "gs");
+ assert_eq!(bucket, "my-bucket");
+ assert_eq!(key, "path/to/data.parquet");
+ }
+
#[test]
fn test_parse_url_no_key() {
assert!(parse_url("s3://bucket-only").is_err());
@@ -197,6 +224,12 @@ mod tests {
assert!(parse_url("s3://bucket/").is_err());
}
+ #[test]
+ fn test_parse_url_rejects_query_and_fragment() {
+ assert!(parse_url("s3://bucket/data.parquet?versionId=abc").is_err());
+ assert!(parse_url("gs://bucket/data.parquet#generation").is_err());
+ }
+
#[test]
fn test_unsupported_scheme() {
let err = build_store("gcs", "bucket").unwrap_err();
@@ -209,24 +242,7 @@ mod tests {
assert_eq!(resolved.path, PathBuf::from("/tmp/local.parquet"));
}
- /// Integration test: download from a real S3-compatible endpoint (MinIO).
- ///
- /// Requires a running MinIO with a test file uploaded. Skipped unless
- /// `MAHOUT_TEST_S3` env var is set. Run with:
- ///
- /// ```sh
- /// MAHOUT_TEST_S3=1 \
- /// AWS_ACCESS_KEY_ID=minioadmin AWS_SECRET_ACCESS_KEY=minioadmin \
- /// AWS_ENDPOINT=http://localhost:9123 AWS_REGION=us-east-1
AWS_ALLOW_HTTP=true \
- /// cargo test -p qdp-core --features remote-io -- test_download_from_minio
- /// ```
- #[test]
- fn test_download_from_minio() {
- if std::env::var("MAHOUT_TEST_S3").is_err() {
- eprintln!("skipping test_download_from_minio (set MAHOUT_TEST_S3=1
to run)");
- return;
- }
- let resolved = resolve_path("s3://test-bucket/data.parquet").unwrap();
+ fn assert_downloaded_parquet(resolved: &ResolvedPath) {
assert!(
resolved.path.exists(),
"temp file should exist after download"
@@ -252,4 +268,43 @@ mod tests {
assert_eq!(sample_size, 4);
assert_eq!(data.len(), 32);
}
+
+ /// Integration test: download from a real S3-compatible endpoint (MinIO).
+ ///
+ /// Requires a running MinIO with a test file uploaded. Skipped unless
+ /// `MAHOUT_TEST_S3` env var is set. Run with:
+ ///
+ /// ```sh
+ /// MAHOUT_TEST_S3=1 \
+ /// AWS_ACCESS_KEY_ID=minioadmin AWS_SECRET_ACCESS_KEY=minioadmin \
+ /// AWS_ENDPOINT=http://localhost:9123 AWS_REGION=us-east-1
AWS_ALLOW_HTTP=true \
+ /// cargo test -p qdp-core --features remote-io -- test_download_from_minio
+ /// ```
+ #[test]
+ fn test_download_from_minio() {
+ if std::env::var("MAHOUT_TEST_S3").is_err() {
+ eprintln!("skipping test_download_from_minio (set MAHOUT_TEST_S3=1
to run)");
+ return;
+ }
+ let resolved = resolve_path("s3://test-bucket/data.parquet").unwrap();
+ assert_downloaded_parquet(&resolved);
+ }
+
+ /// Integration test: download from a real GCS bucket.
+ ///
+ /// Skipped unless `MAHOUT_TEST_GCS` env var is set. The remote URL can be
+ /// overridden with `MAHOUT_TEST_GCS_URL` (default:
gs://test-bucket/data.parquet).
+ /// Authentication must be configured via environment variables supported
by
+ /// `GoogleCloudStorageBuilder::from_env`.
+ #[test]
+ fn test_download_from_gcs() {
+ if std::env::var("MAHOUT_TEST_GCS").is_err() {
+ eprintln!("skipping test_download_from_gcs (set MAHOUT_TEST_GCS=1
to run)");
+ return;
+ }
+ let url = std::env::var("MAHOUT_TEST_GCS_URL")
+ .unwrap_or_else(|_| "gs://test-bucket/data.parquet".to_string());
+ let resolved = resolve_path(&url).unwrap();
+ assert_downloaded_parquet(&resolved);
+ }
}
diff --git a/qdp/qdp-python/README.md b/qdp/qdp-python/README.md
index 6f47b9fea..ee70834ad 100644
--- a/qdp/qdp-python/README.md
+++ b/qdp/qdp-python/README.md
@@ -53,6 +53,10 @@ qtensor = engine.encode("data.parquet", 10, "amplitude")
qtensor = engine.encode("data.arrow", 10, "amplitude")
qtensor = engine.encode("data.npy", 10, "amplitude")
qtensor = engine.encode("data.pt", 10, "amplitude")
+
+# Remote object storage URLs (requires building with remote-io feature)
+qtensor = engine.encode("s3://my-bucket/data.parquet", 10, "amplitude")
+qtensor = engine.encode("gs://my-bucket/data.parquet", 10, "amplitude")
```
## Links
diff --git a/qdp/qdp-python/qumat_qdp/loader.py
b/qdp/qdp-python/qumat_qdp/loader.py
index c6514f094..42a0bea16 100644
--- a/qdp/qdp-python/qumat_qdp/loader.py
+++ b/qdp/qdp-python/qumat_qdp/loader.py
@@ -166,10 +166,15 @@ class QuantumDataLoader:
For streaming=True (Phase 2b), only .parquet is supported; data is
read in chunks to reduce memory.
For streaming=False, supports .parquet, .arrow, .feather, .ipc, .npy,
.pt, .pth, .pb.
- Remote paths (s3://) are supported when the remote-io feature is
enabled.
+ Remote paths (s3://, gs://) are supported when the remote-io feature
is enabled.
+ Remote URL query/fragment (for example ?versionId=... or #...) is not
supported.
"""
if not path or not isinstance(path, str):
raise ValueError(f"path must be a non-empty string, got {path!r}")
+ if "://" in path and ("?" in path or "#" in path):
+ raise ValueError(
+ "Remote URL query/fragment is not supported; use plain
scheme://bucket/key path."
+ )
# For remote URLs, extract the key portion for extension checks.
check_path = path.split("?")[0].rsplit("/", 1)[-1] if "://" in path
else path
if streaming and not (check_path.lower().endswith(".parquet")):
diff --git a/qdp/qdp-python/src/engine.rs b/qdp/qdp-python/src/engine.rs
index 2c94899c5..69b857298 100644
--- a/qdp/qdp-python/src/engine.rs
+++ b/qdp/qdp-python/src/engine.rs
@@ -380,7 +380,7 @@ impl QdpEngine {
}
/// Internal helper to encode from file based on extension.
- /// When the `remote-io` feature is enabled, `s3://` URLs are supported.
+ /// When the `remote-io` feature is enabled, `s3://` and `gs://` URLs are
supported.
fn encode_from_file(
&self,
path: &str,
@@ -667,7 +667,7 @@ impl QdpEngine {
nh,
);
let engine = self.engine.clone();
- // Resolve S3 URLs before detaching from GIL. The _resolved guard
keeps the
+ // Resolve remote URLs before detaching from GIL. The _resolved guard
keeps the
// temp file alive until after the file is fully read inside py.detach.
#[cfg(feature = "remote-io")]
let _resolved =
qdp_core::remote::resolve_path(path_str.as_str()).map_err(|e| {
@@ -715,7 +715,7 @@ impl QdpEngine {
nh,
);
let engine = self.engine.clone();
- // Resolve S3 URLs before detaching from GIL. The _resolved guard
keeps the
+ // Resolve remote URLs before detaching from GIL. The _resolved guard
keeps the
// temp file alive; the streaming reader's open fd preserves data
after drop.
#[cfg(feature = "remote-io")]
let _resolved =
qdp_core::remote::resolve_path(path_str.as_str()).map_err(|e| {
diff --git a/qdp/qdp-python/tests/test_quantum_data_loader.py
b/qdp/qdp-python/tests/test_quantum_data_loader.py
index 8c93c45c5..c80ee3f67 100644
--- a/qdp/qdp-python/tests/test_quantum_data_loader.py
+++ b/qdp/qdp-python/tests/test_quantum_data_loader.py
@@ -187,7 +187,7 @@ def test_null_handling_default_is_none() -> None:
assert loader._null_handling is None
-# --- S3 URL (source_file) builder tests ---
+# --- Remote URL (source_file) builder tests ---
@pytest.mark.skipif(not _loader_available(), reason="QuantumDataLoader not
available")
@@ -196,20 +196,22 @@ def test_null_handling_default_is_none() -> None:
[
("s3://my-bucket/data.parquet", False),
("s3://bucket/path/to/data.parquet", True),
- ("s3://bucket/data.parquet?versionId=abc123", False),
- ("s3://bucket/data.parquet?versionId=abc123", True),
("s3://bucket/data.npy", False),
+ ("gs://my-bucket/data.parquet", False),
+ ("gs://bucket/path/to/data.parquet", True),
+ ("gs://bucket/data.npy", False),
],
ids=[
"parquet-no-stream",
"parquet-stream",
- "parquet-query-no-stream",
- "parquet-query-stream",
"npy-no-stream",
+ "gcs-parquet-no-stream",
+ "gcs-parquet-stream",
+ "gcs-npy-no-stream",
],
)
-def test_source_file_s3_accepted(path, streaming):
- """source_file() accepts valid S3 URLs at builder level."""
+def test_source_file_remote_url_accepted(path, streaming):
+ """source_file() accepts valid remote URLs at builder level."""
loader = (
QuantumDataLoader(device_id=0)
.qubits(4)
@@ -226,15 +228,34 @@ def test_source_file_s3_accepted(path, streaming):
"path",
[
"s3://bucket/data.npy",
- "s3://bucket/data.npy?versionId=abc",
+ "gs://bucket/data.npy",
],
- ids=["npy", "npy-query"],
+ ids=["s3-npy", "gcs-npy"],
)
-def test_source_file_s3_streaming_non_parquet_raises(path):
- """source_file(s3://..., streaming=True) with non-.parquet raises
ValueError."""
+def test_source_file_remote_streaming_non_parquet_raises(path):
+ """source_file(remote://..., streaming=True) with non-.parquet raises
ValueError."""
with pytest.raises(ValueError) as exc_info:
QuantumDataLoader(device_id=0).qubits(4).batches(10,
size=4).source_file(
path, streaming=True
)
msg = str(exc_info.value).lower()
assert "parquet" in msg or "streaming" in msg
+
+
[email protected](not _loader_available(), reason="QuantumDataLoader not
available")
[email protected](
+ "path",
+ [
+ "s3://bucket/data.parquet?versionId=abc",
+ "s3://bucket/data.parquet#v1",
+ "gs://bucket/data.parquet?generation=123",
+ "gs://bucket/data.parquet#v2",
+ ],
+ ids=["s3-query", "s3-fragment", "gcs-query", "gcs-fragment"],
+)
+def test_source_file_remote_query_fragment_raises(path):
+ """source_file(remote://...?... or ...#...) raises ValueError."""
+ with pytest.raises(ValueError) as exc_info:
+ QuantumDataLoader(device_id=0).qubits(4).batches(10,
size=4).source_file(path)
+ msg = str(exc_info.value).lower()
+ assert "query" in msg or "fragment" in msg or "scheme://bucket/key" in msg
diff --git a/website/versioned_docs/version-0.5/qdp/api.md
b/website/versioned_docs/version-0.5/qdp/api.md
index ceb663e4b..ccbc7d6bc 100644
--- a/website/versioned_docs/version-0.5/qdp/api.md
+++ b/website/versioned_docs/version-0.5/qdp/api.md
@@ -41,6 +41,7 @@ Encode classical input into a quantum state and return a
DLPack tensor on GPU.
- `torch.Tensor` (CPU, float64, contiguous)
- `str` / `pathlib.Path` file path
- `.parquet`, `.arrow` / `.feather`, `.npy`, `.pt` / `.pth`, `.pb`
+ - remote URL (`s3://bucket/key`, `gs://bucket/key`) when built with
`remote-io`
- `num_qubits` (int): Number of qubits, range 1–30.
- `encoding_method` (str): `"amplitude" | "angle" | "basis" | "iqp" | "iqp-z"`
(lowercase).
@@ -53,6 +54,7 @@ Encode classical input into a quantum state and return a
DLPack tensor on GPU.
- Output dtype is `complex64` (`precision="float32"`) or `complex128`
(`precision="float64"`).
- Parquet streaming currently supports `"amplitude"` and `"basis"`.
- PyTorch file inputs (`.pt`, `.pth`) require building with the `pytorch`
feature.
+- Remote URL query/fragment is not supported (`?versionId=...`, `#...`).
**Raises**
- `RuntimeError`: Invalid inputs, shapes, dtypes, or unsupported formats.
diff --git a/website/versioned_docs/version-0.5/qdp/getting-started.md
b/website/versioned_docs/version-0.5/qdp/getting-started.md
index f05e84f51..d77cfe17c 100644
--- a/website/versioned_docs/version-0.5/qdp/getting-started.md
+++ b/website/versioned_docs/version-0.5/qdp/getting-started.md
@@ -55,6 +55,17 @@ tensor = torch.from_dlpack(qtensor) # Note: can only be
consumed once
engine.encode("data.parquet", num_qubits=10, encoding_method="amplitude") #
also: .arrow, .npy, .pt, .pb
```
+Remote object storage URLs are supported when QDP is built with `remote-io`:
+
+```python
+engine.encode("s3://my-bucket/path/data.parquet", num_qubits=10,
encoding_method="amplitude")
+engine.encode("gs://my-bucket/path/data.parquet", num_qubits=10,
encoding_method="amplitude")
+```
+
+Notes:
+- Remote URL query/fragment is not supported (`?versionId=...`, `#...`).
+- Streaming still requires `.parquet`.
+
## Tips
- Use `precision="float64"` for higher precision: `QdpEngine(0,
precision="float64")`