This is an automated email from the ASF dual-hosted git repository.
ryankert01 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git
The following commit(s) were added to refs/heads/main by this push:
new 0076003b0 [QDP] Pr1 phase kernel opt (#1386)
0076003b0 is described below
commit 0076003b0777ade5ae4d2e8ad9b2e8b0c83a3cd3
Author: aloha1357 <[email protected]>
AuthorDate: Tue Jun 9 20:24:47 2026 +0200
[QDP] Pr1 phase kernel opt (#1386)
* feat(qdp): optimize phase kernel divergence and hoist constant mem
computation
* test(qdp): add phase encoding tests and benchmark script
---------
Co-authored-by: Ryan Huang <[email protected]>
---
qdp/qdp-kernels/src/iqp.cu | 8 +--
qdp/qdp-kernels/src/phase.cu | 51 +++++++-------
qdp/qdp-python/benchmark/README.md | 2 +
qdp/qdp-python/benchmark/benchmark_phase.py | 105 ++++++++++++++++++++++++++++
testing/qdp/test_bindings.py | 51 ++++++++++++++
5 files changed, 187 insertions(+), 30 deletions(-)
diff --git a/qdp/qdp-kernels/src/iqp.cu b/qdp/qdp-kernels/src/iqp.cu
index a521255f4..51fd022ff 100644
--- a/qdp/qdp-kernels/src/iqp.cu
+++ b/qdp/qdp-kernels/src/iqp.cu
@@ -43,9 +43,7 @@ __device__ double compute_phase(
// Single-qubit Z terms: sum_i x_i * data[i]
for (unsigned int i = 0; i < num_qubits; ++i) {
- if ((x >> i) & 1U) {
- phase += data[i];
- }
+ phase += data[i] * (double)((x >> i) & 1U);
}
// Two-qubit ZZ terms (if enabled): sum_{i<j} x_i * x_j * data[n +
pair_index]
@@ -53,9 +51,7 @@ __device__ double compute_phase(
unsigned int pair_idx = num_qubits;
for (unsigned int i = 0; i < num_qubits; ++i) {
for (unsigned int j = i + 1; j < num_qubits; ++j) {
- if (((x >> i) & 1U) && ((x >> j) & 1U)) {
- phase += data[pair_idx];
- }
+ phase += data[pair_idx] * (double)(((x >> i) & 1U) & ((x >> j)
& 1U));
pair_idx++;
}
}
diff --git a/qdp/qdp-kernels/src/phase.cu b/qdp/qdp-kernels/src/phase.cu
index 1ee757a04..c87c1adac 100644
--- a/qdp/qdp-kernels/src/phase.cu
+++ b/qdp/qdp-kernels/src/phase.cu
@@ -29,24 +29,19 @@
#include <cuda_runtime.h>
#include <cuComplex.h>
#include <math.h>
-#include "kernel_config.h"
-// Precompute 1/√2^n as a compile-time-friendly inline.
-// For n qubits the norm factor is pow(M_SQRT1_2, n).
-__device__ __forceinline__ double phase_norm(unsigned int num_qubits) {
- // M_SQRT1_2 = 1/√2 ≈ 0.7071067811865476
- double factor = 1.0;
- for (unsigned int k = 0; k < num_qubits; ++k) {
- factor *= M_SQRT1_2;
- }
- return factor;
-}
+#ifndef M_SQRT1_2
+#define M_SQRT1_2 0.70710678118654752440
+#endif
+
+#include "kernel_config.h"
__global__ void phase_encode_kernel(
const double* __restrict__ phases,
cuDoubleComplex* __restrict__ state,
size_t state_len,
- unsigned int num_qubits
+ unsigned int num_qubits,
+ double norm_factor
) {
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= state_len) return;
@@ -55,16 +50,14 @@ __global__ void phase_encode_kernel(
double phi = 0.0;
double norm = 1.0;
for (unsigned int bit = 0; bit < num_qubits; ++bit) {
- if ((idx >> bit) & 1U) {
- phi += phases[bit];
- }
+ phi += phases[bit] * (double)((idx >> bit) & 1U);
norm *= M_SQRT1_2;
}
double re, im;
sincos(phi, &im, &re); // re = cos(φ), im = sin(φ)
- state[idx] = make_cuDoubleComplex(norm * re, norm * im);
+ state[idx] = make_cuDoubleComplex(norm_factor * re, norm_factor * im);
}
__global__ void phase_encode_batch_kernel(
@@ -72,7 +65,8 @@ __global__ void phase_encode_batch_kernel(
cuDoubleComplex* __restrict__ state_batch,
size_t num_samples,
size_t state_len,
- unsigned int num_qubits
+ unsigned int num_qubits,
+ double norm_factor
) {
const size_t total_elements = num_samples * state_len;
const size_t stride = gridDim.x * blockDim.x;
@@ -87,16 +81,13 @@ __global__ void phase_encode_batch_kernel(
double phi = 0.0;
for (unsigned int bit = 0; bit < num_qubits; ++bit) {
- if ((element_idx >> bit) & 1U) {
- phi += phases[bit];
- }
+ phi += phases[bit] * (double)((element_idx >> bit) & 1U);
}
- double norm = phase_norm(num_qubits);
double re, im;
sincos(phi, &im, &re);
- state_batch[global_idx] = make_cuDoubleComplex(norm * re, norm * im);
+ state_batch[global_idx] = make_cuDoubleComplex(norm_factor * re,
norm_factor * im);
}
}
@@ -133,11 +124,17 @@ int launch_phase_encode(
const int blockSize = DEFAULT_BLOCK_SIZE;
const int gridSize = (state_len + blockSize - 1) / blockSize;
+ double norm_factor = 1.0;
+ for (unsigned int k = 0; k < num_qubits; ++k) {
+ norm_factor *= M_SQRT1_2;
+ }
+
phase_encode_kernel<<<gridSize, blockSize, 0, stream>>>(
phases_d,
state_complex_d,
state_len,
- num_qubits
+ num_qubits,
+ norm_factor
);
return (int)cudaGetLastError();
@@ -175,12 +172,18 @@ int launch_phase_encode_batch(
const size_t max_blocks = MAX_GRID_BLOCKS;
const size_t gridSize = (blocks_needed < max_blocks) ? blocks_needed :
max_blocks;
+ double norm_factor = 1.0;
+ for (unsigned int k = 0; k < num_qubits; ++k) {
+ norm_factor *= M_SQRT1_2;
+ }
+
phase_encode_batch_kernel<<<gridSize, blockSize, 0, stream>>>(
phases_batch_d,
state_complex_d,
num_samples,
state_len,
- num_qubits
+ num_qubits,
+ norm_factor
);
return (int)cudaGetLastError();
diff --git a/qdp/qdp-python/benchmark/README.md
b/qdp/qdp-python/benchmark/README.md
index 0dbea81ee..d30715a1c 100644
--- a/qdp/qdp-python/benchmark/README.md
+++ b/qdp/qdp-python/benchmark/README.md
@@ -8,6 +8,7 @@ scripts:
- `benchmark_throughput.py`: DataLoader-style throughput benchmark
that measures vectors/sec across Mahout, PennyLane, and Qiskit.
- `benchmark_latency.py`: Data-to-State latency benchmark (CPU RAM -> GPU
VRAM).
+- `benchmark_phase.py`: GPU phase encoding latency benchmark (batch encode
timing).
## Quick Start
@@ -32,6 +33,7 @@ To run individual benchmarks after setup:
uv run --project qdp/qdp-python python
qdp/qdp-python/benchmark/benchmark_e2e.py
uv run --project qdp/qdp-python python
qdp/qdp-python/benchmark/benchmark_latency.py
uv run --project qdp/qdp-python python
qdp/qdp-python/benchmark/benchmark_throughput.py
+uv run --project qdp/qdp-python python
qdp/qdp-python/benchmark/benchmark_phase.py
```
This keeps all benchmark dependencies in the unified repo root venv
(`mahout/.venv`).
diff --git a/qdp/qdp-python/benchmark/benchmark_phase.py
b/qdp/qdp-python/benchmark/benchmark_phase.py
new file mode 100644
index 000000000..c0a5e9039
--- /dev/null
+++ b/qdp/qdp-python/benchmark/benchmark_phase.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""GPU phase encoding latency benchmark.
+
+Measures batch phase encoding on the current build. For before/after
comparisons,
+checkout the baseline commit, rebuild the extension, run with ``--label
baseline``,
+then repeat on the optimized branch with ``--label optimized``.
+
+Run from repo root::
+
+ uv run --project qdp/qdp-python python
qdp/qdp-python/benchmark/benchmark_phase.py
+"""
+
+from __future__ import annotations
+
+import argparse
+
+import torch
+from qumat_qdp import QdpEngine
+
+
+def benchmark_phase(
+ num_qubits: int,
+ num_samples: int,
+ iters: int = 5,
+) -> tuple[float, float]:
+ """Return (total_us, per_sample_us) for batch phase encoding on GPU."""
+ phases = torch.tensor(
+ [[0.1 * (k + 1) for k in range(num_qubits)]] * num_samples,
+ dtype=torch.float64,
+ device="cuda",
+ )
+ engine = QdpEngine(0)
+
+ for _ in range(3):
+ qtensor = engine.encode(phases, num_qubits, "phase")
+ _ = torch.from_dlpack(qtensor)
+ torch.cuda.synchronize()
+
+ start = torch.cuda.Event(enable_timing=True)
+ end = torch.cuda.Event(enable_timing=True)
+
+ start.record()
+ for _ in range(iters):
+ qtensor = engine.encode(phases, num_qubits, "phase")
+ _ = torch.from_dlpack(qtensor)
+ end.record()
+ torch.cuda.synchronize()
+
+ total_ms = start.elapsed_time(end) / iters
+ total_us = total_ms * 1000.0
+ per_sample_us = total_us / num_samples
+ return total_us, per_sample_us
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(description="GPU phase encoding
benchmark")
+ parser.add_argument("--qubits", type=int, default=14)
+ parser.add_argument("--batch-size", type=int, default=128)
+ parser.add_argument("--iterations", type=int, default=5)
+ parser.add_argument(
+ "--label",
+ choices=("baseline", "optimized"),
+ default="optimized",
+ help="Tag for the current checkout (baseline or optimized)",
+ )
+ args = parser.parse_args()
+
+ if not torch.cuda.is_available():
+ raise SystemExit("CUDA not available. Cannot benchmark.")
+
+ device_name = torch.cuda.get_device_name(0)
+ total_us, per_sample_us = benchmark_phase(
+ args.qubits, args.batch_size, args.iterations
+ )
+
+ print("=" * 70)
+ print("Phase encoding GPU benchmark")
+ print(f"GPU: {device_name}")
+ print(
+ f"Config: qubits={args.qubits}, batch_size={args.batch_size}, "
+ f"iterations={args.iterations}, label={args.label}"
+ )
+ print("=" * 70)
+ print(f"Total batch time: {total_us:.1f} us")
+ print(f"Per sample: {per_sample_us:.2f} us ({per_sample_us /
1000:.3f} ms)")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/testing/qdp/test_bindings.py b/testing/qdp/test_bindings.py
index b6a2b60f5..eb5d92ce9 100644
--- a/testing/qdp/test_bindings.py
+++ b/testing/qdp/test_bindings.py
@@ -1795,3 +1795,54 @@ def test_phase_encode_shape(data_shape, expected_shape):
torch_tensor = torch.from_dlpack(qtensor)
assert torch_tensor.shape == expected_shape
+
+
+@requires_qdp
[email protected]
+def test_phase_encode_batch_large():
+ """Test larger batch phase encoding (covers batch launch path)."""
+ pytest.importorskip("torch")
+ from _qdp import QdpEngine
+
+ if not torch.cuda.is_available():
+ pytest.skip("GPU required for QdpEngine")
+
+ engine = QdpEngine(0)
+ batch_size = 128
+ n = 8
+ data = torch.rand((batch_size, n), dtype=torch.float64) * 5.9 + 0.1
+ qtensor = engine.encode(data, n, "phase")
+ torch_tensor = torch.from_dlpack(qtensor)
+
+ assert torch_tensor.shape == (batch_size, 1 << n)
+ for i in range(batch_size):
+ sample = torch_tensor[i]
+ norm_sq = torch.sum(torch.abs(sample) ** 2).item()
+ assert abs(norm_sq - 1.0) < 1e-8, f"Sample {i} norm {norm_sq}"
+ assert bool(torch.all(torch.isfinite(sample))), (
+ f"Non-finite values in sample {i}"
+ )
+
+
+@requires_qdp
[email protected]
+def test_phase_encode_large_n_correctness():
+ """Test phase encoding at N=14 for correctness on large state vectors."""
+ pytest.importorskip("torch")
+ from _qdp import QdpEngine
+
+ if not torch.cuda.is_available():
+ pytest.skip("GPU required for QdpEngine")
+
+ engine = QdpEngine(0)
+ n = 14
+ phases = [0.1 * (k + 1) for k in range(n)]
+ qtensor = engine.encode(phases, n, "phase")
+ torch_tensor = torch.from_dlpack(qtensor)
+
+ assert torch_tensor.shape == (1, 1 << n)
+ norm_sq = torch.sum(torch.abs(torch_tensor) ** 2).item()
+ assert abs(norm_sq - 1.0) < 1e-9, f"Expected unit norm at N=14, got
{norm_sq}"
+ assert bool(torch.all(torch.isfinite(torch_tensor))), (
+ "Non-finite values in large-N phase output"
+ )