This is an automated email from the ASF dual-hosted git repository.

hcr pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git


The following commit(s) were added to refs/heads/main by this push:
     new 15d3df9d6 [QDP] Refactor qdp-python `lib.rs` (#1054)
15d3df9d6 is described below

commit 15d3df9d6f9169540804ef5e49423ec5270ff987
Author: Vic Wen <[email protected]>
AuthorDate: Sat Feb 21 20:34:29 2026 +0800

    [QDP] Refactor qdp-python `lib.rs` (#1054)
    
    * refactor: refactor qdp-python `lib.rs`
    
    * feat: add main branch feature
---
 qdp/qdp-python/src/dlpack.rs             |  200 +++++
 qdp/qdp-python/src/{lib.rs => engine.rs} |  847 +++----------------
 qdp/qdp-python/src/lib.rs                | 1302 +-----------------------------
 qdp/qdp-python/src/loader.rs             |  103 +++
 qdp/qdp-python/src/pytorch.rs            |  251 ++++++
 qdp/qdp-python/src/tensor.rs             |  147 ++++
 6 files changed, 807 insertions(+), 2043 deletions(-)

diff --git a/qdp/qdp-python/src/dlpack.rs b/qdp/qdp-python/src/dlpack.rs
new file mode 100644
index 000000000..4a1ad08ec
--- /dev/null
+++ b/qdp/qdp-python/src/dlpack.rs
@@ -0,0 +1,200 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use pyo3::exceptions::PyRuntimeError;
+use pyo3::ffi;
+use pyo3::prelude::*;
+use qdp_core::dlpack::{DL_FLOAT, DLDeviceType, DLManagedTensor};
+use std::ffi::c_void;
+
+/// DLPack tensor information extracted from a PyCapsule
+///
+/// This struct owns the DLManagedTensor pointer and ensures proper cleanup
+/// via the DLPack deleter when dropped (RAII pattern).
+pub struct DLPackTensorInfo {
+    /// Raw DLManagedTensor pointer from PyTorch DLPack capsule
+    /// This is owned by this struct and will be freed via deleter on drop
+    pub managed_ptr: *mut DLManagedTensor,
+    /// Data pointer inside dl_tensor (GPU memory, owned by managed_ptr)
+    pub data_ptr: *const c_void,
+    pub shape: Vec<i64>,
+    /// CUDA device ID from DLPack metadata.
+    /// Used for defensive validation against PyTorch API device ID.
+    pub device_id: i32,
+}
+
+impl Drop for DLPackTensorInfo {
+    fn drop(&mut self) {
+        unsafe {
+            if !self.managed_ptr.is_null() {
+                // Per DLPack protocol: consumer must call deleter exactly once
+                if let Some(deleter) = (*self.managed_ptr).deleter {
+                    deleter(self.managed_ptr);
+                }
+                // Prevent double-free
+                self.managed_ptr = std::ptr::null_mut();
+            }
+        }
+    }
+}
+
+/// Extract GPU pointer from PyTorch tensor's __dlpack__() capsule
+///
+/// Uses the DLPack protocol to obtain a zero-copy view of the tensor's GPU 
memory.
+/// The returned `DLPackTensorInfo` owns the DLManagedTensor and will 
automatically
+/// call the deleter when dropped, ensuring proper resource cleanup.
+///
+/// # Safety
+/// The returned `data_ptr` points to GPU memory owned by the source tensor.
+/// The caller must ensure the source tensor remains alive and unmodified
+/// for the entire duration that `data_ptr` is in use. Python's GIL ensures
+/// the tensor won't be garbage collected during `encode()`, but the caller
+/// must not deallocate or resize the tensor while encoding is in progress.
+pub fn extract_dlpack_tensor(
+    _py: Python<'_>,
+    tensor: &Bound<'_, PyAny>,
+) -> PyResult<DLPackTensorInfo> {
+    // Call tensor.__dlpack__() to get PyCapsule
+    // Note: PyTorch's __dlpack__ uses the default stream when called without 
arguments
+    let capsule = tensor.call_method0("__dlpack__")?;
+
+    const DLTENSOR_NAME: &[u8] = b"dltensor\0";
+
+    // SAFETY: capsule is a valid PyCapsule from tensor.__dlpack__(). 
DLTENSOR_NAME is a
+    // null-terminated C string for the lifetime of the call. We only read the 
capsule
+    // and call PyCapsule_IsValid / PyCapsule_GetPointer; we do not invalidate 
the capsule.
+    let managed_ptr = unsafe {
+        let capsule_ptr = capsule.as_ptr();
+        if ffi::PyCapsule_IsValid(capsule_ptr, DLTENSOR_NAME.as_ptr() as 
*const i8) == 0 {
+            return Err(PyRuntimeError::new_err(
+                "Invalid DLPack capsule (expected 'dltensor')",
+            ));
+        }
+        let ptr = ffi::PyCapsule_GetPointer(capsule_ptr, 
DLTENSOR_NAME.as_ptr() as *const i8)
+            as *mut DLManagedTensor;
+        if ptr.is_null() {
+            return Err(PyRuntimeError::new_err(
+                "Failed to extract DLManagedTensor from PyCapsule",
+            ));
+        }
+        ptr
+    };
+
+    // SAFETY: managed_ptr is non-null and was returned by 
PyCapsule_GetPointer for a valid
+    // "dltensor" capsule, so it points to a valid DLManagedTensor. The 
capsule (and thus
+    // the tensor) is held by the caller for the duration of this function. We 
read fields
+    // and create slices from shape/strides only when non-null and ndim is 
valid.
+    unsafe {
+        let dl_tensor = &(*managed_ptr).dl_tensor;
+
+        if dl_tensor.data.is_null() {
+            return Err(PyRuntimeError::new_err(
+                "DLPack tensor has null data pointer",
+            ));
+        }
+
+        if dl_tensor.device.device_type != DLDeviceType::kDLCUDA {
+            return Err(PyRuntimeError::new_err(
+                "DLPack tensor must be on CUDA device",
+            ));
+        }
+
+        if dl_tensor.dtype.code != DL_FLOAT
+            || dl_tensor.dtype.bits != 64
+            || dl_tensor.dtype.lanes != 1
+        {
+            return Err(PyRuntimeError::new_err(format!(
+                "DLPack tensor must be float64 (code={}, bits={}, lanes={})",
+                dl_tensor.dtype.code, dl_tensor.dtype.bits, 
dl_tensor.dtype.lanes
+            )));
+        }
+
+        if !dl_tensor
+            .byte_offset
+            .is_multiple_of(std::mem::size_of::<f64>() as u64)
+        {
+            return Err(PyRuntimeError::new_err(
+                "DLPack tensor byte_offset is not aligned for float64",
+            ));
+        }
+
+        let data_ptr =
+            (dl_tensor.data as *const u8).add(dl_tensor.byte_offset as usize) 
as *const f64;
+
+        let ndim = dl_tensor.ndim as usize;
+        // SAFETY: shape pointer is valid for ndim elements when non-null 
(DLPack contract).
+        let shape = if ndim > 0 && !dl_tensor.shape.is_null() {
+            std::slice::from_raw_parts(dl_tensor.shape, ndim).to_vec()
+        } else {
+            vec![]
+        };
+
+        if ndim == 0 || shape.is_empty() {
+            return Err(PyRuntimeError::new_err(
+                "DLPack tensor must have at least 1 dimension",
+            ));
+        }
+
+        if !dl_tensor.strides.is_null() {
+            // SAFETY: strides pointer is valid for ndim elements (DLPack 
contract).
+            let strides = std::slice::from_raw_parts(dl_tensor.strides, ndim);
+            match ndim {
+                1 => {
+                    let expected = 1_i64;
+                    if strides[0] != expected {
+                        return Err(PyRuntimeError::new_err(format!(
+                            "DLPack tensor must be contiguous: stride[0]={}, 
expected {}",
+                            strides[0], expected
+                        )));
+                    }
+                }
+                2 => {
+                    if shape.len() < 2 {
+                        return Err(PyRuntimeError::new_err(
+                            "DLPack tensor must be contiguous (shape len < 2)",
+                        ));
+                    }
+                    let expected_stride_1 = 1_i64;
+                    let expected_stride_0 = shape[1];
+                    if strides[1] != expected_stride_1 || strides[0] != 
expected_stride_0 {
+                        return Err(PyRuntimeError::new_err(format!(
+                            "DLPack tensor must be contiguous: strides=[{}, 
{}], expected [{}, {}] (expected[1]=shape[1])",
+                            strides[0], strides[1], expected_stride_0, 
expected_stride_1
+                        )));
+                    }
+                }
+                _ => {
+                    return Err(PyRuntimeError::new_err(
+                        "DLPack tensor must be 1D or 2D for encoding",
+                    ));
+                }
+            }
+        }
+
+        let device_id = dl_tensor.device.device_id;
+
+        const USED_DLTENSOR_NAME: &[u8] = b"used_dltensor\0";
+        // SAFETY: capsule is the same PyCapsule we used above; renaming is 
allowed and does not free it.
+        ffi::PyCapsule_SetName(capsule.as_ptr(), USED_DLTENSOR_NAME.as_ptr() 
as *const i8);
+
+        Ok(DLPackTensorInfo {
+            managed_ptr,
+            data_ptr: data_ptr as *const std::ffi::c_void,
+            shape,
+            device_id,
+        })
+    }
+}
diff --git a/qdp/qdp-python/src/lib.rs b/qdp/qdp-python/src/engine.rs
similarity index 51%
copy from qdp/qdp-python/src/lib.rs
copy to qdp/qdp-python/src/engine.rs
index 12335cb72..29f56909a 100644
--- a/qdp/qdp-python/src/lib.rs
+++ b/qdp/qdp-python/src/engine.rs
@@ -14,555 +14,26 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use crate::dlpack::extract_dlpack_tensor;
+use crate::pytorch::{
+    extract_cuda_tensor_info, get_tensor_device_id, get_torch_cuda_stream_ptr, 
is_cuda_tensor,
+    is_pytorch_tensor, validate_cuda_tensor_for_encoding, validate_shape, 
validate_tensor,
+};
+use crate::tensor::QuantumTensor;
 use numpy::{PyReadonlyArray1, PyReadonlyArray2, PyUntypedArrayMethods};
 use pyo3::exceptions::PyRuntimeError;
-use pyo3::ffi;
 use pyo3::prelude::*;
-use qdp_core::dlpack::{DL_FLOAT, DLDeviceType, DLManagedTensor};
 use qdp_core::{Precision, QdpEngine as CoreEngine};
-use std::ffi::c_void;
 
-/// Quantum tensor wrapper implementing DLPack protocol
-///
-/// This class wraps a GPU-allocated quantum state vector and implements
-/// the DLPack protocol for zero-copy integration with PyTorch and other
-/// array libraries.
-///
-/// Example:
-///     >>> engine = QdpEngine(device_id=0)
-///     >>> qtensor = engine.encode([1.0, 2.0, 3.0], num_qubits=2, 
encoding_method="amplitude")
-///     >>> torch_tensor = torch.from_dlpack(qtensor)
-#[pyclass]
-struct QuantumTensor {
-    ptr: *mut DLManagedTensor,
-    consumed: bool,
-}
-
-#[pymethods]
-impl QuantumTensor {
-    /// Implements DLPack protocol - returns PyCapsule for PyTorch
-    ///
-    /// This method is called by torch.from_dlpack() to get the GPU memory 
pointer.
-    /// The capsule can only be consumed once to prevent double-free errors.
-    ///
-    /// Args:
-    ///     stream: Optional CUDA stream (DLPack 0.8+; 1=legacy default, 
2=per-thread default)
-    ///
-    /// Returns:
-    ///     PyCapsule containing DLManagedTensor pointer
-    ///
-    /// Raises:
-    ///     RuntimeError: If the tensor has already been consumed
-    #[pyo3(signature = (stream=None))]
-    fn __dlpack__<'py>(&mut self, py: Python<'py>, stream: Option<i64>) -> 
PyResult<Py<PyAny>> {
-        if self.consumed {
-            return Err(PyRuntimeError::new_err(
-                "DLPack tensor already consumed (can only be used once)",
-            ));
-        }
-
-        if self.ptr.is_null() {
-            return Err(PyRuntimeError::new_err("Invalid DLPack tensor 
pointer"));
-        }
-
-        if let Some(stream) = stream
-            && stream > 0
-        {
-            let stream_ptr = qdp_core::dlpack::dlpack_stream_to_cuda(stream);
-            unsafe {
-                qdp_core::dlpack::synchronize_stream(stream_ptr).map_err(|e| {
-                    PyRuntimeError::new_err(format!("CUDA stream sync failed: 
{}", e))
-                })?;
-            }
-        }
-
-        // Mark as consumed to prevent double-free
-        self.consumed = true;
-
-        // Create PyCapsule using FFI
-        // PyTorch will call the deleter stored in DLManagedTensor.deleter
-        // Use a static C string for the capsule name to avoid lifetime issues
-        const DLTENSOR_NAME: &[u8] = b"dltensor\0";
-
-        unsafe {
-            // Create PyCapsule without a destructor
-            // PyTorch will manually call the deleter from DLManagedTensor
-            let capsule_ptr = ffi::PyCapsule_New(
-                self.ptr as *mut std::ffi::c_void,
-                DLTENSOR_NAME.as_ptr() as *const i8,
-                None, // No destructor - PyTorch handles it
-            );
-
-            if capsule_ptr.is_null() {
-                return Err(PyRuntimeError::new_err("Failed to create 
PyCapsule"));
-            }
-
-            Ok(Py::from_owned_ptr(py, capsule_ptr))
-        }
-    }
-
-    /// Returns DLPack device information
-    ///
-    /// Returns:
-    ///     Tuple of (device_type, device_id) where device_type=2 for CUDA
-    fn __dlpack_device__(&self) -> PyResult<(i32, i32)> {
-        if self.ptr.is_null() {
-            return Err(PyRuntimeError::new_err("Invalid DLPack tensor 
pointer"));
-        }
-
-        unsafe {
-            let tensor = &(*self.ptr).dl_tensor;
-            // DLPack device_type: kDLCUDA = 2, kDLCPU = 1
-            let device_type = match tensor.device.device_type {
-                qdp_core::dlpack::DLDeviceType::kDLCUDA => 2,
-                qdp_core::dlpack::DLDeviceType::kDLCPU => 1,
-            };
-            // Read device_id from DLPack tensor metadata
-            Ok((device_type, tensor.device.device_id))
-        }
-    }
-}
-
-impl Drop for QuantumTensor {
-    fn drop(&mut self) {
-        // Only free if not consumed by __dlpack__
-        // If consumed, PyTorch/consumer will call the deleter
-        if !self.consumed && !self.ptr.is_null() {
-            unsafe {
-                // Defensive check: qdp-core always provides a deleter
-                debug_assert!(
-                    (*self.ptr).deleter.is_some(),
-                    "DLManagedTensor from qdp-core should always have a 
deleter"
-                );
-
-                // Call the DLPack deleter to free memory
-                if let Some(deleter) = (*self.ptr).deleter {
-                    deleter(self.ptr);
-                }
-            }
-        }
-    }
-}
-
-// Safety: QuantumTensor can be sent between threads
-// The DLManagedTensor pointer management is thread-safe via Arc in the deleter
-unsafe impl Send for QuantumTensor {}
-unsafe impl Sync for QuantumTensor {}
-
-/// Helper to detect PyTorch tensor
-fn is_pytorch_tensor(obj: &Bound<'_, PyAny>) -> PyResult<bool> {
-    let type_obj = obj.get_type();
-    let name = type_obj.name()?;
-    if name != "Tensor" {
-        return Ok(false);
-    }
-    let module = type_obj.module()?;
-    let module_name = module.to_str()?;
-    Ok(module_name == "torch")
-}
-
-/// Helper to validate CPU tensor
-fn validate_tensor(tensor: &Bound<'_, PyAny>) -> PyResult<()> {
-    if !is_pytorch_tensor(tensor)? {
-        return Err(PyRuntimeError::new_err("Object is not a PyTorch Tensor"));
-    }
-
-    let device = tensor.getattr("device")?;
-    let device_type: String = device.getattr("type")?.extract()?;
-
-    if device_type != "cpu" {
-        return Err(PyRuntimeError::new_err(format!(
-            "Only CPU tensors are currently supported for this path. Got 
device: {}",
-            device_type
-        )));
-    }
-
-    Ok(())
-}
-
-/// Check if a PyTorch tensor is on a CUDA device
-fn is_cuda_tensor(tensor: &Bound<'_, PyAny>) -> PyResult<bool> {
-    let device = tensor.getattr("device")?;
-    let device_type: String = device.getattr("type")?.extract()?;
-    Ok(device_type == "cuda")
-}
-
-/// Validate array/tensor shape (must be 1D or 2D)
-///
-/// Args:
-///     ndim: Number of dimensions
-///     context: Context string for error message (e.g., "array", "tensor", 
"CUDA tensor")
-///
-/// Returns:
-///     Ok(()) if shape is valid (1D or 2D), otherwise returns an error
-fn validate_shape(ndim: usize, context: &str) -> PyResult<()> {
-    match ndim {
-        1 | 2 => Ok(()),
-        _ => {
-            let item_type = if context.contains("array") {
-                "array"
-            } else {
-                "tensor"
-            };
-            Err(PyRuntimeError::new_err(format!(
-                "Unsupported {} shape: {}D. Expected 1D {} for single sample \
-                 encoding or 2D {} (batch_size, features) for batch encoding.",
-                context, ndim, item_type, item_type
-            )))
-        }
-    }
-}
-
-/// Get the CUDA device index from a PyTorch tensor
-fn get_tensor_device_id(tensor: &Bound<'_, PyAny>) -> PyResult<i32> {
-    let device = tensor.getattr("device")?;
-    let device_index: i32 = device.getattr("index")?.extract()?;
-    Ok(device_index)
-}
-
-/// Get the current CUDA stream pointer for the tensor's device.
-fn get_torch_cuda_stream_ptr(tensor: &Bound<'_, PyAny>) -> PyResult<*mut 
c_void> {
-    let py = tensor.py();
-    let torch = PyModule::import(py, "torch")
-        .map_err(|_| PyRuntimeError::new_err("Failed to import torch 
module"))?;
-    let cuda = torch.getattr("cuda")?;
-    let device = tensor.getattr("device")?;
-    let stream = cuda.call_method1("current_stream", (device,))?;
-
-    // Defensive validation: ensure the stream is a CUDA stream on the same 
device
-    let stream_device = stream.getattr("device").map_err(|_| {
-        PyRuntimeError::new_err("CUDA stream object from PyTorch is missing 
'device' attribute")
-    })?;
-    let stream_device_type: String = stream_device
-        .getattr("type")
-        .and_then(|obj| obj.extract())
-        .map_err(|_| {
-            PyRuntimeError::new_err(
-                "Failed to extract CUDA stream device type from PyTorch 
stream.device",
-            )
-        })?;
-    if stream_device_type != "cuda" {
-        return Err(PyRuntimeError::new_err(format!(
-            "Expected CUDA stream device type 'cuda', got '{}'",
-            stream_device_type
-        )));
-    }
-
-    let stream_device_index: i32 = stream_device
-        .getattr("index")
-        .and_then(|obj| obj.extract())
-        .map_err(|_| {
-            PyRuntimeError::new_err(
-                "Failed to extract CUDA stream device index from PyTorch 
stream.device",
-            )
-        })?;
-    let tensor_device_index = get_tensor_device_id(tensor)?;
-    if stream_device_index != tensor_device_index {
-        return Err(PyRuntimeError::new_err(format!(
-            "CUDA stream device index ({}) does not match tensor device index 
({})",
-            stream_device_index, tensor_device_index
-        )));
-    }
-
-    let stream_ptr: u64 = stream.getattr("cuda_stream")?.extract()?;
-    Ok(if stream_ptr == 0 {
-        std::ptr::null_mut()
-    } else {
-        stream_ptr as *mut c_void
-    })
-}
-
-/// Validate a CUDA tensor for direct GPU encoding
-/// Checks: dtype matches encoding method, contiguous, non-empty, device_id 
matches engine
-fn validate_cuda_tensor_for_encoding(
-    tensor: &Bound<'_, PyAny>,
-    expected_device_id: usize,
-    encoding_method: &str,
-) -> PyResult<()> {
-    let method = encoding_method.to_ascii_lowercase();
-
-    // Check encoding method support and dtype (ASCII lowercase for 
case-insensitive match).
-    let dtype = tensor.getattr("dtype")?;
-    let dtype_str: String = dtype.str()?.extract()?;
-    let dtype_str_lower = dtype_str.to_ascii_lowercase();
-    match method.as_str() {
-        "amplitude" => {
-            if !(dtype_str_lower.contains("float64") || 
dtype_str_lower.contains("float32")) {
-                return Err(PyRuntimeError::new_err(format!(
-                    "CUDA tensor must have dtype float64 or float32 for 
amplitude encoding, got {}. \
-                     Use tensor.to(torch.float64) or tensor.to(torch.float32)",
-                    dtype_str
-                )));
-            }
-        }
-        "angle" => {
-            if !dtype_str_lower.contains("float64") {
-                return Err(PyRuntimeError::new_err(format!(
-                    "CUDA tensor must have dtype float64 for {} encoding, got 
{}. \
-                     Use tensor.to(torch.float64)",
-                    method, dtype_str
-                )));
-            }
-        }
-        "basis" => {
-            if !dtype_str_lower.contains("int64") {
-                return Err(PyRuntimeError::new_err(format!(
-                    "CUDA tensor must have dtype int64 for basis encoding, got 
{}. \
-                     Use tensor.to(torch.int64)",
-                    dtype_str
-                )));
-            }
-        }
-        _ => {
-            return Err(PyRuntimeError::new_err(format!(
-                "CUDA tensor encoding currently only supports 'amplitude', 
'angle', or 'basis' methods, got '{}'. \
-                 Use tensor.cpu() to convert to CPU tensor for other encoding 
methods.",
-                encoding_method
-            )));
-        }
-    }
-
-    // Check contiguous
-    let is_contiguous: bool = tensor.call_method0("is_contiguous")?.extract()?;
-    if !is_contiguous {
-        return Err(PyRuntimeError::new_err(
-            "CUDA tensor must be contiguous. Use tensor.contiguous()",
-        ));
-    }
-
-    // Check non-empty
-    let numel: usize = tensor.call_method0("numel")?.extract()?;
-    if numel == 0 {
-        return Err(PyRuntimeError::new_err("CUDA tensor cannot be empty"));
-    }
-
-    // Check device matches engine
-    let tensor_device_id = get_tensor_device_id(tensor)?;
-    if tensor_device_id as usize != expected_device_id {
-        return Err(PyRuntimeError::new_err(format!(
-            "Device mismatch: tensor is on cuda:{}, but engine is on cuda:{}. \
-             Move tensor with tensor.to('cuda:{}')",
-            tensor_device_id, expected_device_id, expected_device_id
-        )));
-    }
-
-    Ok(())
-}
-
-/// Minimal CUDA tensor metadata extracted via PyTorch APIs.
-struct CudaTensorInfo {
-    data_ptr: *const f64,
-    shape: Vec<i64>,
-}
-
-/// Extract GPU pointer and shape directly from a PyTorch CUDA tensor.
-///
-/// # Safety
-/// The returned pointer is borrowed from the source tensor. The caller must
-/// ensure the tensor remains alive and unmodified for the duration of use.
-fn extract_cuda_tensor_info(tensor: &Bound<'_, PyAny>) -> 
PyResult<CudaTensorInfo> {
-    let data_ptr: u64 = tensor.call_method0("data_ptr")?.extract()?;
-    if data_ptr == 0 {
-        return Err(PyRuntimeError::new_err(
-            "PyTorch returned a null data pointer for CUDA tensor",
-        ));
-    }
-
-    let ndim: usize = tensor.call_method0("dim")?.extract()?;
-    let mut shape = Vec::with_capacity(ndim);
-    for axis in 0..ndim {
-        let dim: i64 = tensor.call_method1("size", (axis,))?.extract()?;
-        shape.push(dim);
-    }
-
-    Ok(CudaTensorInfo {
-        data_ptr: data_ptr as *const f64,
-        shape,
-    })
-}
-
-/// DLPack tensor information extracted from a PyCapsule
-///
-/// This struct owns the DLManagedTensor pointer and ensures proper cleanup
-/// via the DLPack deleter when dropped (RAII pattern).
-struct DLPackTensorInfo {
-    /// Raw DLManagedTensor pointer from PyTorch DLPack capsule
-    /// This is owned by this struct and will be freed via deleter on drop
-    managed_ptr: *mut DLManagedTensor,
-    /// Data pointer inside dl_tensor (GPU memory, owned by managed_ptr)
-    data_ptr: *const c_void,
-    shape: Vec<i64>,
-    /// CUDA device ID from DLPack metadata.
-    /// Used for defensive validation against PyTorch API device ID.
-    device_id: i32,
-}
-
-impl Drop for DLPackTensorInfo {
-    fn drop(&mut self) {
-        unsafe {
-            if !self.managed_ptr.is_null() {
-                // Per DLPack protocol: consumer must call deleter exactly once
-                if let Some(deleter) = (*self.managed_ptr).deleter {
-                    deleter(self.managed_ptr);
-                }
-                // Prevent double-free
-                self.managed_ptr = std::ptr::null_mut();
-            }
-        }
-    }
-}
-
-/// Extract GPU pointer from PyTorch tensor's __dlpack__() capsule
-///
-/// Uses the DLPack protocol to obtain a zero-copy view of the tensor's GPU 
memory.
-/// The returned `DLPackTensorInfo` owns the DLManagedTensor and will 
automatically
-/// call the deleter when dropped, ensuring proper resource cleanup.
-///
-/// # Safety
-/// The returned `data_ptr` points to GPU memory owned by the source tensor.
-/// The caller must ensure the source tensor remains alive and unmodified
-/// for the entire duration that `data_ptr` is in use. Python's GIL ensures
-/// the tensor won't be garbage collected during `encode()`, but the caller
-/// must not deallocate or resize the tensor while encoding is in progress.
-fn extract_dlpack_tensor(_py: Python<'_>, tensor: &Bound<'_, PyAny>) -> 
PyResult<DLPackTensorInfo> {
-    // Call tensor.__dlpack__() to get PyCapsule
-    // Note: PyTorch's __dlpack__ uses the default stream when called without 
arguments
-    let capsule = tensor.call_method0("__dlpack__")?;
-
-    const DLTENSOR_NAME: &[u8] = b"dltensor\0";
-
-    // SAFETY: capsule is a valid PyCapsule from tensor.__dlpack__(). 
DLTENSOR_NAME is a
-    // null-terminated C string for the lifetime of the call. We only read the 
capsule
-    // and call PyCapsule_IsValid / PyCapsule_GetPointer; we do not invalidate 
the capsule.
-    let managed_ptr = unsafe {
-        let capsule_ptr = capsule.as_ptr();
-        if ffi::PyCapsule_IsValid(capsule_ptr, DLTENSOR_NAME.as_ptr() as 
*const i8) == 0 {
-            return Err(PyRuntimeError::new_err(
-                "Invalid DLPack capsule (expected 'dltensor')",
-            ));
-        }
-        let ptr = ffi::PyCapsule_GetPointer(capsule_ptr, 
DLTENSOR_NAME.as_ptr() as *const i8)
-            as *mut DLManagedTensor;
-        if ptr.is_null() {
-            return Err(PyRuntimeError::new_err(
-                "Failed to extract DLManagedTensor from PyCapsule",
-            ));
-        }
-        ptr
-    };
-
-    // SAFETY: managed_ptr is non-null and was returned by 
PyCapsule_GetPointer for a valid
-    // "dltensor" capsule, so it points to a valid DLManagedTensor. The 
capsule (and thus
-    // the tensor) is held by the caller for the duration of this function. We 
read fields
-    // and create slices from shape/strides only when non-null and ndim is 
valid.
-    unsafe {
-        let dl_tensor = &(*managed_ptr).dl_tensor;
-
-        if dl_tensor.data.is_null() {
-            return Err(PyRuntimeError::new_err(
-                "DLPack tensor has null data pointer",
-            ));
-        }
-
-        if dl_tensor.device.device_type != DLDeviceType::kDLCUDA {
-            return Err(PyRuntimeError::new_err(
-                "DLPack tensor must be on CUDA device",
-            ));
-        }
-
-        if dl_tensor.dtype.code != DL_FLOAT
-            || dl_tensor.dtype.bits != 64
-            || dl_tensor.dtype.lanes != 1
-        {
-            return Err(PyRuntimeError::new_err(format!(
-                "DLPack tensor must be float64 (code={}, bits={}, lanes={})",
-                dl_tensor.dtype.code, dl_tensor.dtype.bits, 
dl_tensor.dtype.lanes
-            )));
-        }
-
-        if !dl_tensor
-            .byte_offset
-            .is_multiple_of(std::mem::size_of::<f64>() as u64)
-        {
-            return Err(PyRuntimeError::new_err(
-                "DLPack tensor byte_offset is not aligned for float64",
-            ));
-        }
-
-        let data_ptr =
-            (dl_tensor.data as *const u8).add(dl_tensor.byte_offset as usize) 
as *const f64;
-
-        let ndim = dl_tensor.ndim as usize;
-        // SAFETY: shape pointer is valid for ndim elements when non-null 
(DLPack contract).
-        let shape = if ndim > 0 && !dl_tensor.shape.is_null() {
-            std::slice::from_raw_parts(dl_tensor.shape, ndim).to_vec()
-        } else {
-            vec![]
-        };
-
-        if ndim == 0 || shape.is_empty() {
-            return Err(PyRuntimeError::new_err(
-                "DLPack tensor must have at least 1 dimension",
-            ));
-        }
-
-        if !dl_tensor.strides.is_null() {
-            // SAFETY: strides pointer is valid for ndim elements (DLPack 
contract).
-            let strides = std::slice::from_raw_parts(dl_tensor.strides, ndim);
-            match ndim {
-                1 => {
-                    let expected = 1_i64;
-                    if strides[0] != expected {
-                        return Err(PyRuntimeError::new_err(format!(
-                            "DLPack tensor must be contiguous: stride[0]={}, 
expected {}",
-                            strides[0], expected
-                        )));
-                    }
-                }
-                2 => {
-                    if shape.len() < 2 {
-                        return Err(PyRuntimeError::new_err(
-                            "DLPack tensor must be contiguous (shape len < 2)",
-                        ));
-                    }
-                    let expected_stride_1 = 1_i64;
-                    let expected_stride_0 = shape[1];
-                    if strides[1] != expected_stride_1 || strides[0] != 
expected_stride_0 {
-                        return Err(PyRuntimeError::new_err(format!(
-                            "DLPack tensor must be contiguous: strides=[{}, 
{}], expected [{}, {}] (expected[1]=shape[1])",
-                            strides[0], strides[1], expected_stride_0, 
expected_stride_1
-                        )));
-                    }
-                }
-                _ => {
-                    return Err(PyRuntimeError::new_err(
-                        "DLPack tensor must be 1D or 2D for encoding",
-                    ));
-                }
-            }
-        }
-
-        let device_id = dl_tensor.device.device_id;
-
-        const USED_DLTENSOR_NAME: &[u8] = b"used_dltensor\0";
-        // SAFETY: capsule is the same PyCapsule we used above; renaming is 
allowed and does not free it.
-        ffi::PyCapsule_SetName(capsule.as_ptr(), USED_DLTENSOR_NAME.as_ptr() 
as *const i8);
-
-        Ok(DLPackTensorInfo {
-            managed_ptr,
-            data_ptr: data_ptr as *const std::ffi::c_void,
-            shape,
-            device_id,
-        })
-    }
-}
+#[cfg(target_os = "linux")]
+use crate::loader::{PyQuantumLoader, config_from_args, path_from_py};
 
 /// PyO3 wrapper for QdpEngine
 ///
 /// Provides Python bindings for GPU-accelerated quantum state encoding.
 #[pyclass]
-struct QdpEngine {
-    engine: CoreEngine,
+pub struct QdpEngine {
+    pub engine: CoreEngine,
 }
 
 #[pymethods]
@@ -990,143 +461,28 @@ impl QdpEngine {
         })
     }
 
-    // --- Loader factory methods (Linux only) ---
-    #[cfg(target_os = "linux")]
-    /// Create a synthetic-data pipeline iterator (for 
QuantumDataLoader.source_synthetic()).
-    #[pyo3(signature = (total_batches, batch_size, num_qubits, 
encoding_method, seed=None))]
-    fn create_synthetic_loader(
-        &self,
-        total_batches: usize,
-        batch_size: usize,
-        num_qubits: u32,
-        encoding_method: &str,
-        seed: Option<u64>,
-    ) -> PyResult<PyQuantumLoader> {
-        let config = config_from_args(
-            &self.engine,
-            batch_size,
-            num_qubits,
-            encoding_method,
-            total_batches,
-            seed,
-        );
-        let iter = 
qdp_core::PipelineIterator::new_synthetic(self.engine.clone(), config).map_err(
-            |e| PyRuntimeError::new_err(format!("create_synthetic_loader 
failed: {}", e)),
-        )?;
-        Ok(PyQuantumLoader::new(Some(iter)))
-    }
-
-    #[cfg(target_os = "linux")]
-    /// Create a file-backed pipeline iterator (full read then batch; for 
QuantumDataLoader.source_file(path)).
-    #[pyo3(signature = (path, batch_size, num_qubits, encoding_method, 
batch_limit=None))]
-    fn create_file_loader(
-        &self,
-        py: Python<'_>,
-        path: &Bound<'_, PyAny>,
-        batch_size: usize,
-        num_qubits: u32,
-        encoding_method: &str,
-        batch_limit: Option<usize>,
-    ) -> PyResult<PyQuantumLoader> {
-        let path_str = path_from_py(path)?;
-        let batch_limit = batch_limit.unwrap_or(usize::MAX);
-        let config = config_from_args(
-            &self.engine,
-            batch_size,
-            num_qubits,
-            encoding_method,
-            0,
-            None,
-        );
-        let engine = self.engine.clone();
-        let iter = py
-            .detach(|| {
-                qdp_core::PipelineIterator::new_from_file(
-                    engine,
-                    path_str.as_str(),
-                    config,
-                    batch_limit,
-                )
-            })
-            .map_err(|e| PyRuntimeError::new_err(format!("create_file_loader 
failed: {}", e)))?;
-        Ok(PyQuantumLoader::new(Some(iter)))
-    }
-
-    #[cfg(target_os = "linux")]
-    /// Create a streaming Parquet pipeline iterator (for 
QuantumDataLoader.source_file(path, streaming=True)).
-    #[pyo3(signature = (path, batch_size, num_qubits, encoding_method, 
batch_limit=None))]
-    fn create_streaming_file_loader(
-        &self,
-        py: Python<'_>,
-        path: &Bound<'_, PyAny>,
-        batch_size: usize,
-        num_qubits: u32,
-        encoding_method: &str,
-        batch_limit: Option<usize>,
-    ) -> PyResult<PyQuantumLoader> {
-        let path_str = path_from_py(path)?;
-        let batch_limit = batch_limit.unwrap_or(usize::MAX);
-        let config = config_from_args(
-            &self.engine,
-            batch_size,
-            num_qubits,
-            encoding_method,
-            0,
-            None,
-        );
-        let engine = self.engine.clone();
-        let iter = py
-            .detach(|| {
-                qdp_core::PipelineIterator::new_from_file_streaming(
-                    engine,
-                    path_str.as_str(),
-                    config,
-                    batch_limit,
-                )
-            })
-            .map_err(|e| {
-                PyRuntimeError::new_err(format!("create_streaming_file_loader 
failed: {}", e))
-            })?;
-        Ok(PyQuantumLoader::new(Some(iter)))
-    }
-
     /// Encode directly from a PyTorch CUDA tensor. Internal helper.
     ///
     /// Dispatches to the core f32 GPU pointer API for 1D float32 amplitude 
encoding,
     /// or to the float64/basis GPU pointer APIs for other dtypes and batch 
encoding.
-    ///
-    /// Args:
-    ///     data: PyTorch CUDA tensor
-    ///     num_qubits: Number of qubits
-    ///     encoding_method: Encoding strategy (currently only "amplitude")
     fn _encode_from_cuda_tensor(
         &self,
         data: &Bound<'_, PyAny>,
         num_qubits: usize,
         encoding_method: &str,
     ) -> PyResult<QuantumTensor> {
-        // Validate CUDA tensor for direct GPU encoding (shape, contiguity, 
device, dtype)
         validate_cuda_tensor_for_encoding(data, 
self.engine.device().ordinal(), encoding_method)?;
 
-        // Determine dtype for dispatch (float32 vs float64, etc.).
         let dtype = data.getattr("dtype")?;
         let dtype_str: String = dtype.str()?.extract()?;
         let dtype_str_lower = dtype_str.to_ascii_lowercase();
         let is_f32 = dtype_str_lower.contains("float32");
         let method = encoding_method.to_ascii_lowercase();
-
-        // Current f32 CUDA path only supports amplitude encoding for 1D 
tensors.
         let ndim: usize = data.call_method0("dim")?.extract()?;
 
         if method.as_str() == "amplitude" && is_f32 {
-            // NOTE: This f32 fast path intentionally bypasses 
`extract_cuda_tensor_info`/DLPack
-            // and uses PyTorch's `data_ptr()`/`numel()` directly, after
-            // `validate_cuda_tensor_for_encoding` has already enforced 
dtype/shape/contiguity/device.
-            // If additional validation is added to `extract_cuda_tensor_info` 
in the future, it must
-            // be mirrored here to keep behavior consistent.
             match ndim {
                 1 => {
-                    // 1D CUDA tensor, float32 amplitude encoding using core 
f32 GPU pointer API.
                     let input_len: usize = 
data.call_method0("numel")?.extract()?;
                     let stream_ptr = get_torch_cuda_stream_ptr(data)?;
                     let data_ptr_u64: u64 = 
data.call_method0("data_ptr")?.extract()?;
@@ -1161,17 +517,12 @@ impl QdpEngine {
                 ))),
             }
         } else {
-            // Existing float64 (and basis/int64) CUDA path using direct GPU 
pointer.
             let tensor_info = extract_cuda_tensor_info(data)?;
             let stream_ptr = get_torch_cuda_stream_ptr(data)?;
 
             match ndim {
                 1 => {
-                    // 1D CUDA tensor: single sample encoding
                     let input_len = tensor_info.shape[0] as usize;
-                    // SAFETY: tensor_info.data_ptr was obtained via PyTorch's 
data_ptr() from a
-                    // valid CUDA tensor. The tensor remains alive during this 
call
-                    // (held by Python's GIL), and we validated 
dtype/contiguity/device above.
                     let ptr = unsafe {
                         self.engine
                             .encode_from_gpu_ptr_with_stream(
@@ -1191,10 +542,8 @@ impl QdpEngine {
                     })
                 }
                 2 => {
-                    // 2D CUDA tensor: batch encoding
                     let num_samples = tensor_info.shape[0] as usize;
                     let sample_size = tensor_info.shape[1] as usize;
-                    // SAFETY: Same as above - pointer from validated PyTorch 
CUDA tensor
                     let ptr = unsafe {
                         self.engine
                             .encode_batch_from_gpu_ptr_with_stream(
@@ -1222,106 +571,104 @@ impl QdpEngine {
             }
         }
     }
-}
-
-// --- Loader bindings (Linux only; qdp-core pipeline types only built on 
Linux) ---
-#[cfg(target_os = "linux")]
-mod loader_bindings {
-    use super::*;
-    use pyo3::exceptions::PyStopIteration;
-    use qdp_core::{PipelineConfig, PipelineIterator};
-
-    /// Rust-backed iterator yielding one QuantumTensor per batch; used by 
QuantumDataLoader.
-    #[pyclass]
-    pub struct PyQuantumLoader {
-        inner: Option<PipelineIterator>,
-    }
-
-    impl PyQuantumLoader {
-        pub fn new(inner: Option<PipelineIterator>) -> Self {
-            Self { inner }
-        }
-    }
-
-    #[pymethods]
-    impl PyQuantumLoader {
-        fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> {
-            slf
-        }
-
-        fn __next__(mut slf: PyRefMut<'_, Self>) -> PyResult<QuantumTensor> {
-            let mut iter: PipelineIterator = match slf.inner.take() {
-                Some(i) => i,
-                None => return Err(PyStopIteration::new_err("")),
-            };
-            // Call next_batch without releasing GIL (return type *mut 
DLManagedTensor is !Send).
-            let result = iter.next_batch();
-            match result {
-                Ok(Some(ptr)) => {
-                    slf.inner = Some(iter);
-                    Ok(QuantumTensor {
-                        ptr,
-                        consumed: false,
-                    })
-                }
-                Ok(None) => {
-                    // Exhausted; do not put iterator back
-                    Err(PyStopIteration::new_err(""))
-                }
-                Err(e) => {
-                    slf.inner = Some(iter);
-                    Err(PyRuntimeError::new_err(format!(
-                        "Pipeline next_batch failed: {}",
-                        e
-                    )))
-                }
-            }
-        }
-    }
 
-    /// Build PipelineConfig from Python args. device_id is 0 (engine does not 
expose it); iterator uses engine clone with correct device.
-    pub fn config_from_args(
-        _engine: &CoreEngine,
+    // --- Loader factory methods (Linux only) ---
+    #[cfg(target_os = "linux")]
+    /// Create a synthetic-data pipeline iterator (for 
QuantumDataLoader.source_synthetic()).
+    #[pyo3(signature = (total_batches, batch_size, num_qubits, 
encoding_method, seed=None))]
+    fn create_synthetic_loader(
+        &self,
+        total_batches: usize,
         batch_size: usize,
         num_qubits: u32,
         encoding_method: &str,
-        total_batches: usize,
         seed: Option<u64>,
-    ) -> PipelineConfig {
-        PipelineConfig {
-            device_id: 0,
-            num_qubits,
+    ) -> PyResult<PyQuantumLoader> {
+        let config = config_from_args(
+            &self.engine,
             batch_size,
+            num_qubits,
+            encoding_method,
             total_batches,
-            encoding_method: encoding_method.to_string(),
             seed,
-            warmup_batches: 0,
-        }
+        );
+        let iter = 
qdp_core::PipelineIterator::new_synthetic(self.engine.clone(), config).map_err(
+            |e| PyRuntimeError::new_err(format!("create_synthetic_loader 
failed: {}", e)),
+        )?;
+        Ok(PyQuantumLoader::new(Some(iter)))
     }
 
-    /// Resolve path from Python str or pathlib.Path (__fspath__).
-    pub fn path_from_py(path: &Bound<'_, PyAny>) -> PyResult<String> {
-        path.extract::<String>().or_else(|_| {
-            path.call_method0("__fspath__")
-                .and_then(|m| m.extract::<String>())
-        })
+    #[cfg(target_os = "linux")]
+    /// Create a file-backed pipeline iterator (full read then batch; for 
QuantumDataLoader.source_file(path)).
+    #[pyo3(signature = (path, batch_size, num_qubits, encoding_method, 
batch_limit=None))]
+    fn create_file_loader(
+        &self,
+        py: Python<'_>,
+        path: &Bound<'_, PyAny>,
+        batch_size: usize,
+        num_qubits: u32,
+        encoding_method: &str,
+        batch_limit: Option<usize>,
+    ) -> PyResult<PyQuantumLoader> {
+        let path_str = path_from_py(path)?;
+        let batch_limit = batch_limit.unwrap_or(usize::MAX);
+        let config = config_from_args(
+            &self.engine,
+            batch_size,
+            num_qubits,
+            encoding_method,
+            0,
+            None,
+        );
+        let engine = self.engine.clone();
+        let iter = py
+            .detach(|| {
+                qdp_core::PipelineIterator::new_from_file(
+                    engine,
+                    path_str.as_str(),
+                    config,
+                    batch_limit,
+                )
+            })
+            .map_err(|e| PyRuntimeError::new_err(format!("create_file_loader 
failed: {}", e)))?;
+        Ok(PyQuantumLoader::new(Some(iter)))
     }
-}
-
-#[cfg(target_os = "linux")]
-use loader_bindings::{PyQuantumLoader, config_from_args, path_from_py};
-
-/// Quantum Data Plane (QDP) Python module
-///
-/// GPU-accelerated quantum data encoding with DLPack integration.
-#[pymodule]
-fn _qdp(m: &Bound<'_, PyModule>) -> PyResult<()> {
-    // Respect RUST_LOG for Rust log output; try_init() is no-op if already 
initialized.
-    let _ = env_logger::Builder::from_default_env().try_init();
 
-    m.add_class::<QdpEngine>()?;
-    m.add_class::<QuantumTensor>()?;
     #[cfg(target_os = "linux")]
-    m.add_class::<PyQuantumLoader>()?;
-    Ok(())
+    /// Create a streaming Parquet pipeline iterator (for 
QuantumDataLoader.source_file(path, streaming=True)).
+    #[pyo3(signature = (path, batch_size, num_qubits, encoding_method, 
batch_limit=None))]
+    fn create_streaming_file_loader(
+        &self,
+        py: Python<'_>,
+        path: &Bound<'_, PyAny>,
+        batch_size: usize,
+        num_qubits: u32,
+        encoding_method: &str,
+        batch_limit: Option<usize>,
+    ) -> PyResult<PyQuantumLoader> {
+        let path_str = path_from_py(path)?;
+        let batch_limit = batch_limit.unwrap_or(usize::MAX);
+        let config = config_from_args(
+            &self.engine,
+            batch_size,
+            num_qubits,
+            encoding_method,
+            0,
+            None,
+        );
+        let engine = self.engine.clone();
+        let iter = py
+            .detach(|| {
+                qdp_core::PipelineIterator::new_from_file_streaming(
+                    engine,
+                    path_str.as_str(),
+                    config,
+                    batch_limit,
+                )
+            })
+            .map_err(|e| {
+                PyRuntimeError::new_err(format!("create_streaming_file_loader 
failed: {}", e))
+            })?;
+        Ok(PyQuantumLoader::new(Some(iter)))
+    }
 }
diff --git a/qdp/qdp-python/src/lib.rs b/qdp/qdp-python/src/lib.rs
index 12335cb72..34facaedb 100644
--- a/qdp/qdp-python/src/lib.rs
+++ b/qdp/qdp-python/src/lib.rs
@@ -14,1302 +14,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use numpy::{PyReadonlyArray1, PyReadonlyArray2, PyUntypedArrayMethods};
-use pyo3::exceptions::PyRuntimeError;
-use pyo3::ffi;
-use pyo3::prelude::*;
-use qdp_core::dlpack::{DL_FLOAT, DLDeviceType, DLManagedTensor};
-use qdp_core::{Precision, QdpEngine as CoreEngine};
-use std::ffi::c_void;
-
-/// Quantum tensor wrapper implementing DLPack protocol
-///
-/// This class wraps a GPU-allocated quantum state vector and implements
-/// the DLPack protocol for zero-copy integration with PyTorch and other
-/// array libraries.
-///
-/// Example:
-///     >>> engine = QdpEngine(device_id=0)
-///     >>> qtensor = engine.encode([1.0, 2.0, 3.0], num_qubits=2, 
encoding_method="amplitude")
-///     >>> torch_tensor = torch.from_dlpack(qtensor)
-#[pyclass]
-struct QuantumTensor {
-    ptr: *mut DLManagedTensor,
-    consumed: bool,
-}
-
-#[pymethods]
-impl QuantumTensor {
-    /// Implements DLPack protocol - returns PyCapsule for PyTorch
-    ///
-    /// This method is called by torch.from_dlpack() to get the GPU memory 
pointer.
-    /// The capsule can only be consumed once to prevent double-free errors.
-    ///
-    /// Args:
-    ///     stream: Optional CUDA stream (DLPack 0.8+; 1=legacy default, 
2=per-thread default)
-    ///
-    /// Returns:
-    ///     PyCapsule containing DLManagedTensor pointer
-    ///
-    /// Raises:
-    ///     RuntimeError: If the tensor has already been consumed
-    #[pyo3(signature = (stream=None))]
-    fn __dlpack__<'py>(&mut self, py: Python<'py>, stream: Option<i64>) -> 
PyResult<Py<PyAny>> {
-        if self.consumed {
-            return Err(PyRuntimeError::new_err(
-                "DLPack tensor already consumed (can only be used once)",
-            ));
-        }
-
-        if self.ptr.is_null() {
-            return Err(PyRuntimeError::new_err("Invalid DLPack tensor 
pointer"));
-        }
-
-        if let Some(stream) = stream
-            && stream > 0
-        {
-            let stream_ptr = qdp_core::dlpack::dlpack_stream_to_cuda(stream);
-            unsafe {
-                qdp_core::dlpack::synchronize_stream(stream_ptr).map_err(|e| {
-                    PyRuntimeError::new_err(format!("CUDA stream sync failed: 
{}", e))
-                })?;
-            }
-        }
-
-        // Mark as consumed to prevent double-free
-        self.consumed = true;
-
-        // Create PyCapsule using FFI
-        // PyTorch will call the deleter stored in DLManagedTensor.deleter
-        // Use a static C string for the capsule name to avoid lifetime issues
-        const DLTENSOR_NAME: &[u8] = b"dltensor\0";
-
-        unsafe {
-            // Create PyCapsule without a destructor
-            // PyTorch will manually call the deleter from DLManagedTensor
-            let capsule_ptr = ffi::PyCapsule_New(
-                self.ptr as *mut std::ffi::c_void,
-                DLTENSOR_NAME.as_ptr() as *const i8,
-                None, // No destructor - PyTorch handles it
-            );
-
-            if capsule_ptr.is_null() {
-                return Err(PyRuntimeError::new_err("Failed to create 
PyCapsule"));
-            }
-
-            Ok(Py::from_owned_ptr(py, capsule_ptr))
-        }
-    }
-
-    /// Returns DLPack device information
-    ///
-    /// Returns:
-    ///     Tuple of (device_type, device_id) where device_type=2 for CUDA
-    fn __dlpack_device__(&self) -> PyResult<(i32, i32)> {
-        if self.ptr.is_null() {
-            return Err(PyRuntimeError::new_err("Invalid DLPack tensor 
pointer"));
-        }
-
-        unsafe {
-            let tensor = &(*self.ptr).dl_tensor;
-            // DLPack device_type: kDLCUDA = 2, kDLCPU = 1
-            let device_type = match tensor.device.device_type {
-                qdp_core::dlpack::DLDeviceType::kDLCUDA => 2,
-                qdp_core::dlpack::DLDeviceType::kDLCPU => 1,
-            };
-            // Read device_id from DLPack tensor metadata
-            Ok((device_type, tensor.device.device_id))
-        }
-    }
-}
+mod dlpack;
+mod engine;
+mod loader;
+mod pytorch;
+mod tensor;
 
-impl Drop for QuantumTensor {
-    fn drop(&mut self) {
-        // Only free if not consumed by __dlpack__
-        // If consumed, PyTorch/consumer will call the deleter
-        if !self.consumed && !self.ptr.is_null() {
-            unsafe {
-                // Defensive check: qdp-core always provides a deleter
-                debug_assert!(
-                    (*self.ptr).deleter.is_some(),
-                    "DLManagedTensor from qdp-core should always have a 
deleter"
-                );
-
-                // Call the DLPack deleter to free memory
-                if let Some(deleter) = (*self.ptr).deleter {
-                    deleter(self.ptr);
-                }
-            }
-        }
-    }
-}
-
-// Safety: QuantumTensor can be sent between threads
-// The DLManagedTensor pointer management is thread-safe via Arc in the deleter
-unsafe impl Send for QuantumTensor {}
-unsafe impl Sync for QuantumTensor {}
-
-/// Helper to detect PyTorch tensor
-fn is_pytorch_tensor(obj: &Bound<'_, PyAny>) -> PyResult<bool> {
-    let type_obj = obj.get_type();
-    let name = type_obj.name()?;
-    if name != "Tensor" {
-        return Ok(false);
-    }
-    let module = type_obj.module()?;
-    let module_name = module.to_str()?;
-    Ok(module_name == "torch")
-}
-
-/// Helper to validate CPU tensor
-fn validate_tensor(tensor: &Bound<'_, PyAny>) -> PyResult<()> {
-    if !is_pytorch_tensor(tensor)? {
-        return Err(PyRuntimeError::new_err("Object is not a PyTorch Tensor"));
-    }
-
-    let device = tensor.getattr("device")?;
-    let device_type: String = device.getattr("type")?.extract()?;
-
-    if device_type != "cpu" {
-        return Err(PyRuntimeError::new_err(format!(
-            "Only CPU tensors are currently supported for this path. Got 
device: {}",
-            device_type
-        )));
-    }
-
-    Ok(())
-}
-
-/// Check if a PyTorch tensor is on a CUDA device
-fn is_cuda_tensor(tensor: &Bound<'_, PyAny>) -> PyResult<bool> {
-    let device = tensor.getattr("device")?;
-    let device_type: String = device.getattr("type")?.extract()?;
-    Ok(device_type == "cuda")
-}
-
-/// Validate array/tensor shape (must be 1D or 2D)
-///
-/// Args:
-///     ndim: Number of dimensions
-///     context: Context string for error message (e.g., "array", "tensor", 
"CUDA tensor")
-///
-/// Returns:
-///     Ok(()) if shape is valid (1D or 2D), otherwise returns an error
-fn validate_shape(ndim: usize, context: &str) -> PyResult<()> {
-    match ndim {
-        1 | 2 => Ok(()),
-        _ => {
-            let item_type = if context.contains("array") {
-                "array"
-            } else {
-                "tensor"
-            };
-            Err(PyRuntimeError::new_err(format!(
-                "Unsupported {} shape: {}D. Expected 1D {} for single sample \
-                 encoding or 2D {} (batch_size, features) for batch encoding.",
-                context, ndim, item_type, item_type
-            )))
-        }
-    }
-}
-
-/// Get the CUDA device index from a PyTorch tensor
-fn get_tensor_device_id(tensor: &Bound<'_, PyAny>) -> PyResult<i32> {
-    let device = tensor.getattr("device")?;
-    let device_index: i32 = device.getattr("index")?.extract()?;
-    Ok(device_index)
-}
-
-/// Get the current CUDA stream pointer for the tensor's device.
-fn get_torch_cuda_stream_ptr(tensor: &Bound<'_, PyAny>) -> PyResult<*mut 
c_void> {
-    let py = tensor.py();
-    let torch = PyModule::import(py, "torch")
-        .map_err(|_| PyRuntimeError::new_err("Failed to import torch 
module"))?;
-    let cuda = torch.getattr("cuda")?;
-    let device = tensor.getattr("device")?;
-    let stream = cuda.call_method1("current_stream", (device,))?;
-
-    // Defensive validation: ensure the stream is a CUDA stream on the same 
device
-    let stream_device = stream.getattr("device").map_err(|_| {
-        PyRuntimeError::new_err("CUDA stream object from PyTorch is missing 
'device' attribute")
-    })?;
-    let stream_device_type: String = stream_device
-        .getattr("type")
-        .and_then(|obj| obj.extract())
-        .map_err(|_| {
-            PyRuntimeError::new_err(
-                "Failed to extract CUDA stream device type from PyTorch 
stream.device",
-            )
-        })?;
-    if stream_device_type != "cuda" {
-        return Err(PyRuntimeError::new_err(format!(
-            "Expected CUDA stream device type 'cuda', got '{}'",
-            stream_device_type
-        )));
-    }
-
-    let stream_device_index: i32 = stream_device
-        .getattr("index")
-        .and_then(|obj| obj.extract())
-        .map_err(|_| {
-            PyRuntimeError::new_err(
-                "Failed to extract CUDA stream device index from PyTorch 
stream.device",
-            )
-        })?;
-    let tensor_device_index = get_tensor_device_id(tensor)?;
-    if stream_device_index != tensor_device_index {
-        return Err(PyRuntimeError::new_err(format!(
-            "CUDA stream device index ({}) does not match tensor device index 
({})",
-            stream_device_index, tensor_device_index
-        )));
-    }
-
-    let stream_ptr: u64 = stream.getattr("cuda_stream")?.extract()?;
-    Ok(if stream_ptr == 0 {
-        std::ptr::null_mut()
-    } else {
-        stream_ptr as *mut c_void
-    })
-}
-
-/// Validate a CUDA tensor for direct GPU encoding
-/// Checks: dtype matches encoding method, contiguous, non-empty, device_id 
matches engine
-fn validate_cuda_tensor_for_encoding(
-    tensor: &Bound<'_, PyAny>,
-    expected_device_id: usize,
-    encoding_method: &str,
-) -> PyResult<()> {
-    let method = encoding_method.to_ascii_lowercase();
-
-    // Check encoding method support and dtype (ASCII lowercase for 
case-insensitive match).
-    let dtype = tensor.getattr("dtype")?;
-    let dtype_str: String = dtype.str()?.extract()?;
-    let dtype_str_lower = dtype_str.to_ascii_lowercase();
-    match method.as_str() {
-        "amplitude" => {
-            if !(dtype_str_lower.contains("float64") || 
dtype_str_lower.contains("float32")) {
-                return Err(PyRuntimeError::new_err(format!(
-                    "CUDA tensor must have dtype float64 or float32 for 
amplitude encoding, got {}. \
-                     Use tensor.to(torch.float64) or tensor.to(torch.float32)",
-                    dtype_str
-                )));
-            }
-        }
-        "angle" => {
-            if !dtype_str_lower.contains("float64") {
-                return Err(PyRuntimeError::new_err(format!(
-                    "CUDA tensor must have dtype float64 for {} encoding, got 
{}. \
-                     Use tensor.to(torch.float64)",
-                    method, dtype_str
-                )));
-            }
-        }
-        "basis" => {
-            if !dtype_str_lower.contains("int64") {
-                return Err(PyRuntimeError::new_err(format!(
-                    "CUDA tensor must have dtype int64 for basis encoding, got 
{}. \
-                     Use tensor.to(torch.int64)",
-                    dtype_str
-                )));
-            }
-        }
-        _ => {
-            return Err(PyRuntimeError::new_err(format!(
-                "CUDA tensor encoding currently only supports 'amplitude', 
'angle', or 'basis' methods, got '{}'. \
-                 Use tensor.cpu() to convert to CPU tensor for other encoding 
methods.",
-                encoding_method
-            )));
-        }
-    }
-
-    // Check contiguous
-    let is_contiguous: bool = tensor.call_method0("is_contiguous")?.extract()?;
-    if !is_contiguous {
-        return Err(PyRuntimeError::new_err(
-            "CUDA tensor must be contiguous. Use tensor.contiguous()",
-        ));
-    }
-
-    // Check non-empty
-    let numel: usize = tensor.call_method0("numel")?.extract()?;
-    if numel == 0 {
-        return Err(PyRuntimeError::new_err("CUDA tensor cannot be empty"));
-    }
-
-    // Check device matches engine
-    let tensor_device_id = get_tensor_device_id(tensor)?;
-    if tensor_device_id as usize != expected_device_id {
-        return Err(PyRuntimeError::new_err(format!(
-            "Device mismatch: tensor is on cuda:{}, but engine is on cuda:{}. \
-             Move tensor with tensor.to('cuda:{}')",
-            tensor_device_id, expected_device_id, expected_device_id
-        )));
-    }
-
-    Ok(())
-}
-
-/// Minimal CUDA tensor metadata extracted via PyTorch APIs.
-struct CudaTensorInfo {
-    data_ptr: *const f64,
-    shape: Vec<i64>,
-}
-
-/// Extract GPU pointer and shape directly from a PyTorch CUDA tensor.
-///
-/// # Safety
-/// The returned pointer is borrowed from the source tensor. The caller must
-/// ensure the tensor remains alive and unmodified for the duration of use.
-fn extract_cuda_tensor_info(tensor: &Bound<'_, PyAny>) -> 
PyResult<CudaTensorInfo> {
-    let data_ptr: u64 = tensor.call_method0("data_ptr")?.extract()?;
-    if data_ptr == 0 {
-        return Err(PyRuntimeError::new_err(
-            "PyTorch returned a null data pointer for CUDA tensor",
-        ));
-    }
-
-    let ndim: usize = tensor.call_method0("dim")?.extract()?;
-    let mut shape = Vec::with_capacity(ndim);
-    for axis in 0..ndim {
-        let dim: i64 = tensor.call_method1("size", (axis,))?.extract()?;
-        shape.push(dim);
-    }
-
-    Ok(CudaTensorInfo {
-        data_ptr: data_ptr as *const f64,
-        shape,
-    })
-}
-
-/// DLPack tensor information extracted from a PyCapsule
-///
-/// This struct owns the DLManagedTensor pointer and ensures proper cleanup
-/// via the DLPack deleter when dropped (RAII pattern).
-struct DLPackTensorInfo {
-    /// Raw DLManagedTensor pointer from PyTorch DLPack capsule
-    /// This is owned by this struct and will be freed via deleter on drop
-    managed_ptr: *mut DLManagedTensor,
-    /// Data pointer inside dl_tensor (GPU memory, owned by managed_ptr)
-    data_ptr: *const c_void,
-    shape: Vec<i64>,
-    /// CUDA device ID from DLPack metadata.
-    /// Used for defensive validation against PyTorch API device ID.
-    device_id: i32,
-}
-
-impl Drop for DLPackTensorInfo {
-    fn drop(&mut self) {
-        unsafe {
-            if !self.managed_ptr.is_null() {
-                // Per DLPack protocol: consumer must call deleter exactly once
-                if let Some(deleter) = (*self.managed_ptr).deleter {
-                    deleter(self.managed_ptr);
-                }
-                // Prevent double-free
-                self.managed_ptr = std::ptr::null_mut();
-            }
-        }
-    }
-}
-
-/// Extract GPU pointer from PyTorch tensor's __dlpack__() capsule
-///
-/// Uses the DLPack protocol to obtain a zero-copy view of the tensor's GPU 
memory.
-/// The returned `DLPackTensorInfo` owns the DLManagedTensor and will 
automatically
-/// call the deleter when dropped, ensuring proper resource cleanup.
-///
-/// # Safety
-/// The returned `data_ptr` points to GPU memory owned by the source tensor.
-/// The caller must ensure the source tensor remains alive and unmodified
-/// for the entire duration that `data_ptr` is in use. Python's GIL ensures
-/// the tensor won't be garbage collected during `encode()`, but the caller
-/// must not deallocate or resize the tensor while encoding is in progress.
-fn extract_dlpack_tensor(_py: Python<'_>, tensor: &Bound<'_, PyAny>) -> 
PyResult<DLPackTensorInfo> {
-    // Call tensor.__dlpack__() to get PyCapsule
-    // Note: PyTorch's __dlpack__ uses the default stream when called without 
arguments
-    let capsule = tensor.call_method0("__dlpack__")?;
-
-    const DLTENSOR_NAME: &[u8] = b"dltensor\0";
-
-    // SAFETY: capsule is a valid PyCapsule from tensor.__dlpack__(). 
DLTENSOR_NAME is a
-    // null-terminated C string for the lifetime of the call. We only read the 
capsule
-    // and call PyCapsule_IsValid / PyCapsule_GetPointer; we do not invalidate 
the capsule.
-    let managed_ptr = unsafe {
-        let capsule_ptr = capsule.as_ptr();
-        if ffi::PyCapsule_IsValid(capsule_ptr, DLTENSOR_NAME.as_ptr() as 
*const i8) == 0 {
-            return Err(PyRuntimeError::new_err(
-                "Invalid DLPack capsule (expected 'dltensor')",
-            ));
-        }
-        let ptr = ffi::PyCapsule_GetPointer(capsule_ptr, 
DLTENSOR_NAME.as_ptr() as *const i8)
-            as *mut DLManagedTensor;
-        if ptr.is_null() {
-            return Err(PyRuntimeError::new_err(
-                "Failed to extract DLManagedTensor from PyCapsule",
-            ));
-        }
-        ptr
-    };
-
-    // SAFETY: managed_ptr is non-null and was returned by 
PyCapsule_GetPointer for a valid
-    // "dltensor" capsule, so it points to a valid DLManagedTensor. The 
capsule (and thus
-    // the tensor) is held by the caller for the duration of this function. We 
read fields
-    // and create slices from shape/strides only when non-null and ndim is 
valid.
-    unsafe {
-        let dl_tensor = &(*managed_ptr).dl_tensor;
-
-        if dl_tensor.data.is_null() {
-            return Err(PyRuntimeError::new_err(
-                "DLPack tensor has null data pointer",
-            ));
-        }
-
-        if dl_tensor.device.device_type != DLDeviceType::kDLCUDA {
-            return Err(PyRuntimeError::new_err(
-                "DLPack tensor must be on CUDA device",
-            ));
-        }
-
-        if dl_tensor.dtype.code != DL_FLOAT
-            || dl_tensor.dtype.bits != 64
-            || dl_tensor.dtype.lanes != 1
-        {
-            return Err(PyRuntimeError::new_err(format!(
-                "DLPack tensor must be float64 (code={}, bits={}, lanes={})",
-                dl_tensor.dtype.code, dl_tensor.dtype.bits, 
dl_tensor.dtype.lanes
-            )));
-        }
-
-        if !dl_tensor
-            .byte_offset
-            .is_multiple_of(std::mem::size_of::<f64>() as u64)
-        {
-            return Err(PyRuntimeError::new_err(
-                "DLPack tensor byte_offset is not aligned for float64",
-            ));
-        }
-
-        let data_ptr =
-            (dl_tensor.data as *const u8).add(dl_tensor.byte_offset as usize) 
as *const f64;
-
-        let ndim = dl_tensor.ndim as usize;
-        // SAFETY: shape pointer is valid for ndim elements when non-null 
(DLPack contract).
-        let shape = if ndim > 0 && !dl_tensor.shape.is_null() {
-            std::slice::from_raw_parts(dl_tensor.shape, ndim).to_vec()
-        } else {
-            vec![]
-        };
-
-        if ndim == 0 || shape.is_empty() {
-            return Err(PyRuntimeError::new_err(
-                "DLPack tensor must have at least 1 dimension",
-            ));
-        }
-
-        if !dl_tensor.strides.is_null() {
-            // SAFETY: strides pointer is valid for ndim elements (DLPack 
contract).
-            let strides = std::slice::from_raw_parts(dl_tensor.strides, ndim);
-            match ndim {
-                1 => {
-                    let expected = 1_i64;
-                    if strides[0] != expected {
-                        return Err(PyRuntimeError::new_err(format!(
-                            "DLPack tensor must be contiguous: stride[0]={}, 
expected {}",
-                            strides[0], expected
-                        )));
-                    }
-                }
-                2 => {
-                    if shape.len() < 2 {
-                        return Err(PyRuntimeError::new_err(
-                            "DLPack tensor must be contiguous (shape len < 2)",
-                        ));
-                    }
-                    let expected_stride_1 = 1_i64;
-                    let expected_stride_0 = shape[1];
-                    if strides[1] != expected_stride_1 || strides[0] != 
expected_stride_0 {
-                        return Err(PyRuntimeError::new_err(format!(
-                            "DLPack tensor must be contiguous: strides=[{}, 
{}], expected [{}, {}] (expected[1]=shape[1])",
-                            strides[0], strides[1], expected_stride_0, 
expected_stride_1
-                        )));
-                    }
-                }
-                _ => {
-                    return Err(PyRuntimeError::new_err(
-                        "DLPack tensor must be 1D or 2D for encoding",
-                    ));
-                }
-            }
-        }
-
-        let device_id = dl_tensor.device.device_id;
-
-        const USED_DLTENSOR_NAME: &[u8] = b"used_dltensor\0";
-        // SAFETY: capsule is the same PyCapsule we used above; renaming is 
allowed and does not free it.
-        ffi::PyCapsule_SetName(capsule.as_ptr(), USED_DLTENSOR_NAME.as_ptr() 
as *const i8);
-
-        Ok(DLPackTensorInfo {
-            managed_ptr,
-            data_ptr: data_ptr as *const std::ffi::c_void,
-            shape,
-            device_id,
-        })
-    }
-}
-
-/// PyO3 wrapper for QdpEngine
-///
-/// Provides Python bindings for GPU-accelerated quantum state encoding.
-#[pyclass]
-struct QdpEngine {
-    engine: CoreEngine,
-}
-
-#[pymethods]
-impl QdpEngine {
-    /// Initialize QDP engine on specified GPU device
-    ///
-    /// Args:
-    ///     device_id: CUDA device ID (typically 0)
-    ///     precision: Output precision ("float32" default, or "float64")
-    ///
-    /// Returns:
-    ///     QdpEngine instance
-    ///
-    /// Raises:
-    ///     RuntimeError: If CUDA device initialization fails
-    #[new]
-    #[pyo3(signature = (device_id=0, precision="float32"))]
-    fn new(device_id: usize, precision: &str) -> PyResult<Self> {
-        let precision = match precision.to_ascii_lowercase().as_str() {
-            "float32" | "f32" | "float" => Precision::Float32,
-            "float64" | "f64" | "double" => Precision::Float64,
-            other => {
-                return Err(PyRuntimeError::new_err(format!(
-                    "Unsupported precision '{}'. Use 'float32' (default) or 
'float64'.",
-                    other
-                )));
-            }
-        };
-
-        let engine = CoreEngine::new_with_precision(device_id, precision)
-            .map_err(|e| PyRuntimeError::new_err(format!("Failed to 
initialize: {}", e)))?;
-        Ok(Self { engine })
-    }
-
-    /// Encode classical data into quantum state (auto-detects input type)
-    ///
-    /// Args:
-    ///     data: Input data - supports:
-    ///         - Python list: [1.0, 2.0, 3.0, 4.0]
-    ///         - NumPy array: 1D (single sample) or 2D (batch) array
-    ///         - PyTorch tensor: CPU tensor (float64 recommended; will be 
copied to GPU)
-    ///         - String path: .parquet, .arrow, .feather, .npy, .pt, .pth, 
.pb file
-    ///         - pathlib.Path: Path object (converted via os.fspath())
-    ///     num_qubits: Number of qubits for encoding
-    ///     encoding_method: Encoding strategy ("amplitude" default, "angle", 
or "basis")
-    ///
-    /// Returns:
-    ///     QuantumTensor: DLPack-compatible tensor for zero-copy PyTorch 
integration
-    ///         Shape: [batch_size, 2^num_qubits]
-    ///
-    /// Example:
-    ///     >>> engine = QdpEngine(0)
-    ///     >>> # From list
-    ///     >>> tensor = engine.encode([1.0, 2.0, 3.0, 4.0], 2)
-    ///     >>> # From NumPy batch
-    ///     >>> tensor = engine.encode(np.random.randn(64, 4), 2)
-    ///     >>> # From file path string
-    ///     >>> tensor = engine.encode("data.parquet", 10)
-    ///     >>> # From pathlib.Path
-    ///     >>> from pathlib import Path
-    ///     >>> tensor = engine.encode(Path("data.npy"), 10)
-    #[pyo3(signature = (data, num_qubits, encoding_method="amplitude"))]
-    fn encode(
-        &self,
-        data: &Bound<'_, PyAny>,
-        num_qubits: usize,
-        encoding_method: &str,
-    ) -> PyResult<QuantumTensor> {
-        // Check if it's a string path
-        if let Ok(path) = data.extract::<String>() {
-            return self.encode_from_file(&path, num_qubits, encoding_method);
-        }
-
-        // Check if it's a pathlib.Path or os.PathLike object (has __fspath__ 
method)
-        if data.hasattr("__fspath__")? {
-            let path: String = data.call_method0("__fspath__")?.extract()?;
-            return self.encode_from_file(&path, num_qubits, encoding_method);
-        }
-
-        // Check if it's a NumPy array
-        if data.hasattr("__array_interface__")? {
-            return self.encode_from_numpy(data, num_qubits, encoding_method);
-        }
-
-        // Check if it's a PyTorch tensor
-        if is_pytorch_tensor(data)? {
-            // Check if it's a CUDA tensor - use zero-copy GPU encoding
-            if is_cuda_tensor(data)? {
-                return self._encode_from_cuda_tensor(data, num_qubits, 
encoding_method);
-            }
-            // CPU PyTorch tensor path
-            return self.encode_from_pytorch(data, num_qubits, encoding_method);
-        }
-
-        // Fallback: try to extract as Vec<f64> (Python list)
-        self.encode_from_list(data, num_qubits, encoding_method)
-    }
-
-    /// Encode from NumPy array (1D or 2D)
-    fn encode_from_numpy(
-        &self,
-        data: &Bound<'_, PyAny>,
-        num_qubits: usize,
-        encoding_method: &str,
-    ) -> PyResult<QuantumTensor> {
-        let ndim: usize = data.getattr("ndim")?.extract()?;
-        validate_shape(ndim, "array")?;
-
-        match ndim {
-            1 => {
-                // 1D array: single sample encoding (zero-copy if already 
contiguous)
-                let array_1d = 
data.extract::<PyReadonlyArray1<f64>>().map_err(|_| {
-                    PyRuntimeError::new_err(
-                        "Failed to extract 1D NumPy array. Ensure dtype is 
float64.",
-                    )
-                })?;
-                let data_slice = array_1d.as_slice().map_err(|_| {
-                    PyRuntimeError::new_err("NumPy array must be contiguous 
(C-order)")
-                })?;
-                let ptr = self
-                    .engine
-                    .encode(data_slice, num_qubits, encoding_method)
-                    .map_err(|e| PyRuntimeError::new_err(format!("Encoding 
failed: {}", e)))?;
-                Ok(QuantumTensor {
-                    ptr,
-                    consumed: false,
-                })
-            }
-            2 => {
-                // 2D array: batch encoding (zero-copy if already contiguous)
-                let array_2d = 
data.extract::<PyReadonlyArray2<f64>>().map_err(|_| {
-                    PyRuntimeError::new_err(
-                        "Failed to extract 2D NumPy array. Ensure dtype is 
float64.",
-                    )
-                })?;
-                let shape = array_2d.shape();
-                let num_samples = shape[0];
-                let sample_size = shape[1];
-                let data_slice = array_2d.as_slice().map_err(|_| {
-                    PyRuntimeError::new_err("NumPy array must be contiguous 
(C-order)")
-                })?;
-                let ptr = self
-                    .engine
-                    .encode_batch(
-                        data_slice,
-                        num_samples,
-                        sample_size,
-                        num_qubits,
-                        encoding_method,
-                    )
-                    .map_err(|e| PyRuntimeError::new_err(format!("Encoding 
failed: {}", e)))?;
-                Ok(QuantumTensor {
-                    ptr,
-                    consumed: false,
-                })
-            }
-            _ => unreachable!("validate_shape() should have caught invalid 
ndim"),
-        }
-    }
-
-    /// Encode from PyTorch tensor (1D or 2D)
-    fn encode_from_pytorch(
-        &self,
-        data: &Bound<'_, PyAny>,
-        num_qubits: usize,
-        encoding_method: &str,
-    ) -> PyResult<QuantumTensor> {
-        // Check if it's a CUDA tensor - use zero-copy GPU encoding via DLPack
-        if is_cuda_tensor(data)? {
-            // Validate CUDA tensor for direct GPU encoding
-            validate_cuda_tensor_for_encoding(
-                data,
-                self.engine.device().ordinal(),
-                encoding_method,
-            )?;
-
-            // Extract GPU pointer via DLPack (RAII wrapper ensures deleter is 
called)
-            let dlpack_info = extract_dlpack_tensor(data.py(), data)?;
-
-            // ensure PyTorch API and DLPack metadata agree on device ID
-            let pytorch_device_id = get_tensor_device_id(data)?;
-            if dlpack_info.device_id != pytorch_device_id {
-                return Err(PyRuntimeError::new_err(format!(
-                    "Device ID mismatch: PyTorch reports device {}, but DLPack 
metadata reports {}. \
-                     This indicates an inconsistency between PyTorch and 
DLPack device information.",
-                    pytorch_device_id, dlpack_info.device_id
-                )));
-            }
-
-            let ndim: usize = data.call_method0("dim")?.extract()?;
-            validate_shape(ndim, "CUDA tensor")?;
-
-            match ndim {
-                1 => {
-                    // 1D CUDA tensor: single sample encoding
-                    let input_len = dlpack_info.shape[0] as usize;
-                    // SAFETY: dlpack_info.data_ptr was validated via DLPack 
protocol from a
-                    // valid PyTorch CUDA tensor. The tensor remains alive 
during this call
-                    // (held by Python's GIL), and we validated 
dtype/contiguity/device above.
-                    // The DLPackTensorInfo RAII wrapper will call deleter 
when dropped.
-                    let ptr = unsafe {
-                        self.engine
-                            .encode_from_gpu_ptr(
-                                dlpack_info.data_ptr,
-                                input_len,
-                                num_qubits,
-                                encoding_method,
-                            )
-                            .map_err(|e| {
-                                PyRuntimeError::new_err(format!("Encoding 
failed: {}", e))
-                            })?
-                    };
-                    return Ok(QuantumTensor {
-                        ptr,
-                        consumed: false,
-                    });
-                }
-                2 => {
-                    // 2D CUDA tensor: batch encoding
-                    let num_samples = dlpack_info.shape[0] as usize;
-                    let sample_size = dlpack_info.shape[1] as usize;
-                    // SAFETY: Same as above - pointer from validated DLPack 
tensor
-                    let ptr = unsafe {
-                        self.engine
-                            .encode_batch_from_gpu_ptr(
-                                dlpack_info.data_ptr,
-                                num_samples,
-                                sample_size,
-                                num_qubits,
-                                encoding_method,
-                            )
-                            .map_err(|e| {
-                                PyRuntimeError::new_err(format!("Encoding 
failed: {}", e))
-                            })?
-                    };
-                    return Ok(QuantumTensor {
-                        ptr,
-                        consumed: false,
-                    });
-                }
-                _ => unreachable!("validate_shape() should have caught invalid 
ndim"),
-            }
-        }
-
-        // CPU tensor path
-        validate_tensor(data)?;
-        // PERF: Avoid Tensor -> Python list -> Vec deep copies.
-        //
-        // For CPU tensors, `tensor.detach().numpy()` returns a NumPy view 
that shares the same
-        // underlying memory (zero-copy) when the tensor is C-contiguous. We 
can then borrow a
-        // `&[f64]` directly via pyo3-numpy.
-        let ndim: usize = data.call_method0("dim")?.extract()?;
-        validate_shape(ndim, "tensor")?;
-        let numpy_view = data
-            .call_method0("detach")?
-            .call_method0("numpy")
-            .map_err(|_| {
-                PyRuntimeError::new_err(
-                    "Failed to convert torch.Tensor to NumPy view. Ensure the 
tensor is on CPU \
-                     and does not require grad (try: tensor = 
tensor.detach().cpu())",
-                )
-            })?;
-
-        match ndim {
-            1 => {
-                // 1D tensor: single sample encoding
-                let array_1d = 
numpy_view.extract::<PyReadonlyArray1<f64>>().map_err(|_| {
-                    PyRuntimeError::new_err(
-                        "Failed to extract NumPy view as float64 array. Ensure 
dtype is float64 \
-                             (try: tensor = tensor.to(torch.float64))",
-                    )
-                })?;
-                let data_slice = array_1d.as_slice().map_err(|_| {
-                    PyRuntimeError::new_err(
-                        "Tensor must be contiguous (C-order) to get zero-copy 
slice \
-                         (try: tensor = tensor.contiguous())",
-                    )
-                })?;
-                let ptr = self
-                    .engine
-                    .encode(data_slice, num_qubits, encoding_method)
-                    .map_err(|e| PyRuntimeError::new_err(format!("Encoding 
failed: {}", e)))?;
-                Ok(QuantumTensor {
-                    ptr,
-                    consumed: false,
-                })
-            }
-            2 => {
-                // 2D tensor: batch encoding
-                let array_2d = 
numpy_view.extract::<PyReadonlyArray2<f64>>().map_err(|_| {
-                    PyRuntimeError::new_err(
-                        "Failed to extract NumPy view as float64 array. Ensure 
dtype is float64 \
-                             (try: tensor = tensor.to(torch.float64))",
-                    )
-                })?;
-                let shape = array_2d.shape();
-                let num_samples = shape[0];
-                let sample_size = shape[1];
-                let data_slice = array_2d.as_slice().map_err(|_| {
-                    PyRuntimeError::new_err(
-                        "Tensor must be contiguous (C-order) to get zero-copy 
slice \
-                         (try: tensor = tensor.contiguous())",
-                    )
-                })?;
-                let ptr = self
-                    .engine
-                    .encode_batch(
-                        data_slice,
-                        num_samples,
-                        sample_size,
-                        num_qubits,
-                        encoding_method,
-                    )
-                    .map_err(|e| PyRuntimeError::new_err(format!("Encoding 
failed: {}", e)))?;
-                Ok(QuantumTensor {
-                    ptr,
-                    consumed: false,
-                })
-            }
-            _ => unreachable!("validate_shape() should have caught invalid 
ndim"),
-        }
-    }
-
-    /// Encode from Python list
-    fn encode_from_list(
-        &self,
-        data: &Bound<'_, PyAny>,
-        num_qubits: usize,
-        encoding_method: &str,
-    ) -> PyResult<QuantumTensor> {
-        let vec_data = data.extract::<Vec<f64>>().map_err(|_| {
-            PyRuntimeError::new_err(
-                "Unsupported data type. Expected: list, NumPy array, PyTorch 
tensor, or file path",
-            )
-        })?;
-        let ptr = self
-            .engine
-            .encode(&vec_data, num_qubits, encoding_method)
-            .map_err(|e| PyRuntimeError::new_err(format!("Encoding failed: 
{}", e)))?;
-        Ok(QuantumTensor {
-            ptr,
-            consumed: false,
-        })
-    }
-
-    /// Internal helper to encode from file based on extension
-    fn encode_from_file(
-        &self,
-        path: &str,
-        num_qubits: usize,
-        encoding_method: &str,
-    ) -> PyResult<QuantumTensor> {
-        let ptr = if path.ends_with(".parquet") {
-            self.engine
-                .encode_from_parquet(path, num_qubits, encoding_method)
-                .map_err(|e| {
-                    PyRuntimeError::new_err(format!("Encoding from parquet 
failed: {}", e))
-                })?
-        } else if path.ends_with(".arrow") || path.ends_with(".feather") {
-            self.engine
-                .encode_from_arrow_ipc(path, num_qubits, encoding_method)
-                .map_err(|e| {
-                    PyRuntimeError::new_err(format!("Encoding from Arrow IPC 
failed: {}", e))
-                })?
-        } else if path.ends_with(".npy") {
-            self.engine
-                .encode_from_numpy(path, num_qubits, encoding_method)
-                .map_err(|e| {
-                    PyRuntimeError::new_err(format!("Encoding from NumPy 
failed: {}", e))
-                })?
-        } else if path.ends_with(".pt") || path.ends_with(".pth") {
-            self.engine
-                .encode_from_torch(path, num_qubits, encoding_method)
-                .map_err(|e| {
-                    PyRuntimeError::new_err(format!("Encoding from PyTorch 
failed: {}", e))
-                })?
-        } else if path.ends_with(".pb") {
-            self.engine
-                .encode_from_tensorflow(path, num_qubits, encoding_method)
-                .map_err(|e| {
-                    PyRuntimeError::new_err(format!("Encoding from TensorFlow 
failed: {}", e))
-                })?
-        } else {
-            return Err(PyRuntimeError::new_err(format!(
-                "Unsupported file format. Expected .parquet, .arrow, .feather, 
.npy, .pt, .pth, or .pb, got: {}",
-                path
-            )));
-        };
-
-        Ok(QuantumTensor {
-            ptr,
-            consumed: false,
-        })
-    }
-
-    /// Encode from TensorFlow TensorProto file
-    ///
-    /// Args:
-    ///     path: Path to TensorProto file (.pb)
-    ///     num_qubits: Number of qubits for encoding
-    ///     encoding_method: Encoding strategy (currently only "amplitude")
-    ///
-    /// Returns:
-    ///     QuantumTensor: DLPack tensor containing all encoded states
-    ///
-    /// Example:
-    ///     >>> engine = QdpEngine(device_id=0)
-    ///     >>> batched = engine.encode_from_tensorflow("data.pb", 16, 
"amplitude")
-    ///     >>> torch_tensor = torch.from_dlpack(batched)  # Shape: [200, 
65536]
-    fn encode_from_tensorflow(
-        &self,
-        path: &str,
-        num_qubits: usize,
-        encoding_method: &str,
-    ) -> PyResult<QuantumTensor> {
-        let ptr = self
-            .engine
-            .encode_from_tensorflow(path, num_qubits, encoding_method)
-            .map_err(|e| {
-                PyRuntimeError::new_err(format!("Encoding from TensorFlow 
failed: {}", e))
-            })?;
-        Ok(QuantumTensor {
-            ptr,
-            consumed: false,
-        })
-    }
-
-    // --- Loader factory methods (Linux only) ---
-    #[cfg(target_os = "linux")]
-    /// Create a synthetic-data pipeline iterator (for 
QuantumDataLoader.source_synthetic()).
-    #[pyo3(signature = (total_batches, batch_size, num_qubits, 
encoding_method, seed=None))]
-    fn create_synthetic_loader(
-        &self,
-        total_batches: usize,
-        batch_size: usize,
-        num_qubits: u32,
-        encoding_method: &str,
-        seed: Option<u64>,
-    ) -> PyResult<PyQuantumLoader> {
-        let config = config_from_args(
-            &self.engine,
-            batch_size,
-            num_qubits,
-            encoding_method,
-            total_batches,
-            seed,
-        );
-        let iter = 
qdp_core::PipelineIterator::new_synthetic(self.engine.clone(), config).map_err(
-            |e| PyRuntimeError::new_err(format!("create_synthetic_loader 
failed: {}", e)),
-        )?;
-        Ok(PyQuantumLoader::new(Some(iter)))
-    }
-
-    #[cfg(target_os = "linux")]
-    /// Create a file-backed pipeline iterator (full read then batch; for 
QuantumDataLoader.source_file(path)).
-    #[pyo3(signature = (path, batch_size, num_qubits, encoding_method, 
batch_limit=None))]
-    fn create_file_loader(
-        &self,
-        py: Python<'_>,
-        path: &Bound<'_, PyAny>,
-        batch_size: usize,
-        num_qubits: u32,
-        encoding_method: &str,
-        batch_limit: Option<usize>,
-    ) -> PyResult<PyQuantumLoader> {
-        let path_str = path_from_py(path)?;
-        let batch_limit = batch_limit.unwrap_or(usize::MAX);
-        let config = config_from_args(
-            &self.engine,
-            batch_size,
-            num_qubits,
-            encoding_method,
-            0,
-            None,
-        );
-        let engine = self.engine.clone();
-        let iter = py
-            .detach(|| {
-                qdp_core::PipelineIterator::new_from_file(
-                    engine,
-                    path_str.as_str(),
-                    config,
-                    batch_limit,
-                )
-            })
-            .map_err(|e| PyRuntimeError::new_err(format!("create_file_loader 
failed: {}", e)))?;
-        Ok(PyQuantumLoader::new(Some(iter)))
-    }
-
-    #[cfg(target_os = "linux")]
-    /// Create a streaming Parquet pipeline iterator (for 
QuantumDataLoader.source_file(path, streaming=True)).
-    #[pyo3(signature = (path, batch_size, num_qubits, encoding_method, 
batch_limit=None))]
-    fn create_streaming_file_loader(
-        &self,
-        py: Python<'_>,
-        path: &Bound<'_, PyAny>,
-        batch_size: usize,
-        num_qubits: u32,
-        encoding_method: &str,
-        batch_limit: Option<usize>,
-    ) -> PyResult<PyQuantumLoader> {
-        let path_str = path_from_py(path)?;
-        let batch_limit = batch_limit.unwrap_or(usize::MAX);
-        let config = config_from_args(
-            &self.engine,
-            batch_size,
-            num_qubits,
-            encoding_method,
-            0,
-            None,
-        );
-        let engine = self.engine.clone();
-        let iter = py
-            .detach(|| {
-                qdp_core::PipelineIterator::new_from_file_streaming(
-                    engine,
-                    path_str.as_str(),
-                    config,
-                    batch_limit,
-                )
-            })
-            .map_err(|e| {
-                PyRuntimeError::new_err(format!("create_streaming_file_loader 
failed: {}", e))
-            })?;
-        Ok(PyQuantumLoader::new(Some(iter)))
-    }
-
-    /// Encode directly from a PyTorch CUDA tensor. Internal helper.
-    ///
-    /// Dispatches to the core f32 GPU pointer API for 1D float32 amplitude 
encoding,
-    /// or to the float64/basis GPU pointer APIs for other dtypes and batch 
encoding.
-    ///
-    /// Args:
-    ///     data: PyTorch CUDA tensor
-    ///     num_qubits: Number of qubits
-    ///     encoding_method: Encoding strategy (currently only "amplitude")
-    fn _encode_from_cuda_tensor(
-        &self,
-        data: &Bound<'_, PyAny>,
-        num_qubits: usize,
-        encoding_method: &str,
-    ) -> PyResult<QuantumTensor> {
-        // Validate CUDA tensor for direct GPU encoding (shape, contiguity, 
device, dtype)
-        validate_cuda_tensor_for_encoding(data, 
self.engine.device().ordinal(), encoding_method)?;
-
-        // Determine dtype for dispatch (float32 vs float64, etc.).
-        let dtype = data.getattr("dtype")?;
-        let dtype_str: String = dtype.str()?.extract()?;
-        let dtype_str_lower = dtype_str.to_ascii_lowercase();
-        let is_f32 = dtype_str_lower.contains("float32");
-        let method = encoding_method.to_ascii_lowercase();
-
-        // Current f32 CUDA path only supports amplitude encoding for 1D 
tensors.
-        let ndim: usize = data.call_method0("dim")?.extract()?;
-
-        if method.as_str() == "amplitude" && is_f32 {
-            // NOTE: This f32 fast path intentionally bypasses 
`extract_cuda_tensor_info`/DLPack
-            // and uses PyTorch's `data_ptr()`/`numel()` directly, after
-            // `validate_cuda_tensor_for_encoding` has already enforced 
dtype/shape/contiguity/device.
-            // If additional validation is added to `extract_cuda_tensor_info` 
in the future, it must
-            // be mirrored here to keep behavior consistent.
-            match ndim {
-                1 => {
-                    // 1D CUDA tensor, float32 amplitude encoding using core 
f32 GPU pointer API.
-                    let input_len: usize = 
data.call_method0("numel")?.extract()?;
-                    let stream_ptr = get_torch_cuda_stream_ptr(data)?;
-                    let data_ptr_u64: u64 = 
data.call_method0("data_ptr")?.extract()?;
-                    let data_ptr = data_ptr_u64 as *const f32;
-
-                    let ptr = unsafe {
-                        self.engine
-                            .encode_from_gpu_ptr_f32_with_stream(
-                                data_ptr, input_len, num_qubits, stream_ptr,
-                            )
-                            .map_err(|e| {
-                                PyRuntimeError::new_err(format!(
-                                    "Encoding failed (float32 amplitude): {}",
-                                    e
-                                ))
-                            })?
-                    };
-
-                    Ok(QuantumTensor {
-                        ptr,
-                        consumed: false,
-                    })
-                }
-                2 => Err(PyRuntimeError::new_err(
-                    "CUDA float32 batch amplitude encoding is not yet 
supported. \
-                     Use float64 (tensor.to(torch.float64)) or encode samples 
individually.",
-                )),
-                _ => Err(PyRuntimeError::new_err(format!(
-                    "Unsupported CUDA tensor shape: {}D. Expected 1D tensor 
for single \
-                     sample encoding or 2D tensor (batch_size, features) for 
batch encoding.",
-                    ndim
-                ))),
-            }
-        } else {
-            // Existing float64 (and basis/int64) CUDA path using direct GPU 
pointer.
-            let tensor_info = extract_cuda_tensor_info(data)?;
-            let stream_ptr = get_torch_cuda_stream_ptr(data)?;
-
-            match ndim {
-                1 => {
-                    // 1D CUDA tensor: single sample encoding
-                    let input_len = tensor_info.shape[0] as usize;
-                    // SAFETY: tensor_info.data_ptr was obtained via PyTorch's 
data_ptr() from a
-                    // valid CUDA tensor. The tensor remains alive during this 
call
-                    // (held by Python's GIL), and we validated 
dtype/contiguity/device above.
-                    let ptr = unsafe {
-                        self.engine
-                            .encode_from_gpu_ptr_with_stream(
-                                tensor_info.data_ptr as *const 
std::ffi::c_void,
-                                input_len,
-                                num_qubits,
-                                encoding_method,
-                                stream_ptr,
-                            )
-                            .map_err(|e| {
-                                PyRuntimeError::new_err(format!("Encoding 
failed: {}", e))
-                            })?
-                    };
-                    Ok(QuantumTensor {
-                        ptr,
-                        consumed: false,
-                    })
-                }
-                2 => {
-                    // 2D CUDA tensor: batch encoding
-                    let num_samples = tensor_info.shape[0] as usize;
-                    let sample_size = tensor_info.shape[1] as usize;
-                    // SAFETY: Same as above - pointer from validated PyTorch 
CUDA tensor
-                    let ptr = unsafe {
-                        self.engine
-                            .encode_batch_from_gpu_ptr_with_stream(
-                                tensor_info.data_ptr as *const 
std::ffi::c_void,
-                                num_samples,
-                                sample_size,
-                                num_qubits,
-                                encoding_method,
-                                stream_ptr,
-                            )
-                            .map_err(|e| {
-                                PyRuntimeError::new_err(format!("Encoding 
failed: {}", e))
-                            })?
-                    };
-                    Ok(QuantumTensor {
-                        ptr,
-                        consumed: false,
-                    })
-                }
-                _ => Err(PyRuntimeError::new_err(format!(
-                    "Unsupported CUDA tensor shape: {}D. Expected 1D tensor 
for single \
-                     sample encoding or 2D tensor (batch_size, features) for 
batch encoding.",
-                    ndim
-                ))),
-            }
-        }
-    }
-}
-
-// --- Loader bindings (Linux only; qdp-core pipeline types only built on 
Linux) ---
-#[cfg(target_os = "linux")]
-mod loader_bindings {
-    use super::*;
-    use pyo3::exceptions::PyStopIteration;
-    use qdp_core::{PipelineConfig, PipelineIterator};
-
-    /// Rust-backed iterator yielding one QuantumTensor per batch; used by 
QuantumDataLoader.
-    #[pyclass]
-    pub struct PyQuantumLoader {
-        inner: Option<PipelineIterator>,
-    }
-
-    impl PyQuantumLoader {
-        pub fn new(inner: Option<PipelineIterator>) -> Self {
-            Self { inner }
-        }
-    }
-
-    #[pymethods]
-    impl PyQuantumLoader {
-        fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> {
-            slf
-        }
-
-        fn __next__(mut slf: PyRefMut<'_, Self>) -> PyResult<QuantumTensor> {
-            let mut iter: PipelineIterator = match slf.inner.take() {
-                Some(i) => i,
-                None => return Err(PyStopIteration::new_err("")),
-            };
-            // Call next_batch without releasing GIL (return type *mut 
DLManagedTensor is !Send).
-            let result = iter.next_batch();
-            match result {
-                Ok(Some(ptr)) => {
-                    slf.inner = Some(iter);
-                    Ok(QuantumTensor {
-                        ptr,
-                        consumed: false,
-                    })
-                }
-                Ok(None) => {
-                    // Exhausted; do not put iterator back
-                    Err(PyStopIteration::new_err(""))
-                }
-                Err(e) => {
-                    slf.inner = Some(iter);
-                    Err(PyRuntimeError::new_err(format!(
-                        "Pipeline next_batch failed: {}",
-                        e
-                    )))
-                }
-            }
-        }
-    }
-
-    /// Build PipelineConfig from Python args. device_id is 0 (engine does not 
expose it); iterator uses engine clone with correct device.
-    pub fn config_from_args(
-        _engine: &CoreEngine,
-        batch_size: usize,
-        num_qubits: u32,
-        encoding_method: &str,
-        total_batches: usize,
-        seed: Option<u64>,
-    ) -> PipelineConfig {
-        PipelineConfig {
-            device_id: 0,
-            num_qubits,
-            batch_size,
-            total_batches,
-            encoding_method: encoding_method.to_string(),
-            seed,
-            warmup_batches: 0,
-        }
-    }
-
-    /// Resolve path from Python str or pathlib.Path (__fspath__).
-    pub fn path_from_py(path: &Bound<'_, PyAny>) -> PyResult<String> {
-        path.extract::<String>().or_else(|_| {
-            path.call_method0("__fspath__")
-                .and_then(|m| m.extract::<String>())
-        })
-    }
-}
+use engine::QdpEngine;
+use pyo3::prelude::*;
+use tensor::QuantumTensor;
 
 #[cfg(target_os = "linux")]
-use loader_bindings::{PyQuantumLoader, config_from_args, path_from_py};
+use loader::PyQuantumLoader;
 
 /// Quantum Data Plane (QDP) Python module
 ///
diff --git a/qdp/qdp-python/src/loader.rs b/qdp/qdp-python/src/loader.rs
new file mode 100644
index 000000000..2806b17af
--- /dev/null
+++ b/qdp/qdp-python/src/loader.rs
@@ -0,0 +1,103 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Loader bindings (Linux only; qdp-core pipeline types only built on Linux)
+#[cfg(target_os = "linux")]
+mod loader_impl {
+    use crate::tensor::QuantumTensor;
+    use pyo3::exceptions::PyRuntimeError;
+    use pyo3::prelude::*;
+    use qdp_core::{PipelineConfig, PipelineIterator, QdpEngine as CoreEngine};
+
+    /// Rust-backed iterator yielding one QuantumTensor per batch; used by 
QuantumDataLoader.
+    #[pyclass]
+    pub struct PyQuantumLoader {
+        pub inner: Option<PipelineIterator>,
+    }
+
+    impl PyQuantumLoader {
+        pub fn new(inner: Option<PipelineIterator>) -> Self {
+            Self { inner }
+        }
+    }
+
+    #[pymethods]
+    impl PyQuantumLoader {
+        fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> {
+            slf
+        }
+
+        fn __next__(mut slf: PyRefMut<'_, Self>) -> PyResult<QuantumTensor> {
+            let mut iter: PipelineIterator = match slf.inner.take() {
+                Some(i) => i,
+                None => return 
Err(pyo3::exceptions::PyStopIteration::new_err("")),
+            };
+            // Call next_batch without releasing GIL (return type *mut 
DLManagedTensor is !Send).
+            let result = iter.next_batch();
+            match result {
+                Ok(Some(ptr)) => {
+                    slf.inner = Some(iter);
+                    Ok(QuantumTensor {
+                        ptr,
+                        consumed: false,
+                    })
+                }
+                Ok(None) => {
+                    // Exhausted; do not put iterator back
+                    Err(pyo3::exceptions::PyStopIteration::new_err(""))
+                }
+                Err(e) => {
+                    slf.inner = Some(iter);
+                    Err(PyRuntimeError::new_err(format!(
+                        "Pipeline next_batch failed: {}",
+                        e
+                    )))
+                }
+            }
+        }
+    }
+
+    /// Build PipelineConfig from Python args. device_id is 0 (engine does not 
expose it); iterator uses engine clone with correct device.
+    pub fn config_from_args(
+        _engine: &CoreEngine,
+        batch_size: usize,
+        num_qubits: u32,
+        encoding_method: &str,
+        total_batches: usize,
+        seed: Option<u64>,
+    ) -> PipelineConfig {
+        PipelineConfig {
+            device_id: 0,
+            num_qubits,
+            batch_size,
+            total_batches,
+            encoding_method: encoding_method.to_string(),
+            seed,
+            warmup_batches: 0,
+        }
+    }
+
+    /// Resolve path from Python str or pathlib.Path (__fspath__).
+    pub fn path_from_py(path: &Bound<'_, PyAny>) -> PyResult<String> {
+        path.extract::<String>().or_else(|_| {
+            path.call_method0("__fspath__")
+                .and_then(|m| m.extract::<String>())
+        })
+    }
+}
+
+#[cfg(target_os = "linux")]
+pub use loader_impl::{PyQuantumLoader, config_from_args, path_from_py};
diff --git a/qdp/qdp-python/src/pytorch.rs b/qdp/qdp-python/src/pytorch.rs
new file mode 100644
index 000000000..cb5c75247
--- /dev/null
+++ b/qdp/qdp-python/src/pytorch.rs
@@ -0,0 +1,251 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use pyo3::exceptions::PyRuntimeError;
+use pyo3::prelude::*;
+use std::ffi::c_void;
+
+/// Helper to detect PyTorch tensor
+pub fn is_pytorch_tensor(obj: &Bound<'_, PyAny>) -> PyResult<bool> {
+    let type_obj = obj.get_type();
+    let name = type_obj.name()?;
+    if name != "Tensor" {
+        return Ok(false);
+    }
+    let module = type_obj.module()?;
+    let module_name = module.to_str()?;
+    Ok(module_name == "torch")
+}
+
+/// Helper to validate CPU tensor
+pub fn validate_tensor(tensor: &Bound<'_, PyAny>) -> PyResult<()> {
+    if !is_pytorch_tensor(tensor)? {
+        return Err(PyRuntimeError::new_err("Object is not a PyTorch Tensor"));
+    }
+
+    let device = tensor.getattr("device")?;
+    let device_type: String = device.getattr("type")?.extract()?;
+
+    if device_type != "cpu" {
+        return Err(PyRuntimeError::new_err(format!(
+            "Only CPU tensors are currently supported for this path. Got 
device: {}",
+            device_type
+        )));
+    }
+
+    Ok(())
+}
+
+/// Check if a PyTorch tensor is on a CUDA device
+pub fn is_cuda_tensor(tensor: &Bound<'_, PyAny>) -> PyResult<bool> {
+    let device = tensor.getattr("device")?;
+    let device_type: String = device.getattr("type")?.extract()?;
+    Ok(device_type == "cuda")
+}
+
+/// Validate array/tensor shape (must be 1D or 2D)
+///
+/// Args:
+///     ndim: Number of dimensions
+///     context: Context string for error message (e.g., "array", "tensor", 
"CUDA tensor")
+///
+/// Returns:
+///     Ok(()) if shape is valid (1D or 2D), otherwise returns an error
+pub fn validate_shape(ndim: usize, context: &str) -> PyResult<()> {
+    match ndim {
+        1 | 2 => Ok(()),
+        _ => {
+            let item_type = if context.contains("array") {
+                "array"
+            } else {
+                "tensor"
+            };
+            Err(PyRuntimeError::new_err(format!(
+                "Unsupported {} shape: {}D. Expected 1D {} for single sample \
+                 encoding or 2D {} (batch_size, features) for batch encoding.",
+                context, ndim, item_type, item_type
+            )))
+        }
+    }
+}
+
+/// Get the CUDA device index from a PyTorch tensor
+pub fn get_tensor_device_id(tensor: &Bound<'_, PyAny>) -> PyResult<i32> {
+    let device = tensor.getattr("device")?;
+    let device_index: i32 = device.getattr("index")?.extract()?;
+    Ok(device_index)
+}
+
+/// Get the current CUDA stream pointer for the tensor's device.
+pub fn get_torch_cuda_stream_ptr(tensor: &Bound<'_, PyAny>) -> PyResult<*mut 
c_void> {
+    let py = tensor.py();
+    let torch = PyModule::import(py, "torch")
+        .map_err(|_| PyRuntimeError::new_err("Failed to import torch 
module"))?;
+    let cuda = torch.getattr("cuda")?;
+    let device = tensor.getattr("device")?;
+    let stream = cuda.call_method1("current_stream", (device,))?;
+
+    // Defensive validation: ensure the stream is a CUDA stream on the same 
device
+    let stream_device = stream.getattr("device").map_err(|_| {
+        PyRuntimeError::new_err("CUDA stream object from PyTorch is missing 
'device' attribute")
+    })?;
+    let stream_device_type: String = stream_device
+        .getattr("type")
+        .and_then(|obj| obj.extract())
+        .map_err(|_| {
+            PyRuntimeError::new_err(
+                "Failed to extract CUDA stream device type from PyTorch 
stream.device",
+            )
+        })?;
+    if stream_device_type != "cuda" {
+        return Err(PyRuntimeError::new_err(format!(
+            "Expected CUDA stream device type 'cuda', got '{}'",
+            stream_device_type
+        )));
+    }
+
+    let stream_device_index: i32 = stream_device
+        .getattr("index")
+        .and_then(|obj| obj.extract())
+        .map_err(|_| {
+            PyRuntimeError::new_err(
+                "Failed to extract CUDA stream device index from PyTorch 
stream.device",
+            )
+        })?;
+    let tensor_device_index = get_tensor_device_id(tensor)?;
+    if stream_device_index != tensor_device_index {
+        return Err(PyRuntimeError::new_err(format!(
+            "CUDA stream device index ({}) does not match tensor device index 
({})",
+            stream_device_index, tensor_device_index
+        )));
+    }
+
+    let stream_ptr: u64 = stream.getattr("cuda_stream")?.extract()?;
+    Ok(if stream_ptr == 0 {
+        std::ptr::null_mut()
+    } else {
+        stream_ptr as *mut c_void
+    })
+}
+
+/// Validate a CUDA tensor for direct GPU encoding
+/// Checks: dtype matches encoding method, contiguous, non-empty, device_id 
matches engine
+pub fn validate_cuda_tensor_for_encoding(
+    tensor: &Bound<'_, PyAny>,
+    expected_device_id: usize,
+    encoding_method: &str,
+) -> PyResult<()> {
+    let method = encoding_method.to_ascii_lowercase();
+
+    // Check encoding method support and dtype (ASCII lowercase for 
case-insensitive match).
+    let dtype = tensor.getattr("dtype")?;
+    let dtype_str: String = dtype.str()?.extract()?;
+    let dtype_str_lower = dtype_str.to_ascii_lowercase();
+    match method.as_str() {
+        "amplitude" => {
+            if !(dtype_str_lower.contains("float64") || 
dtype_str_lower.contains("float32")) {
+                return Err(PyRuntimeError::new_err(format!(
+                    "CUDA tensor must have dtype float64 or float32 for 
amplitude encoding, got {}. \
+                     Use tensor.to(torch.float64) or tensor.to(torch.float32)",
+                    dtype_str
+                )));
+            }
+        }
+        "angle" => {
+            if !dtype_str_lower.contains("float64") {
+                return Err(PyRuntimeError::new_err(format!(
+                    "CUDA tensor must have dtype float64 for angle encoding, 
got {}. \
+                     Use tensor.to(torch.float64)",
+                    dtype_str
+                )));
+            }
+        }
+        "basis" => {
+            if !dtype_str_lower.contains("int64") {
+                return Err(PyRuntimeError::new_err(format!(
+                    "CUDA tensor must have dtype int64 for basis encoding, got 
{}. \
+                     Use tensor.to(torch.int64)",
+                    dtype_str
+                )));
+            }
+        }
+        _ => {
+            return Err(PyRuntimeError::new_err(format!(
+                "CUDA tensor encoding currently only supports 'amplitude', 
'angle', or 'basis' methods, got '{}'. \
+                 Use tensor.cpu() to convert to CPU tensor for other encoding 
methods.",
+                encoding_method
+            )));
+        }
+    }
+
+    // Check contiguous
+    let is_contiguous: bool = tensor.call_method0("is_contiguous")?.extract()?;
+    if !is_contiguous {
+        return Err(PyRuntimeError::new_err(
+            "CUDA tensor must be contiguous. Use tensor.contiguous()",
+        ));
+    }
+
+    // Check non-empty
+    let numel: usize = tensor.call_method0("numel")?.extract()?;
+    if numel == 0 {
+        return Err(PyRuntimeError::new_err("CUDA tensor cannot be empty"));
+    }
+
+    // Check device matches engine
+    let tensor_device_id = get_tensor_device_id(tensor)?;
+    if tensor_device_id as usize != expected_device_id {
+        return Err(PyRuntimeError::new_err(format!(
+            "Device mismatch: tensor is on cuda:{}, but engine is on cuda:{}. \
+             Move tensor with tensor.to('cuda:{}')",
+            tensor_device_id, expected_device_id, expected_device_id
+        )));
+    }
+
+    Ok(())
+}
+
+/// Minimal CUDA tensor metadata extracted via PyTorch APIs.
+pub struct CudaTensorInfo {
+    pub data_ptr: *const f64,
+    pub shape: Vec<i64>,
+}
+
+/// Extract GPU pointer and shape directly from a PyTorch CUDA tensor.
+///
+/// # Safety
+/// The returned pointer is borrowed from the source tensor. The caller must
+/// ensure the tensor remains alive and unmodified for the duration of use.
+pub fn extract_cuda_tensor_info(tensor: &Bound<'_, PyAny>) -> 
PyResult<CudaTensorInfo> {
+    let data_ptr: u64 = tensor.call_method0("data_ptr")?.extract()?;
+    if data_ptr == 0 {
+        return Err(PyRuntimeError::new_err(
+            "PyTorch returned a null data pointer for CUDA tensor",
+        ));
+    }
+
+    let ndim: usize = tensor.call_method0("dim")?.extract()?;
+    let mut shape = Vec::with_capacity(ndim);
+    for axis in 0..ndim {
+        let dim: i64 = tensor.call_method1("size", (axis,))?.extract()?;
+        shape.push(dim);
+    }
+
+    Ok(CudaTensorInfo {
+        data_ptr: data_ptr as *const f64,
+        shape,
+    })
+}
diff --git a/qdp/qdp-python/src/tensor.rs b/qdp/qdp-python/src/tensor.rs
new file mode 100644
index 000000000..ab341d1ac
--- /dev/null
+++ b/qdp/qdp-python/src/tensor.rs
@@ -0,0 +1,147 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use pyo3::exceptions::PyRuntimeError;
+use pyo3::ffi;
+use pyo3::prelude::*;
+use qdp_core::dlpack::DLManagedTensor;
+
+/// Quantum tensor wrapper implementing DLPack protocol
+///
+/// This class wraps a GPU-allocated quantum state vector and implements
+/// the DLPack protocol for zero-copy integration with PyTorch and other
+/// array libraries.
+///
+/// Example:
+///     >>> engine = QdpEngine(device_id=0)
+///     >>> qtensor = engine.encode([1.0, 2.0, 3.0], num_qubits=2, 
encoding_method="amplitude")
+///     >>> torch_tensor = torch.from_dlpack(qtensor)
+#[pyclass]
+pub struct QuantumTensor {
+    pub ptr: *mut DLManagedTensor,
+    pub consumed: bool,
+}
+
+#[pymethods]
+impl QuantumTensor {
+    /// Implements DLPack protocol - returns PyCapsule for PyTorch
+    ///
+    /// This method is called by torch.from_dlpack() to get the GPU memory 
pointer.
+    /// The capsule can only be consumed once to prevent double-free errors.
+    ///
+    /// Args:
+    ///     stream: Optional CUDA stream (DLPack 0.8+; 1=legacy default, 
2=per-thread default)
+    ///
+    /// Returns:
+    ///     PyCapsule containing DLManagedTensor pointer
+    ///
+    /// Raises:
+    ///     RuntimeError: If the tensor has already been consumed
+    #[pyo3(signature = (stream=None))]
+    fn __dlpack__<'py>(&mut self, py: Python<'py>, stream: Option<i64>) -> 
PyResult<Py<PyAny>> {
+        if self.consumed {
+            return Err(PyRuntimeError::new_err(
+                "DLPack tensor already consumed (can only be used once)",
+            ));
+        }
+
+        if self.ptr.is_null() {
+            return Err(PyRuntimeError::new_err("Invalid DLPack tensor 
pointer"));
+        }
+
+        if let Some(stream) = stream
+            && stream > 0
+        {
+            let stream_ptr = qdp_core::dlpack::dlpack_stream_to_cuda(stream);
+            unsafe {
+                qdp_core::dlpack::synchronize_stream(stream_ptr).map_err(|e| {
+                    PyRuntimeError::new_err(format!("CUDA stream sync failed: 
{}", e))
+                })?;
+            }
+        }
+
+        // Mark as consumed to prevent double-free
+        self.consumed = true;
+
+        // Create PyCapsule using FFI
+        // PyTorch will call the deleter stored in DLManagedTensor.deleter
+        // Use a static C string for the capsule name to avoid lifetime issues
+        const DLTENSOR_NAME: &[u8] = b"dltensor\0";
+
+        unsafe {
+            // Create PyCapsule without a destructor
+            // PyTorch will manually call the deleter from DLManagedTensor
+            let capsule_ptr = ffi::PyCapsule_New(
+                self.ptr as *mut std::ffi::c_void,
+                DLTENSOR_NAME.as_ptr() as *const i8,
+                None, // No destructor - PyTorch handles it
+            );
+
+            if capsule_ptr.is_null() {
+                return Err(PyRuntimeError::new_err("Failed to create 
PyCapsule"));
+            }
+
+            Ok(Py::from_owned_ptr(py, capsule_ptr))
+        }
+    }
+
+    /// Returns DLPack device information
+    ///
+    /// Returns:
+    ///     Tuple of (device_type, device_id) where device_type=2 for CUDA
+    fn __dlpack_device__(&self) -> PyResult<(i32, i32)> {
+        if self.ptr.is_null() {
+            return Err(PyRuntimeError::new_err("Invalid DLPack tensor 
pointer"));
+        }
+
+        unsafe {
+            let tensor = &(*self.ptr).dl_tensor;
+            // DLPack device_type: kDLCUDA = 2, kDLCPU = 1
+            let device_type = match tensor.device.device_type {
+                qdp_core::dlpack::DLDeviceType::kDLCUDA => 2,
+                qdp_core::dlpack::DLDeviceType::kDLCPU => 1,
+            };
+            // Read device_id from DLPack tensor metadata
+            Ok((device_type, tensor.device.device_id))
+        }
+    }
+}
+
+impl Drop for QuantumTensor {
+    fn drop(&mut self) {
+        // Only free if not consumed by __dlpack__
+        // If consumed, PyTorch/consumer will call the deleter
+        if !self.consumed && !self.ptr.is_null() {
+            unsafe {
+                // Defensive check: qdp-core always provides a deleter
+                debug_assert!(
+                    (*self.ptr).deleter.is_some(),
+                    "DLManagedTensor from qdp-core should always have a 
deleter"
+                );
+
+                // Call the DLPack deleter to free memory
+                if let Some(deleter) = (*self.ptr).deleter {
+                    deleter(self.ptr);
+                }
+            }
+        }
+    }
+}
+
+// Safety: QuantumTensor can be sent between threads
+// The DLManagedTensor pointer management is thread-safe via Arc in the deleter
+unsafe impl Send for QuantumTensor {}
+unsafe impl Sync for QuantumTensor {}

Reply via email to