Hi wonghang,
I am trying to develop a batched version of both GpuCholesky and
GpuCublasTriangularSolve().
I was confused about what to do in L_op with the extra batch dimension, but
after
carefully reading the docs again, I now understand it. L_op is just a
linearized
and transposed version of Op, so the input of L_op has the same shape and
form as
the output of Op, and vice-versa. Now I got it. These attached functions
are
tested and work on batches. Let 'nb' be the batch size, so Cll is
dimensioned (nb,n,n) ,
and dl is the RHS, dimensioned (nb,n). Then to solve er = Cll * dl, for
dl, I use:
R = theano.gpuarray.linalg.GpuCholeskyBatch(lower=True)(Cll)
lam1=
theano.gpuarray.linalg.GpuCublasTriangularSolveBatch(lower=True)(R,er)
R_T=R.dimshuffle(0,2,1)
dl=
theano.gpuarray.linalg.GpuCublasTriangularSolveBatch(lower=False)(R_T,lam1)
This does the classical Cholesky followed by back-substitution. This code
runs in 4.0 seconds.
When I use the scan on the non-batched versions, it runs in 10.3 sec. So it
is 2.5 times faster!
That makes a big difference to me as I can get a result before I go home.
Paul
On Tuesday, February 11, 2020 at 4:22:20 AM UTC+1, Wong Hang wrote:
>
> Hi Paul,
>
> I am not quite sure what you are going to do.
>
> If you want a batch version of Cholesky and TriangularSolve, then you will
> end up with a "nb" copy of gradients.
> What are you going to do with that "nb" copy of gradients?
> AFAIK, theano.grad only accept scalar input. If you need a Jacobian, you
> can only implement it by theano.scan and I know theano.scan is inefficient.
>
> You may be interested in this thread:
> https://groups.google.com/d/msg/theano-users/Rg8ZIru-pgo/DgvwY57RBwAJ
>
> Best,
> wonghang
>
> Paul Baggenstoss <[email protected] <javascript:>> 於 2020年2月10日 週一
> 下午8:46寫道:
>
>> Hi Wonghang,
>> So I am working toward making the Cholesky problem faster, and by
>> that I include triangular
>> solvers like GpuCublasTriangularSolve(). We typically do the Cholesky
>> decomp, then solve
>> linear systems involving the Cholesky matrix that has an upper or lower
>> triangular form.
>>
>> So I have started with GpuCublasTriangularSolve(). It has a lot of
>> overhead from
>> all the gpu array matrix copying and creation, which adds to the overhead
>> of
>> theano.scan, which is necessary to work with batches. So I thought it
>> would be much
>> better to have a batched version of Cholesky() and
>> GpuCublasTriangularSolve(). I have
>> created working versions of these batched routines (attached). It's
>> actually pretty simple,
>> I just index into the data by batch, then call the gpu solver, potrf()
>> for Cholesky
>> and trsv() for the solver. I loop over the batch like this:
>>
>> if theano.config.floatX=='float32':
>> wordlen=4
>> else:
>> wordlet=8
>> for ib in range(nb):
>> trsv(ctx.cublas_handle, uplo, trans, diag, n,
>> A_ptr+ib*n*n*wordlen, lda, b_ptr+ib*n*m*wordlen, 1)
>>
>>
>> There are a few small gotchas, such as that the GPU routines expect
>> F-ordered data, but
>> to index like I did above, it has to be C-ordered. So the data has to be
>> C-ordered by batch, but F-ordered within the batch!
>>
>> The problem I have, where I will need help, is in the gradients.
>> Although I can compute the gradient in L_op correctly (I think),
>> theano.gradient is not
>> happy with the shape of the matrices that L_op returns. This is probably
>> because gradient does not understand that the data is bacthed.
>>
>> Do you think you can help in this matter? I think this is over my head.
>> Paul
>>
>>
>> --
>>
>> ---
>> You received this message because you are subscribed to the Google Groups
>> "theano-users" group.
>> To unsubscribe from this group and stop receiving emails from it, send an
>> email to [email protected] <javascript:>.
>> To view this discussion on the web visit
>> https://groups.google.com/d/msgid/theano-users/b1336e9d-9601-48e9-9665-58740618a861%40googlegroups.com
>>
>> <https://groups.google.com/d/msgid/theano-users/b1336e9d-9601-48e9-9665-58740618a861%40googlegroups.com?utm_medium=email&utm_source=footer>
>> .
>>
>
--
---
You received this message because you are subscribed to the Google Groups
"theano-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To view this discussion on the web visit
https://groups.google.com/d/msgid/theano-users/9213049c-5277-470d-acdb-5f4e3a3a3702%40googlegroups.com.
from __future__ import absolute_import, division, print_function
import warnings
import pkg_resources
import numpy as np
from numpy.linalg.linalg import LinAlgError
import theano
from theano import Op, config, tensor
from theano.scalar import bool as bool_t
from theano.gof import COp, ParamsType
from theano.gpuarray import GpuArrayType
from .basic_ops import (CGpuKernelBase, as_gpuarray_variable, gpu_contiguous, gpuarray_helper_inc_dir,
infer_context_name)
from .type import gpu_context_type
try:
import pygpu
from pygpu.basic import triu, tril
pygpu_available = True
except ImportError:
pygpu_available = False
cusolver_available = False
try:
import skcuda
from skcuda import cusolver
cusolver_available = True
except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound):
pass
cublas_available = False
try:
from skcuda import cublas
cublas_available = True
except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound):
pass
if cusolver_available:
# Add cusolver call as it is missing in skcuda
# SPOTRS
cusolver._libcusolver.cusolverDnSpotrs.restype = int
cusolver._libcusolver.cusolverDnSpotrs.argtypes = [cusolver.ctypes.c_void_p,
cusolver.ctypes.c_int,
cusolver.ctypes.c_int,
cusolver.ctypes.c_int,
cusolver.ctypes.c_void_p,
cusolver.ctypes.c_int,
cusolver.ctypes.c_void_p,
cusolver.ctypes.c_int,
cusolver.ctypes.c_void_p]
def cusolverDnSpotrs(handle, uplo, n, nrhs, A, lda,
B, ldb, devInfo):
"""
Solve real single precision linear system for hermitian matrices.
References
----------
`cusolverDn<t>potrs <http://docs.nvidia.com/cuda/cusolver/index.html#cuds-lt-t-gt-potrs>`_
"""
status = cusolver._libcusolver.cusolverDnSpotrs(handle, uplo, n, nrhs,
int(A), lda, int(B),
ldb, int(devInfo))
cusolver.cusolverCheckStatus(status)
# DPOTRS
# TODO: Are they still missing in skucda?
cusolver._libcusolver.cusolverDnDpotrs.restype = int
cusolver._libcusolver.cusolverDnDpotrs.argtypes = [cusolver.ctypes.c_void_p,
cusolver.ctypes.c_int,
cusolver.ctypes.c_int,
cusolver.ctypes.c_int,
cusolver.ctypes.c_void_p,
cusolver.ctypes.c_int,
cusolver.ctypes.c_void_p,
cusolver.ctypes.c_int,
cusolver.ctypes.c_void_p]
def cusolverDnDpotrs(handle, uplo, n, nrhs, A, lda,
B, ldb, devInfo):
status = cusolver._libcusolver.cusolverDnDpotrs(handle, uplo, n, nrhs,
int(A), lda, int(B),
ldb, int(devInfo))
cusolver.cusolverCheckStatus(status)
def attach_cusolver_handle_to_context(ctx):
handle = getattr(ctx, 'cusolver_handle', None)
if handle is None:
with ctx:
ctx.cusolver_handle = cusolver.cusolverDnCreate()
def attach_cublas_handle_to_context(ctx):
handle = getattr(ctx, 'cublas_handle', None)
if handle is None:
with ctx:
ctx.cublas_handle = cublas.cublasCreate()
# it is a subset of all cases available in slinalg's MATRIX_STRUCTURE
MATRIX_STRUCTURES_SOLVE = (
'general',
'symmetric',
'lower_triangular',
'upper_triangular')
class GpuCusolverSolve(Op):
"""
CUSOLVER GPU solver OP.
Parameters
----------
trans
Whether to take the transpose of the input matrix or not.
"""
__props__ = ('A_structure', 'trans', 'inplace')
def __init__(self, A_structure='general', trans='N', inplace=False):
self.trans = trans
self.inplace = inplace
self.A_structure = A_structure
if self.inplace:
self.destroy_map = {0: [0]}
assert A_structure in MATRIX_STRUCTURES_SOLVE
super(GpuCusolverSolve, self).__init__()
def make_node(self, inp1, inp2):
if not cusolver_available:
raise RuntimeError('CUSOLVER is not available and '
'GpuCusolverSolve Op can not be constructed.')
if skcuda.__version__ <= '0.5.1':
warnings.warn('The GpuSolve op requires scikit-cuda > 0.5.1 to work with CUDA 8')
context_name = infer_context_name(inp1, inp2)
inp1 = as_gpuarray_variable(inp1, context_name)
inp2 = as_gpuarray_variable(inp2, context_name)
inp1 = gpu_contiguous(inp1)
inp2 = gpu_contiguous(inp2)
assert inp1.ndim == 2
assert inp2.ndim == 2
assert inp1.dtype == inp2.dtype
return theano.Apply(
self, [inp1, inp2],
[GpuArrayType(inp1.dtype,
broadcastable=inp1.broadcastable,
context_name=context_name)()])
def prepare_node(self, node, storage_map, compute_map, impl):
ctx = node.inputs[0].type.context
attach_cusolver_handle_to_context(ctx)
def check_dev_info(self, dev_info):
val = np.asarray(dev_info)[0]
if val > 0:
raise LinAlgError('A is singular')
def perform(self, node, inputs, outputs):
context = inputs[0][0].context
# Size of the matrices to invert.
z = outputs[0]
# Matrix.
A = inputs[0]
# Solution vectors.
b = inputs[1]
assert(len(A.shape) == 2)
assert(len(b.shape) == 2)
if self.trans in ['T', 'C']:
trans = 1
l, n = A.shape
k, m = b.shape
elif self.trans == 'N':
trans = 0
n, l = A.shape
k, m = b.shape
else:
raise ValueError('Invalid value for trans')
if l != n:
raise ValueError('A must be a square matrix')
if n != k:
raise ValueError('A and b must be aligned.')
lda = max(1, n)
ldb = max(1, k)
# We copy A and b as cusolver operates inplace
b = pygpu.array(b, copy=True, order='F')
if not self.inplace:
A = pygpu.array(A, copy=True)
A_ptr = A.gpudata
b_ptr = b.gpudata
# cusolver expects a F ordered matrix, but A is not explicitly
# converted between C and F order, instead we switch the
# "transpose" flag.
if A.flags['C_CONTIGUOUS']:
trans = 1 - trans
if A.dtype == 'float32':
potrf_bufferSize = cusolver.cusolverDnSpotrf_bufferSize
potrf = cusolver.cusolverDnSpotrf
potrs = cusolverDnSpotrs
getrf_bufferSize = cusolver.cusolverDnSgetrf_bufferSize
getrf = cusolver.cusolverDnSgetrf
getrs = cusolver.cusolverDnSgetrs
elif A.dtype == 'float64':
potrf_bufferSize = cusolver.cusolverDnDpotrf_bufferSize
potrf = cusolver.cusolverDnDpotrf
potrs = cusolverDnDpotrs
getrf_bufferSize = cusolver.cusolverDnDgetrf_bufferSize
getrf = cusolver.cusolverDnDgetrf
getrs = cusolver.cusolverDnDgetrs
else:
raise ValueError("Unsupported dtype")
if self.A_structure == 'symmetric':
with context:
workspace_size = potrf_bufferSize(
context.cusolver_handle, 0, n, A_ptr, lda)
workspace = pygpu.zeros(workspace_size, dtype=A.dtype,
context=context)
dev_info = pygpu.zeros((1,), dtype='int32', context=context)
workspace_ptr = workspace.gpudata
dev_info_ptr = dev_info.gpudata
with context:
potrf(
context.cusolver_handle, 0, n, A_ptr, lda, workspace_ptr,
workspace_size, dev_info_ptr)
self.check_dev_info(dev_info)
potrs(
context.cusolver_handle, 0, n, m, A_ptr, lda,
b_ptr, ldb, dev_info_ptr)
else:
# general case for A
with context:
workspace_size = getrf_bufferSize(
context.cusolver_handle, n, n, A_ptr, lda)
workspace = pygpu.zeros(workspace_size, dtype=A.dtype,
context=context)
pivots = pygpu.zeros(n, dtype='int32', context=context)
dev_info = pygpu.zeros((1,), dtype='int32', context=context)
workspace_ptr = workspace.gpudata
pivots_ptr = pivots.gpudata
dev_info_ptr = dev_info.gpudata
with context:
getrf(
context.cusolver_handle, n, n, A_ptr, lda, workspace_ptr,
pivots_ptr, dev_info_ptr)
self.check_dev_info(dev_info)
getrs(
context.cusolver_handle, trans, n, m, A_ptr, lda,
pivots_ptr, b_ptr, ldb, dev_info_ptr)
z[0] = b
def L_op(self, inputs, outputs, output_gradients):
# Modified from theano/tensor/slinalg.py
A, b = inputs
c = outputs[0]
c_bar = output_gradients[0]
# FIXME: triangular structure would use GpuCublasTriangularsolve?
# no need to handle A_structure like slinalg.py?
trans_solve_op = GpuCusolverSolve('general')
b_bar = trans_solve_op(A.T, c_bar)
A_bar = -tensor.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T)
return [A_bar, b_bar]
class GpuCublasTriangularSolve(Op):
"""
CUBLAS GPU Triangular Solve Op.
Parameters
----------
lower
Whether system is lower-triangular (True) or upper-triangular (False).
trans
Whether to take the transpose of the input matrix or not.
"""
__props__ = ('trans', 'lower')
def __init__(self, lower=True, trans='N'):
self.trans = trans
self.lower = lower
super(GpuCublasTriangularSolve, self).__init__()
def make_node(self, inp1, inp2):
if not cublas_available:
raise RuntimeError('CUBLAS is not available and '
'GpuCublasTriangularSolve Op '
'can not be constructed.')
context_name = infer_context_name(inp1, inp2)
inp1 = as_gpuarray_variable(inp1, context_name)
inp2 = as_gpuarray_variable(inp2, context_name)
inp1 = gpu_contiguous(inp1)
inp2 = gpu_contiguous(inp2)
assert inp1.ndim == 2
assert inp2.ndim in [1, 2]
assert inp1.dtype == inp2.dtype
return theano.Apply(self, [inp1, inp2],
[GpuArrayType(inp1.dtype,
broadcastable=inp2.broadcastable,
context_name=context_name)()])
def prepare_node(self, node, storage_map, compute_map, impl):
ctx = node.inputs[0].type.context
attach_cublas_handle_to_context(ctx)
def perform(self, node, inputs, outputs):
ctx = node.inputs[0].type.context
# Solution set
x = outputs[0]
# Matrix.
A = inputs[0]
# right hand side
b = inputs[1]
assert(len(A.shape) == 2)
assert(len(b.shape) in [1, 2])
# implicitly deal with the difference between C order
# and fortran order by flipping the trans and lower flags
lower = not self.lower
trans = self.trans
if trans in ['T', 'C']:
trans = 'N'
l, n = A.shape
elif trans == 'N':
trans = 'T'
n, l = A.shape
else:
raise ValueError('Invalid value for trans')
if b.ndim == 2:
k, m = b.shape
else:
k, = b.shape
m = 1
if l != n:
raise ValueError('A must be a square matrix')
if n != k:
raise ValueError('A and b must be aligned.')
lda = max(1, n)
ldb = max(1, k)
# solution overwrites right hand side on exit
b = pygpu.array(b, copy=True, order='F')
A_ptr = A.gpudata
b_ptr = b.gpudata
# unit scalar used for multiplication
alpha = 1.0
# indicates matrix A is on left of B
side = 'l'
# set whether upper or lower part of matrix A stored
uplo = 'l' if lower else 'u'
# indicates elements on diagonal of matrix A may not be unity
diag = 'n'
if A.dtype == 'float32':
trsv = cublas.cublasStrsv
trsm = cublas.cublasStrsm
elif A.dtype == 'float64':
trsv = cublas.cublasDtrsv
trsm = cublas.cublasDtrsm
else:
raise ValueError("Unsupported dtype")
with ctx:
if b.ndim == 1:
# matrix vector solve
trsv(ctx.cublas_handle, uplo, trans, diag, n,
A_ptr, lda, b_ptr, 1)
else:
trsm(ctx.cublas_handle, side, uplo, trans, diag,
n, m, alpha, A_ptr, lda, b_ptr, ldb)
x[0] = b
def L_op(self, inputs, outputs, output_gradients):
# Modified from theano/tensor/slinalg.py
A, b = inputs
c = outputs[0]
c_bar = output_gradients[0]
trans_solve_op = GpuCublasTriangularSolve(not self.lower)
b_bar = trans_solve_op(A.T, c_bar)
A_bar = -tensor.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T)
if self.lower:
A_bar = tensor.tril(A_bar)
else:
A_bar = tensor.triu(A_bar)
return [A_bar, b_bar]
def gpu_solve(A, b, A_structure='general', trans='N'):
if A_structure == 'lower':
return GpuCublasTriangularSolve(True, trans)(A, b)
elif A_structure == 'upper':
return GpuCublasTriangularSolve(False, trans)(A, b)
return GpuCusolverSolve(A_structure, trans)(A, b)
def gpu_solve_lower_triangular(A, b, trans='N'):
return GpuCublasTriangularSolve(True, trans)(A, b)
def gpu_solve_upper_triangular(A, b, trans='N'):
return GpuCublasTriangularSolve(False, trans)(A, b)
class GpuCholesky(Op):
"""
CUSOLVER GPU Cholesky Op.
Given a real positive definite matrix `A` returns either a lower
triangular matrix `L` such that `A == dot(L, L.T)` if `lower == True`
else returns an upper triangular matrix `U` such that `A == dot(U.T, U)`
if `lower == False`.
Parameters
----------
lower
Whether to return a lower rather than upper triangular decomposition.
"""
__props__ = ('lower', 'inplace')
def __init__(self, lower=True, inplace=False):
self.lower = lower
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
super(GpuCholesky, self).__init__()
def clone_inplace(self):
return self.__class__(lower=self.lower, inplace=True)
def make_node(self, inp):
if not cusolver_available:
raise RuntimeError('CUSOLVER is not available and '
'GpuCholesky Op can not be constructed.')
if skcuda.__version__ <= '0.5.1':
warnings.warn('The GpuCholesky op requires scikit-cuda > '
'0.5.1 to work with CUDA 8')
if not pygpu_available:
raise RuntimeError('Missing pygpu or triu/tril functions.'
'Install or update libgpuarray.')
context_name = infer_context_name(inp)
inp = as_gpuarray_variable(inp, context_name)
inp = gpu_contiguous(inp)
assert inp.ndim == 2
return theano.Apply(self, [inp], [inp.type()])
def prepare_node(self, node, storage_map, compute_map, impl):
ctx = node.inputs[0].type.context
attach_cusolver_handle_to_context(ctx)
def perform(self, node, inputs, outputs):
context = inputs[0][0].context
# Input matrix.
A = inputs[0]
l, n = A.shape
if l != n:
raise ValueError('A must be a square matrix')
lda = max(1, n)
# cusolver operates on F ordered matrices, but A is expected
# to be symmetric so it does not matter.
# We copy A if needed
if self.inplace:
L = A
else:
L = pygpu.array(A, copy=True)
# The output matrix will contain only the upper or lower
# triangular factorization of A. If L is C ordered (it
# probably is as it is the default in Theano) we just switch
# the fill mode parameter of cusolver
l_parameter = 0 if self.lower else 1
if L.flags['C_CONTIGUOUS']:
l_parameter = 1 - l_parameter
L_ptr = L.gpudata
if A.dtype == 'float32':
potrf_bufferSize = cusolver.cusolverDnSpotrf_bufferSize
potrf = cusolver.cusolverDnSpotrf
elif A.dtype == 'float64':
potrf_bufferSize = cusolver.cusolverDnDpotrf_bufferSize
potrf = cusolver.cusolverDnDpotrf
else:
raise ValueError("Unsupported dtype")
with context:
workspace_size = potrf_bufferSize(
context.cusolver_handle, l_parameter, n, L_ptr, lda)
workspace = pygpu.zeros(workspace_size, dtype=A.dtype,
context=context)
dev_info = pygpu.zeros((1,), dtype='int32', context=context)
workspace_ptr = workspace.gpudata
dev_info_ptr = dev_info.gpudata
potrf(context.cusolver_handle, l_parameter, n, L_ptr,
lda, workspace_ptr, workspace_size, dev_info_ptr)
val_dev_info = np.asarray(dev_info)[0]
if val_dev_info > 0:
raise LinAlgError('Cholesky decomposition failed (is A SPD?)')
# cusolver leaves the elements in the matrix outside the considered
# upper or lower triangle unchanged, so we need to put zeros outside
# the triangle
if self.lower:
tril(L)
else:
triu(L)
outputs[0][0] = L
def L_op(self, inputs, outputs, gradients):
# Modified from theano/tensor/slinalg.py
# No handling for on_error = 'nan'
dz = gradients[0]
chol_x = outputs[0]
# this is for nan mode
#
# ok = ~tensor.any(tensor.isnan(chol_x))
# chol_x = tensor.switch(ok, chol_x, 1)
# dz = tensor.switch(ok, dz, 1)
# deal with upper triangular by converting to lower triangular
if not self.lower:
chol_x = chol_x.T
dz = dz.T
def tril_and_halve_diagonal(mtx):
"""Extracts lower triangle of square matrix and halves diagonal."""
return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.)
def conjugate_solve_triangular(outer, inner):
"""Computes L^{-T} P L^{-1} for lower-triangular L."""
return gpu_solve_upper_triangular(
outer.T, gpu_solve_upper_triangular(outer.T, inner.T).T)
s = conjugate_solve_triangular(
chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz)))
if self.lower:
grad = tensor.tril(s + s.T) - tensor.diag(tensor.diagonal(s))
else:
grad = tensor.triu(s + s.T) - tensor.diag(tensor.diagonal(s))
return [grad]
def gpu_cholesky(A, lower=True):
return GpuCholesky(lower)(A)
# TODO: add support for float64
class GpuMagmaBase(COp):
"""Base class for magma related operations. Add the necessary headers,
libraries and optionally the location of headers and library.
"""
def c_headers(self):
return ['gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/ext_cuda.h',
'gpuarray_helper.h', 'magma.h']
def c_header_dirs(self):
dirs = [gpuarray_helper_inc_dir(), pygpu.get_include(),
config.cuda.include_path]
if config.magma.include_path:
dirs.append(config.magma.include_path)
return dirs
def c_libraries(self):
return ['magma']
def c_lib_dirs(self):
if config.magma.library_path:
return [config.magma.library_path]
return []
def prepare_node(self, node, storage_map, compute_map, impl):
from skcuda.magma import magma_init
ctx = node.inputs[0].type.context
if not getattr(ctx, 'is_magma_initialized', False):
with ctx:
magma_init()
ctx.is_magma_initialized = True
class GpuMagmaSVD(GpuMagmaBase):
"""Computes the svd of a matrix :math:`A` using magma library.
.. warning::
Because of implementation constraints, this Op returns outputs
in order ``S, U, VT``. Use :func:`theano.gpuarray.linalg.gpu_svd`
to get them in expected order ``U, S, VT``.
"""
__props__ = ('full_matrices', 'compute_uv')
_cop_num_inputs = 1
_cop_num_outputs = 3
check_input = False
params_type = ParamsType(full_matrices=bool_t, context=gpu_context_type)
def __init__(self, full_matrices=True, compute_uv=True):
self.full_matrices = full_matrices
self.compute_uv = compute_uv
COp.__init__(self, ['c_code/magma_svd.c'], 'APPLY_SPECIFIC(magma_svd)')
def make_node(self, A):
ctx_name = infer_context_name(A)
A = as_gpuarray_variable(A, ctx_name)
A = gpu_contiguous(A)
if A.ndim != 2:
raise LinAlgError("Matrix rank error")
if A.dtype != 'float32':
raise TypeError("only `float32` is supported for now")
if self.compute_uv:
return theano.Apply(self, [A],
# return S, U, VT
[GpuArrayType(A.dtype, broadcastable=[False],
context_name=ctx_name)(),
A.type(),
A.type()])
else:
return theano.Apply(self, [A],
# return only S
[GpuArrayType(A.dtype, broadcastable=[False],
context_name=ctx_name)()])
def prepare_node(self, node, storage_map, compute_map, impl):
super(GpuMagmaSVD, self).prepare_node(node, storage_map, compute_map, impl)
# Check node to prevent eventual errors with old pickled nodes.
if self.compute_uv:
A, B, C = node.outputs
# We expect order: S (vector), U (matrix), VT (matrix)
assert A.type.ndim == 1 and B.type.ndim == C.type.ndim == 2, \
"Due to implementation constraints, GpuMagmaSVD interface has changed and now returns (S, U, VT) " \
"instead of (U, S, VT). Either update your code, or use gpu_svd() to get the expected (U, S, VT) order."
def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context)
def infer_shape(self, node, shapes):
x_shape, = shapes
M, N = x_shape
K = tensor.minimum(M, N)
s_shape = (K, )
if self.compute_uv:
u_shape = (M, M) if self.full_matrices else (M, K)
vt_shape = (N, N) if self.full_matrices else (K, N)
return [s_shape, u_shape, vt_shape]
else:
return [s_shape]
def gpu_svd(a, full_matrices=1, compute_uv=1):
"""
This function performs the SVD on GPU.
Parameters
----------
full_matrices : bool, optional
If True (default), u and v have the shapes (M, M) and (N, N),
respectively.
Otherwise, the shapes are (M, K) and (K, N), respectively,
where K = min(M, N).
compute_uv : bool, optional
Whether or not to compute u and v in addition to s.
True by default.
Returns
-------
U, V, D : matrices
"""
out = GpuMagmaSVD(full_matrices, compute_uv)(a)
if compute_uv:
S, U, VT = out
out = [U, S, VT]
return out
class GpuMagmaMatrixInverse(GpuMagmaBase):
"""Computes the inverse of a matrix :math:`A` using magma library.
"""
__props__ = ('inplace', )
check_input = False
params_type = ParamsType(inplace=bool_t, context=gpu_context_type)
def __init__(self, inplace=False):
COp.__init__(self, ['c_code/magma_inv.c'], 'APPLY_SPECIFIC(magma_inv)')
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def clone_inplace(self):
return self.__class__(inplace=True)
def make_node(self, A):
ctx_name = infer_context_name(A)
A = as_gpuarray_variable(A, ctx_name)
A = gpu_contiguous(A)
if A.ndim != 2:
raise LinAlgError("Matrix rank error")
if A.dtype != 'float32':
raise TypeError("only `float32` is supported for now")
return theano.Apply(self, [A], [A.type()])
def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context)
def infer_shape(self, node, shapes):
return shapes
def gpu_matrix_inverse(a):
"""
This function performs the matrix inverse on GPU.
Returns
-------
a_inv: matrix
"""
return GpuMagmaMatrixInverse()(a)
class GpuMagmaCholesky(GpuMagmaBase, CGpuKernelBase):
"""Computes the cholesky decomposition of a matrix :math:`A` using magma
library.
"""
__props__ = ('lower', 'inplace')
check_input = False
params_type = ParamsType(lower=bool_t, inplace=bool_t, context=gpu_context_type)
def __init__(self, lower=True, inplace=False):
self.lower = lower
COp.__init__(self, ['c_code/magma_cholesky.c'], 'APPLY_SPECIFIC(magma_cholesky)')
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def clone_inplace(self):
return self.__class__(lower=self.lower, inplace=True)
def make_node(self, A):
ctx_name = infer_context_name(A)
A = as_gpuarray_variable(A, ctx_name)
A = gpu_contiguous(A)
if A.ndim != 2:
raise LinAlgError("Matrix rank error")
#if A.dtype != 'float32': # PMB removed this check
# raise TypeError("only `float32` is supported for now")
return theano.Apply(self, [A], [A.type()])
def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context)
def infer_shape(self, node, shapes):
return [shapes[0]]
# Note to self - L_op is just the linearized and transposed Op.
# trivial case when Op is y = A * x, then L_op is x_bar = A.T * y_bar
# y_bar = "output_gradients" and is same shape and form as y,
# x_bar = "input_gradients" (output of L_op) and is same shape and form as x,
def L_op(self, inputs, outputs, output_gradients):
# Modified from theano/tensor/slinalg.py
# No handling for on_error = 'nan'
dz = output_gradients[0]
nb,n,l = dz.shape
chol_x = outputs[0]
# deal with upper triangular by converting to lower triangular
if not self.lower:
chol_x=chol_x.dimshuffle(0,2,1)
dz=dz.dimshuffle(0,2,1)
def tril_and_halve_diagonal(mtx):
"""Extracts lower triangle of square matrix and halves diagonal."""
return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.)
def tril_and_halve_diagonal_batch(mtx):
"""Extracts lower triangle of square matrix and halves diagonal."""
T=tensor.tril(tensor.ones((n,n)))
I=tensor.eye(n)
out = mtx*T-I*mtx/2
return out
def conjugate_solve_triangular(outer, inner):
"""Computes L^{-T} P L^{-1} for lower-triangular L."""
return gpu_solve_upper_triangular(
outer.T, gpu_solve_upper_triangular(outer.T, inner.T).T)
def conjugate_solve_triangular_batch(outer, inner):
"""Computes L^{-T} P L^{-1} for lower-triangular L."""
if True:
out,_ = theano.scan(fn=lambda outer,inner : \
solve_upper_triangular( outer.T, solve_upper_triangular(outer.T, inner.T).T) , \
sequences=[outer,inner], non_sequences=[])
return out
else:
outer_T = outer.dimshuffle(0,2,1)
inner_T = inner.dimshuffle(0,2,1)
# important note: gpu_solve_upper_triangular_batch works for
# matrix RHS, but matrix RHS is transpose RHS!
# for this reason, we don't need to transpose 'inner'
tmp = gpu_solve_upper_triangular_batch(outer_T, inner_T)
tmp_T = tmp.dimshuffle(0,2,1)
return gpu_solve_upper_triangular_batch( outer_T, tmp_T)
if False:
#tmp1 = chol_x.T.dot(dz)
tmp1 = tensor.sum( chol_x.dimshuffle(0,2,1,'x') * dz.dimshuffle(0,'x',1,2), axis=2).reshape((nb,n,n))
tmp2 = tril_and_halve_diagonal_batch(tmp1)
s = conjugate_solve_triangular_batch( chol_x, tmp2)
else:
s,_ = theano.scan(fn=lambda chol_x,dz : \
conjugate_solve_triangular(chol_x,tril_and_halve_diagonal(chol_x.T.dot(dz))), \
sequences=[chol_x,dz], non_sequences=[])
if True:
if self.lower:
grad,_ = theano.scan(fn=lambda s : tensor.tril(s + s.T) - tensor.diag(tensor.diagonal(s)), \
sequences=[s], non_sequences=[])
else:
grad,_ = theano.scan(fn=lambda s : tensor.triu(s + s.T) - tensor.diag(tensor.diagonal(s)), \
sequences=[s], non_sequences=[])
else:
if self.lower:
T=tensor.tril(tensor.ones((n,n)))
else:
T=tensor.triu(tensor.ones((n,n)))
I=tensor.eye(n)
grad=(s+s.dimshuffle(0,2,1))*T-s*I.dimshuffle('x',0,1)
return [grad]
def L_op_(self, inputs, outputs, gradients):
# Modified from theano/tensor/slinalg.py
# No handling for on_error = 'nan'
dz = gradients[0]
chol_x = outputs[0]
# this is for nan mode
#
# ok = ~tensor.any(tensor.isnan(chol_x))
# chol_x = tensor.switch(ok, chol_x, 1)
# dz = tensor.switch(ok, dz, 1)
# deal with upper triangular by converting to lower triangular
if not self.lower:
chol_x = chol_x.T
dz = dz.T
def tril_and_halve_diagonal(mtx):
"""Extracts lower triangle of square matrix and halves diagonal."""
return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.)
def conjugate_solve_triangular(outer, inner):
"""Computes L^{-T} P L^{-1} for lower-triangular L."""
return gpu_solve_upper_triangular(
outer.T, gpu_solve_upper_triangular(outer.T, inner.T).T)
s = conjugate_solve_triangular(
chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz)))
if self.lower:
grad = tensor.tril(s + s.T) - tensor.diag(tensor.diagonal(s))
else:
grad = tensor.triu(s + s.T) - tensor.diag(tensor.diagonal(s))
return [grad]
class GpuMagmaQR(GpuMagmaBase, CGpuKernelBase):
"""Computes the qr decomposition of a matrix :math:`A` using magma
library.
Parameters
----------
complete : If False, returns only ``R``.
.. warning::
Because of implementation constraints, this Op returns outputs
in order ``R, Q``. Use :func:`theano.gpuarray.linalg.gpu_qr`
to get them in expected order ``Q, R``.
"""
__props__ = ('complete', )
_cop_num_inputs = 1
_cop_num_outputs = 2
check_input = False
params_type = ParamsType(complete=bool_t, context=gpu_context_type)
def __init__(self, complete=True):
self.complete = complete
COp.__init__(self, ['c_code/magma_qr.c'], 'APPLY_SPECIFIC(magma_qr)')
def make_node(self, A):
ctx_name = infer_context_name(A)
A = as_gpuarray_variable(A, ctx_name)
A = gpu_contiguous(A)
if A.ndim != 2:
raise LinAlgError("Matrix rank error")
if A.dtype != 'float32':
raise TypeError("only `float32` is supported for now")
if self.complete:
return theano.Apply(self, [A],
# return R, Q
[A.type(), A.type()])
else:
return theano.Apply(self, [A],
# return R
[A.type()])
def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context)
def gpu_qr(a, complete=True):
"""
This function performs the QR on GPU.
Parameters
----------
complete : bool, optional
If `False`, returns only r.
Returns
-------
Q, R : matrices
"""
out = GpuMagmaQR(complete)(a)
if complete:
R, Q = out
out = [Q, R]
return out
class GpuMagmaEigh(GpuMagmaBase):
"""Computes the eigen decomposition of a symmetric matrix :math:`A` using magma
library.
Parameters
----------
UPLO : Specifies whether the calculation is done with the lower triangular
part of matrix (`L`, default) or the upper triangular part (`U`).
compute_v : If `True`, computes eigenvalues and eigenvectors (`True`,
default). If `False`, computes only eigenvalues of matrix.
"""
__props__ = ('lower', 'compute_v')
_cop_num_inputs = 1
_cop_num_outputs = 2
check_input = False
params_type = ParamsType(lower=bool_t, compute_v=bool_t,
context=gpu_context_type)
def __init__(self, UPLO='L', compute_v=True):
assert UPLO in ['L', 'U']
self.lower = UPLO == 'L'
self.compute_v = compute_v
COp.__init__(self, ['c_code/magma_eigh.c'], 'APPLY_SPECIFIC(magma_eigh)')
def make_node(self, A):
ctx_name = infer_context_name(A)
A = as_gpuarray_variable(A, ctx_name)
A = gpu_contiguous(A)
if A.ndim != 2:
raise LinAlgError("Matrix rank error")
if A.dtype != 'float32':
raise TypeError("only `float32` is supported for now")
if self.compute_v:
return theano.Apply(self, [A],
# return D, V
[GpuArrayType(A.dtype, broadcastable=[False],
context_name=ctx_name)(),
A.type()])
else:
return theano.Apply(self, [A],
# return D
[GpuArrayType(A.dtype, broadcastable=[False],
context_name=ctx_name)()])
def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context)
class GpuCholeskyBatch(Op):
"""
CUSOLVER GPU Cholesky Op.
Given a real positive definite matrix `A` returns either a lower
triangular matrix `L` such that `A == dot(L, L.T)` if `lower == True`
else returns an upper triangular matrix `U` such that `A == dot(U.T, U)`
if `lower == False`.
Parameters
----------
lower
Whether to return a lower rather than upper triangular decomposition.
"""
__props__ = ('lower', 'inplace')
def __init__(self, lower=True, inplace=False):
self.lower = lower
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
super(GpuCholeskyBatch, self).__init__()
def clone_inplace(self):
return self.__class__(lower=self.lower, inplace=True)
def make_node(self, inp):
if not cusolver_available:
raise RuntimeError('CUSOLVER is not available and '
'GpuCholesky Op can not be constructed.')
if skcuda.__version__ <= '0.5.1':
warnings.warn('The GpuCholesky op requires scikit-cuda > '
'0.5.1 to work with CUDA 8')
if not pygpu_available:
raise RuntimeError('Missing pygpu or triu/tril functions.'
'Install or update libgpuarray.')
context_name = infer_context_name(inp)
inp = as_gpuarray_variable(inp, context_name)
inp = gpu_contiguous(inp)
assert inp.ndim == 3
return theano.Apply(self, [inp], [inp.type()])
def prepare_node(self, node, storage_map, compute_map, impl):
ctx = node.inputs[0].type.context
attach_cusolver_handle_to_context(ctx)
def perform(self, node, inputs, outputs):
context = inputs[0][0].context
# Input matrix.
A = inputs[0]
nb, l, n = A.shape
if l != n:
raise ValueError('A must be a square matrix')
lda = max(1, n)
# cusolver operates on F ordered matrices, but A is expected
# to be symmetric so it does not matter.
# We copy A if needed
if self.inplace:
L = A
else:
L = pygpu.array(A, copy=True)
# The output matrix will contain only the upper or lower
# triangular factorization of A. If L is C ordered (it
# probably is as it is the default in Theano) we just switch
# the fill mode parameter of cusolver
l_parameter = 0 if self.lower else 1
if L.flags['C_CONTIGUOUS']:
l_parameter = 1 - l_parameter
L_ptr = L.gpudata
if A.dtype == 'float32':
potrf_bufferSize = cusolver.cusolverDnSpotrf_bufferSize
potrf = cusolver.cusolverDnSpotrf
wordlen = 4
elif A.dtype == 'float64':
potrf_bufferSize = cusolver.cusolverDnDpotrf_bufferSize
potrf = cusolver.cusolverDnDpotrf
wordlen = 8
else:
raise ValueError("Unsupported dtype")
with context:
workspace_size = potrf_bufferSize(
context.cusolver_handle, l_parameter, n, L_ptr, lda)
workspace = pygpu.zeros(workspace_size, dtype=A.dtype,
context=context)
dev_info = pygpu.zeros((1,), dtype='int32', context=context)
workspace_ptr = workspace.gpudata
dev_info_ptr = dev_info.gpudata
for ibatch in range(nb):
L_ptr_batch = L.gpudata+ibatch*n*n*wordlen
potrf(context.cusolver_handle, l_parameter, n, L_ptr_batch,
lda, workspace_ptr, workspace_size, dev_info_ptr)
val_dev_info = np.asarray(dev_info)[0]
if val_dev_info > 0:
raise LinAlgError('Cholesky decomposition failed (is A SPD?)')
# cusolver leaves the elements in the matrix outside the considered
# upper or lower triangle unchanged, so we need to put zeros outside
# the triangle
if self.lower:
tril(L[ibatch])
else:
triu(L[ibatch])
outputs[0][0] = L
# Note to self - L_op is just the linearized and transposed Op.
# trivial case when Op is y = A * x, then L_op is x_bar = A.T * y_bar
# y_bar = "output_gradients" and is same shape and form as y,
# x_bar = "input_gradients" (output of L_op) and is same shape and form as x,
def L_op(self, inputs, outputs, output_gradients):
# Modified from theano/tensor/slinalg.py
# No handling for on_error = 'nan'
dz = output_gradients[0]
nb,n,l = dz.shape
chol_x = outputs[0]
# deal with upper triangular by converting to lower triangular
if not self.lower:
chol_x=chol_x.dimshuffle(0,2,1)
dz=dz.dimshuffle(0,2,1)
def tril_and_halve_diagonal_batch(mtx):
"""Extracts lower triangle of square matrix and halves diagonal."""
T=tensor.tril(tensor.ones((n,n)))
I=tensor.eye(n)
out = mtx*T-I*mtx/2
return out
def conjugate_solve_triangular_batch(outer, inner):
"""Computes L^{-T} P L^{-1} for lower-triangular L."""
outer_T = outer.dimshuffle(0,2,1)
#inner_T = inner.dimshuffle(0,2,1)
#
# gpu_solve_upper_triangular_batch works for matrix RHS,
# but matrix RHS should be transpose due to using C ordering
# for this reason, we don't need to transpose 'tmp' or 'inner',
# even though it was done in slinalg.Cholesky.L_op
#
tmp = gpu_solve_upper_triangular_batch(outer_T, inner)
#tmp_T = tmp.dimshuffle(0,2,1)
return gpu_solve_upper_triangular_batch( outer_T, tmp)
#tmp1 = chol_x.T.dot(dz)
tmp1 = tensor.sum( chol_x.dimshuffle(0,2,1,'x') * dz.dimshuffle(0,'x',1,2), axis=2).reshape((nb,n,n))
tmp2 = tril_and_halve_diagonal_batch(tmp1)
s = conjugate_solve_triangular_batch( chol_x, tmp2)
if self.lower:
T=tensor.tril(tensor.ones((n,n)))
else:
T=tensor.triu(tensor.ones((n,n)))
I=tensor.eye(n)
grad=(s+s.dimshuffle(0,2,1))*T-s*I.dimshuffle('x',0,1)
return [grad]
# Batched version of GpuCublasTriangularSolve. Note: this Op works when
# RHS is a matrix, bu this is intended for use within an L_op and
# works only when b is (batchsize,n,n) and is transposed!
class GpuCublasTriangularSolveBatch(Op):
"""
CUBLAS GPU Triangular Solve Op - Batched.
Parameters
----------
lower
Whether system is lower-triangular (True) or upper-triangular (False).
trans
Whether to take the transpose of the input matrix or not.
"""
__props__ = ('trans', 'lower')
def __init__(self, lower=True, trans='N'):
self.trans = trans
self.lower = lower
super(GpuCublasTriangularSolveBatch, self).__init__()
def make_node(self, inp1, inp2):
if not cublas_available:
raise RuntimeError('CUBLAS is not available and '
'GpuCublasTriangularSolve Op '
'can not be constructed.')
context_name = infer_context_name(inp1, inp2)
inp1 = as_gpuarray_variable(inp1, context_name)
inp2 = as_gpuarray_variable(inp2, context_name)
inp1 = gpu_contiguous(inp1)
inp2 = gpu_contiguous(inp2)
assert inp1.ndim == 3
assert inp2.ndim in [2,3]
assert inp1.dtype == inp2.dtype
return theano.Apply(self, [inp1, inp2],
[GpuArrayType(inp1.dtype,
broadcastable=inp2.broadcastable,
context_name=context_name)()])
def prepare_node(self, node, storage_map, compute_map, impl):
ctx = node.inputs[0].type.context
attach_cublas_handle_to_context(ctx)
def perform(self, node, inputs, outputs):
ctx = node.inputs[0].type.context
# Solution set
x = outputs[0]
# Matrix.
A = inputs[0]
# right hand side
b = inputs[1]
assert(len(A.shape) == 3) # nb, n, n
assert(len(b.shape) in [2,3]) # (nb, n) or (nb,n,m)
# implicitly deal with the difference between C order
# and fortran order in A by flipping the trans and lower flags
lower = not self.lower
trans = self.trans
if trans in ['T', 'C']:
trans = 'N'
nb, l, n = A.shape
elif trans == 'N':
trans = 'T'
nb, n, l = A.shape
else:
raise ValueError('Invalid value for trans')
if len(b.shape) == 3:
nb2, k , m = b.shape
else:
nb2, k = b.shape
m=1
if nb != nb2:
raise ValueError('A and b must be aligned.')
if l != n:
raise ValueError('A must be a square matrix')
if n != k:
raise ValueError('A and b must be aligned.')
lda = max(1, n)
ldb = max(1, k)
# solution overwrites right hand side on exit
# PMB: GPU wants F ordering, but input from Python is C ordered
# and we need C - ordering to indexing the batch.
# this only works id b is a vector (times batch length)
# the only time m>1 is when this Op is called from with an L_op
# and there, the data must be transposed
b = pygpu.array(b, copy=True, order='C') # PMB changed to C ordered
A_ptr = A.gpudata
b_ptr = b.gpudata
# unit scalar used for multiplication
alpha = 1.0
# indicates matrix A is on left of B
side = 'l'
# set whether upper or lower part of matrix A stored
uplo = 'l' if lower else 'u'
# indicates elements on diagonal of matrix A may not be unity
diag = 'n'
if A.dtype == 'float32':
trsv = cublas.cublasStrsv
trsm = cublas.cublasStrsm
wordlen=4
elif A.dtype == 'float64':
trsv = cublas.cublasDtrsv
trsm = cublas.cublasDtrsm
wordlen=8
else:
raise ValueError("Unsupported dtype")
with ctx:
for ib in range(nb):
# matrix vector solve
trsv(ctx.cublas_handle, uplo, trans, diag, n,
A_ptr+ib*n*n*wordlen, lda, b_ptr+ib*n*m*wordlen, 1)
if m>1:
x[0] = b.reshape((nb,n,m))
else:
x[0] = b.reshape((nb,n,))
# Note to self - L_op is just the linearized and transposed Op.
# trivial case when Op is y = A * x, then L_op is x_bar = A.T * y_bar
# y_bar = "output_gradients" and is same shape and form as y,
# x_bar = "input_gradients" (outpit of L_op) and is same shape and form as x,
#
# Note: L_op will only work for m=1 (b is a batched vector)
def L_op(self, inputs, outputs, output_gradients):
# Modified from theano/tensor/slinalg.py
A, b = inputs
nb, n, l = A.shape
nb2, k = b.shape
# reshape variables with batch as first dimension
c = outputs[0] # c is of shape (nb,n)
# get the dimension of the gradients
c_bar = output_gradients[0] # c_bar is of shape (nb,n)
trans_solve_op = GpuCublasTriangularSolveBatch(not self.lower)
# Operation for non-batched version: b_bar = trans_solve_op(A.T, c_bar)
A_T = A.dimshuffle(0,2,1)
b_bar = trans_solve_op(A_T, c_bar) # b_bar is shape (nb,n)
# Operation for non-batched version: A_bar = -tensor.outer(b_bar, c)
A_bar = - b_bar.dimshuffle(0,1,'x') * c.dimshuffle(0,'x',1)
A_bar=A_bar.reshape((nb,n,n))
# non-batched operation:
#if self.lower:
# A_bar=: tensor.tril(A_bar)
#else:
# A_bar= tensor.triu(A_bar)
# batched operation:
if self.lower:
I=tensor.tril(tensor.ones((n,n)))
else:
I=tensor.triu(tensor.ones((n,n)))
A_bar = A_bar * I.dimshuffle('x',0,1)
return [A_bar, b_bar]
def gpu_solve_upper_triangular_batch(A, b, trans='N'):
return GpuCublasTriangularSolveBatch(False, trans)(A, b)
class GpuCusolverSolveBatch(Op):
"""
CUSOLVER GPU solver OP.
Parameters
----------
trans
Whether to take the transpose of the input matrix or not.
"""
__props__ = ('A_structure', 'trans', 'inplace')
def __init__(self, A_structure='general', trans='N', inplace=False):
self.trans = trans
self.inplace = inplace
self.A_structure = A_structure
if self.inplace:
self.destroy_map = {0: [0]}
assert A_structure in MATRIX_STRUCTURES_SOLVE
super(GpuCusolverSolveBatch, self).__init__()
def make_node(self, inp1, inp2):
if not cusolver_available:
raise RuntimeError('CUSOLVER is not available and '
'GpuCusolverSolve Op can not be constructed.')
if skcuda.__version__ <= '0.5.1':
warnings.warn('The GpuSolve op requires scikit-cuda > 0.5.1 to work with CUDA 8')
context_name = infer_context_name(inp1, inp2)
inp1 = as_gpuarray_variable(inp1, context_name)
inp2 = as_gpuarray_variable(inp2, context_name)
inp1 = gpu_contiguous(inp1)
inp2 = gpu_contiguous(inp2)
assert inp1.ndim == 3
assert inp2.ndim == 3
assert inp1.dtype == inp2.dtype
return theano.Apply(
self, [inp1, inp2],
[GpuArrayType(inp1.dtype,
broadcastable=inp1.broadcastable,
context_name=context_name)()])
def prepare_node(self, node, storage_map, compute_map, impl):
ctx = node.inputs[0].type.context
attach_cusolver_handle_to_context(ctx)
def check_dev_info(self, dev_info):
val = np.asarray(dev_info)[0]
if val > 0:
raise LinAlgError('A is singular')
def perform(self, node, inputs, outputs):
context = inputs[0][0].context
# Size of the matrices to invert.
z = outputs[0]
# Matrix.
A = inputs[0]
# Solution vectors.
b = inputs[1]
assert(len(A.shape) == 3)
assert(len(b.shape) == 3)
if self.trans in ['T', 'C']:
trans = 1
nb,l, n = A.shape
nb2, k, m = b.shape
elif self.trans == 'N':
trans = 0
nb, n, l = A.shape
nb2, k, m = b.shape
else:
raise ValueError('Invalid value for trans')
if nb != nb2:
raise ValueError('A and b must have same batchlen')
if l != n:
raise ValueError('A must be a square matrix')
if n != k:
raise ValueError('A and b must be aligned.')
lda = max(1, n)
ldb = max(1, k)
# We copy A and b as cusolver operates inplace
b = pygpu.array(b, copy=True, order='C') # PMB, changed to need C ordring to index by batch
print('PMB: inplace=', self.inplace)
if not self.inplace:
A = pygpu.array(A, copy=True)
trans = 1 - trans
A_ptr = A.gpudata
b_ptr = b.gpudata
# cusolver expects a F ordered matrix, but A is not explicitly
# converted between C and F order, instead we switch the
# "transpose" flag.
print('PMB: trans=', trans)
if A.flags['C_CONTIGUOUS']:
trans = 1 - trans
if A.dtype == 'float32':
potrf_bufferSize = cusolver.cusolverDnSpotrf_bufferSize
potrf = cusolver.cusolverDnSpotrf
potrs = cusolverDnSpotrs
getrf_bufferSize = cusolver.cusolverDnSgetrf_bufferSize
getrf = cusolver.cusolverDnSgetrf
getrs = cusolver.cusolverDnSgetrs
wordlen=4
elif A.dtype == 'float64':
potrf_bufferSize = cusolver.cusolverDnDpotrf_bufferSize
potrf = cusolver.cusolverDnDpotrf
potrs = cusolverDnDpotrs
getrf_bufferSize = cusolver.cusolverDnDgetrf_bufferSize
getrf = cusolver.cusolverDnDgetrf
getrs = cusolver.cusolverDnDgetrs
wordlen=8
else:
raise ValueError("Unsupported dtype")
print('PMB: struct=', self.A_structure)
if self.A_structure == 'symmetric':
with context:
workspace_size = potrf_bufferSize(
context.cusolver_handle, 0, n, A_ptr, lda)
workspace = pygpu.zeros(workspace_size, dtype=A.dtype,
context=context)
dev_info = pygpu.zeros((1,), dtype='int32', context=context)
workspace_ptr = workspace.gpudata
dev_info_ptr = dev_info.gpudata
with context:
potrf(
context.cusolver_handle, 0, n, A_ptr, lda, workspace_ptr,
workspace_size, dev_info_ptr)
self.check_dev_info(dev_info)
potrs(
context.cusolver_handle, 0, n, m, A_ptr, lda,
b_ptr, ldb, dev_info_ptr)
else:
print('PMB: general case')
# general case for A
with context:
workspace_size = getrf_bufferSize(
context.cusolver_handle, n, n, A_ptr, lda)
workspace = pygpu.zeros(workspace_size, dtype=A.dtype,
context=context)
pivots = pygpu.zeros(n, dtype='int32', context=context)
dev_info = pygpu.zeros((1,), dtype='int32', context=context)
workspace_ptr = workspace.gpudata
pivots_ptr = pivots.gpudata
dev_info_ptr = dev_info.gpudata
with context:
for ib in range(nb):
#for ib in range(1):
getrf(
context.cusolver_handle, n, n, A_ptr+ib*n*n, lda, workspace_ptr,
pivots_ptr, dev_info_ptr)
self.check_dev_info(dev_info)
getrs(
context.cusolver_handle, trans, n, m, A_ptr+ib*n*n, lda,
pivots_ptr, b_ptr+ib*n*m, ldb, dev_info_ptr)
#z[0] = b.reshape((nb*n,m))
z[0] = b
def L_op(self, inputs, outputs, output_gradients):
# Modified from theano/tensor/slinalg.py
A, b = inputs
nb,n,l =A.shape
nb2,n,m =b.shape
c = outputs[0]
c=c.reshape((nb,n)) # c is (nb*n,m), but grad should only be used when m=1
c_bar = output_gradients[0]
cdim=theano.tensor.prod(c_bar.shape)
ndrv = cdim//nb//n
c_bar=c_bar.reshape((nb,n,ndrv))
# FIXME: triangular structure would use GpuCublasTriangularsolve?
# no need to handle A_structure like slinalg.py?
trans_solve_op = GpuCusolverSolveBatch('general')
A_T=A.dimshuffle(0,2,1)
b_bar = trans_solve_op(A_T, c_bar)
b_bar=b_bar.reshape((nb,n,ndrv))
#A_bar = -tensor.outer(b_bar, c)
A_bar = - b_bar.dimshuffle(0,1,2,'x') * c.dimshuffle(0,'x','x',1)
A_bar=A_bar.reshape((nb*n,n,ndrv))
b_bar=b_bar.reshape(c_bar.shape)
return [A_bar, b_bar]