Hi Wonghang,
So I am working toward making the Cholesky problem faster, and by that
I include triangular
solvers like GpuCublasTriangularSolve(). We typically do the Cholesky
decomp, then solve
linear systems involving the Cholesky matrix that has an upper or lower
triangular form.
So I have started with GpuCublasTriangularSolve(). It has a lot of overhead
from
all the gpu array matrix copying and creation, which adds to the overhead of
theano.scan, which is necessary to work with batches. So I thought it
would be much
better to have a batched version of Cholesky() and
GpuCublasTriangularSolve(). I have
created working versions of these batched routines (attached). It's
actually pretty simple,
I just index into the data by batch, then call the gpu solver, potrf() for
Cholesky
and trsv() for the solver. I loop over the batch like this:
if theano.config.floatX=='float32':
wordlen=4
else:
wordlet=8
for ib in range(nb):
trsv(ctx.cublas_handle, uplo, trans, diag, n,
A_ptr+ib*n*n*wordlen, lda, b_ptr+ib*n*m*wordlen, 1)
There are a few small gotchas, such as that the GPU routines expect
F-ordered data, but
to index like I did above, it has to be C-ordered. So the data has to be
C-ordered by batch, but F-ordered within the batch!
The problem I have, where I will need help, is in the gradients.
Although I can compute the gradient in L_op correctly (I think),
theano.gradient is not
happy with the shape of the matrices that L_op returns. This is probably
because gradient does not understand that the data is bacthed.
Do you think you can help in this matter? I think this is over my head.
Paul
--
---
You received this message because you are subscribed to the Google Groups
"theano-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To view this discussion on the web visit
https://groups.google.com/d/msgid/theano-users/b1336e9d-9601-48e9-9665-58740618a861%40googlegroups.com.
class GpuCublasTriangularSolveBatch(Op):
"""
CUBLAS GPU Triangular Solve Op - Batched.
Parameters
----------
lower
Whether system is lower-triangular (True) or upper-triangular (False).
trans
Whether to take the transpose of the input matrix or not.
"""
__props__ = ('trans', 'lower')
def __init__(self, lower=True, trans='N'):
self.trans = trans
self.lower = lower
super(GpuCublasTriangularSolveBatch, self).__init__()
def make_node(self, inp1, inp2):
if not cublas_available:
raise RuntimeError('CUBLAS is not available and '
'GpuCublasTriangularSolve Op '
'can not be constructed.')
context_name = infer_context_name(inp1, inp2)
inp1 = as_gpuarray_variable(inp1, context_name)
inp2 = as_gpuarray_variable(inp2, context_name)
inp1 = gpu_contiguous(inp1)
inp2 = gpu_contiguous(inp2)
assert inp1.ndim == 3
assert inp2.ndim == 3
assert inp1.dtype == inp2.dtype
return theano.Apply(self, [inp1, inp2],
[GpuArrayType(inp1.dtype,
broadcastable=inp2.broadcastable,
context_name=context_name)()])
def prepare_node(self, node, storage_map, compute_map, impl):
ctx = node.inputs[0].type.context
attach_cublas_handle_to_context(ctx)
def perform(self, node, inputs, outputs):
ctx = node.inputs[0].type.context
# Solution set
x = outputs[0]
# Matrix.
A = inputs[0]
# right hand side
b = inputs[1]
assert(len(A.shape) == 3) # nb, n, n
assert(len(b.shape) == 3) # nb, n, m
# implicitly deal with the difference between C order
# and fortran order by flipping the trans and lower flags
lower = not self.lower
trans = self.trans
if trans in ['T', 'C']:
trans = 'N'
nb, l, n = A.shape
elif trans == 'N':
trans = 'T'
nb, n, l = A.shape
else:
raise ValueError('Invalid value for trans')
nb2, k, m = b.shape
if nb != nb2:
raise ValueError('A and b must be aligned.')
if l != n:
raise ValueError('A must be a square matrix')
if n != k:
raise ValueError('A and b must be aligned.')
lda = max(1, n)
ldb = max(1, k)
# solution overwrites right hand side on exit
# PMB: for GPU wants F ordering, but we need C - ordering to indexing the batch
# this must be taken into account when b is a matrix
b = pygpu.array(b, copy=True, order='C') # PMB changed to C ordered
A_ptr = A.gpudata
b_ptr = b.gpudata
# unit scalar used for multiplication
alpha = 1.0
# indicates matrix A is on left of B
side = 'l'
# set whether upper or lower part of matrix A stored
uplo = 'l' if lower else 'u'
# indicates elements on diagonal of matrix A may not be unity
diag = 'n'
if A.dtype == 'float32':
trsv = cublas.cublasStrsv
trsm = cublas.cublasStrsm
wordlen=4
elif A.dtype == 'float64':
trsv = cublas.cublasDtrsv
trsm = cublas.cublasDtrsm
wordlen=8
else:
raise ValueError("Unsupported dtype")
with ctx:
for ib in range(nb):
# matrix vector solve
trsv(ctx.cublas_handle, uplo, trans, diag, n,
A_ptr+ib*n*n*wordlen, lda, b_ptr+ib*n*m*wordlen, 1)
# because of theano.gradient, we cannot add another dimension for the batch,
# so it is concatenated into the first dimension
x[0] = b.reshape((nb*n,))
def L_op(self, inputs, outputs, output_gradients):
# Modified from theano/tensor/slinalg.py
A, b = inputs
nb, n, l = A.shape
nb2, k, m = b.shape
# reshape variables with batch as first dimension
c = outputs[0]
c=c.reshape((nb,n)) # grad is only to used for m=1
# get the dimension of the gradients
c_bar = output_gradients[0]
#print('Cndim=', c_bar.ndim)
cdim=theano.tensor.prod(c_bar.shape)
ddims=c_bar.shape[2:]
ndrv = cdim//nb//n
c_bar=c_bar.reshape((nb,n,ndrv))
trans_solve_op = GpuCublasTriangularSolveBatch(not self.lower)
A_T = A.dimshuffle(0,2,1)
# c_bar is special because in GpuCublasTriangularSolveBatch, we need 'F' ordering,
# so here we swap the dimensions, and reshape
c_bar=c_bar.dimshuffle(0,2,1).reshape((nb,n,ndrv))
b_bar = trans_solve_op(A_T, c_bar)
b_bar=b_bar.reshape((nb,n,ndrv))
# Operation for non-batched version: A_bar = -tensor.outer(b_bar, c)
A_bar = - b_bar.dimshuffle(0,1,2,'x') * c.dimshuffle(0,'x','x',1)
A_bar=A_bar.reshape((nb,n,ndrv,n))
b_bar=b_bar.reshape((nb,n,ndrv))
#if self.lower:
# A_bar,_= theano.scan(fn=lambda A_bar : tensor.tril(A_bar), \
# sequences=[A_bar], non_sequences=[])
#else:
# A_bar,_= theano.scan(fn=lambda A_bar : tensor.triu(A_bar), \
# sequences=[A_bar], non_sequences=[])
A_bar=A_bar.reshape((nb*n,ndrv,n))
b_bar=b_bar.reshape((nb*n,ndrv,1))
#b_bar=b_bar.reshape(c_bar.shape)
return [A_bar, b_bar]
class GpuCholeskyBatch(Op):
"""
CUSOLVER GPU Cholesky Op.
Given a real positive definite matrix `A` returns either a lower
triangular matrix `L` such that `A == dot(L, L.T)` if `lower == True`
else returns an upper triangular matrix `U` such that `A == dot(U.T, U)`
if `lower == False`.
Parameters
----------
lower
Whether to return a lower rather than upper triangular decomposition.
"""
__props__ = ('lower', 'inplace')
def __init__(self, lower=True, inplace=False):
self.lower = lower
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
super(GpuCholeskyBatch, self).__init__()
def clone_inplace(self):
return self.__class__(lower=self.lower, inplace=True)
def make_node(self, inp):
if not cusolver_available:
raise RuntimeError('CUSOLVER is not available and '
'GpuCholesky Op can not be constructed.')
if skcuda.__version__ <= '0.5.1':
warnings.warn('The GpuCholesky op requires scikit-cuda > '
'0.5.1 to work with CUDA 8')
if not pygpu_available:
raise RuntimeError('Missing pygpu or triu/tril functions.'
'Install or update libgpuarray.')
context_name = infer_context_name(inp)
inp = as_gpuarray_variable(inp, context_name)
inp = gpu_contiguous(inp)
assert inp.ndim == 3
return theano.Apply(self, [inp], [inp.type()])
def prepare_node(self, node, storage_map, compute_map, impl):
ctx = node.inputs[0].type.context
attach_cusolver_handle_to_context(ctx)
def perform(self, node, inputs, outputs):
context = inputs[0][0].context
# Input matrix.
A = inputs[0]
nb, l, n = A.shape
print('ffff', nb, l, n)
if l != n:
raise ValueError('A must be a square matrix')
lda = max(1, n)
# cusolver operates on F ordered matrices, but A is expected
# to be symmetric so it does not matter.
# We copy A if needed
if self.inplace:
L = A
else:
L = pygpu.array(A, copy=True)
# The output matrix will contain only the upper or lower
# triangular factorization of A. If L is C ordered (it
# probably is as it is the default in Theano) we just switch
# the fill mode parameter of cusolver
l_parameter = 0 if self.lower else 1
if L.flags['C_CONTIGUOUS']:
l_parameter = 1 - l_parameter
L_ptr = L.gpudata
if A.dtype == 'float32':
potrf_bufferSize = cusolver.cusolverDnSpotrf_bufferSize
potrf = cusolver.cusolverDnSpotrf
elif A.dtype == 'float64':
potrf_bufferSize = cusolver.cusolverDnDpotrf_bufferSize
potrf = cusolver.cusolverDnDpotrf
else:
raise ValueError("Unsupported dtype")
with context:
workspace_size = potrf_bufferSize(
context.cusolver_handle, l_parameter, n, L_ptr, lda)
workspace = pygpu.zeros(workspace_size, dtype=A.dtype,
context=context)
dev_info = pygpu.zeros((1,), dtype='int32', context=context)
workspace_ptr = workspace.gpudata
dev_info_ptr = dev_info.gpudata
if theano.config.floatX=='float32':
word_sz=4
else:
word_sz=8
for ibatch in range(nb):
L_ptr_batch = L.gpudata+ibatch*n*n*word_sz
potrf(context.cusolver_handle, l_parameter, n, L_ptr_batch,
lda, workspace_ptr, workspace_size, dev_info_ptr)
val_dev_info = np.asarray(dev_info)[0]
if val_dev_info > 0:
raise LinAlgError('Cholesky decomposition failed (is A SPD?)')
# cusolver leaves the elements in the matrix outside the considered
# upper or lower triangle unchanged, so we need to put zeros outside
# the triangle
if self.lower:
tril(L[ibatch])
else:
triu(L[ibatch])
outputs[0][0] = L
def L_op(self, inputs, outputs, gradients):
# Modified from theano/tensor/slinalg.py
# No handling for on_error = 'nan'
dz = gradients[0]
chol_x = outputs[0]
# this is for nan mode
#
# ok = ~tensor.any(tensor.isnan(chol_x))
# chol_x = tensor.switch(ok, chol_x, 1)
# dz = tensor.switch(ok, dz, 1)
# deal with upper triangular by converting to lower triangular
if not self.lower:
chol_x=chol_x.dimshuffle(0,2,1)
dz=dz.dimshuffle(0,2,1)
# got to here
def tril_and_halve_diagonal(mtx):
"""Extracts lower triangle of square matrix and halves diagonal."""
out,_ = theano.scan(fn=lambda mtx : tensor.tril(mtx)-tensor.diag(tensor.diagonal(mtx)/2) , sequences=[mtx], non_sequences=[])
return out
def conjugate_solve_triangular(outer, inner):
"""Computes L^{-T} P L^{-1} for lower-triangular L."""
return gpu_solve_upper_triangular(
outer.T, gpu_solve_upper_triangular(outer.T, inner.T).T)
s = conjugate_solve_triangular(
chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz)))
if self.lower:
grad = tensor.tril(s + s.T) - tensor.diag(tensor.diagonal(s))
else:
grad = tensor.triu(s + s.T) - tensor.diag(tensor.diagonal(s))
return [grad]