Re: [theano-users] Error using floatX = float16 to save memory

luca . wagner . 0812 Thu, 21 Jul 2016 05:35:04 -0700

Frederic,
I had to reinstall theano  theano updated version + gpuarray and pygpu.
If the flags are:
floatX = float16
device=cuda


the convnet starts without errors:

Python 2.7.11 |Anaconda custom (64-bit)| (default, Dec  6 2015, 18:08:32) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)] on linux2
Type "help", "copyright", "credits" or "license" for more information.
Anaconda is brought to you by Continuum Analytics.
Please check out: http://continuum.io/thanks and https://anaconda.org
>>> 
runfile('/home/luca/data/DeepLearningTutorials/Theano-3D-ConvNet-master/convnet3d/core/run_multi_conv.py',
 
wdir='/home/luca/data/DeepLearningTutorials/Theano-3D-ConvNet-master/convnet3d/core')
Mapped name None to device cuda: GeForce 840M
Using cuDNN version 5005 on context None
/home/luca/data/Theano-master/theano/tensor/signal/downsample.py:6: 
UserWarning: downsample module has been moved to the 
theano.tensor.signal.pool module.
  "downsample module has been moved to the theano.tensor.signal.pool 
module.")
Disabling C code for Elemwise{mul,no_inplace} due to unsupported float16
Disabling C code for Elemwise{Cast{float32}} due to unsupported float16
Disabling C code for Elemwise{Cast{float16}} due to unsupported float16
Disabling C code for Elemwise{Cast{float16}} due to unsupported float16
Disabling C code for Alloc due to unsupported float16
Disabling C code for DiagonalSubtensor{inplace} due to unsupported float16
Disabling C code for IncDiagonalSubtensor due to unsupported float16
Disabling C code for DiagonalSubtensor{inplace} due to unsupported float16
Disabling C code for MaxAndArgmax due to unsupported float16


start time:
21/07/2016
14:32:07


images for training: 594
images for validation: 82
epochs: 200


... training neural network 13


training @ iter =  0
training @ iter =  200
training @ iter =  400


training cost 0.69336
epoch 1, training batch 594/594,validation error 45.122 %
training @ iter =  600
......


but if I put instead
floatX = float32
device=gpu

the error is:

Python 2.7.11 |Anaconda custom (64-bit)| (default, Dec  6 2015, 18:08:32) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)] on linux2
Type "help", "copyright", "credits" or "license" for more information.
Anaconda is brought to you by Continuum Analytics.
Please check out: http://continuum.io/thanks and https://anaconda.org
>>> 
runfile('/home/luca/data/DeepLearningTutorials/Theano-3D-ConvNet-master/convnet3d/core/run_multi_conv.py',
 
wdir='/home/luca/data/DeepLearningTutorials/Theano-3D-ConvNet-master/convnet3d/core')
Mapped name None to device cuda: GeForce 840M
Using cuDNN version 5005 on context None
Using gpu device 0: GeForce 840M (CNMeM is disabled, cuDNN 5005)
/home/luca/data/Theano-master/theano/tensor/signal/downsample.py:6: 
UserWarning: downsample module has been moved to the 
theano.tensor.signal.pool module.
  "downsample module has been moved to the theano.tensor.signal.pool 
module.")
ERROR (theano.gof.opt): Optimization failure due to: 
LocalOptGroup(local_abstractconv_cudnn,local_conv_dnn,local_abstractconv_gemm,local_abstractconv_gradinputs_gemm,local_abstractconv_gradweight_gemm,local_conv_gemm)
ERROR (theano.gof.opt): node: AbstractConv2d{border_mode='valid', 
subsample=(1, 1), filter_flip=True, imshp=(20, 1, 20, 20), kshp=(100, 1, 5, 
5), filter_dilation=(1, 1)}(GpuFromHost.0, GpuReshape{4}.0)
ERROR (theano.gof.opt): TRACEBACK:
ERROR (theano.gof.opt): Traceback (most recent call last):
  File "/home/luca/data/Theano-master/theano/gof/opt.py", line 1820, in 
process_node
    replacements = lopt.transform(node)
  File "/home/luca/data/Theano-master/theano/gof/opt.py", line 1265, in 
transform
    repl = opt.transform(node)
  File "/home/luca/data/Theano-master/theano/sandbox/cuda/dnn.py", line 
3149, in local_abstractconv_cudnn
    conv_mode=conv_mode)
  File "/home/luca/data/Theano-master/theano/sandbox/cuda/dnn.py", line 
1181, in dnn_conv
    conv_mode=conv_mode, precision=precision)(img.shape,
  File "/home/luca/data/Theano-master/theano/sandbox/cuda/dnn.py", line 
180, in __init__
    assert precision in ['float16', 'float32', 'float64']
AssertionError

Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File 
"/home/luca/anaconda2/lib/python2.7/site-packages/spyderlib/widgets/externalshell/sitecustomize.py",
 
line 714, in runfile
    execfile(filename, namespace)
  File 
"/home/luca/anaconda2/lib/python2.7/site-packages/spyderlib/widgets/externalshell/sitecustomize.py",
 
line 81, in execfile
    builtins.execfile(filename, *where)
  File 
"/home/luca/data/DeepLearningTutorials/Theano-3D-ConvNet-master/convnet3d/core/run_multi_conv.py",
 
line 124, in <module>
    run_experiments()
  File 
"/home/luca/data/DeepLearningTutorials/Theano-3D-ConvNet-master/convnet3d/core/run_multi_conv.py",
 
line 83, in run_experiments
    Pretrained = False
  File "mpr_convnet_class.py", line 291, in __init__
    train_model = theano.function([x,y],cost, 
updates=updates)                 
  File "/home/luca/data/Theano-master/theano/compile/function.py", line 
322, in function
    output_keys=output_keys)
  File "/home/luca/data/Theano-master/theano/compile/pfunc.py", line 480, 
in pfunc
    output_keys=output_keys)
  File "/home/luca/data/Theano-master/theano/compile/function_module.py", 
line 1783, in orig_function
    output_keys=output_keys).create(
  File "/home/luca/data/Theano-master/theano/compile/function_module.py", 
line 1463, in __init__
    optimizer_profile = optimizer(fgraph)
  File "/home/luca/data/Theano-master/theano/gof/opt.py", line 102, in 
__call__
    return self.optimize(fgraph)
  File "/home/luca/data/Theano-master/theano/gof/opt.py", line 90, in 
optimize
    ret = self.apply(fgraph, *args, **kwargs)
  File "/home/luca/data/Theano-master/theano/gof/opt.py", line 235, in apply
    sub_prof = optimizer.optimize(fgraph)
  File "/home/luca/data/Theano-master/theano/gof/opt.py", line 90, in 
optimize
    ret = self.apply(fgraph, *args, **kwargs)
  File "/home/luca/data/Theano-master/theano/gof/opt.py", line 235, in apply
    sub_prof = optimizer.optimize(fgraph)
  File "/home/luca/data/Theano-master/theano/gof/opt.py", line 90, in 
optimize
    ret = self.apply(fgraph, *args, **kwargs)
  File "/home/luca/data/Theano-master/theano/gof/opt.py", line 2257, in 
apply
    lopt_change = self.process_node(fgraph, node, lopt)
  File "/home/luca/data/Theano-master/theano/gof/opt.py", line 1825, in 
process_node
    lopt, node)
  File "/home/luca/data/Theano-master/theano/gof/opt.py", line 1719, in 
warn_inplace
    return NavigatorOptimizer.warn(exc, nav, repl_pairs, local_opt, node)
  File "/home/luca/data/Theano-master/theano/gof/opt.py", line 1705, in warn
    raise exc
AssertionError

Python 2.7.11 |Anaconda custom (64-bit)| (default, Dec  6 2015, 18:08:32) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)] on linux2
Type "help", "copyright", "credits" or "license" for more information.
Anaconda is brought to you by Continuum Analytics.
Please check out: http://continuum.io/thanks and https://anaconda.org
>>> 
runfile('/home/luca/data/DeepLearningTutorials/Theano-3D-ConvNet-master/convnet3d/core/run_multi_conv.py',
 
wdir='/home/luca/data/DeepLearningTutorials/Theano-3D-ConvNet-master/convnet3d/core')
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File 
"/home/luca/anaconda2/lib/python2.7/site-packages/spyderlib/widgets/externalshell/sitecustomize.py",
 
line 714, in runfile
    execfile(filename, namespace)
  File 
"/home/luca/anaconda2/lib/python2.7/site-packages/spyderlib/widgets/externalshell/sitecustomize.py",
 
line 81, in execfile
    builtins.execfile(filename, *where)
  File 
"/home/luca/data/DeepLearningTutorials/Theano-3D-ConvNet-master/convnet3d/core/run_multi_conv.py",
 
line 1, in <module>
    import mpr_convnet_class as conv
  File "mpr_convnet_class.py", line 2, in <module>
    from convnet3d import ConvLayer, PoolLayer
  File "convnet3d.py", line 3, in <module>
    from theano.tensor.nnet.conv3d2d import conv3d
  File "/home/luca/data/Theano-master/theano/__init__.py", line 125, in 
<module>
    import theano.gpuarray
  File "/home/luca/data/Theano-master/theano/gpuarray/__init__.py", line 
31, in <module>
    from . import fft, dnn, opt, nerv, extra_ops
  File "/home/luca/data/Theano-master/theano/gpuarray/dnn.py", line 30, in 
<module>
    from .basic_ops import (as_gpuarray_variable, infer_context_name,
ImportError: cannot import name gpu_alloc_empty
>>> 


On Thursday, July 21, 2016 at 12:13:35 PM UTC+2, [email protected] 
wrote:
>
> Frederic,
> In ops.py  I can't find shape_i_op
> thanks
>
> On Thursday, July 21, 2016 at 11:50:51 AM UTC+2, [email protected] 
> wrote:
>>
>> Frederic,
>> this is the feedback afterl the upgrades about float 16.
>>
>> Python 2.7.11 |Anaconda custom (64-bit)| (default, Dec  6 2015, 18:08:32) 
>> [GCC 4.4.7 20120313 (Red Hat 4.4.7-1)] on linux2
>> Type "help", "copyright", "credits" or "license" for more information.
>> Anaconda is brought to you by Continuum Analytics.
>> Please check out: http://continuum.io/thanks and https://anaconda.org
>> >>> import run_multi_conv
>> Traceback (most recent call last):
>>   File "<stdin>", line 1, in <module>
>>   File "run_multi_conv.py", line 1, in <module>
>>     import mpr_convnet_class as conv
>>   File "mpr_convnet_class.py", line 2, in <module>
>>     from convnet3d import ConvLayer, PoolLayer
>>   File "convnet3d.py", line 3, in <module>
>>     from theano.tensor.nnet.conv3d2d import conv3d
>>   File "/home/luca/data/Theano-master/theano/__init__.py", line 125, in 
>> <module>
>>     import theano.gpuarray
>>   File "/home/luca/data/Theano-master/theano/gpuarray/__init__.py", line 
>> 31, in <module>
>>     from . import fft, dnn, opt, nerv, extra_ops
>>   File "/home/luca/data/Theano-master/theano/gpuarray/dnn.py", line 17, 
>> in <module>
>>     from theano.compile.ops import shape_i, shape_i_op
>> ImportError: cannot import name shape_i_op
>> >>> 
>>
>>
>>
>>
>>
>> On Thursday, July 21, 2016 at 11:15:06 AM UTC+2, [email protected] 
>> wrote:
>>>
>>> Frederic,
>>> I'll do it and give you a feedback,
>>> many thanks
>>> Luca
>>>
>>> On Tuesday, July 19, 2016 at 10:09:21 PM UTC+2, nouiz wrote:
>>>>
>>>> We have a PR that upgrade some stuff about float16:
>>>>
>>>> https://github.com/Theano/Theano/pull/4764/files
>>>>
>>>> It probably fix your problem. Can you try it to confirm that you don't 
>>>> have a different problem?
>>>>
>>>> thanks
>>>>
>>>> Frédéric
>>>>
>>>> On Fri, Jul 15, 2016 at 4:55 AM, <[email protected]> wrote:
>>>>
>>>>> ok I try.
>>>>> thanks
>>>>>
>>>>> On Thursday, July 14, 2016 at 11:44:41 PM UTC+2, Arnaud Bergeron wrote:
>>>>>>
>>>>>> I can't reproduce your problem using a simple convolution in float16.
>>>>>>
>>>>>> Either this is because your code is doing something unexpected or 
>>>>>> because the problem has been fixed in the development version.
>>>>>>
>>>>>> In nay case the development version is a much better option for the 
>>>>>> new backend and float16 so I encourage you to upgrade and try again: 
>>>>>> http://deeplearning.net/software/theano/install.html#bleeding-edge-install-instructions
>>>>>> .
>>>>>>
>>>>>> 2016-07-14 4:22 GMT-04:00 <[email protected]>:
>>>>>>
>>>>>>> Here is .theanorc:
>>>>>>>
>>>>>>> [global]
>>>>>>> floatX = float16
>>>>>>> device=cuda
>>>>>>> [cuda] 
>>>>>>> root = /usr/local/cuda-7.5
>>>>>>>
>>>>>>>
>>>>>>> [nvcc]
>>>>>>> fastmath=True
>>>>>>>
>>>>>>> optimizer = fast_compile
>>>>>>>
>>>>>>> On Thursday, July 14, 2016 at 10:19:56 AM UTC+2, 
>>>>>>> [email protected] wrote:
>>>>>>>>
>>>>>>>> Hi Arnaud,
>>>>>>>> I put _f16_ok = True in dnn.py ( attached).
>>>>>>>>
>>>>>>>> This is the error I received:
>>>>>>>>
>>>>>>>> Python 2.7.11 |Anaconda custom (64-bit)| (default, Dec  6 2015, 
>>>>>>>> 18:08:32) 
>>>>>>>> [GCC 4.4.7 20120313 (Red Hat 4.4.7-1)] on linux2
>>>>>>>> Type "help", "copyright", "credits" or "license" for more 
>>>>>>>> information.
>>>>>>>> Anaconda is brought to you by Continuum Analytics.
>>>>>>>> Please check out: http://continuum.io/thanks and 
>>>>>>>> https://anaconda.org
>>>>>>>> >>> import run_multi_conv
>>>>>>>>
>>>>>>>> Mapped name None to device cuda: GeForce 840M
>>>>>>>> WARNING (theano.gof.compilelock): Overriding existing lock by dead 
>>>>>>>> process '3202' (I am process '3351')
>>>>>>>> Using cuDNN version 5005 on context None
>>>>>>>> /home/luca/data/Theano-master/theano/tensor/signal/downsample.py:6: 
>>>>>>>> UserWarning: downsample module has been moved to the 
>>>>>>>> theano.tensor.signal.pool module.
>>>>>>>>   "downsample module has been moved to the 
>>>>>>>> theano.tensor.signal.pool module.")
>>>>>>>> >>> 
>>>>>>>> >>> run_multi_conv.run_experiments()
>>>>>>>> Disabling C code for Elemwise{mul,no_inplace} due to unsupported 
>>>>>>>> float16
>>>>>>>> Disabling C code for Elemwise{Cast{float32}} due to unsupported 
>>>>>>>> float16
>>>>>>>> Disabling C code for Elemwise{Cast{float16}} due to unsupported 
>>>>>>>> float16
>>>>>>>> Disabling C code for Elemwise{Cast{float16}} due to unsupported 
>>>>>>>> float16
>>>>>>>> Disabling C code for Alloc due to unsupported float16
>>>>>>>> Disabling C code for Cast{float16} due to unsupported float16
>>>>>>>> Disabling C code for Cast{float16} due to unsupported float16
>>>>>>>> Disabling C code for Cast{float16} due to unsupported float16
>>>>>>>> Disabling C code for Cast{float16} due to unsupported float16
>>>>>>>> Disabling C code for RandomFunction{binomial} due to unsupported 
>>>>>>>> float16
>>>>>>>> Disabling C code for RandomFunction{binomial} due to unsupported 
>>>>>>>> float16
>>>>>>>> ===============================
>>>>>>>> 00001    #include <Python.h>
>>>>>>>> 00002    #include <iostream>
>>>>>>>> 00003    #include "theano_mod_helper.h"
>>>>>>>> 00004    #include <gpuarray/array.h>
>>>>>>>> 00005    #include <gpuarray/kernel.h>
>>>>>>>> 00006    #include <gpuarray/error.h>
>>>>>>>> 00007    #include <gpuarray/buffer.h>
>>>>>>>> 00008    #include <gpuarray/buffer_blas.h>
>>>>>>>> 00009    #include <numpy/arrayobject.h>
>>>>>>>> 00010    #include <gpuarray_api.h>
>>>>>>>> 00011    #include <math.h>
>>>>>>>> 00012    #include <numpy/arrayscalars.h>
>>>>>>>> 00013    #include "cudnn.h"
>>>>>>>> 00014    #include "cudnn_helper.h"
>>>>>>>> 00015    #include "gpuarray_helper.h"
>>>>>>>> 00016    #include "gpuarray/types.h"
>>>>>>>> 00017    #include "gpuarray/array.h"
>>>>>>>> 00018    #include "gpuarray/util.h"
>>>>>>>> 00019    #include "gpuarray/ext_cuda.h"
>>>>>>>> 00020    #include "gpuarray_api.h"
>>>>>>>> 00021    #include "numpy_compat.h"
>>>>>>>> 00022    //////////////////////
>>>>>>>> 00023    ////  Support Code
>>>>>>>> 00024    //////////////////////
>>>>>>>> 00025    
>>>>>>>> 00026    
>>>>>>>> 00027    
>>>>>>>> 00028    static int
>>>>>>>> 00029    c_set_tensorNd(PyGpuArrayObject *var, 
>>>>>>>> cudnnTensorDescriptor_t desc) {
>>>>>>>> 00030      cudnnDataType_t dt;
>>>>>>>> 00031      size_t ds;
>>>>>>>> 00032      switch (var->ga.typecode) {
>>>>>>>> 00033      case GA_FLOAT:
>>>>>>>> 00034        dt = CUDNN_DATA_FLOAT;
>>>>>>>> 00035        break;
>>>>>>>> 00036      case GA_DOUBLE:
>>>>>>>> 00037        dt = CUDNN_DATA_DOUBLE;
>>>>>>>> 00038        break;
>>>>>>>> 00039    #if CUDNN_VERSION > 3000
>>>>>>>> 00040      case GA_HALF:
>>>>>>>> 00041        dt = CUDNN_DATA_HALF;
>>>>>>>> 00042        break;
>>>>>>>> 00043    #endif
>>>>>>>> 00044      default:
>>>>>>>> 00045        PyErr_SetString(PyExc_TypeError, "Non-float datatype 
>>>>>>>> in c_set_tensorNd");
>>>>>>>> 00046        return -1;
>>>>>>>> 00047      }
>>>>>>>> 00048      ds = gpuarray_get_elsize(var->ga.typecode);
>>>>>>>> 00049    
>>>>>>>> 00050      int strs[5], dims[5], default_stride = 1;
>>>>>>>> 00051      unsigned int nd = PyGpuArray_NDIM(var);
>>>>>>>> 00052    
>>>>>>>> 00053      if (nd > 5) {
>>>>>>>> 00054        PyErr_SetString(PyExc_TypeError, "Tensor of more than 
>>>>>>>> 5d");
>>>>>>>> 00055        return -1;
>>>>>>>> 00056      }
>>>>>>>> 00057    
>>>>>>>> 00058      for (unsigned int _i = nd; _i > 0; _i--) {
>>>>>>>> 00059        unsigned int i = _i - 1;
>>>>>>>> 00060        strs[i] = PyGpuArray_STRIDE(var, i) ?
>>>>>>>> 00061          PyGpuArray_STRIDE(var, i)/ds : default_stride;
>>>>>>>> 00062        default_stride *= PyGpuArray_DIM(var, i);
>>>>>>>> 00063        dims[i] = PyGpuArray_DIM(var, i);
>>>>>>>> 00064      }
>>>>>>>> 00065    
>>>>>>>> 00066      cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, dt, 
>>>>>>>> nd, dims, strs);
>>>>>>>> 00067      if (err != CUDNN_STATUS_SUCCESS) {
>>>>>>>> 00068        PyErr_Format(PyExc_RuntimeError,
>>>>>>>> 00069             "Could not set tensorNd descriptor: %s",
>>>>>>>> 00070             cudnnGetErrorString(err));
>>>>>>>> 00071        return -1;
>>>>>>>> 00072      }
>>>>>>>> 00073      return 0;
>>>>>>>> 00074    }
>>>>>>>> 00075    
>>>>>>>> 00076    static int
>>>>>>>> 00077    c_set_filter(PyGpuArrayObject *var, 
>>>>>>>> cudnnFilterDescriptor_t desc) {
>>>>>>>> 00078      cudnnDataType_t dt;
>>>>>>>> 00079      cudnnStatus_t err;
>>>>>>>> 00080    
>>>>>>>> 00081      if (!GpuArray_IS_C_CONTIGUOUS(&var->ga)) {
>>>>>>>> 00082        PyErr_SetString(PyExc_ValueError,
>>>>>>>> 00083                "Only contiguous filters (kernels) are 
>>>>>>>> supported.");
>>>>>>>> 00084        return -1;
>>>>>>>> 00085      }
>>>>>>>> 00086      switch (var->ga.typecode) {
>>>>>>>> 00087      case GA_FLOAT:
>>>>>>>> 00088        dt = CUDNN_DATA_FLOAT;
>>>>>>>> 00089        break;
>>>>>>>> 00090      case GA_DOUBLE:
>>>>>>>> 00091        dt = CUDNN_DATA_DOUBLE;
>>>>>>>> 00092        break;
>>>>>>>> 00093    #if CUDNN_VERSION > 3000
>>>>>>>> 00094      case GA_HALF:
>>>>>>>> 00095        dt = CUDNN_DATA_HALF;
>>>>>>>> 00096        break;
>>>>>>>> 00097    #endif
>>>>>>>> 00098      default:
>>>>>>>> 00099        PyErr_SetString(PyExc_TypeError, "Non-float datatype 
>>>>>>>> in c_set_filter");
>>>>>>>> 00100        return -1;
>>>>>>>> 00101      }
>>>>>>>> 00102    
>>>>>>>> 00103      int dims[5];
>>>>>>>> 00104      unsigned int nd = PyGpuArray_NDIM(var);
>>>>>>>> 00105    
>>>>>>>> 00106      if (nd > 5) {
>>>>>>>> 00107        PyErr_SetString(PyExc_TypeError, "Tensor of more than 
>>>>>>>> 5d");
>>>>>>>> 00108        return -1;
>>>>>>>> 00109      }
>>>>>>>> 00110    
>>>>>>>> 00111      for (unsigned int _i = nd; _i > 0; _i--) {
>>>>>>>> 00112        unsigned int i = _i - 1;
>>>>>>>> 00113        dims[i] = PyGpuArray_DIM(var, i);
>>>>>>>> 00114      }
>>>>>>>> 00115    
>>>>>>>> 00116    #if CUDNN_VERSION >= 5000
>>>>>>>> 00117        err = cudnnSetFilterNdDescriptor(desc, dt, 
>>>>>>>> CUDNN_TENSOR_NCHW, nd, dims);
>>>>>>>> 00118    #else
>>>>>>>> 00119        err = cudnnSetFilterNdDescriptor(desc, dt, nd, dims);
>>>>>>>> 00120    #endif
>>>>>>>> 00121    
>>>>>>>> 00122      if (err != CUDNN_STATUS_SUCCESS) {
>>>>>>>> 00123        PyErr_Format(PyExc_RuntimeError,
>>>>>>>> 00124             "Could not set filter descriptor: %s.",
>>>>>>>> 00125             cudnnGetErrorString(err));
>>>>>>>> 00126        return -1;
>>>>>>>> 00127      }
>>>>>>>> 00128      return 0;
>>>>>>>> 00129    }
>>>>>>>> 00130    
>>>>>>>> 00131    
>>>>>>>> 00132    
>>>>>>>> 00133        namespace {
>>>>>>>> 00134        struct 
>>>>>>>> __struct_compiled_op_86feacd077d8749f42b5d82709a80ba3 {
>>>>>>>> 00135            PyObject* __ERROR;
>>>>>>>> 00136    
>>>>>>>> 00137            PyObject* storage_V3;
>>>>>>>> 00138    PyObject* storage_V5;
>>>>>>>> 00139    PyObject* storage_V7;
>>>>>>>> 00140    PyObject* storage_V9;
>>>>>>>> 00141    PyObject* storage_V11;
>>>>>>>> 00142    PyObject* storage_V13;
>>>>>>>> 00143    PyObject* storage_V1;
>>>>>>>> 00144    PyObject* storage_V15;
>>>>>>>> 00145            
>>>>>>>> 00146        PyObject* py_V15;
>>>>>>>> 00147        PyGpuContextObject *V15;
>>>>>>>> 00148    #define DTYPE_INPUT_0 npy_float16
>>>>>>>> 00149    #define TYPENUM_INPUT_0 23
>>>>>>>> 00150    #define ITEMSIZE_INPUT_0 2
>>>>>>>> 00151    #define DTYPE_INPUT_1 npy_float16
>>>>>>>> 00152    #define TYPENUM_INPUT_1 23
>>>>>>>> 00153    #define ITEMSIZE_INPUT_1 2
>>>>>>>> 00154    #define DTYPE_INPUT_2 npy_float16
>>>>>>>> 00155    #define TYPENUM_INPUT_2 23
>>>>>>>> 00156    #define ITEMSIZE_INPUT_2 2
>>>>>>>> 00157    #define DTYPE_INPUT_4 npy_float16
>>>>>>>> 00158    #define TYPENUM_INPUT_4 23
>>>>>>>> 00159    #define ITEMSIZE_INPUT_4 2
>>>>>>>> 00160    #define DTYPE_INPUT_5 npy_float16
>>>>>>>> 00161    #define TYPENUM_INPUT_5 23
>>>>>>>> 00162    #define ITEMSIZE_INPUT_5 2
>>>>>>>> 00163    #define DTYPE_OUTPUT_0 npy_float16
>>>>>>>> 00164    #define TYPENUM_OUTPUT_0 23
>>>>>>>> 00165    #define ITEMSIZE_OUTPUT_0 2
>>>>>>>> 00166    #define APPLY_SPECIFIC(str) 
>>>>>>>> str##_node_86feacd077d8749f42b5d82709a80ba3_0
>>>>>>>> 00167    #define CONV_INPLACE 1
>>>>>>>> 00168    #define CONV_ALGO 
>>>>>>>> CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
>>>>>>>> 00169    
>>>>>>>> 00170    
>>>>>>>> 00171    PyGpuContextObject *ctx;
>>>>>>>> 00172    cudnnHandle_t APPLY_SPECIFIC(_handle);
>>>>>>>> 00173    
>>>>>>>> 00174    
>>>>>>>> 00175    cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
>>>>>>>> 00176    cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
>>>>>>>> 00177    cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns);
>>>>>>>> 00178    
>>>>>>>> 00179    
>>>>>>>> 00180    
>>>>>>>> 00181    #ifdef CHOOSE_ALGO
>>>>>>>> 00182    int reuse_algo;
>>>>>>>> 00183    cudnnConvolutionFwdAlgo_t prev_algo;
>>>>>>>> 00184    #ifndef CHOOSE_ONCE
>>>>>>>> 00185    size_t prev_img_dims[5];
>>>>>>>> 00186    size_t prev_kern_dims[5];
>>>>>>>> 00187    #endif
>>>>>>>> 00188    #endif
>>>>>>>> 00189    
>>>>>>>> 00190    int
>>>>>>>> 00191    APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, 
>>>>>>>> PyGpuArrayObject *kerns,
>>>>>>>> 00192                             PyGpuArrayObject *om,
>>>>>>>> 00193                             cudnnConvolutionDescriptor_t desc,
>>>>>>>> 00194                             double alpha, double beta,
>>>>>>>> 00195                             PyGpuArrayObject **output,
>>>>>>>> 00196                             PyGpuContextObject *c) {
>>>>>>>> 00197      cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
>>>>>>>> 00198      float af = alpha, bf = beta;
>>>>>>>> 00199      void *alpha_p;
>>>>>>>> 00200      void *beta_p;
>>>>>>>> 00201    
>>>>>>>> 00202      if (PyGpuArray_DIMS(input)[1] != 
>>>>>>>> PyGpuArray_DIMS(kerns)[1]) {
>>>>>>>> 00203        PyErr_SetString(PyExc_ValueError,
>>>>>>>> 00204                "images and kernel must have the same stack 
>>>>>>>> size");
>>>>>>>> 00205        return 1;
>>>>>>>> 00206      }
>>>>>>>> 00207    
>>>>>>>> 00208      if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
>>>>>>>> 00209        return 1;
>>>>>>>> 00210      if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
>>>>>>>> 00211        return 1;
>>>>>>>> 00212    
>>>>>>>> 00213      switch (input->ga.typecode) {
>>>>>>>> 00214      case GA_DOUBLE:
>>>>>>>> 00215        alpha_p = (void *)&alpha;
>>>>>>>> 00216        beta_p = (void *)&beta;
>>>>>>>> 00217        break;
>>>>>>>> 00218      case GA_FLOAT:
>>>>>>>> 00219      case GA_HALF:
>>>>>>>> 00220        alpha_p = (void *)&af;
>>>>>>>> 00221        beta_p = (void *)&bf;
>>>>>>>> 00222        break;
>>>>>>>> 00223      default:
>>>>>>>> 00224        PyErr_SetString(PyExc_TypeError, "Unsupported type in 
>>>>>>>> convolution");
>>>>>>>> 00225        return 1;
>>>>>>>> 00226      }
>>>>>>>> 00227    
>>>>>>>> 00228    #ifdef CONV_INPLACE
>>>>>>>> 00229      Py_XDECREF(*output);
>>>>>>>> 00230      *output = om;
>>>>>>>> 00231      Py_INCREF(*output);
>>>>>>>> 00232    #else
>>>>>>>> 00233      if (theano_prep_output(output, PyGpuArray_NDIM(om), 
>>>>>>>> PyGpuArray_DIMS(om),
>>>>>>>> 00234                             om->ga.typecode, GA_C_ORDER, c) 
>>>>>>>> != 0)
>>>>>>>> 00235        return 1;
>>>>>>>> 00236      if (beta != 0.0 && pygpu_move(*output, om))
>>>>>>>> 00237        return 1;
>>>>>>>> 00238    #endif
>>>>>>>> 00239    
>>>>>>>> 00240      if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == 
>>>>>>>> -1)
>>>>>>>> 00241        return 1;
>>>>>>>> 00242    
>>>>>>>> 00243      cudnnConvolutionFwdAlgo_t algo = CONV_ALGO;
>>>>>>>> 00244    
>>>>>>>> 00245      cuda_enter(c->ctx);
>>>>>>>> 00246    #ifdef CHOOSE_ALGO
>>>>>>>> 00247    #ifndef CHOOSE_ONCE
>>>>>>>> 00248      reuse_algo = 1;
>>>>>>>> 00249      for (unsigned int i = 0; i < PyGpuArray_NDIM(input); 
>>>>>>>> i++) {
>>>>>>>> 00250        reuse_algo = (reuse_algo &&
>>>>>>>> 00251                      PyGpuArray_DIM(input, i) == 
>>>>>>>> prev_img_dims[i]);
>>>>>>>> 00252        reuse_algo = (reuse_algo &&
>>>>>>>> 00253                      PyGpuArray_DIM(kerns, i) == 
>>>>>>>> prev_kern_dims[i]);
>>>>>>>> 00254      }
>>>>>>>> 00255    #endif
>>>>>>>> 00256    
>>>>>>>> 00257      if (!reuse_algo) {
>>>>>>>> 00258    #ifdef CHOOSE_TIME
>>>>>>>> 00259        int count;
>>>>>>>> 00260        cudnnConvolutionFwdAlgoPerf_t choice;
>>>>>>>> 00261        err = cudnnFindConvolutionForwardAlgorithm(
>>>>>>>> 00262          APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), 
>>>>>>>> APPLY_SPECIFIC(kerns),
>>>>>>>> 00263          desc, APPLY_SPECIFIC(output), 1, &count, &choice);
>>>>>>>> 00264    
>>>>>>>> 00265        if (err != CUDNN_STATUS_SUCCESS) {
>>>>>>>> 00266          PyErr_Format(PyExc_RuntimeError,
>>>>>>>> 00267                       "error selecting convolution algo: %s",
>>>>>>>> 00268                       cudnnGetErrorString(err));
>>>>>>>> 00269          cuda_exit(c->ctx);
>>>>>>>> 00270          return 1;
>>>>>>>> 00271        }
>>>>>>>> 00272        algo = choice.algo;
>>>>>>>> 00273    #else
>>>>>>>> 00274        size_t free;
>>>>>>>> 00275        int err2 = gpucontext_property(c->ctx, 
>>>>>>>> GA_CTX_PROP_FREE_GMEM, &free);
>>>>>>>> 00276    
>>>>>>>> 00277        if (err2 != GA_NO_ERROR) {
>>>>>>>> 00278          PyErr_Format(PyExc_RuntimeError, "Error when trying 
>>>>>>>> to find the "
>>>>>>>> 00279                       "memory information on the GPU");
>>>>>>>> 00280          cuda_exit(c->ctx);
>>>>>>>> 00281          return 1;
>>>>>>>> 00282        }
>>>>>>>> 00283    
>>>>>>>> 00284        err = cudnnGetConvolutionForwardAlgorithm(
>>>>>>>> 00285          APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), 
>>>>>>>> APPLY_SPECIFIC(kerns),
>>>>>>>> 00286          desc, APPLY_SPECIFIC(output),
>>>>>>>> 00287          CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, 
>>>>>>>> &algo);
>>>>>>>> 00288        if (err != CUDNN_STATUS_SUCCESS) {
>>>>>>>> 00289          PyErr_Format(PyExc_RuntimeError,
>>>>>>>> 00290                       "error selecting convolution algo: %s",
>>>>>>>> 00291                       cudnnGetErrorString(err));
>>>>>>>> 00292          cuda_exit(c->ctx);
>>>>>>>> 00293          return 1;
>>>>>>>> 00294        }
>>>>>>>> 00295    #endif
>>>>>>>> 00296        prev_algo = algo;
>>>>>>>> 00297      } else {
>>>>>>>> 00298        algo = prev_algo;
>>>>>>>> 00299      }
>>>>>>>> 00300    
>>>>>>>> 00301    #ifdef CHOOSE_ONCE
>>>>>>>> 00302      reuse_algo = 1;
>>>>>>>> 00303    #else
>>>>>>>> 00304      for (unsigned int i = 0; i < PyGpuArray_NDIM(input); 
>>>>>>>> i++) {
>>>>>>>> 00305        prev_img_dims[i] = PyGpuArray_DIM(input, i);
>>>>>>>> 00306        prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
>>>>>>>> 00307      }
>>>>>>>> 00308    #endif
>>>>>>>> 00309    
>>>>>>>> 00310    #endif
>>>>>>>> 00311    
>>>>>>>> 00312      /* These two algos are not supported for 3d conv */
>>>>>>>> 00313      if (PyGpuArray_NDIM(input) == 5 &&
>>>>>>>> 00314          (algo == 
>>>>>>>> CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
>>>>>>>> 00315           algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM))
>>>>>>>> 00316        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
>>>>>>>> 00317    
>>>>>>>> 00318      // The FFT implementation does not support strides, 1x1 
>>>>>>>> filters or inputs
>>>>>>>> 00319      // with a spatial dimension larger than 1024. The 
>>>>>>>> tiled-FFT implementation
>>>>>>>> 00320      // does not support strides.
>>>>>>>> 00321      // If the chosen implementation is FFT or tiled-FFT, 
>>>>>>>> validate that it can
>>>>>>>> 00322      // be used on the current data and default to a safe 
>>>>>>>> implementation if it
>>>>>>>> 00323      // can't.
>>>>>>>> 00324      // The following code is 2d-specific but it is fine as 
>>>>>>>> FFT and tiled-FFT are
>>>>>>>> 00325      // defined only for 2d filters
>>>>>>>> 00326      if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
>>>>>>>> 00327           algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && 
>>>>>>>> PyGpuArray_NDIM(input) == 4) {
>>>>>>>> 00328    
>>>>>>>> 00329        // Extract the properties of the convolution descriptor
>>>>>>>> 00330        int nd;
>>>>>>>> 00331        int pad[2];
>>>>>>>> 00332        int stride[2];
>>>>>>>> 00333        int upscale[2];
>>>>>>>> 00334        cudnnConvolutionMode_t mode;
>>>>>>>> 00335        cudnnDataType_t data_type;
>>>>>>>> 00336        err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, 
>>>>>>>> pad, stride,
>>>>>>>> 00337                                                 upscale, 
>>>>>>>> &mode, &data_type);
>>>>>>>> 00338        if (err != CUDNN_STATUS_SUCCESS) {
>>>>>>>> 00339          PyErr_Format(PyExc_RuntimeError,
>>>>>>>> 00340                       "error getting convolution properties: 
>>>>>>>> %s",
>>>>>>>> 00341                       cudnnGetErrorString(err));
>>>>>>>> 00342          cuda_exit(c->ctx);
>>>>>>>> 00343          return 1;
>>>>>>>> 00344        }
>>>>>>>> 00345    
>>>>>>>> 00346        if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT)
>>>>>>>> 00347        {
>>>>>>>> 00348          if (stride[0] != 1 || stride[1] != 1 ||
>>>>>>>> 00349              PyGpuArray_DIM(input, 2) > 1024 || 
>>>>>>>> PyGpuArray_DIM(input, 3) > 1024 ||
>>>>>>>> 00350              (PyGpuArray_DIM(kerns, 2) == 1 && 
>>>>>>>> PyGpuArray_DIM(kerns, 3) == 1))
>>>>>>>> 00351          {
>>>>>>>> 00352            algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
>>>>>>>> 00353          }
>>>>>>>> 00354        }
>>>>>>>> 00355        else
>>>>>>>> 00356        {
>>>>>>>> 00357          // algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
>>>>>>>> 00358          if (stride[0] != 1 || stride[1] != 1)
>>>>>>>> 00359          {
>>>>>>>> 00360            algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
>>>>>>>> 00361          }
>>>>>>>> 00362        }
>>>>>>>> 00363      }
>>>>>>>> 00364    
>>>>>>>> 00365      {
>>>>>>>> 00366        size_t worksize;
>>>>>>>> 00367        gpudata *workspace;
>>>>>>>> 00368        err = 
>>>>>>>> cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle),
>>>>>>>> 00369                                                      
>>>>>>>> APPLY_SPECIFIC(input),
>>>>>>>> 00370                                                      
>>>>>>>> APPLY_SPECIFIC(kerns),
>>>>>>>> 00371                                                      desc,
>>>>>>>> 00372                                                      
>>>>>>>> APPLY_SPECIFIC(output),
>>>>>>>> 00373                                                      algo,
>>>>>>>> 00374                                                      
>>>>>>>> &worksize);
>>>>>>>> 00375    
>>>>>>>> 00376        if (err == CUDNN_STATUS_NOT_SUPPORTED) {
>>>>>>>> 00377          // Fallback to none algo if not supported
>>>>>>>> 00378          // TODO: Print a warning
>>>>>>>> 00379          algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
>>>>>>>> 00380    
>>>>>>>> 00381          err = 
>>>>>>>> cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle),
>>>>>>>> 00382                                                        
>>>>>>>> APPLY_SPECIFIC(input),
>>>>>>>> 00383                                                        
>>>>>>>> APPLY_SPECIFIC(kerns),
>>>>>>>> 00384                                                        desc,
>>>>>>>> 00385                                                        
>>>>>>>> APPLY_SPECIFIC(output),
>>>>>>>> 00386                                                        algo,
>>>>>>>> 00387                                                        
>>>>>>>> &worksize);
>>>>>>>> 00388        }
>>>>>>>> 00389    
>>>>>>>> 00390        if (err != CUDNN_STATUS_SUCCESS) {
>>>>>>>> 00391          PyErr_Format(PyExc_RuntimeError,
>>>>>>>> 00392                       "error getting worksize: %s",
>>>>>>>> 00393                       cudnnGetErrorString(err));
>>>>>>>> 00394          cuda_exit(c->ctx);
>>>>>>>> 00395          return 1;
>>>>>>>> 00396        }
>>>>>>>> 00397    
>>>>>>>> 00398        /*
>>>>>>>> 00399         * This is less than ideal since we need to free it 
>>>>>>>> after (which
>>>>>>>> 00400         * introduces a synchronization point. But we don't 
>>>>>>>> have a module
>>>>>>>> 00401         * to place a nice get_work_mem() function in.
>>>>>>>> 00402         */
>>>>>>>> 00403        if (worksize != 0) {
>>>>>>>> 00404          workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, 
>>>>>>>> NULL);
>>>>>>>> 00405          if (workspace == NULL) {
>>>>>>>> 00406            PyErr_SetString(PyExc_RuntimeError,
>>>>>>>> 00407                            "Could not allocate working 
>>>>>>>> memory");
>>>>>>>> 00408            cuda_exit(c->ctx);
>>>>>>>> 00409            return 1;
>>>>>>>> 00410          }
>>>>>>>> 00411        }
>>>>>>>> 00412    
>>>>>>>> 00413        cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
>>>>>>>> 00414        cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
>>>>>>>> 00415        cuda_wait((*output)->ga.data, 
>>>>>>>> GPUARRAY_CUDA_WAIT_WRITE);
>>>>>>>> 00416    
>>>>>>>> 00417        err = cudnnConvolutionForward(
>>>>>>>> 00418          APPLY_SPECIFIC(_handle),
>>>>>>>> 00419          alpha_p,
>>>>>>>> 00420          APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
>>>>>>>> 00421          APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
>>>>>>>> 00422          desc, algo,
>>>>>>>> 00423          worksize == 0 ? NULL : *(void **)workspace, worksize,
>>>>>>>> 00424          beta_p,
>>>>>>>> 00425          APPLY_SPECIFIC(output), 
>>>>>>>> PyGpuArray_DEV_DATA(*output));
>>>>>>>> 00426    
>>>>>>>> 00427        if (worksize != 0)
>>>>>>>> 00428          gpudata_release(workspace);
>>>>>>>> 00429    
>>>>>>>> 00430        cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
>>>>>>>> 00431        cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
>>>>>>>> 00432        cuda_record((*output)->ga.data, 
>>>>>>>> GPUARRAY_CUDA_WAIT_WRITE);
>>>>>>>> 00433      }
>>>>>>>> 00434      cuda_exit(c->ctx);
>>>>>>>> 00435    
>>>>>>>> 00436      if (err != CUDNN_STATUS_SUCCESS) {
>>>>>>>> 00437        PyErr_Format(PyExc_RuntimeError, "error doing 
>>>>>>>> operation: %s",
>>>>>>>> 00438             cudnnGetErrorString(err));
>>>>>>>> 00439        return 1;
>>>>>>>> 00440      }
>>>>>>>> 00441      return 0;
>>>>>>>> 00442    }
>>>>>>>> 00443    
>>>>>>>> 00444    #undef DTYPE_INPUT_0
>>>>>>>> 00445    #undef TYPENUM_INPUT_0
>>>>>>>> 00446    #undef ITEMSIZE_INPUT_0
>>>>>>>> 00447    #undef DTYPE_INPUT_1
>>>>>>>> 00448    #undef TYPENUM_INPUT_1
>>>>>>>> 00449    #undef ITEMSIZE_INPUT_1
>>>>>>>> 00450    #undef DTYPE_INPUT_2
>>>>>>>> 00451    #undef TYPENUM_INPUT_2
>>>>>>>> 00452    #undef ITEMSIZE_INPUT_2
>>>>>>>> 00453    #undef DTYPE_INPUT_4
>>>>>>>> 00454    #undef TYPENUM_INPUT_4
>>>>>>>> 00455    #undef ITEMSIZE_INPUT_4
>>>>>>>> 00456    #undef DTYPE_INPUT_5
>>>>>>>> 00457    #undef TYPENUM_INPUT_5
>>>>>>>> 00458    #undef ITEMSIZE_INPUT_5
>>>>>>>> 00459    #undef DTYPE_OUTPUT_0
>>>>>>>> 00460    #undef TYPENUM_OUTPUT_0
>>>>>>>> 00461    #undef ITEMSIZE_OUTPUT_0
>>>>>>>> 00462    #undef APPLY_SPECIFIC
>>>>>>>> 00463    #undef CONV_INPLACE
>>>>>>>> 00464    #undef CONV_ALGO
>>>>>>>> 00465    
>>>>>>>> 00466            
>>>>>>>> __struct_compiled_op_86feacd077d8749f42b5d82709a80ba3() {
>>>>>>>> 00467                // This is only somewhat safe because we:
>>>>>>>> 00468                //  1) Are not a virtual class
>>>>>>>> 00469                //  2) Do not use any virtual classes in the 
>>>>>>>> members
>>>>>>>> 00470                //  3) Deal with mostly POD and pointers
>>>>>>>> 00471    
>>>>>>>> 00472                // If this changes, we would have to revise 
>>>>>>>> this, but for
>>>>>>>> 00473                // now I am tired of chasing segfaults because
>>>>>>>> 00474                // initialization code had an error and some 
>>>>>>>> pointer has
>>>>>>>> 00475                // a junk value.
>>>>>>>> 00476                memset(this, 0, sizeof(*this));
>>>>>>>> 00477            }
>>>>>>>> 00478            
>>>>>>>> ~__struct_compiled_op_86feacd077d8749f42b5d82709a80ba3(void) {
>>>>>>>> 00479                cleanup();
>>>>>>>> 00480            }
>>>>>>>> 00481    
>>>>>>>> 00482            int init(PyObject* __ERROR, PyObject* storage_V3, 
>>>>>>>> PyObject* storage_V5, PyObject* storage_V7, PyObject* storage_V9, 
>>>>>>>> PyObject* 
>>>>>>>> storage_V11, PyObject* storage_V13, PyObject* storage_V1, PyObject* 
>>>>>>>> storage_V15) {
>>>>>>>> 00483                Py_XINCREF(storage_V3);
>>>>>>>> 00484    Py_XINCREF(storage_V5);
>>>>>>>> 00485    Py_XINCREF(storage_V7);
>>>>>>>> 00486    Py_XINCREF(storage_V9);
>>>>>>>> 00487    Py_XINCREF(storage_V11);
>>>>>>>> 00488    Py_XINCREF(storage_V13);
>>>>>>>> 00489    Py_XINCREF(storage_V1);
>>>>>>>> 00490    Py_XINCREF(storage_V15);
>>>>>>>> 00491                this->storage_V3 = storage_V3;
>>>>>>>> 00492    this->storage_V5 = storage_V5;
>>>>>>>> 00493    this->storage_V7 = storage_V7;
>>>>>>>> 00494    this->storage_V9 = storage_V9;
>>>>>>>> 00495    this->storage_V11 = storage_V11;
>>>>>>>> 00496    this->storage_V13 = storage_V13;
>>>>>>>> 00497    this->storage_V1 = storage_V1;
>>>>>>>> 00498    this->storage_V15 = storage_V15;
>>>>>>>> 00499                
>>>>>>>> 00500    
>>>>>>>> 00501    
>>>>>>>> 00502    
>>>>>>>> 00503    
>>>>>>>> 00504    
>>>>>>>> 00505    
>>>>>>>> 00506    
>>>>>>>> 00507    
>>>>>>>> 00508        py_V15 = PyList_GET_ITEM(storage_V15, 0);
>>>>>>>> 00509        {Py_XINCREF(py_V15);}
>>>>>>>> 00510        
>>>>>>>> 00511    if (!PyObject_TypeCheck(py_V15, &PyGpuContextType)) {
>>>>>>>> 00512      PyErr_SetString(PyExc_TypeError, "expected a 
>>>>>>>> GpuContext");
>>>>>>>> 00513      {
>>>>>>>> 00514            if (!PyErr_Occurred()) {
>>>>>>>> 00515                PyErr_SetString(PyExc_RuntimeError,
>>>>>>>> 00516                    "Unexpected error in an Op's C code. "
>>>>>>>> 00517                    "No Python exception was set.");
>>>>>>>> 00518                }
>>>>>>>> 00519            return 15;
>>>>>>>> 00520    }
>>>>>>>> 00521    }
>>>>>>>> 00522    
>>>>>>>> 00523    V15 = (PyGpuContextObject *)py_V15;
>>>>>>>> 00524    Py_INCREF(V15);
>>>>>>>> 00525    
>>>>>>>> 00526    
>>>>>>>> 00527    #define DTYPE_INPUT_0 npy_float16
>>>>>>>> 00528    #define TYPENUM_INPUT_0 23
>>>>>>>> 00529    #define ITEMSIZE_INPUT_0 2
>>>>>>>> 00530    #define DTYPE_INPUT_1 npy_float16
>>>>>>>> 00531    #define TYPENUM_INPUT_1 23
>>>>>>>> 00532    #define ITEMSIZE_INPUT_1 2
>>>>>>>> 00533    #define DTYPE_INPUT_2 npy_float16
>>>>>>>> 00534    #define TYPENUM_INPUT_2 23
>>>>>>>> 00535    #define ITEMSIZE_INPUT_2 2
>>>>>>>> 00536    #define DTYPE_INPUT_4 npy_float16
>>>>>>>> 00537    #define TYPENUM_INPUT_4 23
>>>>>>>> 00538    #define ITEMSIZE_INPUT_4 2
>>>>>>>> 00539    #define DTYPE_INPUT_5 npy_float16
>>>>>>>> 00540    #define TYPENUM_INPUT_5 23
>>>>>>>> 00541    #define ITEMSIZE_INPUT_5 2
>>>>>>>> 00542    #define DTYPE_OUTPUT_0 npy_float16
>>>>>>>> 00543    #define TYPENUM_OUTPUT_0 23
>>>>>>>> 00544    #define ITEMSIZE_OUTPUT_0 2
>>>>>>>> 00545    #define APPLY_SPECIFIC(str) 
>>>>>>>> str##_node_86feacd077d8749f42b5d82709a80ba3_0
>>>>>>>> 00546    #define CONV_INPLACE 1
>>>>>>>> 00547    #define CONV_ALGO 
>>>>>>>> CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
>>>>>>>> 00548    #define FAIL { \
>>>>>>>> 00549            if (!PyErr_Occurred()) { \
>>>>>>>> 00550                PyErr_SetString(PyExc_RuntimeError, \
>>>>>>>> 00551                    "Unexpected error in an Op's C code. " \
>>>>>>>> 00552                    "No Python exception was set."); \
>>>>>>>> 00553                } \
>>>>>>>> 00554            return 17; \
>>>>>>>> 00555    }
>>>>>>>> 00556    #define PARAMS V15
>>>>>>>> 00557    
>>>>>>>> 00558    
>>>>>>>> 00559    {
>>>>>>>> 00560      // We need to keep a reference here to have it available 
>>>>>>>> in the destructor.
>>>>>>>> 00561      ctx = PARAMS;
>>>>>>>> 00562      Py_INCREF(ctx);
>>>>>>>> 00563    
>>>>>>>> 00564      cuda_enter(PARAMS->ctx);
>>>>>>>> 00565      cudnnStatus_t err;
>>>>>>>> 00566      APPLY_SPECIFIC(_handle) = NULL;
>>>>>>>> 00567      if ((err = cudnnCreate(&APPLY_SPECIFIC(_handle))) != 
>>>>>>>> CUDNN_STATUS_SUCCESS) {
>>>>>>>> 00568        PyErr_Format(PyExc_RuntimeError, "could not create 
>>>>>>>> cuDNN handle: %s",
>>>>>>>> 00569                     cudnnGetErrorString(err));
>>>>>>>> 00570        cuda_exit(PARAMS->ctx);
>>>>>>>> 00571        FAIL;
>>>>>>>> 00572      }
>>>>>>>> 00573      if ((err = cudnnSetStream(APPLY_SPECIFIC(_handle),
>>>>>>>> 00574                                cuda_get_stream(PARAMS->ctx))) 
>>>>>>>> != CUDNN_STATUS_SUCCESS) {
>>>>>>>> 00575        PyErr_Format(PyExc_RuntimeError, "Could not set cudnn 
>>>>>>>> stream: %s",
>>>>>>>> 00576                     cudnnGetErrorString(err));
>>>>>>>> 00577        cuda_exit(PARAMS->ctx);
>>>>>>>> 00578        FAIL;
>>>>>>>> 00579      }
>>>>>>>> 00580      cuda_exit(PARAMS->ctx);
>>>>>>>> 00581    }
>>>>>>>> 00582    
>>>>>>>> 00583    
>>>>>>>> 00584    
>>>>>>>> 00585    cudnnStatus_t APPLY_SPECIFIC(err);
>>>>>>>> 00586    APPLY_SPECIFIC(input) = NULL;
>>>>>>>> 00587    APPLY_SPECIFIC(output) = NULL;
>>>>>>>> 00588    APPLY_SPECIFIC(kerns) = NULL;
>>>>>>>> 00589    if ((APPLY_SPECIFIC(err) = 
>>>>>>>> cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != 
>>>>>>>> CUDNN_STATUS_SUCCESS) {
>>>>>>>> 00590      PyErr_Format(PyExc_MemoryError, "could not allocate 
>>>>>>>> tensor descriptor "
>>>>>>>> 00591               "(inp): %s", 
>>>>>>>> cudnnGetErrorString(APPLY_SPECIFIC(err)));
>>>>>>>> 00592      FAIL;
>>>>>>>> 00593    }
>>>>>>>> 00594    if ((APPLY_SPECIFIC(err) = 
>>>>>>>> cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != 
>>>>>>>> CUDNN_STATUS_SUCCESS) {
>>>>>>>> 00595      PyErr_Format(PyExc_MemoryError, "could not allocate 
>>>>>>>> tensor descriptor "
>>>>>>>> 00596                   "(out): %s", 
>>>>>>>> cudnnGetErrorString(APPLY_SPECIFIC(err)));
>>>>>>>> 00597      FAIL;
>>>>>>>> 00598    }
>>>>>>>> 00599    if ((APPLY_SPECIFIC(err) = 
>>>>>>>> cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns))) != 
>>>>>>>> CUDNN_STATUS_SUCCESS) {
>>>>>>>> 00600      PyErr_Format(PyExc_MemoryError, "could not allocate 
>>>>>>>> filter descriptor: %s", 
>>>>>>>> 00601               cudnnGetErrorString(APPLY_SPECIFIC(err)));
>>>>>>>> 00602      FAIL;
>>>>>>>> 00603    }
>>>>>>>> 00604    
>>>>>>>> 00605    
>>>>>>>> 00606    
>>>>>>>> 00607    #ifdef CHOOSE_ALGO
>>>>>>>> 00608    reuse_algo = 0;
>>>>>>>> 00609    prev_algo = CONV_ALGO;
>>>>>>>> 00610    #ifndef CHOOSE_ONCE
>>>>>>>> 00611    memset(prev_img_dims, 0, sizeof(prev_img_dims));
>>>>>>>> 00612    memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
>>>>>>>> 00613    #endif
>>>>>>>> 00614    #endif
>>>>>>>> 00615    
>>>>>>>> 00616    
>>>>>>>> 00617    #undef FAIL
>>>>>>>> 00618    #undef PARAMS
>>>>>>>> 00619    #undef DTYPE_INPUT_0
>>>>>>>> 00620    #undef TYPENUM_INPUT_0
>>>>>>>> 00621    #undef ITEMSIZE_INPUT_0
>>>>>>>> 00622    #undef DTYPE_INPUT_1
>>>>>>>> 00623    #undef TYPENUM_INPUT_1
>>>>>>>> 00624    #undef ITEMSIZE_INPUT_1
>>>>>>>> 00625    #undef DTYPE_INPUT_2
>>>>>>>> 00626    #undef
>>>>>>>>
>>>>>>>

-- 

--- 
You received this message because you are subscribed to the Google Groups 
"theano-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/d/optout.

Re: [theano-users] Error using floatX = float16 to save memory

Reply via email to