We have a PR that upgrade some stuff about float16: https://github.com/Theano/Theano/pull/4764/files
It probably fix your problem. Can you try it to confirm that you don't have a different problem? thanks Frédéric On Fri, Jul 15, 2016 at 4:55 AM, <[email protected]> wrote: > ok I try. > thanks > > On Thursday, July 14, 2016 at 11:44:41 PM UTC+2, Arnaud Bergeron wrote: >> >> I can't reproduce your problem using a simple convolution in float16. >> >> Either this is because your code is doing something unexpected or because >> the problem has been fixed in the development version. >> >> In nay case the development version is a much better option for the new >> backend and float16 so I encourage you to upgrade and try again: >> http://deeplearning.net/software/theano/install.html#bleeding-edge-install-instructions >> . >> >> 2016-07-14 4:22 GMT-04:00 <[email protected]>: >> >>> Here is .theanorc: >>> >>> [global] >>> floatX = float16 >>> device=cuda >>> [cuda] >>> root = /usr/local/cuda-7.5 >>> >>> >>> [nvcc] >>> fastmath=True >>> >>> optimizer = fast_compile >>> >>> On Thursday, July 14, 2016 at 10:19:56 AM UTC+2, [email protected] >>> wrote: >>>> >>>> Hi Arnaud, >>>> I put _f16_ok = True in dnn.py ( attached). >>>> >>>> This is the error I received: >>>> >>>> Python 2.7.11 |Anaconda custom (64-bit)| (default, Dec 6 2015, >>>> 18:08:32) >>>> [GCC 4.4.7 20120313 (Red Hat 4.4.7-1)] on linux2 >>>> Type "help", "copyright", "credits" or "license" for more information. >>>> Anaconda is brought to you by Continuum Analytics. >>>> Please check out: http://continuum.io/thanks and https://anaconda.org >>>> >>> import run_multi_conv >>>> >>>> Mapped name None to device cuda: GeForce 840M >>>> WARNING (theano.gof.compilelock): Overriding existing lock by dead >>>> process '3202' (I am process '3351') >>>> Using cuDNN version 5005 on context None >>>> /home/luca/data/Theano-master/theano/tensor/signal/downsample.py:6: >>>> UserWarning: downsample module has been moved to the >>>> theano.tensor.signal.pool module. >>>> "downsample module has been moved to the theano.tensor.signal.pool >>>> module.") >>>> >>> >>>> >>> run_multi_conv.run_experiments() >>>> Disabling C code for Elemwise{mul,no_inplace} due to unsupported float16 >>>> Disabling C code for Elemwise{Cast{float32}} due to unsupported float16 >>>> Disabling C code for Elemwise{Cast{float16}} due to unsupported float16 >>>> Disabling C code for Elemwise{Cast{float16}} due to unsupported float16 >>>> Disabling C code for Alloc due to unsupported float16 >>>> Disabling C code for Cast{float16} due to unsupported float16 >>>> Disabling C code for Cast{float16} due to unsupported float16 >>>> Disabling C code for Cast{float16} due to unsupported float16 >>>> Disabling C code for Cast{float16} due to unsupported float16 >>>> Disabling C code for RandomFunction{binomial} due to unsupported float16 >>>> Disabling C code for RandomFunction{binomial} due to unsupported float16 >>>> =============================== >>>> 00001 #include <Python.h> >>>> 00002 #include <iostream> >>>> 00003 #include "theano_mod_helper.h" >>>> 00004 #include <gpuarray/array.h> >>>> 00005 #include <gpuarray/kernel.h> >>>> 00006 #include <gpuarray/error.h> >>>> 00007 #include <gpuarray/buffer.h> >>>> 00008 #include <gpuarray/buffer_blas.h> >>>> 00009 #include <numpy/arrayobject.h> >>>> 00010 #include <gpuarray_api.h> >>>> 00011 #include <math.h> >>>> 00012 #include <numpy/arrayscalars.h> >>>> 00013 #include "cudnn.h" >>>> 00014 #include "cudnn_helper.h" >>>> 00015 #include "gpuarray_helper.h" >>>> 00016 #include "gpuarray/types.h" >>>> 00017 #include "gpuarray/array.h" >>>> 00018 #include "gpuarray/util.h" >>>> 00019 #include "gpuarray/ext_cuda.h" >>>> 00020 #include "gpuarray_api.h" >>>> 00021 #include "numpy_compat.h" >>>> 00022 ////////////////////// >>>> 00023 //// Support Code >>>> 00024 ////////////////////// >>>> 00025 >>>> 00026 >>>> 00027 >>>> 00028 static int >>>> 00029 c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t >>>> desc) { >>>> 00030 cudnnDataType_t dt; >>>> 00031 size_t ds; >>>> 00032 switch (var->ga.typecode) { >>>> 00033 case GA_FLOAT: >>>> 00034 dt = CUDNN_DATA_FLOAT; >>>> 00035 break; >>>> 00036 case GA_DOUBLE: >>>> 00037 dt = CUDNN_DATA_DOUBLE; >>>> 00038 break; >>>> 00039 #if CUDNN_VERSION > 3000 >>>> 00040 case GA_HALF: >>>> 00041 dt = CUDNN_DATA_HALF; >>>> 00042 break; >>>> 00043 #endif >>>> 00044 default: >>>> 00045 PyErr_SetString(PyExc_TypeError, "Non-float datatype in >>>> c_set_tensorNd"); >>>> 00046 return -1; >>>> 00047 } >>>> 00048 ds = gpuarray_get_elsize(var->ga.typecode); >>>> 00049 >>>> 00050 int strs[5], dims[5], default_stride = 1; >>>> 00051 unsigned int nd = PyGpuArray_NDIM(var); >>>> 00052 >>>> 00053 if (nd > 5) { >>>> 00054 PyErr_SetString(PyExc_TypeError, "Tensor of more than 5d"); >>>> 00055 return -1; >>>> 00056 } >>>> 00057 >>>> 00058 for (unsigned int _i = nd; _i > 0; _i--) { >>>> 00059 unsigned int i = _i - 1; >>>> 00060 strs[i] = PyGpuArray_STRIDE(var, i) ? >>>> 00061 PyGpuArray_STRIDE(var, i)/ds : default_stride; >>>> 00062 default_stride *= PyGpuArray_DIM(var, i); >>>> 00063 dims[i] = PyGpuArray_DIM(var, i); >>>> 00064 } >>>> 00065 >>>> 00066 cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, dt, nd, >>>> dims, strs); >>>> 00067 if (err != CUDNN_STATUS_SUCCESS) { >>>> 00068 PyErr_Format(PyExc_RuntimeError, >>>> 00069 "Could not set tensorNd descriptor: %s", >>>> 00070 cudnnGetErrorString(err)); >>>> 00071 return -1; >>>> 00072 } >>>> 00073 return 0; >>>> 00074 } >>>> 00075 >>>> 00076 static int >>>> 00077 c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t >>>> desc) { >>>> 00078 cudnnDataType_t dt; >>>> 00079 cudnnStatus_t err; >>>> 00080 >>>> 00081 if (!GpuArray_IS_C_CONTIGUOUS(&var->ga)) { >>>> 00082 PyErr_SetString(PyExc_ValueError, >>>> 00083 "Only contiguous filters (kernels) are >>>> supported."); >>>> 00084 return -1; >>>> 00085 } >>>> 00086 switch (var->ga.typecode) { >>>> 00087 case GA_FLOAT: >>>> 00088 dt = CUDNN_DATA_FLOAT; >>>> 00089 break; >>>> 00090 case GA_DOUBLE: >>>> 00091 dt = CUDNN_DATA_DOUBLE; >>>> 00092 break; >>>> 00093 #if CUDNN_VERSION > 3000 >>>> 00094 case GA_HALF: >>>> 00095 dt = CUDNN_DATA_HALF; >>>> 00096 break; >>>> 00097 #endif >>>> 00098 default: >>>> 00099 PyErr_SetString(PyExc_TypeError, "Non-float datatype in >>>> c_set_filter"); >>>> 00100 return -1; >>>> 00101 } >>>> 00102 >>>> 00103 int dims[5]; >>>> 00104 unsigned int nd = PyGpuArray_NDIM(var); >>>> 00105 >>>> 00106 if (nd > 5) { >>>> 00107 PyErr_SetString(PyExc_TypeError, "Tensor of more than 5d"); >>>> 00108 return -1; >>>> 00109 } >>>> 00110 >>>> 00111 for (unsigned int _i = nd; _i > 0; _i--) { >>>> 00112 unsigned int i = _i - 1; >>>> 00113 dims[i] = PyGpuArray_DIM(var, i); >>>> 00114 } >>>> 00115 >>>> 00116 #if CUDNN_VERSION >= 5000 >>>> 00117 err = cudnnSetFilterNdDescriptor(desc, dt, >>>> CUDNN_TENSOR_NCHW, nd, dims); >>>> 00118 #else >>>> 00119 err = cudnnSetFilterNdDescriptor(desc, dt, nd, dims); >>>> 00120 #endif >>>> 00121 >>>> 00122 if (err != CUDNN_STATUS_SUCCESS) { >>>> 00123 PyErr_Format(PyExc_RuntimeError, >>>> 00124 "Could not set filter descriptor: %s.", >>>> 00125 cudnnGetErrorString(err)); >>>> 00126 return -1; >>>> 00127 } >>>> 00128 return 0; >>>> 00129 } >>>> 00130 >>>> 00131 >>>> 00132 >>>> 00133 namespace { >>>> 00134 struct >>>> __struct_compiled_op_86feacd077d8749f42b5d82709a80ba3 { >>>> 00135 PyObject* __ERROR; >>>> 00136 >>>> 00137 PyObject* storage_V3; >>>> 00138 PyObject* storage_V5; >>>> 00139 PyObject* storage_V7; >>>> 00140 PyObject* storage_V9; >>>> 00141 PyObject* storage_V11; >>>> 00142 PyObject* storage_V13; >>>> 00143 PyObject* storage_V1; >>>> 00144 PyObject* storage_V15; >>>> 00145 >>>> 00146 PyObject* py_V15; >>>> 00147 PyGpuContextObject *V15; >>>> 00148 #define DTYPE_INPUT_0 npy_float16 >>>> 00149 #define TYPENUM_INPUT_0 23 >>>> 00150 #define ITEMSIZE_INPUT_0 2 >>>> 00151 #define DTYPE_INPUT_1 npy_float16 >>>> 00152 #define TYPENUM_INPUT_1 23 >>>> 00153 #define ITEMSIZE_INPUT_1 2 >>>> 00154 #define DTYPE_INPUT_2 npy_float16 >>>> 00155 #define TYPENUM_INPUT_2 23 >>>> 00156 #define ITEMSIZE_INPUT_2 2 >>>> 00157 #define DTYPE_INPUT_4 npy_float16 >>>> 00158 #define TYPENUM_INPUT_4 23 >>>> 00159 #define ITEMSIZE_INPUT_4 2 >>>> 00160 #define DTYPE_INPUT_5 npy_float16 >>>> 00161 #define TYPENUM_INPUT_5 23 >>>> 00162 #define ITEMSIZE_INPUT_5 2 >>>> 00163 #define DTYPE_OUTPUT_0 npy_float16 >>>> 00164 #define TYPENUM_OUTPUT_0 23 >>>> 00165 #define ITEMSIZE_OUTPUT_0 2 >>>> 00166 #define APPLY_SPECIFIC(str) >>>> str##_node_86feacd077d8749f42b5d82709a80ba3_0 >>>> 00167 #define CONV_INPLACE 1 >>>> 00168 #define CONV_ALGO >>>> CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM >>>> 00169 >>>> 00170 >>>> 00171 PyGpuContextObject *ctx; >>>> 00172 cudnnHandle_t APPLY_SPECIFIC(_handle); >>>> 00173 >>>> 00174 >>>> 00175 cudnnTensorDescriptor_t APPLY_SPECIFIC(input); >>>> 00176 cudnnTensorDescriptor_t APPLY_SPECIFIC(output); >>>> 00177 cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns); >>>> 00178 >>>> 00179 >>>> 00180 >>>> 00181 #ifdef CHOOSE_ALGO >>>> 00182 int reuse_algo; >>>> 00183 cudnnConvolutionFwdAlgo_t prev_algo; >>>> 00184 #ifndef CHOOSE_ONCE >>>> 00185 size_t prev_img_dims[5]; >>>> 00186 size_t prev_kern_dims[5]; >>>> 00187 #endif >>>> 00188 #endif >>>> 00189 >>>> 00190 int >>>> 00191 APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, >>>> PyGpuArrayObject *kerns, >>>> 00192 PyGpuArrayObject *om, >>>> 00193 cudnnConvolutionDescriptor_t desc, >>>> 00194 double alpha, double beta, >>>> 00195 PyGpuArrayObject **output, >>>> 00196 PyGpuContextObject *c) { >>>> 00197 cudnnStatus_t err = CUDNN_STATUS_SUCCESS; >>>> 00198 float af = alpha, bf = beta; >>>> 00199 void *alpha_p; >>>> 00200 void *beta_p; >>>> 00201 >>>> 00202 if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) { >>>> 00203 PyErr_SetString(PyExc_ValueError, >>>> 00204 "images and kernel must have the same stack size"); >>>> 00205 return 1; >>>> 00206 } >>>> 00207 >>>> 00208 if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1) >>>> 00209 return 1; >>>> 00210 if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) >>>> 00211 return 1; >>>> 00212 >>>> 00213 switch (input->ga.typecode) { >>>> 00214 case GA_DOUBLE: >>>> 00215 alpha_p = (void *)α >>>> 00216 beta_p = (void *)β >>>> 00217 break; >>>> 00218 case GA_FLOAT: >>>> 00219 case GA_HALF: >>>> 00220 alpha_p = (void *)⁡ >>>> 00221 beta_p = (void *)&bf; >>>> 00222 break; >>>> 00223 default: >>>> 00224 PyErr_SetString(PyExc_TypeError, "Unsupported type in >>>> convolution"); >>>> 00225 return 1; >>>> 00226 } >>>> 00227 >>>> 00228 #ifdef CONV_INPLACE >>>> 00229 Py_XDECREF(*output); >>>> 00230 *output = om; >>>> 00231 Py_INCREF(*output); >>>> 00232 #else >>>> 00233 if (theano_prep_output(output, PyGpuArray_NDIM(om), >>>> PyGpuArray_DIMS(om), >>>> 00234 om->ga.typecode, GA_C_ORDER, c) != 0) >>>> 00235 return 1; >>>> 00236 if (beta != 0.0 && pygpu_move(*output, om)) >>>> 00237 return 1; >>>> 00238 #endif >>>> 00239 >>>> 00240 if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1) >>>> 00241 return 1; >>>> 00242 >>>> 00243 cudnnConvolutionFwdAlgo_t algo = CONV_ALGO; >>>> 00244 >>>> 00245 cuda_enter(c->ctx); >>>> 00246 #ifdef CHOOSE_ALGO >>>> 00247 #ifndef CHOOSE_ONCE >>>> 00248 reuse_algo = 1; >>>> 00249 for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { >>>> 00250 reuse_algo = (reuse_algo && >>>> 00251 PyGpuArray_DIM(input, i) == >>>> prev_img_dims[i]); >>>> 00252 reuse_algo = (reuse_algo && >>>> 00253 PyGpuArray_DIM(kerns, i) == >>>> prev_kern_dims[i]); >>>> 00254 } >>>> 00255 #endif >>>> 00256 >>>> 00257 if (!reuse_algo) { >>>> 00258 #ifdef CHOOSE_TIME >>>> 00259 int count; >>>> 00260 cudnnConvolutionFwdAlgoPerf_t choice; >>>> 00261 err = cudnnFindConvolutionForwardAlgorithm( >>>> 00262 APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), >>>> APPLY_SPECIFIC(kerns), >>>> 00263 desc, APPLY_SPECIFIC(output), 1, &count, &choice); >>>> 00264 >>>> 00265 if (err != CUDNN_STATUS_SUCCESS) { >>>> 00266 PyErr_Format(PyExc_RuntimeError, >>>> 00267 "error selecting convolution algo: %s", >>>> 00268 cudnnGetErrorString(err)); >>>> 00269 cuda_exit(c->ctx); >>>> 00270 return 1; >>>> 00271 } >>>> 00272 algo = choice.algo; >>>> 00273 #else >>>> 00274 size_t free; >>>> 00275 int err2 = gpucontext_property(c->ctx, >>>> GA_CTX_PROP_FREE_GMEM, &free); >>>> 00276 >>>> 00277 if (err2 != GA_NO_ERROR) { >>>> 00278 PyErr_Format(PyExc_RuntimeError, "Error when trying to >>>> find the " >>>> 00279 "memory information on the GPU"); >>>> 00280 cuda_exit(c->ctx); >>>> 00281 return 1; >>>> 00282 } >>>> 00283 >>>> 00284 err = cudnnGetConvolutionForwardAlgorithm( >>>> 00285 APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), >>>> APPLY_SPECIFIC(kerns), >>>> 00286 desc, APPLY_SPECIFIC(output), >>>> 00287 CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, >>>> &algo); >>>> 00288 if (err != CUDNN_STATUS_SUCCESS) { >>>> 00289 PyErr_Format(PyExc_RuntimeError, >>>> 00290 "error selecting convolution algo: %s", >>>> 00291 cudnnGetErrorString(err)); >>>> 00292 cuda_exit(c->ctx); >>>> 00293 return 1; >>>> 00294 } >>>> 00295 #endif >>>> 00296 prev_algo = algo; >>>> 00297 } else { >>>> 00298 algo = prev_algo; >>>> 00299 } >>>> 00300 >>>> 00301 #ifdef CHOOSE_ONCE >>>> 00302 reuse_algo = 1; >>>> 00303 #else >>>> 00304 for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { >>>> 00305 prev_img_dims[i] = PyGpuArray_DIM(input, i); >>>> 00306 prev_kern_dims[i] = PyGpuArray_DIM(kerns, i); >>>> 00307 } >>>> 00308 #endif >>>> 00309 >>>> 00310 #endif >>>> 00311 >>>> 00312 /* These two algos are not supported for 3d conv */ >>>> 00313 if (PyGpuArray_NDIM(input) == 5 && >>>> 00314 (algo == >>>> CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM || >>>> 00315 algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM)) >>>> 00316 algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; >>>> 00317 >>>> 00318 // The FFT implementation does not support strides, 1x1 >>>> filters or inputs >>>> 00319 // with a spatial dimension larger than 1024. The tiled-FFT >>>> implementation >>>> 00320 // does not support strides. >>>> 00321 // If the chosen implementation is FFT or tiled-FFT, >>>> validate that it can >>>> 00322 // be used on the current data and default to a safe >>>> implementation if it >>>> 00323 // can't. >>>> 00324 // The following code is 2d-specific but it is fine as FFT >>>> and tiled-FFT are >>>> 00325 // defined only for 2d filters >>>> 00326 if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT || >>>> 00327 algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && >>>> PyGpuArray_NDIM(input) == 4) { >>>> 00328 >>>> 00329 // Extract the properties of the convolution descriptor >>>> 00330 int nd; >>>> 00331 int pad[2]; >>>> 00332 int stride[2]; >>>> 00333 int upscale[2]; >>>> 00334 cudnnConvolutionMode_t mode; >>>> 00335 cudnnDataType_t data_type; >>>> 00336 err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, >>>> stride, >>>> 00337 upscale, &mode, >>>> &data_type); >>>> 00338 if (err != CUDNN_STATUS_SUCCESS) { >>>> 00339 PyErr_Format(PyExc_RuntimeError, >>>> 00340 "error getting convolution properties: %s", >>>> 00341 cudnnGetErrorString(err)); >>>> 00342 cuda_exit(c->ctx); >>>> 00343 return 1; >>>> 00344 } >>>> 00345 >>>> 00346 if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) >>>> 00347 { >>>> 00348 if (stride[0] != 1 || stride[1] != 1 || >>>> 00349 PyGpuArray_DIM(input, 2) > 1024 || >>>> PyGpuArray_DIM(input, 3) > 1024 || >>>> 00350 (PyGpuArray_DIM(kerns, 2) == 1 && >>>> PyGpuArray_DIM(kerns, 3) == 1)) >>>> 00351 { >>>> 00352 algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; >>>> 00353 } >>>> 00354 } >>>> 00355 else >>>> 00356 { >>>> 00357 // algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING >>>> 00358 if (stride[0] != 1 || stride[1] != 1) >>>> 00359 { >>>> 00360 algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; >>>> 00361 } >>>> 00362 } >>>> 00363 } >>>> 00364 >>>> 00365 { >>>> 00366 size_t worksize; >>>> 00367 gpudata *workspace; >>>> 00368 err = >>>> cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle), >>>> 00369 >>>> APPLY_SPECIFIC(input), >>>> 00370 >>>> APPLY_SPECIFIC(kerns), >>>> 00371 desc, >>>> 00372 >>>> APPLY_SPECIFIC(output), >>>> 00373 algo, >>>> 00374 &worksize); >>>> 00375 >>>> 00376 if (err == CUDNN_STATUS_NOT_SUPPORTED) { >>>> 00377 // Fallback to none algo if not supported >>>> 00378 // TODO: Print a warning >>>> 00379 algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; >>>> 00380 >>>> 00381 err = >>>> cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle), >>>> 00382 >>>> APPLY_SPECIFIC(input), >>>> 00383 >>>> APPLY_SPECIFIC(kerns), >>>> 00384 desc, >>>> 00385 >>>> APPLY_SPECIFIC(output), >>>> 00386 algo, >>>> 00387 &worksize); >>>> 00388 } >>>> 00389 >>>> 00390 if (err != CUDNN_STATUS_SUCCESS) { >>>> 00391 PyErr_Format(PyExc_RuntimeError, >>>> 00392 "error getting worksize: %s", >>>> 00393 cudnnGetErrorString(err)); >>>> 00394 cuda_exit(c->ctx); >>>> 00395 return 1; >>>> 00396 } >>>> 00397 >>>> 00398 /* >>>> 00399 * This is less than ideal since we need to free it after >>>> (which >>>> 00400 * introduces a synchronization point. But we don't have a >>>> module >>>> 00401 * to place a nice get_work_mem() function in. >>>> 00402 */ >>>> 00403 if (worksize != 0) { >>>> 00404 workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, >>>> NULL); >>>> 00405 if (workspace == NULL) { >>>> 00406 PyErr_SetString(PyExc_RuntimeError, >>>> 00407 "Could not allocate working memory"); >>>> 00408 cuda_exit(c->ctx); >>>> 00409 return 1; >>>> 00410 } >>>> 00411 } >>>> 00412 >>>> 00413 cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ); >>>> 00414 cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); >>>> 00415 cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); >>>> 00416 >>>> 00417 err = cudnnConvolutionForward( >>>> 00418 APPLY_SPECIFIC(_handle), >>>> 00419 alpha_p, >>>> 00420 APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), >>>> 00421 APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), >>>> 00422 desc, algo, >>>> 00423 worksize == 0 ? NULL : *(void **)workspace, worksize, >>>> 00424 beta_p, >>>> 00425 APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output)); >>>> 00426 >>>> 00427 if (worksize != 0) >>>> 00428 gpudata_release(workspace); >>>> 00429 >>>> 00430 cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ); >>>> 00431 cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); >>>> 00432 cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); >>>> 00433 } >>>> 00434 cuda_exit(c->ctx); >>>> 00435 >>>> 00436 if (err != CUDNN_STATUS_SUCCESS) { >>>> 00437 PyErr_Format(PyExc_RuntimeError, "error doing operation: >>>> %s", >>>> 00438 cudnnGetErrorString(err)); >>>> 00439 return 1; >>>> 00440 } >>>> 00441 return 0; >>>> 00442 } >>>> 00443 >>>> 00444 #undef DTYPE_INPUT_0 >>>> 00445 #undef TYPENUM_INPUT_0 >>>> 00446 #undef ITEMSIZE_INPUT_0 >>>> 00447 #undef DTYPE_INPUT_1 >>>> 00448 #undef TYPENUM_INPUT_1 >>>> 00449 #undef ITEMSIZE_INPUT_1 >>>> 00450 #undef DTYPE_INPUT_2 >>>> 00451 #undef TYPENUM_INPUT_2 >>>> 00452 #undef ITEMSIZE_INPUT_2 >>>> 00453 #undef DTYPE_INPUT_4 >>>> 00454 #undef TYPENUM_INPUT_4 >>>> 00455 #undef ITEMSIZE_INPUT_4 >>>> 00456 #undef DTYPE_INPUT_5 >>>> 00457 #undef TYPENUM_INPUT_5 >>>> 00458 #undef ITEMSIZE_INPUT_5 >>>> 00459 #undef DTYPE_OUTPUT_0 >>>> 00460 #undef TYPENUM_OUTPUT_0 >>>> 00461 #undef ITEMSIZE_OUTPUT_0 >>>> 00462 #undef APPLY_SPECIFIC >>>> 00463 #undef CONV_INPLACE >>>> 00464 #undef CONV_ALGO >>>> 00465 >>>> 00466 >>>> __struct_compiled_op_86feacd077d8749f42b5d82709a80ba3() { >>>> 00467 // This is only somewhat safe because we: >>>> 00468 // 1) Are not a virtual class >>>> 00469 // 2) Do not use any virtual classes in the >>>> members >>>> 00470 // 3) Deal with mostly POD and pointers >>>> 00471 >>>> 00472 // If this changes, we would have to revise this, >>>> but for >>>> 00473 // now I am tired of chasing segfaults because >>>> 00474 // initialization code had an error and some >>>> pointer has >>>> 00475 // a junk value. >>>> 00476 memset(this, 0, sizeof(*this)); >>>> 00477 } >>>> 00478 >>>> ~__struct_compiled_op_86feacd077d8749f42b5d82709a80ba3(void) { >>>> 00479 cleanup(); >>>> 00480 } >>>> 00481 >>>> 00482 int init(PyObject* __ERROR, PyObject* storage_V3, >>>> PyObject* storage_V5, PyObject* storage_V7, PyObject* storage_V9, PyObject* >>>> storage_V11, PyObject* storage_V13, PyObject* storage_V1, PyObject* >>>> storage_V15) { >>>> 00483 Py_XINCREF(storage_V3); >>>> 00484 Py_XINCREF(storage_V5); >>>> 00485 Py_XINCREF(storage_V7); >>>> 00486 Py_XINCREF(storage_V9); >>>> 00487 Py_XINCREF(storage_V11); >>>> 00488 Py_XINCREF(storage_V13); >>>> 00489 Py_XINCREF(storage_V1); >>>> 00490 Py_XINCREF(storage_V15); >>>> 00491 this->storage_V3 = storage_V3; >>>> 00492 this->storage_V5 = storage_V5; >>>> 00493 this->storage_V7 = storage_V7; >>>> 00494 this->storage_V9 = storage_V9; >>>> 00495 this->storage_V11 = storage_V11; >>>> 00496 this->storage_V13 = storage_V13; >>>> 00497 this->storage_V1 = storage_V1; >>>> 00498 this->storage_V15 = storage_V15; >>>> 00499 >>>> 00500 >>>> 00501 >>>> 00502 >>>> 00503 >>>> 00504 >>>> 00505 >>>> 00506 >>>> 00507 >>>> 00508 py_V15 = PyList_GET_ITEM(storage_V15, 0); >>>> 00509 {Py_XINCREF(py_V15);} >>>> 00510 >>>> 00511 if (!PyObject_TypeCheck(py_V15, &PyGpuContextType)) { >>>> 00512 PyErr_SetString(PyExc_TypeError, "expected a GpuContext"); >>>> 00513 { >>>> 00514 if (!PyErr_Occurred()) { >>>> 00515 PyErr_SetString(PyExc_RuntimeError, >>>> 00516 "Unexpected error in an Op's C code. " >>>> 00517 "No Python exception was set."); >>>> 00518 } >>>> 00519 return 15; >>>> 00520 } >>>> 00521 } >>>> 00522 >>>> 00523 V15 = (PyGpuContextObject *)py_V15; >>>> 00524 Py_INCREF(V15); >>>> 00525 >>>> 00526 >>>> 00527 #define DTYPE_INPUT_0 npy_float16 >>>> 00528 #define TYPENUM_INPUT_0 23 >>>> 00529 #define ITEMSIZE_INPUT_0 2 >>>> 00530 #define DTYPE_INPUT_1 npy_float16 >>>> 00531 #define TYPENUM_INPUT_1 23 >>>> 00532 #define ITEMSIZE_INPUT_1 2 >>>> 00533 #define DTYPE_INPUT_2 npy_float16 >>>> 00534 #define TYPENUM_INPUT_2 23 >>>> 00535 #define ITEMSIZE_INPUT_2 2 >>>> 00536 #define DTYPE_INPUT_4 npy_float16 >>>> 00537 #define TYPENUM_INPUT_4 23 >>>> 00538 #define ITEMSIZE_INPUT_4 2 >>>> 00539 #define DTYPE_INPUT_5 npy_float16 >>>> 00540 #define TYPENUM_INPUT_5 23 >>>> 00541 #define ITEMSIZE_INPUT_5 2 >>>> 00542 #define DTYPE_OUTPUT_0 npy_float16 >>>> 00543 #define TYPENUM_OUTPUT_0 23 >>>> 00544 #define ITEMSIZE_OUTPUT_0 2 >>>> 00545 #define APPLY_SPECIFIC(str) >>>> str##_node_86feacd077d8749f42b5d82709a80ba3_0 >>>> 00546 #define CONV_INPLACE 1 >>>> 00547 #define CONV_ALGO >>>> CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM >>>> 00548 #define FAIL { \ >>>> 00549 if (!PyErr_Occurred()) { \ >>>> 00550 PyErr_SetString(PyExc_RuntimeError, \ >>>> 00551 "Unexpected error in an Op's C code. " \ >>>> 00552 "No Python exception was set."); \ >>>> 00553 } \ >>>> 00554 return 17; \ >>>> 00555 } >>>> 00556 #define PARAMS V15 >>>> 00557 >>>> 00558 >>>> 00559 { >>>> 00560 // We need to keep a reference here to have it available in >>>> the destructor. >>>> 00561 ctx = PARAMS; >>>> 00562 Py_INCREF(ctx); >>>> 00563 >>>> 00564 cuda_enter(PARAMS->ctx); >>>> 00565 cudnnStatus_t err; >>>> 00566 APPLY_SPECIFIC(_handle) = NULL; >>>> 00567 if ((err = cudnnCreate(&APPLY_SPECIFIC(_handle))) != >>>> CUDNN_STATUS_SUCCESS) { >>>> 00568 PyErr_Format(PyExc_RuntimeError, "could not create cuDNN >>>> handle: %s", >>>> 00569 cudnnGetErrorString(err)); >>>> 00570 cuda_exit(PARAMS->ctx); >>>> 00571 FAIL; >>>> 00572 } >>>> 00573 if ((err = cudnnSetStream(APPLY_SPECIFIC(_handle), >>>> 00574 cuda_get_stream(PARAMS->ctx))) != >>>> CUDNN_STATUS_SUCCESS) { >>>> 00575 PyErr_Format(PyExc_RuntimeError, "Could not set cudnn >>>> stream: %s", >>>> 00576 cudnnGetErrorString(err)); >>>> 00577 cuda_exit(PARAMS->ctx); >>>> 00578 FAIL; >>>> 00579 } >>>> 00580 cuda_exit(PARAMS->ctx); >>>> 00581 } >>>> 00582 >>>> 00583 >>>> 00584 >>>> 00585 cudnnStatus_t APPLY_SPECIFIC(err); >>>> 00586 APPLY_SPECIFIC(input) = NULL; >>>> 00587 APPLY_SPECIFIC(output) = NULL; >>>> 00588 APPLY_SPECIFIC(kerns) = NULL; >>>> 00589 if ((APPLY_SPECIFIC(err) = >>>> cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != >>>> CUDNN_STATUS_SUCCESS) { >>>> 00590 PyErr_Format(PyExc_MemoryError, "could not allocate tensor >>>> descriptor " >>>> 00591 "(inp): %s", >>>> cudnnGetErrorString(APPLY_SPECIFIC(err))); >>>> 00592 FAIL; >>>> 00593 } >>>> 00594 if ((APPLY_SPECIFIC(err) = >>>> cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != >>>> CUDNN_STATUS_SUCCESS) { >>>> 00595 PyErr_Format(PyExc_MemoryError, "could not allocate tensor >>>> descriptor " >>>> 00596 "(out): %s", >>>> cudnnGetErrorString(APPLY_SPECIFIC(err))); >>>> 00597 FAIL; >>>> 00598 } >>>> 00599 if ((APPLY_SPECIFIC(err) = >>>> cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns))) != >>>> CUDNN_STATUS_SUCCESS) { >>>> 00600 PyErr_Format(PyExc_MemoryError, "could not allocate filter >>>> descriptor: %s", >>>> 00601 cudnnGetErrorString(APPLY_SPECIFIC(err))); >>>> 00602 FAIL; >>>> 00603 } >>>> 00604 >>>> 00605 >>>> 00606 >>>> 00607 #ifdef CHOOSE_ALGO >>>> 00608 reuse_algo = 0; >>>> 00609 prev_algo = CONV_ALGO; >>>> 00610 #ifndef CHOOSE_ONCE >>>> 00611 memset(prev_img_dims, 0, sizeof(prev_img_dims)); >>>> 00612 memset(prev_kern_dims, 0, sizeof(prev_kern_dims)); >>>> 00613 #endif >>>> 00614 #endif >>>> 00615 >>>> 00616 >>>> 00617 #undef FAIL >>>> 00618 #undef PARAMS >>>> 00619 #undef DTYPE_INPUT_0 >>>> 00620 #undef TYPENUM_INPUT_0 >>>> 00621 #undef ITEMSIZE_INPUT_0 >>>> 00622 #undef DTYPE_INPUT_1 >>>> 00623 #undef TYPENUM_INPUT_1 >>>> 00624 #undef ITEMSIZE_INPUT_1 >>>> 00625 #undef DTYPE_INPUT_2 >>>> 00626 #undef TYPENUM_INPUT_2 >>>> 00627 #undef ITEMSIZE_INPUT_2 >>>> 00628 #undef DTYPE_INPUT_4 >>>> 00629 #undef TYPENUM_INPUT_4 >>>> 00630 #undef ITEMSIZE_INPUT_4 >>>> 00631 #undef DTYPE_INPUT_5 >>>> 00632 #undef TYPENUM_INPUT_5 >>>> 00633 #undef ITEMSIZE_INPUT_5 >>>> 00634 #undef DTYPE_OUTPUT_0 >>>> 00635 #undef TYPENUM_OUTPUT_0 >>>> 00636 #undef ITEMSIZE_OUTPUT_0 >>>> 00637 #undef APPLY_SPECIFIC >>>> 00638 #undef CONV_INPLACE >>>> 00639 #undef CONV_ALGO >>>> 00640 this->__ERROR = __ERROR; >>>> 00641 return 0; >>>> 00642 } >>>> 00643 void cleanup(void) { >>>> 00644 __label_1: >>>> 00645 >>>> 00646 double __DUMMY_1; >>>> 00647 __label_3: >>>> 00648 >>>> 00649 double __DUMMY_3; >>>> 00650 __label_5: >>>> 00651 >>>> 00652 double __DUMMY_5; >>>> 00653 __label_7: >>>> 00654 >>>> 00655 double __DUMMY_7; >>>> 00656 __label_9: >>>> 00657 >>>> 00658 double __DUMMY_9; >>>> 00659 __label_11: >>>> 00660 >>>> 00661 double __DUMMY_11; >>>> 00662 __label_13: >>>> 00663 >>>> 00664 double __DUMMY_13; >>>> 00665 __label_15: >>>> 00666 Py_XDECREF(V15); V15 = NULL; >>>> 00667 {Py_XDECREF(py_V15);} >>>> 00668 >>>> 00669 double __DUMMY_15; >>>> 00670 __label_18: >>>> 00671 >>>> 00672 #define DTYPE_INPUT_0 npy_float16 >>>> 00673 #define TYPENUM_INPUT_0 23 >>>> 00674 #define ITEMSIZE_INPUT_0 2 >>>> 00675 #define DTYPE_INPUT_1 npy_float16 >>>> 00676 #define TYPENUM_INPUT_1 23 >>>> 00677 #define ITEMSIZE_INPUT_1 2 >>>> 00678 #define DTYPE_INPUT_2 npy_float16 >>>> 00679 #define TYPENUM_INPUT_2 23 >>>> 00680 #define ITEMSIZE_INPUT_2 2 >>>> 00681 #define DTYPE_INPUT_4 npy_float16 >>>> 00682 #define TYPENUM_INPUT_4 23 >>>> 00683 #define ITEMSIZE_INPUT_4 2 >>>> 00684 #define DTYPE_INPUT_5 npy_float16 >>>> 00685 #define TYPENUM_INPUT_5 23 >>>> 00686 #define ITEMSIZE_INPUT_5 2 >>>> 00687 #define DTYPE_OUTPUT_0 npy_float16 >>>> 00688 #define TYPENUM_OUTPUT_0 23 >>>> 00689 #define ITEMSIZE_OUTPUT_0 2 >>>> 00690 #define APPLY_SPECIFIC(str) >>>> str##_node_86feacd077d8749f42b5d82709a80ba3_0 >>>> 00691 #define CONV_INPLACE 1 >>>> 00692 #define CONV_ALGO >>>> CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM >>>> 00693 >>>> 00694 >>>> 00695 cuda_enter(ctx->ctx); >>>> 00696 cudnnDestroy(APPLY_SPECIFIC(_handle)); >>>> 00697 cuda_exit(ctx->ctx); >>>> 00698 Py_DECREF((PyObject *)ctx); >>>> 00699 >>>> 00700 >>>> 00701 if (APPLY_SPECIFIC(input) != NULL) >>>> 00702 cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); >>>> 00703 if (APPLY_SPECIFIC(output) != NULL) >>>> 00704 cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); >>>> 00705 if (APPLY_SPECIFIC(kerns) != NULL) >>>> 00706 cudnnDestroyFilterDescriptor(APPLY_SPECIFIC(kerns)); >>>> 00707 >>>> 00708 #undef DTYPE_INPUT_0 >>>> 00709 #undef TYPENUM_INPUT_0 >>>> 00710 #undef ITEMSIZE_INPUT_0 >>>> 00711 #undef DTYPE_INPUT_1 >>>> 00712 #undef TYPENUM_INPUT_1 >>>> 00713 #undef ITEMSIZE_INPUT_1 >>>> 00714 #undef DTYPE_INPUT_2 >>>> 00715 #undef TYPENUM_INPUT_2 >>>> 00716 #undef ITEMSIZE_INPUT_2 >>>> 00717 #undef DTYPE_INPUT_4 >>>> 00718 #undef TYPENUM_INPUT_4 >>>> 00719 #undef ITEMSIZE_INPUT_4 >>>> 00720 #undef DTYPE_INPUT_5 >>>> 00721 #undef TYPENUM_INPUT_5 >>>> 00722 #undef ITEMSIZE_INPUT_5 >>>> 00723 #undef DTYPE_OUTPUT_0 >>>> 00724 #undef TYPENUM_OUTPUT_0 >>>> 00725 #undef ITEMSIZE_OUTPUT_0 >>>> 00726 #undef APPLY_SPECIFIC >>>> 00727 #undef CONV_INPLACE >>>> 00728 #undef CONV_ALGO >>>> 00729 double __DUMMY_18; >>>> 00730 >>>> 00731 Py_XDECREF(this->storage_V3); >>>> 00732 Py_XDECREF(this->storage_V5); >>>> 00733 Py_XDECREF(this->storage_V7); >>>> 00734 Py_XDECREF(this->storage_V9); >>>> 00735 Py_XDECREF(this->storage_V11); >>>> 00736 Py_XDECREF(this->storage_V13); >>>> 00737 Py_XDECREF(this->storage_V1); >>>> 00738 Py_XDECREF(this->storage_V15); >>>> 00739 } >>>> 00740 int run(void) { >>>> 00741 int __failure = 0; >>>> 00742 >>>> 00743 PyObject* py_V1; >>>> 00744 >>>> 00745 PyGpuArrayObject *V1; >>>> 00746 >>>> 00747 PyObject* py_V3; >>>> 00748 >>>> 00749 PyGpuArrayObject *V3; >>>> 00750 >>>> 00751 PyObject* py_V5; >>>> 00752 >>>> 00753 PyGpuArrayObject *V5; >>>> 00754 >>>> 00755 PyObject* py_V7; >>>> 00756 >>>> 00757 PyGpuArrayObject *V7; >>>> 00758 >>>> 00759 PyObject* py_V9; >>>> 00760 >>>> 00761 cudnnConvolutionDescriptor_t V9; >>>> 00762 >>>> 00763 PyObject* py_V11; >>>> 00764 >>>> 00765 typedef npy_float16 V11_dtype; // Deprecated >>>> use dtype_V11 instead. >>>> 00766 typedef npy_float16 dtype_V11; >>>> 00767 >>>> 00768 npy_float16 V11; >>>> 00769 >>>> 00770 PyObject* py_V13; >>>> 00771 >>>> 00772 typedef npy_float16 V13_dtype; // Deprecated >>>> use dtype_V13 instead. >>>> 00773 typedef npy_float16 dtype_V13; >>>> 00774 >>>> 00775 npy_float16 V13; >>>> 00776 >>>> 00777 { >>>> 00778 >>>> 00779 py_V1 = PyList_GET_ITEM(storage_V1, 0); >>>> 00780 {Py_XINCREF(py_V1);} >>>> 00781 >>>> 00782 if (py_V1 == Py_None) >>>> 00783 { >>>> 00784 V1 = NULL; >>>> 00785 } >>>> 00786 else >>>> 00787 { >>>> 00788 >>>> 00789 V1 = NULL; >>>> 00790 if (py_V1 == Py_None) { >>>> 00791 PyErr_SetString(PyExc_ValueError, "expected a >>>> GpuArray, not None"); >>>> 00792 { >>>> 00793 __failure = 2; >>>> 00794 if (!PyErr_Occurred()) { >>>> 00795 PyErr_SetString(PyExc_RuntimeError, >>>> 00796 "Unexpected error in an Op's C code. " >>>> 00797 "No Python exception was set."); >>>> 00798 } >>>> 00799 goto __label_2;} >>>> 00800 } >>>> 00801 /* First check if we are the base type exactly (the >>>> most common case), >>>> 00802 then do the full subclass check if needed. */ >>>> 00803 if (py_V1->ob_type != &PyGpuArrayType && >>>> 00804 !PyObject_TypeCheck(py_V1, &PyGpuArrayType)) { >>>> 00805 PyE >>>> >>> -- >>> >>> --- >>> You received this message because you are subscribed to the Google >>> Groups "theano-users" group. >>> To unsubscribe from this group and stop receiving emails from it, send >>> an email to [email protected]. >>> >>> For more options, visit https://groups.google.com/d/optout. >>> >> >> -- > > --- > You received this message because you are subscribed to the Google Groups > "theano-users" group. > To unsubscribe from this group and stop receiving emails from it, send an > email to [email protected]. > For more options, visit https://groups.google.com/d/optout. > -- --- You received this message because you are subscribed to the Google Groups "theano-users" group. To unsubscribe from this group and stop receiving emails from it, send an email to [email protected]. For more options, visit https://groups.google.com/d/optout.
