ok I try. thanks On Thursday, July 14, 2016 at 11:44:41 PM UTC+2, Arnaud Bergeron wrote: > > I can't reproduce your problem using a simple convolution in float16. > > Either this is because your code is doing something unexpected or because > the problem has been fixed in the development version. > > In nay case the development version is a much better option for the new > backend and float16 so I encourage you to upgrade and try again: > http://deeplearning.net/software/theano/install.html#bleeding-edge-install-instructions > . > > 2016-07-14 4:22 GMT-04:00 <[email protected] <javascript:>>: > >> Here is .theanorc: >> >> [global] >> floatX = float16 >> device=cuda >> [cuda] >> root = /usr/local/cuda-7.5 >> >> >> [nvcc] >> fastmath=True >> >> optimizer = fast_compile >> >> On Thursday, July 14, 2016 at 10:19:56 AM UTC+2, [email protected] >> wrote: >>> >>> Hi Arnaud, >>> I put _f16_ok = True in dnn.py ( attached). >>> >>> This is the error I received: >>> >>> Python 2.7.11 |Anaconda custom (64-bit)| (default, Dec 6 2015, >>> 18:08:32) >>> [GCC 4.4.7 20120313 (Red Hat 4.4.7-1)] on linux2 >>> Type "help", "copyright", "credits" or "license" for more information. >>> Anaconda is brought to you by Continuum Analytics. >>> Please check out: http://continuum.io/thanks and https://anaconda.org >>> >>> import run_multi_conv >>> >>> Mapped name None to device cuda: GeForce 840M >>> WARNING (theano.gof.compilelock): Overriding existing lock by dead >>> process '3202' (I am process '3351') >>> Using cuDNN version 5005 on context None >>> /home/luca/data/Theano-master/theano/tensor/signal/downsample.py:6: >>> UserWarning: downsample module has been moved to the >>> theano.tensor.signal.pool module. >>> "downsample module has been moved to the theano.tensor.signal.pool >>> module.") >>> >>> >>> >>> run_multi_conv.run_experiments() >>> Disabling C code for Elemwise{mul,no_inplace} due to unsupported float16 >>> Disabling C code for Elemwise{Cast{float32}} due to unsupported float16 >>> Disabling C code for Elemwise{Cast{float16}} due to unsupported float16 >>> Disabling C code for Elemwise{Cast{float16}} due to unsupported float16 >>> Disabling C code for Alloc due to unsupported float16 >>> Disabling C code for Cast{float16} due to unsupported float16 >>> Disabling C code for Cast{float16} due to unsupported float16 >>> Disabling C code for Cast{float16} due to unsupported float16 >>> Disabling C code for Cast{float16} due to unsupported float16 >>> Disabling C code for RandomFunction{binomial} due to unsupported float16 >>> Disabling C code for RandomFunction{binomial} due to unsupported float16 >>> =============================== >>> 00001 #include <Python.h> >>> 00002 #include <iostream> >>> 00003 #include "theano_mod_helper.h" >>> 00004 #include <gpuarray/array.h> >>> 00005 #include <gpuarray/kernel.h> >>> 00006 #include <gpuarray/error.h> >>> 00007 #include <gpuarray/buffer.h> >>> 00008 #include <gpuarray/buffer_blas.h> >>> 00009 #include <numpy/arrayobject.h> >>> 00010 #include <gpuarray_api.h> >>> 00011 #include <math.h> >>> 00012 #include <numpy/arrayscalars.h> >>> 00013 #include "cudnn.h" >>> 00014 #include "cudnn_helper.h" >>> 00015 #include "gpuarray_helper.h" >>> 00016 #include "gpuarray/types.h" >>> 00017 #include "gpuarray/array.h" >>> 00018 #include "gpuarray/util.h" >>> 00019 #include "gpuarray/ext_cuda.h" >>> 00020 #include "gpuarray_api.h" >>> 00021 #include "numpy_compat.h" >>> 00022 ////////////////////// >>> 00023 //// Support Code >>> 00024 ////////////////////// >>> 00025 >>> 00026 >>> 00027 >>> 00028 static int >>> 00029 c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t >>> desc) { >>> 00030 cudnnDataType_t dt; >>> 00031 size_t ds; >>> 00032 switch (var->ga.typecode) { >>> 00033 case GA_FLOAT: >>> 00034 dt = CUDNN_DATA_FLOAT; >>> 00035 break; >>> 00036 case GA_DOUBLE: >>> 00037 dt = CUDNN_DATA_DOUBLE; >>> 00038 break; >>> 00039 #if CUDNN_VERSION > 3000 >>> 00040 case GA_HALF: >>> 00041 dt = CUDNN_DATA_HALF; >>> 00042 break; >>> 00043 #endif >>> 00044 default: >>> 00045 PyErr_SetString(PyExc_TypeError, "Non-float datatype in >>> c_set_tensorNd"); >>> 00046 return -1; >>> 00047 } >>> 00048 ds = gpuarray_get_elsize(var->ga.typecode); >>> 00049 >>> 00050 int strs[5], dims[5], default_stride = 1; >>> 00051 unsigned int nd = PyGpuArray_NDIM(var); >>> 00052 >>> 00053 if (nd > 5) { >>> 00054 PyErr_SetString(PyExc_TypeError, "Tensor of more than 5d"); >>> 00055 return -1; >>> 00056 } >>> 00057 >>> 00058 for (unsigned int _i = nd; _i > 0; _i--) { >>> 00059 unsigned int i = _i - 1; >>> 00060 strs[i] = PyGpuArray_STRIDE(var, i) ? >>> 00061 PyGpuArray_STRIDE(var, i)/ds : default_stride; >>> 00062 default_stride *= PyGpuArray_DIM(var, i); >>> 00063 dims[i] = PyGpuArray_DIM(var, i); >>> 00064 } >>> 00065 >>> 00066 cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, dt, nd, >>> dims, strs); >>> 00067 if (err != CUDNN_STATUS_SUCCESS) { >>> 00068 PyErr_Format(PyExc_RuntimeError, >>> 00069 "Could not set tensorNd descriptor: %s", >>> 00070 cudnnGetErrorString(err)); >>> 00071 return -1; >>> 00072 } >>> 00073 return 0; >>> 00074 } >>> 00075 >>> 00076 static int >>> 00077 c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t >>> desc) { >>> 00078 cudnnDataType_t dt; >>> 00079 cudnnStatus_t err; >>> 00080 >>> 00081 if (!GpuArray_IS_C_CONTIGUOUS(&var->ga)) { >>> 00082 PyErr_SetString(PyExc_ValueError, >>> 00083 "Only contiguous filters (kernels) are supported."); >>> 00084 return -1; >>> 00085 } >>> 00086 switch (var->ga.typecode) { >>> 00087 case GA_FLOAT: >>> 00088 dt = CUDNN_DATA_FLOAT; >>> 00089 break; >>> 00090 case GA_DOUBLE: >>> 00091 dt = CUDNN_DATA_DOUBLE; >>> 00092 break; >>> 00093 #if CUDNN_VERSION > 3000 >>> 00094 case GA_HALF: >>> 00095 dt = CUDNN_DATA_HALF; >>> 00096 break; >>> 00097 #endif >>> 00098 default: >>> 00099 PyErr_SetString(PyExc_TypeError, "Non-float datatype in >>> c_set_filter"); >>> 00100 return -1; >>> 00101 } >>> 00102 >>> 00103 int dims[5]; >>> 00104 unsigned int nd = PyGpuArray_NDIM(var); >>> 00105 >>> 00106 if (nd > 5) { >>> 00107 PyErr_SetString(PyExc_TypeError, "Tensor of more than 5d"); >>> 00108 return -1; >>> 00109 } >>> 00110 >>> 00111 for (unsigned int _i = nd; _i > 0; _i--) { >>> 00112 unsigned int i = _i - 1; >>> 00113 dims[i] = PyGpuArray_DIM(var, i); >>> 00114 } >>> 00115 >>> 00116 #if CUDNN_VERSION >= 5000 >>> 00117 err = cudnnSetFilterNdDescriptor(desc, dt, >>> CUDNN_TENSOR_NCHW, nd, dims); >>> 00118 #else >>> 00119 err = cudnnSetFilterNdDescriptor(desc, dt, nd, dims); >>> 00120 #endif >>> 00121 >>> 00122 if (err != CUDNN_STATUS_SUCCESS) { >>> 00123 PyErr_Format(PyExc_RuntimeError, >>> 00124 "Could not set filter descriptor: %s.", >>> 00125 cudnnGetErrorString(err)); >>> 00126 return -1; >>> 00127 } >>> 00128 return 0; >>> 00129 } >>> 00130 >>> 00131 >>> 00132 >>> 00133 namespace { >>> 00134 struct >>> __struct_compiled_op_86feacd077d8749f42b5d82709a80ba3 { >>> 00135 PyObject* __ERROR; >>> 00136 >>> 00137 PyObject* storage_V3; >>> 00138 PyObject* storage_V5; >>> 00139 PyObject* storage_V7; >>> 00140 PyObject* storage_V9; >>> 00141 PyObject* storage_V11; >>> 00142 PyObject* storage_V13; >>> 00143 PyObject* storage_V1; >>> 00144 PyObject* storage_V15; >>> 00145 >>> 00146 PyObject* py_V15; >>> 00147 PyGpuContextObject *V15; >>> 00148 #define DTYPE_INPUT_0 npy_float16 >>> 00149 #define TYPENUM_INPUT_0 23 >>> 00150 #define ITEMSIZE_INPUT_0 2 >>> 00151 #define DTYPE_INPUT_1 npy_float16 >>> 00152 #define TYPENUM_INPUT_1 23 >>> 00153 #define ITEMSIZE_INPUT_1 2 >>> 00154 #define DTYPE_INPUT_2 npy_float16 >>> 00155 #define TYPENUM_INPUT_2 23 >>> 00156 #define ITEMSIZE_INPUT_2 2 >>> 00157 #define DTYPE_INPUT_4 npy_float16 >>> 00158 #define TYPENUM_INPUT_4 23 >>> 00159 #define ITEMSIZE_INPUT_4 2 >>> 00160 #define DTYPE_INPUT_5 npy_float16 >>> 00161 #define TYPENUM_INPUT_5 23 >>> 00162 #define ITEMSIZE_INPUT_5 2 >>> 00163 #define DTYPE_OUTPUT_0 npy_float16 >>> 00164 #define TYPENUM_OUTPUT_0 23 >>> 00165 #define ITEMSIZE_OUTPUT_0 2 >>> 00166 #define APPLY_SPECIFIC(str) >>> str##_node_86feacd077d8749f42b5d82709a80ba3_0 >>> 00167 #define CONV_INPLACE 1 >>> 00168 #define CONV_ALGO >>> CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM >>> 00169 >>> 00170 >>> 00171 PyGpuContextObject *ctx; >>> 00172 cudnnHandle_t APPLY_SPECIFIC(_handle); >>> 00173 >>> 00174 >>> 00175 cudnnTensorDescriptor_t APPLY_SPECIFIC(input); >>> 00176 cudnnTensorDescriptor_t APPLY_SPECIFIC(output); >>> 00177 cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns); >>> 00178 >>> 00179 >>> 00180 >>> 00181 #ifdef CHOOSE_ALGO >>> 00182 int reuse_algo; >>> 00183 cudnnConvolutionFwdAlgo_t prev_algo; >>> 00184 #ifndef CHOOSE_ONCE >>> 00185 size_t prev_img_dims[5]; >>> 00186 size_t prev_kern_dims[5]; >>> 00187 #endif >>> 00188 #endif >>> 00189 >>> 00190 int >>> 00191 APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, >>> PyGpuArrayObject *kerns, >>> 00192 PyGpuArrayObject *om, >>> 00193 cudnnConvolutionDescriptor_t desc, >>> 00194 double alpha, double beta, >>> 00195 PyGpuArrayObject **output, >>> 00196 PyGpuContextObject *c) { >>> 00197 cudnnStatus_t err = CUDNN_STATUS_SUCCESS; >>> 00198 float af = alpha, bf = beta; >>> 00199 void *alpha_p; >>> 00200 void *beta_p; >>> 00201 >>> 00202 if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) { >>> 00203 PyErr_SetString(PyExc_ValueError, >>> 00204 "images and kernel must have the same stack size"); >>> 00205 return 1; >>> 00206 } >>> 00207 >>> 00208 if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1) >>> 00209 return 1; >>> 00210 if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) >>> 00211 return 1; >>> 00212 >>> 00213 switch (input->ga.typecode) { >>> 00214 case GA_DOUBLE: >>> 00215 alpha_p = (void *)α >>> 00216 beta_p = (void *)β >>> 00217 break; >>> 00218 case GA_FLOAT: >>> 00219 case GA_HALF: >>> 00220 alpha_p = (void *)⁡ >>> 00221 beta_p = (void *)&bf; >>> 00222 break; >>> 00223 default: >>> 00224 PyErr_SetString(PyExc_TypeError, "Unsupported type in >>> convolution"); >>> 00225 return 1; >>> 00226 } >>> 00227 >>> 00228 #ifdef CONV_INPLACE >>> 00229 Py_XDECREF(*output); >>> 00230 *output = om; >>> 00231 Py_INCREF(*output); >>> 00232 #else >>> 00233 if (theano_prep_output(output, PyGpuArray_NDIM(om), >>> PyGpuArray_DIMS(om), >>> 00234 om->ga.typecode, GA_C_ORDER, c) != 0) >>> 00235 return 1; >>> 00236 if (beta != 0.0 && pygpu_move(*output, om)) >>> 00237 return 1; >>> 00238 #endif >>> 00239 >>> 00240 if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1) >>> 00241 return 1; >>> 00242 >>> 00243 cudnnConvolutionFwdAlgo_t algo = CONV_ALGO; >>> 00244 >>> 00245 cuda_enter(c->ctx); >>> 00246 #ifdef CHOOSE_ALGO >>> 00247 #ifndef CHOOSE_ONCE >>> 00248 reuse_algo = 1; >>> 00249 for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { >>> 00250 reuse_algo = (reuse_algo && >>> 00251 PyGpuArray_DIM(input, i) == prev_img_dims[i]); >>> 00252 reuse_algo = (reuse_algo && >>> 00253 PyGpuArray_DIM(kerns, i) == >>> prev_kern_dims[i]); >>> 00254 } >>> 00255 #endif >>> 00256 >>> 00257 if (!reuse_algo) { >>> 00258 #ifdef CHOOSE_TIME >>> 00259 int count; >>> 00260 cudnnConvolutionFwdAlgoPerf_t choice; >>> 00261 err = cudnnFindConvolutionForwardAlgorithm( >>> 00262 APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), >>> APPLY_SPECIFIC(kerns), >>> 00263 desc, APPLY_SPECIFIC(output), 1, &count, &choice); >>> 00264 >>> 00265 if (err != CUDNN_STATUS_SUCCESS) { >>> 00266 PyErr_Format(PyExc_RuntimeError, >>> 00267 "error selecting convolution algo: %s", >>> 00268 cudnnGetErrorString(err)); >>> 00269 cuda_exit(c->ctx); >>> 00270 return 1; >>> 00271 } >>> 00272 algo = choice.algo; >>> 00273 #else >>> 00274 size_t free; >>> 00275 int err2 = gpucontext_property(c->ctx, >>> GA_CTX_PROP_FREE_GMEM, &free); >>> 00276 >>> 00277 if (err2 != GA_NO_ERROR) { >>> 00278 PyErr_Format(PyExc_RuntimeError, "Error when trying to >>> find the " >>> 00279 "memory information on the GPU"); >>> 00280 cuda_exit(c->ctx); >>> 00281 return 1; >>> 00282 } >>> 00283 >>> 00284 err = cudnnGetConvolutionForwardAlgorithm( >>> 00285 APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), >>> APPLY_SPECIFIC(kerns), >>> 00286 desc, APPLY_SPECIFIC(output), >>> 00287 CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, >>> &algo); >>> 00288 if (err != CUDNN_STATUS_SUCCESS) { >>> 00289 PyErr_Format(PyExc_RuntimeError, >>> 00290 "error selecting convolution algo: %s", >>> 00291 cudnnGetErrorString(err)); >>> 00292 cuda_exit(c->ctx); >>> 00293 return 1; >>> 00294 } >>> 00295 #endif >>> 00296 prev_algo = algo; >>> 00297 } else { >>> 00298 algo = prev_algo; >>> 00299 } >>> 00300 >>> 00301 #ifdef CHOOSE_ONCE >>> 00302 reuse_algo = 1; >>> 00303 #else >>> 00304 for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { >>> 00305 prev_img_dims[i] = PyGpuArray_DIM(input, i); >>> 00306 prev_kern_dims[i] = PyGpuArray_DIM(kerns, i); >>> 00307 } >>> 00308 #endif >>> 00309 >>> 00310 #endif >>> 00311 >>> 00312 /* These two algos are not supported for 3d conv */ >>> 00313 if (PyGpuArray_NDIM(input) == 5 && >>> 00314 (algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM >>> || >>> 00315 algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM)) >>> 00316 algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; >>> 00317 >>> 00318 // The FFT implementation does not support strides, 1x1 >>> filters or inputs >>> 00319 // with a spatial dimension larger than 1024. The tiled-FFT >>> implementation >>> 00320 // does not support strides. >>> 00321 // If the chosen implementation is FFT or tiled-FFT, validate >>> that it can >>> 00322 // be used on the current data and default to a safe >>> implementation if it >>> 00323 // can't. >>> 00324 // The following code is 2d-specific but it is fine as FFT >>> and tiled-FFT are >>> 00325 // defined only for 2d filters >>> 00326 if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT || >>> 00327 algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && >>> PyGpuArray_NDIM(input) == 4) { >>> 00328 >>> 00329 // Extract the properties of the convolution descriptor >>> 00330 int nd; >>> 00331 int pad[2]; >>> 00332 int stride[2]; >>> 00333 int upscale[2]; >>> 00334 cudnnConvolutionMode_t mode; >>> 00335 cudnnDataType_t data_type; >>> 00336 err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, >>> stride, >>> 00337 upscale, &mode, >>> &data_type); >>> 00338 if (err != CUDNN_STATUS_SUCCESS) { >>> 00339 PyErr_Format(PyExc_RuntimeError, >>> 00340 "error getting convolution properties: %s", >>> 00341 cudnnGetErrorString(err)); >>> 00342 cuda_exit(c->ctx); >>> 00343 return 1; >>> 00344 } >>> 00345 >>> 00346 if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) >>> 00347 { >>> 00348 if (stride[0] != 1 || stride[1] != 1 || >>> 00349 PyGpuArray_DIM(input, 2) > 1024 || >>> PyGpuArray_DIM(input, 3) > 1024 || >>> 00350 (PyGpuArray_DIM(kerns, 2) == 1 && >>> PyGpuArray_DIM(kerns, 3) == 1)) >>> 00351 { >>> 00352 algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; >>> 00353 } >>> 00354 } >>> 00355 else >>> 00356 { >>> 00357 // algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING >>> 00358 if (stride[0] != 1 || stride[1] != 1) >>> 00359 { >>> 00360 algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; >>> 00361 } >>> 00362 } >>> 00363 } >>> 00364 >>> 00365 { >>> 00366 size_t worksize; >>> 00367 gpudata *workspace; >>> 00368 err = >>> cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle), >>> 00369 >>> APPLY_SPECIFIC(input), >>> 00370 >>> APPLY_SPECIFIC(kerns), >>> 00371 desc, >>> 00372 >>> APPLY_SPECIFIC(output), >>> 00373 algo, >>> 00374 &worksize); >>> 00375 >>> 00376 if (err == CUDNN_STATUS_NOT_SUPPORTED) { >>> 00377 // Fallback to none algo if not supported >>> 00378 // TODO: Print a warning >>> 00379 algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; >>> 00380 >>> 00381 err = >>> cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle), >>> 00382 >>> APPLY_SPECIFIC(input), >>> 00383 >>> APPLY_SPECIFIC(kerns), >>> 00384 desc, >>> 00385 >>> APPLY_SPECIFIC(output), >>> 00386 algo, >>> 00387 &worksize); >>> 00388 } >>> 00389 >>> 00390 if (err != CUDNN_STATUS_SUCCESS) { >>> 00391 PyErr_Format(PyExc_RuntimeError, >>> 00392 "error getting worksize: %s", >>> 00393 cudnnGetErrorString(err)); >>> 00394 cuda_exit(c->ctx); >>> 00395 return 1; >>> 00396 } >>> 00397 >>> 00398 /* >>> 00399 * This is less than ideal since we need to free it after >>> (which >>> 00400 * introduces a synchronization point. But we don't have a >>> module >>> 00401 * to place a nice get_work_mem() function in. >>> 00402 */ >>> 00403 if (worksize != 0) { >>> 00404 workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, >>> NULL); >>> 00405 if (workspace == NULL) { >>> 00406 PyErr_SetString(PyExc_RuntimeError, >>> 00407 "Could not allocate working memory"); >>> 00408 cuda_exit(c->ctx); >>> 00409 return 1; >>> 00410 } >>> 00411 } >>> 00412 >>> 00413 cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ); >>> 00414 cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); >>> 00415 cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); >>> 00416 >>> 00417 err = cudnnConvolutionForward( >>> 00418 APPLY_SPECIFIC(_handle), >>> 00419 alpha_p, >>> 00420 APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), >>> 00421 APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), >>> 00422 desc, algo, >>> 00423 worksize == 0 ? NULL : *(void **)workspace, worksize, >>> 00424 beta_p, >>> 00425 APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output)); >>> 00426 >>> 00427 if (worksize != 0) >>> 00428 gpudata_release(workspace); >>> 00429 >>> 00430 cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ); >>> 00431 cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); >>> 00432 cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); >>> 00433 } >>> 00434 cuda_exit(c->ctx); >>> 00435 >>> 00436 if (err != CUDNN_STATUS_SUCCESS) { >>> 00437 PyErr_Format(PyExc_RuntimeError, "error doing operation: >>> %s", >>> 00438 cudnnGetErrorString(err)); >>> 00439 return 1; >>> 00440 } >>> 00441 return 0; >>> 00442 } >>> 00443 >>> 00444 #undef DTYPE_INPUT_0 >>> 00445 #undef TYPENUM_INPUT_0 >>> 00446 #undef ITEMSIZE_INPUT_0 >>> 00447 #undef DTYPE_INPUT_1 >>> 00448 #undef TYPENUM_INPUT_1 >>> 00449 #undef ITEMSIZE_INPUT_1 >>> 00450 #undef DTYPE_INPUT_2 >>> 00451 #undef TYPENUM_INPUT_2 >>> 00452 #undef ITEMSIZE_INPUT_2 >>> 00453 #undef DTYPE_INPUT_4 >>> 00454 #undef TYPENUM_INPUT_4 >>> 00455 #undef ITEMSIZE_INPUT_4 >>> 00456 #undef DTYPE_INPUT_5 >>> 00457 #undef TYPENUM_INPUT_5 >>> 00458 #undef ITEMSIZE_INPUT_5 >>> 00459 #undef DTYPE_OUTPUT_0 >>> 00460 #undef TYPENUM_OUTPUT_0 >>> 00461 #undef ITEMSIZE_OUTPUT_0 >>> 00462 #undef APPLY_SPECIFIC >>> 00463 #undef CONV_INPLACE >>> 00464 #undef CONV_ALGO >>> 00465 >>> 00466 __struct_compiled_op_86feacd077d8749f42b5d82709a80ba3() >>> { >>> 00467 // This is only somewhat safe because we: >>> 00468 // 1) Are not a virtual class >>> 00469 // 2) Do not use any virtual classes in the members >>> 00470 // 3) Deal with mostly POD and pointers >>> 00471 >>> 00472 // If this changes, we would have to revise this, >>> but for >>> 00473 // now I am tired of chasing segfaults because >>> 00474 // initialization code had an error and some >>> pointer has >>> 00475 // a junk value. >>> 00476 memset(this, 0, sizeof(*this)); >>> 00477 } >>> 00478 >>> ~__struct_compiled_op_86feacd077d8749f42b5d82709a80ba3(void) { >>> 00479 cleanup(); >>> 00480 } >>> 00481 >>> 00482 int init(PyObject* __ERROR, PyObject* storage_V3, >>> PyObject* storage_V5, PyObject* storage_V7, PyObject* storage_V9, PyObject* >>> storage_V11, PyObject* storage_V13, PyObject* storage_V1, PyObject* >>> storage_V15) { >>> 00483 Py_XINCREF(storage_V3); >>> 00484 Py_XINCREF(storage_V5); >>> 00485 Py_XINCREF(storage_V7); >>> 00486 Py_XINCREF(storage_V9); >>> 00487 Py_XINCREF(storage_V11); >>> 00488 Py_XINCREF(storage_V13); >>> 00489 Py_XINCREF(storage_V1); >>> 00490 Py_XINCREF(storage_V15); >>> 00491 this->storage_V3 = storage_V3; >>> 00492 this->storage_V5 = storage_V5; >>> 00493 this->storage_V7 = storage_V7; >>> 00494 this->storage_V9 = storage_V9; >>> 00495 this->storage_V11 = storage_V11; >>> 00496 this->storage_V13 = storage_V13; >>> 00497 this->storage_V1 = storage_V1; >>> 00498 this->storage_V15 = storage_V15; >>> 00499 >>> 00500 >>> 00501 >>> 00502 >>> 00503 >>> 00504 >>> 00505 >>> 00506 >>> 00507 >>> 00508 py_V15 = PyList_GET_ITEM(storage_V15, 0); >>> 00509 {Py_XINCREF(py_V15);} >>> 00510 >>> 00511 if (!PyObject_TypeCheck(py_V15, &PyGpuContextType)) { >>> 00512 PyErr_SetString(PyExc_TypeError, "expected a GpuContext"); >>> 00513 { >>> 00514 if (!PyErr_Occurred()) { >>> 00515 PyErr_SetString(PyExc_RuntimeError, >>> 00516 "Unexpected error in an Op's C code. " >>> 00517 "No Python exception was set."); >>> 00518 } >>> 00519 return 15; >>> 00520 } >>> 00521 } >>> 00522 >>> 00523 V15 = (PyGpuContextObject *)py_V15; >>> 00524 Py_INCREF(V15); >>> 00525 >>> 00526 >>> 00527 #define DTYPE_INPUT_0 npy_float16 >>> 00528 #define TYPENUM_INPUT_0 23 >>> 00529 #define ITEMSIZE_INPUT_0 2 >>> 00530 #define DTYPE_INPUT_1 npy_float16 >>> 00531 #define TYPENUM_INPUT_1 23 >>> 00532 #define ITEMSIZE_INPUT_1 2 >>> 00533 #define DTYPE_INPUT_2 npy_float16 >>> 00534 #define TYPENUM_INPUT_2 23 >>> 00535 #define ITEMSIZE_INPUT_2 2 >>> 00536 #define DTYPE_INPUT_4 npy_float16 >>> 00537 #define TYPENUM_INPUT_4 23 >>> 00538 #define ITEMSIZE_INPUT_4 2 >>> 00539 #define DTYPE_INPUT_5 npy_float16 >>> 00540 #define TYPENUM_INPUT_5 23 >>> 00541 #define ITEMSIZE_INPUT_5 2 >>> 00542 #define DTYPE_OUTPUT_0 npy_float16 >>> 00543 #define TYPENUM_OUTPUT_0 23 >>> 00544 #define ITEMSIZE_OUTPUT_0 2 >>> 00545 #define APPLY_SPECIFIC(str) >>> str##_node_86feacd077d8749f42b5d82709a80ba3_0 >>> 00546 #define CONV_INPLACE 1 >>> 00547 #define CONV_ALGO >>> CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM >>> 00548 #define FAIL { \ >>> 00549 if (!PyErr_Occurred()) { \ >>> 00550 PyErr_SetString(PyExc_RuntimeError, \ >>> 00551 "Unexpected error in an Op's C code. " \ >>> 00552 "No Python exception was set."); \ >>> 00553 } \ >>> 00554 return 17; \ >>> 00555 } >>> 00556 #define PARAMS V15 >>> 00557 >>> 00558 >>> 00559 { >>> 00560 // We need to keep a reference here to have it available in >>> the destructor. >>> 00561 ctx = PARAMS; >>> 00562 Py_INCREF(ctx); >>> 00563 >>> 00564 cuda_enter(PARAMS->ctx); >>> 00565 cudnnStatus_t err; >>> 00566 APPLY_SPECIFIC(_handle) = NULL; >>> 00567 if ((err = cudnnCreate(&APPLY_SPECIFIC(_handle))) != >>> CUDNN_STATUS_SUCCESS) { >>> 00568 PyErr_Format(PyExc_RuntimeError, "could not create cuDNN >>> handle: %s", >>> 00569 cudnnGetErrorString(err)); >>> 00570 cuda_exit(PARAMS->ctx); >>> 00571 FAIL; >>> 00572 } >>> 00573 if ((err = cudnnSetStream(APPLY_SPECIFIC(_handle), >>> 00574 cuda_get_stream(PARAMS->ctx))) != >>> CUDNN_STATUS_SUCCESS) { >>> 00575 PyErr_Format(PyExc_RuntimeError, "Could not set cudnn >>> stream: %s", >>> 00576 cudnnGetErrorString(err)); >>> 00577 cuda_exit(PARAMS->ctx); >>> 00578 FAIL; >>> 00579 } >>> 00580 cuda_exit(PARAMS->ctx); >>> 00581 } >>> 00582 >>> 00583 >>> 00584 >>> 00585 cudnnStatus_t APPLY_SPECIFIC(err); >>> 00586 APPLY_SPECIFIC(input) = NULL; >>> 00587 APPLY_SPECIFIC(output) = NULL; >>> 00588 APPLY_SPECIFIC(kerns) = NULL; >>> 00589 if ((APPLY_SPECIFIC(err) = >>> cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != >>> CUDNN_STATUS_SUCCESS) { >>> 00590 PyErr_Format(PyExc_MemoryError, "could not allocate tensor >>> descriptor " >>> 00591 "(inp): %s", >>> cudnnGetErrorString(APPLY_SPECIFIC(err))); >>> 00592 FAIL; >>> 00593 } >>> 00594 if ((APPLY_SPECIFIC(err) = >>> cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != >>> CUDNN_STATUS_SUCCESS) { >>> 00595 PyErr_Format(PyExc_MemoryError, "could not allocate tensor >>> descriptor " >>> 00596 "(out): %s", >>> cudnnGetErrorString(APPLY_SPECIFIC(err))); >>> 00597 FAIL; >>> 00598 } >>> 00599 if ((APPLY_SPECIFIC(err) = >>> cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns))) != >>> CUDNN_STATUS_SUCCESS) { >>> 00600 PyErr_Format(PyExc_MemoryError, "could not allocate filter >>> descriptor: %s", >>> 00601 cudnnGetErrorString(APPLY_SPECIFIC(err))); >>> 00602 FAIL; >>> 00603 } >>> 00604 >>> 00605 >>> 00606 >>> 00607 #ifdef CHOOSE_ALGO >>> 00608 reuse_algo = 0; >>> 00609 prev_algo = CONV_ALGO; >>> 00610 #ifndef CHOOSE_ONCE >>> 00611 memset(prev_img_dims, 0, sizeof(prev_img_dims)); >>> 00612 memset(prev_kern_dims, 0, sizeof(prev_kern_dims)); >>> 00613 #endif >>> 00614 #endif >>> 00615 >>> 00616 >>> 00617 #undef FAIL >>> 00618 #undef PARAMS >>> 00619 #undef DTYPE_INPUT_0 >>> 00620 #undef TYPENUM_INPUT_0 >>> 00621 #undef ITEMSIZE_INPUT_0 >>> 00622 #undef DTYPE_INPUT_1 >>> 00623 #undef TYPENUM_INPUT_1 >>> 00624 #undef ITEMSIZE_INPUT_1 >>> 00625 #undef DTYPE_INPUT_2 >>> 00626 #undef TYPENUM_INPUT_2 >>> 00627 #undef ITEMSIZE_INPUT_2 >>> 00628 #undef DTYPE_INPUT_4 >>> 00629 #undef TYPENUM_INPUT_4 >>> 00630 #undef ITEMSIZE_INPUT_4 >>> 00631 #undef DTYPE_INPUT_5 >>> 00632 #undef TYPENUM_INPUT_5 >>> 00633 #undef ITEMSIZE_INPUT_5 >>> 00634 #undef DTYPE_OUTPUT_0 >>> 00635 #undef TYPENUM_OUTPUT_0 >>> 00636 #undef ITEMSIZE_OUTPUT_0 >>> 00637 #undef APPLY_SPECIFIC >>> 00638 #undef CONV_INPLACE >>> 00639 #undef CONV_ALGO >>> 00640 this->__ERROR = __ERROR; >>> 00641 return 0; >>> 00642 } >>> 00643 void cleanup(void) { >>> 00644 __label_1: >>> 00645 >>> 00646 double __DUMMY_1; >>> 00647 __label_3: >>> 00648 >>> 00649 double __DUMMY_3; >>> 00650 __label_5: >>> 00651 >>> 00652 double __DUMMY_5; >>> 00653 __label_7: >>> 00654 >>> 00655 double __DUMMY_7; >>> 00656 __label_9: >>> 00657 >>> 00658 double __DUMMY_9; >>> 00659 __label_11: >>> 00660 >>> 00661 double __DUMMY_11; >>> 00662 __label_13: >>> 00663 >>> 00664 double __DUMMY_13; >>> 00665 __label_15: >>> 00666 Py_XDECREF(V15); V15 = NULL; >>> 00667 {Py_XDECREF(py_V15);} >>> 00668 >>> 00669 double __DUMMY_15; >>> 00670 __label_18: >>> 00671 >>> 00672 #define DTYPE_INPUT_0 npy_float16 >>> 00673 #define TYPENUM_INPUT_0 23 >>> 00674 #define ITEMSIZE_INPUT_0 2 >>> 00675 #define DTYPE_INPUT_1 npy_float16 >>> 00676 #define TYPENUM_INPUT_1 23 >>> 00677 #define ITEMSIZE_INPUT_1 2 >>> 00678 #define DTYPE_INPUT_2 npy_float16 >>> 00679 #define TYPENUM_INPUT_2 23 >>> 00680 #define ITEMSIZE_INPUT_2 2 >>> 00681 #define DTYPE_INPUT_4 npy_float16 >>> 00682 #define TYPENUM_INPUT_4 23 >>> 00683 #define ITEMSIZE_INPUT_4 2 >>> 00684 #define DTYPE_INPUT_5 npy_float16 >>> 00685 #define TYPENUM_INPUT_5 23 >>> 00686 #define ITEMSIZE_INPUT_5 2 >>> 00687 #define DTYPE_OUTPUT_0 npy_float16 >>> 00688 #define TYPENUM_OUTPUT_0 23 >>> 00689 #define ITEMSIZE_OUTPUT_0 2 >>> 00690 #define APPLY_SPECIFIC(str) >>> str##_node_86feacd077d8749f42b5d82709a80ba3_0 >>> 00691 #define CONV_INPLACE 1 >>> 00692 #define CONV_ALGO >>> CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM >>> 00693 >>> 00694 >>> 00695 cuda_enter(ctx->ctx); >>> 00696 cudnnDestroy(APPLY_SPECIFIC(_handle)); >>> 00697 cuda_exit(ctx->ctx); >>> 00698 Py_DECREF((PyObject *)ctx); >>> 00699 >>> 00700 >>> 00701 if (APPLY_SPECIFIC(input) != NULL) >>> 00702 cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); >>> 00703 if (APPLY_SPECIFIC(output) != NULL) >>> 00704 cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); >>> 00705 if (APPLY_SPECIFIC(kerns) != NULL) >>> 00706 cudnnDestroyFilterDescriptor(APPLY_SPECIFIC(kerns)); >>> 00707 >>> 00708 #undef DTYPE_INPUT_0 >>> 00709 #undef TYPENUM_INPUT_0 >>> 00710 #undef ITEMSIZE_INPUT_0 >>> 00711 #undef DTYPE_INPUT_1 >>> 00712 #undef TYPENUM_INPUT_1 >>> 00713 #undef ITEMSIZE_INPUT_1 >>> 00714 #undef DTYPE_INPUT_2 >>> 00715 #undef TYPENUM_INPUT_2 >>> 00716 #undef ITEMSIZE_INPUT_2 >>> 00717 #undef DTYPE_INPUT_4 >>> 00718 #undef TYPENUM_INPUT_4 >>> 00719 #undef ITEMSIZE_INPUT_4 >>> 00720 #undef DTYPE_INPUT_5 >>> 00721 #undef TYPENUM_INPUT_5 >>> 00722 #undef ITEMSIZE_INPUT_5 >>> 00723 #undef DTYPE_OUTPUT_0 >>> 00724 #undef TYPENUM_OUTPUT_0 >>> 00725 #undef ITEMSIZE_OUTPUT_0 >>> 00726 #undef APPLY_SPECIFIC >>> 00727 #undef CONV_INPLACE >>> 00728 #undef CONV_ALGO >>> 00729 double __DUMMY_18; >>> 00730 >>> 00731 Py_XDECREF(this->storage_V3); >>> 00732 Py_XDECREF(this->storage_V5); >>> 00733 Py_XDECREF(this->storage_V7); >>> 00734 Py_XDECREF(this->storage_V9); >>> 00735 Py_XDECREF(this->storage_V11); >>> 00736 Py_XDECREF(this->storage_V13); >>> 00737 Py_XDECREF(this->storage_V1); >>> 00738 Py_XDECREF(this->storage_V15); >>> 00739 } >>> 00740 int run(void) { >>> 00741 int __failure = 0; >>> 00742 >>> 00743 PyObject* py_V1; >>> 00744 >>> 00745 PyGpuArrayObject *V1; >>> 00746 >>> 00747 PyObject* py_V3; >>> 00748 >>> 00749 PyGpuArrayObject *V3; >>> 00750 >>> 00751 PyObject* py_V5; >>> 00752 >>> 00753 PyGpuArrayObject *V5; >>> 00754 >>> 00755 PyObject* py_V7; >>> 00756 >>> 00757 PyGpuArrayObject *V7; >>> 00758 >>> 00759 PyObject* py_V9; >>> 00760 >>> 00761 cudnnConvolutionDescriptor_t V9; >>> 00762 >>> 00763 PyObject* py_V11; >>> 00764 >>> 00765 typedef npy_float16 V11_dtype; // Deprecated >>> use dtype_V11 instead. >>> 00766 typedef npy_float16 dtype_V11; >>> 00767 >>> 00768 npy_float16 V11; >>> 00769 >>> 00770 PyObject* py_V13; >>> 00771 >>> 00772 typedef npy_float16 V13_dtype; // Deprecated >>> use dtype_V13 instead. >>> 00773 typedef npy_float16 dtype_V13; >>> 00774 >>> 00775 npy_float16 V13; >>> 00776 >>> 00777 { >>> 00778 >>> 00779 py_V1 = PyList_GET_ITEM(storage_V1, 0); >>> 00780 {Py_XINCREF(py_V1);} >>> 00781 >>> 00782 if (py_V1 == Py_None) >>> 00783 { >>> 00784 V1 = NULL; >>> 00785 } >>> 00786 else >>> 00787 { >>> 00788 >>> 00789 V1 = NULL; >>> 00790 if (py_V1 == Py_None) { >>> 00791 PyErr_SetString(PyExc_ValueError, "expected a >>> GpuArray, not None"); >>> 00792 { >>> 00793 __failure = 2; >>> 00794 if (!PyErr_Occurred()) { >>> 00795 PyErr_SetString(PyExc_RuntimeError, >>> 00796 "Unexpected error in an Op's C code. " >>> 00797 "No Python exception was set."); >>> 00798 } >>> 00799 goto __label_2;} >>> 00800 } >>> 00801 /* First check if we are the base type exactly (the >>> most common case), >>> 00802 then do the full subclass check if needed. */ >>> 00803 if (py_V1->ob_type != &PyGpuArrayType && >>> 00804 !PyObject_TypeCheck(py_V1, &PyGpuArrayType)) { >>> 00805 PyE >>> >> -- >> >> --- >> You received this message because you are subscribed to the Google Groups >> "theano-users" group. >> To unsubscribe from this group and stop receiving emails from it, send an >> email to [email protected] <javascript:>. >> >> For more options, visit https://groups.google.com/d/optout. >> > >
-- --- You received this message because you are subscribed to the Google Groups "theano-users" group. To unsubscribe from this group and stop receiving emails from it, send an email to [email protected]. For more options, visit https://groups.google.com/d/optout.
