Here is .theanorc: [global] floatX = float16 device=cuda [cuda] root = /usr/local/cuda-7.5
[nvcc] fastmath=True optimizer = fast_compile On Thursday, July 14, 2016 at 10:19:56 AM UTC+2, [email protected] wrote: > > Hi Arnaud, > I put _f16_ok = True in dnn.py ( attached). > > This is the error I received: > > Python 2.7.11 |Anaconda custom (64-bit)| (default, Dec 6 2015, 18:08:32) > [GCC 4.4.7 20120313 (Red Hat 4.4.7-1)] on linux2 > Type "help", "copyright", "credits" or "license" for more information. > Anaconda is brought to you by Continuum Analytics. > Please check out: http://continuum.io/thanks and https://anaconda.org > >>> import run_multi_conv > > Mapped name None to device cuda: GeForce 840M > WARNING (theano.gof.compilelock): Overriding existing lock by dead process > '3202' (I am process '3351') > Using cuDNN version 5005 on context None > /home/luca/data/Theano-master/theano/tensor/signal/downsample.py:6: > UserWarning: downsample module has been moved to the > theano.tensor.signal.pool module. > "downsample module has been moved to the theano.tensor.signal.pool > module.") > >>> > >>> run_multi_conv.run_experiments() > Disabling C code for Elemwise{mul,no_inplace} due to unsupported float16 > Disabling C code for Elemwise{Cast{float32}} due to unsupported float16 > Disabling C code for Elemwise{Cast{float16}} due to unsupported float16 > Disabling C code for Elemwise{Cast{float16}} due to unsupported float16 > Disabling C code for Alloc due to unsupported float16 > Disabling C code for Cast{float16} due to unsupported float16 > Disabling C code for Cast{float16} due to unsupported float16 > Disabling C code for Cast{float16} due to unsupported float16 > Disabling C code for Cast{float16} due to unsupported float16 > Disabling C code for RandomFunction{binomial} due to unsupported float16 > Disabling C code for RandomFunction{binomial} due to unsupported float16 > =============================== > 00001 #include <Python.h> > 00002 #include <iostream> > 00003 #include "theano_mod_helper.h" > 00004 #include <gpuarray/array.h> > 00005 #include <gpuarray/kernel.h> > 00006 #include <gpuarray/error.h> > 00007 #include <gpuarray/buffer.h> > 00008 #include <gpuarray/buffer_blas.h> > 00009 #include <numpy/arrayobject.h> > 00010 #include <gpuarray_api.h> > 00011 #include <math.h> > 00012 #include <numpy/arrayscalars.h> > 00013 #include "cudnn.h" > 00014 #include "cudnn_helper.h" > 00015 #include "gpuarray_helper.h" > 00016 #include "gpuarray/types.h" > 00017 #include "gpuarray/array.h" > 00018 #include "gpuarray/util.h" > 00019 #include "gpuarray/ext_cuda.h" > 00020 #include "gpuarray_api.h" > 00021 #include "numpy_compat.h" > 00022 ////////////////////// > 00023 //// Support Code > 00024 ////////////////////// > 00025 > 00026 > 00027 > 00028 static int > 00029 c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t > desc) { > 00030 cudnnDataType_t dt; > 00031 size_t ds; > 00032 switch (var->ga.typecode) { > 00033 case GA_FLOAT: > 00034 dt = CUDNN_DATA_FLOAT; > 00035 break; > 00036 case GA_DOUBLE: > 00037 dt = CUDNN_DATA_DOUBLE; > 00038 break; > 00039 #if CUDNN_VERSION > 3000 > 00040 case GA_HALF: > 00041 dt = CUDNN_DATA_HALF; > 00042 break; > 00043 #endif > 00044 default: > 00045 PyErr_SetString(PyExc_TypeError, "Non-float datatype in > c_set_tensorNd"); > 00046 return -1; > 00047 } > 00048 ds = gpuarray_get_elsize(var->ga.typecode); > 00049 > 00050 int strs[5], dims[5], default_stride = 1; > 00051 unsigned int nd = PyGpuArray_NDIM(var); > 00052 > 00053 if (nd > 5) { > 00054 PyErr_SetString(PyExc_TypeError, "Tensor of more than 5d"); > 00055 return -1; > 00056 } > 00057 > 00058 for (unsigned int _i = nd; _i > 0; _i--) { > 00059 unsigned int i = _i - 1; > 00060 strs[i] = PyGpuArray_STRIDE(var, i) ? > 00061 PyGpuArray_STRIDE(var, i)/ds : default_stride; > 00062 default_stride *= PyGpuArray_DIM(var, i); > 00063 dims[i] = PyGpuArray_DIM(var, i); > 00064 } > 00065 > 00066 cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, dt, nd, > dims, strs); > 00067 if (err != CUDNN_STATUS_SUCCESS) { > 00068 PyErr_Format(PyExc_RuntimeError, > 00069 "Could not set tensorNd descriptor: %s", > 00070 cudnnGetErrorString(err)); > 00071 return -1; > 00072 } > 00073 return 0; > 00074 } > 00075 > 00076 static int > 00077 c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) > { > 00078 cudnnDataType_t dt; > 00079 cudnnStatus_t err; > 00080 > 00081 if (!GpuArray_IS_C_CONTIGUOUS(&var->ga)) { > 00082 PyErr_SetString(PyExc_ValueError, > 00083 "Only contiguous filters (kernels) are supported."); > 00084 return -1; > 00085 } > 00086 switch (var->ga.typecode) { > 00087 case GA_FLOAT: > 00088 dt = CUDNN_DATA_FLOAT; > 00089 break; > 00090 case GA_DOUBLE: > 00091 dt = CUDNN_DATA_DOUBLE; > 00092 break; > 00093 #if CUDNN_VERSION > 3000 > 00094 case GA_HALF: > 00095 dt = CUDNN_DATA_HALF; > 00096 break; > 00097 #endif > 00098 default: > 00099 PyErr_SetString(PyExc_TypeError, "Non-float datatype in > c_set_filter"); > 00100 return -1; > 00101 } > 00102 > 00103 int dims[5]; > 00104 unsigned int nd = PyGpuArray_NDIM(var); > 00105 > 00106 if (nd > 5) { > 00107 PyErr_SetString(PyExc_TypeError, "Tensor of more than 5d"); > 00108 return -1; > 00109 } > 00110 > 00111 for (unsigned int _i = nd; _i > 0; _i--) { > 00112 unsigned int i = _i - 1; > 00113 dims[i] = PyGpuArray_DIM(var, i); > 00114 } > 00115 > 00116 #if CUDNN_VERSION >= 5000 > 00117 err = cudnnSetFilterNdDescriptor(desc, dt, CUDNN_TENSOR_NCHW, > nd, dims); > 00118 #else > 00119 err = cudnnSetFilterNdDescriptor(desc, dt, nd, dims); > 00120 #endif > 00121 > 00122 if (err != CUDNN_STATUS_SUCCESS) { > 00123 PyErr_Format(PyExc_RuntimeError, > 00124 "Could not set filter descriptor: %s.", > 00125 cudnnGetErrorString(err)); > 00126 return -1; > 00127 } > 00128 return 0; > 00129 } > 00130 > 00131 > 00132 > 00133 namespace { > 00134 struct __struct_compiled_op_86feacd077d8749f42b5d82709a80ba3 { > 00135 PyObject* __ERROR; > 00136 > 00137 PyObject* storage_V3; > 00138 PyObject* storage_V5; > 00139 PyObject* storage_V7; > 00140 PyObject* storage_V9; > 00141 PyObject* storage_V11; > 00142 PyObject* storage_V13; > 00143 PyObject* storage_V1; > 00144 PyObject* storage_V15; > 00145 > 00146 PyObject* py_V15; > 00147 PyGpuContextObject *V15; > 00148 #define DTYPE_INPUT_0 npy_float16 > 00149 #define TYPENUM_INPUT_0 23 > 00150 #define ITEMSIZE_INPUT_0 2 > 00151 #define DTYPE_INPUT_1 npy_float16 > 00152 #define TYPENUM_INPUT_1 23 > 00153 #define ITEMSIZE_INPUT_1 2 > 00154 #define DTYPE_INPUT_2 npy_float16 > 00155 #define TYPENUM_INPUT_2 23 > 00156 #define ITEMSIZE_INPUT_2 2 > 00157 #define DTYPE_INPUT_4 npy_float16 > 00158 #define TYPENUM_INPUT_4 23 > 00159 #define ITEMSIZE_INPUT_4 2 > 00160 #define DTYPE_INPUT_5 npy_float16 > 00161 #define TYPENUM_INPUT_5 23 > 00162 #define ITEMSIZE_INPUT_5 2 > 00163 #define DTYPE_OUTPUT_0 npy_float16 > 00164 #define TYPENUM_OUTPUT_0 23 > 00165 #define ITEMSIZE_OUTPUT_0 2 > 00166 #define APPLY_SPECIFIC(str) > str##_node_86feacd077d8749f42b5d82709a80ba3_0 > 00167 #define CONV_INPLACE 1 > 00168 #define CONV_ALGO CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM > 00169 > 00170 > 00171 PyGpuContextObject *ctx; > 00172 cudnnHandle_t APPLY_SPECIFIC(_handle); > 00173 > 00174 > 00175 cudnnTensorDescriptor_t APPLY_SPECIFIC(input); > 00176 cudnnTensorDescriptor_t APPLY_SPECIFIC(output); > 00177 cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns); > 00178 > 00179 > 00180 > 00181 #ifdef CHOOSE_ALGO > 00182 int reuse_algo; > 00183 cudnnConvolutionFwdAlgo_t prev_algo; > 00184 #ifndef CHOOSE_ONCE > 00185 size_t prev_img_dims[5]; > 00186 size_t prev_kern_dims[5]; > 00187 #endif > 00188 #endif > 00189 > 00190 int > 00191 APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, > PyGpuArrayObject *kerns, > 00192 PyGpuArrayObject *om, > 00193 cudnnConvolutionDescriptor_t desc, > 00194 double alpha, double beta, > 00195 PyGpuArrayObject **output, > 00196 PyGpuContextObject *c) { > 00197 cudnnStatus_t err = CUDNN_STATUS_SUCCESS; > 00198 float af = alpha, bf = beta; > 00199 void *alpha_p; > 00200 void *beta_p; > 00201 > 00202 if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) { > 00203 PyErr_SetString(PyExc_ValueError, > 00204 "images and kernel must have the same stack size"); > 00205 return 1; > 00206 } > 00207 > 00208 if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1) > 00209 return 1; > 00210 if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) > 00211 return 1; > 00212 > 00213 switch (input->ga.typecode) { > 00214 case GA_DOUBLE: > 00215 alpha_p = (void *)α > 00216 beta_p = (void *)β > 00217 break; > 00218 case GA_FLOAT: > 00219 case GA_HALF: > 00220 alpha_p = (void *)⁡ > 00221 beta_p = (void *)&bf; > 00222 break; > 00223 default: > 00224 PyErr_SetString(PyExc_TypeError, "Unsupported type in > convolution"); > 00225 return 1; > 00226 } > 00227 > 00228 #ifdef CONV_INPLACE > 00229 Py_XDECREF(*output); > 00230 *output = om; > 00231 Py_INCREF(*output); > 00232 #else > 00233 if (theano_prep_output(output, PyGpuArray_NDIM(om), > PyGpuArray_DIMS(om), > 00234 om->ga.typecode, GA_C_ORDER, c) != 0) > 00235 return 1; > 00236 if (beta != 0.0 && pygpu_move(*output, om)) > 00237 return 1; > 00238 #endif > 00239 > 00240 if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1) > 00241 return 1; > 00242 > 00243 cudnnConvolutionFwdAlgo_t algo = CONV_ALGO; > 00244 > 00245 cuda_enter(c->ctx); > 00246 #ifdef CHOOSE_ALGO > 00247 #ifndef CHOOSE_ONCE > 00248 reuse_algo = 1; > 00249 for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { > 00250 reuse_algo = (reuse_algo && > 00251 PyGpuArray_DIM(input, i) == prev_img_dims[i]); > 00252 reuse_algo = (reuse_algo && > 00253 PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]); > 00254 } > 00255 #endif > 00256 > 00257 if (!reuse_algo) { > 00258 #ifdef CHOOSE_TIME > 00259 int count; > 00260 cudnnConvolutionFwdAlgoPerf_t choice; > 00261 err = cudnnFindConvolutionForwardAlgorithm( > 00262 APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), > APPLY_SPECIFIC(kerns), > 00263 desc, APPLY_SPECIFIC(output), 1, &count, &choice); > 00264 > 00265 if (err != CUDNN_STATUS_SUCCESS) { > 00266 PyErr_Format(PyExc_RuntimeError, > 00267 "error selecting convolution algo: %s", > 00268 cudnnGetErrorString(err)); > 00269 cuda_exit(c->ctx); > 00270 return 1; > 00271 } > 00272 algo = choice.algo; > 00273 #else > 00274 size_t free; > 00275 int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_FREE_GMEM, > &free); > 00276 > 00277 if (err2 != GA_NO_ERROR) { > 00278 PyErr_Format(PyExc_RuntimeError, "Error when trying to find > the " > 00279 "memory information on the GPU"); > 00280 cuda_exit(c->ctx); > 00281 return 1; > 00282 } > 00283 > 00284 err = cudnnGetConvolutionForwardAlgorithm( > 00285 APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), > APPLY_SPECIFIC(kerns), > 00286 desc, APPLY_SPECIFIC(output), > 00287 CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo); > 00288 if (err != CUDNN_STATUS_SUCCESS) { > 00289 PyErr_Format(PyExc_RuntimeError, > 00290 "error selecting convolution algo: %s", > 00291 cudnnGetErrorString(err)); > 00292 cuda_exit(c->ctx); > 00293 return 1; > 00294 } > 00295 #endif > 00296 prev_algo = algo; > 00297 } else { > 00298 algo = prev_algo; > 00299 } > 00300 > 00301 #ifdef CHOOSE_ONCE > 00302 reuse_algo = 1; > 00303 #else > 00304 for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { > 00305 prev_img_dims[i] = PyGpuArray_DIM(input, i); > 00306 prev_kern_dims[i] = PyGpuArray_DIM(kerns, i); > 00307 } > 00308 #endif > 00309 > 00310 #endif > 00311 > 00312 /* These two algos are not supported for 3d conv */ > 00313 if (PyGpuArray_NDIM(input) == 5 && > 00314 (algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM || > 00315 algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM)) > 00316 algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; > 00317 > 00318 // The FFT implementation does not support strides, 1x1 filters > or inputs > 00319 // with a spatial dimension larger than 1024. The tiled-FFT > implementation > 00320 // does not support strides. > 00321 // If the chosen implementation is FFT or tiled-FFT, validate > that it can > 00322 // be used on the current data and default to a safe > implementation if it > 00323 // can't. > 00324 // The following code is 2d-specific but it is fine as FFT and > tiled-FFT are > 00325 // defined only for 2d filters > 00326 if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT || > 00327 algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && > PyGpuArray_NDIM(input) == 4) { > 00328 > 00329 // Extract the properties of the convolution descriptor > 00330 int nd; > 00331 int pad[2]; > 00332 int stride[2]; > 00333 int upscale[2]; > 00334 cudnnConvolutionMode_t mode; > 00335 cudnnDataType_t data_type; > 00336 err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, > stride, > 00337 upscale, &mode, > &data_type); > 00338 if (err != CUDNN_STATUS_SUCCESS) { > 00339 PyErr_Format(PyExc_RuntimeError, > 00340 "error getting convolution properties: %s", > 00341 cudnnGetErrorString(err)); > 00342 cuda_exit(c->ctx); > 00343 return 1; > 00344 } > 00345 > 00346 if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) > 00347 { > 00348 if (stride[0] != 1 || stride[1] != 1 || > 00349 PyGpuArray_DIM(input, 2) > 1024 || > PyGpuArray_DIM(input, 3) > 1024 || > 00350 (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, > 3) == 1)) > 00351 { > 00352 algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; > 00353 } > 00354 } > 00355 else > 00356 { > 00357 // algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING > 00358 if (stride[0] != 1 || stride[1] != 1) > 00359 { > 00360 algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; > 00361 } > 00362 } > 00363 } > 00364 > 00365 { > 00366 size_t worksize; > 00367 gpudata *workspace; > 00368 err = > cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle), > 00369 > APPLY_SPECIFIC(input), > 00370 > APPLY_SPECIFIC(kerns), > 00371 desc, > 00372 > APPLY_SPECIFIC(output), > 00373 algo, > 00374 &worksize); > 00375 > 00376 if (err == CUDNN_STATUS_NOT_SUPPORTED) { > 00377 // Fallback to none algo if not supported > 00378 // TODO: Print a warning > 00379 algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; > 00380 > 00381 err = > cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle), > 00382 > APPLY_SPECIFIC(input), > 00383 > APPLY_SPECIFIC(kerns), > 00384 desc, > 00385 > APPLY_SPECIFIC(output), > 00386 algo, > 00387 &worksize); > 00388 } > 00389 > 00390 if (err != CUDNN_STATUS_SUCCESS) { > 00391 PyErr_Format(PyExc_RuntimeError, > 00392 "error getting worksize: %s", > 00393 cudnnGetErrorString(err)); > 00394 cuda_exit(c->ctx); > 00395 return 1; > 00396 } > 00397 > 00398 /* > 00399 * This is less than ideal since we need to free it after > (which > 00400 * introduces a synchronization point. But we don't have a > module > 00401 * to place a nice get_work_mem() function in. > 00402 */ > 00403 if (worksize != 0) { > 00404 workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL); > 00405 if (workspace == NULL) { > 00406 PyErr_SetString(PyExc_RuntimeError, > 00407 "Could not allocate working memory"); > 00408 cuda_exit(c->ctx); > 00409 return 1; > 00410 } > 00411 } > 00412 > 00413 cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ); > 00414 cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); > 00415 cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); > 00416 > 00417 err = cudnnConvolutionForward( > 00418 APPLY_SPECIFIC(_handle), > 00419 alpha_p, > 00420 APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), > 00421 APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), > 00422 desc, algo, > 00423 worksize == 0 ? NULL : *(void **)workspace, worksize, > 00424 beta_p, > 00425 APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output)); > 00426 > 00427 if (worksize != 0) > 00428 gpudata_release(workspace); > 00429 > 00430 cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ); > 00431 cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); > 00432 cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); > 00433 } > 00434 cuda_exit(c->ctx); > 00435 > 00436 if (err != CUDNN_STATUS_SUCCESS) { > 00437 PyErr_Format(PyExc_RuntimeError, "error doing operation: %s", > 00438 cudnnGetErrorString(err)); > 00439 return 1; > 00440 } > 00441 return 0; > 00442 } > 00443 > 00444 #undef DTYPE_INPUT_0 > 00445 #undef TYPENUM_INPUT_0 > 00446 #undef ITEMSIZE_INPUT_0 > 00447 #undef DTYPE_INPUT_1 > 00448 #undef TYPENUM_INPUT_1 > 00449 #undef ITEMSIZE_INPUT_1 > 00450 #undef DTYPE_INPUT_2 > 00451 #undef TYPENUM_INPUT_2 > 00452 #undef ITEMSIZE_INPUT_2 > 00453 #undef DTYPE_INPUT_4 > 00454 #undef TYPENUM_INPUT_4 > 00455 #undef ITEMSIZE_INPUT_4 > 00456 #undef DTYPE_INPUT_5 > 00457 #undef TYPENUM_INPUT_5 > 00458 #undef ITEMSIZE_INPUT_5 > 00459 #undef DTYPE_OUTPUT_0 > 00460 #undef TYPENUM_OUTPUT_0 > 00461 #undef ITEMSIZE_OUTPUT_0 > 00462 #undef APPLY_SPECIFIC > 00463 #undef CONV_INPLACE > 00464 #undef CONV_ALGO > 00465 > 00466 __struct_compiled_op_86feacd077d8749f42b5d82709a80ba3() { > 00467 // This is only somewhat safe because we: > 00468 // 1) Are not a virtual class > 00469 // 2) Do not use any virtual classes in the members > 00470 // 3) Deal with mostly POD and pointers > 00471 > 00472 // If this changes, we would have to revise this, but > for > 00473 // now I am tired of chasing segfaults because > 00474 // initialization code had an error and some pointer > has > 00475 // a junk value. > 00476 memset(this, 0, sizeof(*this)); > 00477 } > 00478 > ~__struct_compiled_op_86feacd077d8749f42b5d82709a80ba3(void) { > 00479 cleanup(); > 00480 } > 00481 > 00482 int init(PyObject* __ERROR, PyObject* storage_V3, > PyObject* storage_V5, PyObject* storage_V7, PyObject* storage_V9, PyObject* > storage_V11, PyObject* storage_V13, PyObject* storage_V1, PyObject* > storage_V15) { > 00483 Py_XINCREF(storage_V3); > 00484 Py_XINCREF(storage_V5); > 00485 Py_XINCREF(storage_V7); > 00486 Py_XINCREF(storage_V9); > 00487 Py_XINCREF(storage_V11); > 00488 Py_XINCREF(storage_V13); > 00489 Py_XINCREF(storage_V1); > 00490 Py_XINCREF(storage_V15); > 00491 this->storage_V3 = storage_V3; > 00492 this->storage_V5 = storage_V5; > 00493 this->storage_V7 = storage_V7; > 00494 this->storage_V9 = storage_V9; > 00495 this->storage_V11 = storage_V11; > 00496 this->storage_V13 = storage_V13; > 00497 this->storage_V1 = storage_V1; > 00498 this->storage_V15 = storage_V15; > 00499 > 00500 > 00501 > 00502 > 00503 > 00504 > 00505 > 00506 > 00507 > 00508 py_V15 = PyList_GET_ITEM(storage_V15, 0); > 00509 {Py_XINCREF(py_V15);} > 00510 > 00511 if (!PyObject_TypeCheck(py_V15, &PyGpuContextType)) { > 00512 PyErr_SetString(PyExc_TypeError, "expected a GpuContext"); > 00513 { > 00514 if (!PyErr_Occurred()) { > 00515 PyErr_SetString(PyExc_RuntimeError, > 00516 "Unexpected error in an Op's C code. " > 00517 "No Python exception was set."); > 00518 } > 00519 return 15; > 00520 } > 00521 } > 00522 > 00523 V15 = (PyGpuContextObject *)py_V15; > 00524 Py_INCREF(V15); > 00525 > 00526 > 00527 #define DTYPE_INPUT_0 npy_float16 > 00528 #define TYPENUM_INPUT_0 23 > 00529 #define ITEMSIZE_INPUT_0 2 > 00530 #define DTYPE_INPUT_1 npy_float16 > 00531 #define TYPENUM_INPUT_1 23 > 00532 #define ITEMSIZE_INPUT_1 2 > 00533 #define DTYPE_INPUT_2 npy_float16 > 00534 #define TYPENUM_INPUT_2 23 > 00535 #define ITEMSIZE_INPUT_2 2 > 00536 #define DTYPE_INPUT_4 npy_float16 > 00537 #define TYPENUM_INPUT_4 23 > 00538 #define ITEMSIZE_INPUT_4 2 > 00539 #define DTYPE_INPUT_5 npy_float16 > 00540 #define TYPENUM_INPUT_5 23 > 00541 #define ITEMSIZE_INPUT_5 2 > 00542 #define DTYPE_OUTPUT_0 npy_float16 > 00543 #define TYPENUM_OUTPUT_0 23 > 00544 #define ITEMSIZE_OUTPUT_0 2 > 00545 #define APPLY_SPECIFIC(str) > str##_node_86feacd077d8749f42b5d82709a80ba3_0 > 00546 #define CONV_INPLACE 1 > 00547 #define CONV_ALGO CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM > 00548 #define FAIL { \ > 00549 if (!PyErr_Occurred()) { \ > 00550 PyErr_SetString(PyExc_RuntimeError, \ > 00551 "Unexpected error in an Op's C code. " \ > 00552 "No Python exception was set."); \ > 00553 } \ > 00554 return 17; \ > 00555 } > 00556 #define PARAMS V15 > 00557 > 00558 > 00559 { > 00560 // We need to keep a reference here to have it available in the > destructor. > 00561 ctx = PARAMS; > 00562 Py_INCREF(ctx); > 00563 > 00564 cuda_enter(PARAMS->ctx); > 00565 cudnnStatus_t err; > 00566 APPLY_SPECIFIC(_handle) = NULL; > 00567 if ((err = cudnnCreate(&APPLY_SPECIFIC(_handle))) != > CUDNN_STATUS_SUCCESS) { > 00568 PyErr_Format(PyExc_RuntimeError, "could not create cuDNN > handle: %s", > 00569 cudnnGetErrorString(err)); > 00570 cuda_exit(PARAMS->ctx); > 00571 FAIL; > 00572 } > 00573 if ((err = cudnnSetStream(APPLY_SPECIFIC(_handle), > 00574 cuda_get_stream(PARAMS->ctx))) != > CUDNN_STATUS_SUCCESS) { > 00575 PyErr_Format(PyExc_RuntimeError, "Could not set cudnn stream: > %s", > 00576 cudnnGetErrorString(err)); > 00577 cuda_exit(PARAMS->ctx); > 00578 FAIL; > 00579 } > 00580 cuda_exit(PARAMS->ctx); > 00581 } > 00582 > 00583 > 00584 > 00585 cudnnStatus_t APPLY_SPECIFIC(err); > 00586 APPLY_SPECIFIC(input) = NULL; > 00587 APPLY_SPECIFIC(output) = NULL; > 00588 APPLY_SPECIFIC(kerns) = NULL; > 00589 if ((APPLY_SPECIFIC(err) = > cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != > CUDNN_STATUS_SUCCESS) { > 00590 PyErr_Format(PyExc_MemoryError, "could not allocate tensor > descriptor " > 00591 "(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err))); > 00592 FAIL; > 00593 } > 00594 if ((APPLY_SPECIFIC(err) = > cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != > CUDNN_STATUS_SUCCESS) { > 00595 PyErr_Format(PyExc_MemoryError, "could not allocate tensor > descriptor " > 00596 "(out): %s", > cudnnGetErrorString(APPLY_SPECIFIC(err))); > 00597 FAIL; > 00598 } > 00599 if ((APPLY_SPECIFIC(err) = > cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns))) != > CUDNN_STATUS_SUCCESS) { > 00600 PyErr_Format(PyExc_MemoryError, "could not allocate filter > descriptor: %s", > 00601 cudnnGetErrorString(APPLY_SPECIFIC(err))); > 00602 FAIL; > 00603 } > 00604 > 00605 > 00606 > 00607 #ifdef CHOOSE_ALGO > 00608 reuse_algo = 0; > 00609 prev_algo = CONV_ALGO; > 00610 #ifndef CHOOSE_ONCE > 00611 memset(prev_img_dims, 0, sizeof(prev_img_dims)); > 00612 memset(prev_kern_dims, 0, sizeof(prev_kern_dims)); > 00613 #endif > 00614 #endif > 00615 > 00616 > 00617 #undef FAIL > 00618 #undef PARAMS > 00619 #undef DTYPE_INPUT_0 > 00620 #undef TYPENUM_INPUT_0 > 00621 #undef ITEMSIZE_INPUT_0 > 00622 #undef DTYPE_INPUT_1 > 00623 #undef TYPENUM_INPUT_1 > 00624 #undef ITEMSIZE_INPUT_1 > 00625 #undef DTYPE_INPUT_2 > 00626 #undef TYPENUM_INPUT_2 > 00627 #undef ITEMSIZE_INPUT_2 > 00628 #undef DTYPE_INPUT_4 > 00629 #undef TYPENUM_INPUT_4 > 00630 #undef ITEMSIZE_INPUT_4 > 00631 #undef DTYPE_INPUT_5 > 00632 #undef TYPENUM_INPUT_5 > 00633 #undef ITEMSIZE_INPUT_5 > 00634 #undef DTYPE_OUTPUT_0 > 00635 #undef TYPENUM_OUTPUT_0 > 00636 #undef ITEMSIZE_OUTPUT_0 > 00637 #undef APPLY_SPECIFIC > 00638 #undef CONV_INPLACE > 00639 #undef CONV_ALGO > 00640 this->__ERROR = __ERROR; > 00641 return 0; > 00642 } > 00643 void cleanup(void) { > 00644 __label_1: > 00645 > 00646 double __DUMMY_1; > 00647 __label_3: > 00648 > 00649 double __DUMMY_3; > 00650 __label_5: > 00651 > 00652 double __DUMMY_5; > 00653 __label_7: > 00654 > 00655 double __DUMMY_7; > 00656 __label_9: > 00657 > 00658 double __DUMMY_9; > 00659 __label_11: > 00660 > 00661 double __DUMMY_11; > 00662 __label_13: > 00663 > 00664 double __DUMMY_13; > 00665 __label_15: > 00666 Py_XDECREF(V15); V15 = NULL; > 00667 {Py_XDECREF(py_V15);} > 00668 > 00669 double __DUMMY_15; > 00670 __label_18: > 00671 > 00672 #define DTYPE_INPUT_0 npy_float16 > 00673 #define TYPENUM_INPUT_0 23 > 00674 #define ITEMSIZE_INPUT_0 2 > 00675 #define DTYPE_INPUT_1 npy_float16 > 00676 #define TYPENUM_INPUT_1 23 > 00677 #define ITEMSIZE_INPUT_1 2 > 00678 #define DTYPE_INPUT_2 npy_float16 > 00679 #define TYPENUM_INPUT_2 23 > 00680 #define ITEMSIZE_INPUT_2 2 > 00681 #define DTYPE_INPUT_4 npy_float16 > 00682 #define TYPENUM_INPUT_4 23 > 00683 #define ITEMSIZE_INPUT_4 2 > 00684 #define DTYPE_INPUT_5 npy_float16 > 00685 #define TYPENUM_INPUT_5 23 > 00686 #define ITEMSIZE_INPUT_5 2 > 00687 #define DTYPE_OUTPUT_0 npy_float16 > 00688 #define TYPENUM_OUTPUT_0 23 > 00689 #define ITEMSIZE_OUTPUT_0 2 > 00690 #define APPLY_SPECIFIC(str) > str##_node_86feacd077d8749f42b5d82709a80ba3_0 > 00691 #define CONV_INPLACE 1 > 00692 #define CONV_ALGO CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM > 00693 > 00694 > 00695 cuda_enter(ctx->ctx); > 00696 cudnnDestroy(APPLY_SPECIFIC(_handle)); > 00697 cuda_exit(ctx->ctx); > 00698 Py_DECREF((PyObject *)ctx); > 00699 > 00700 > 00701 if (APPLY_SPECIFIC(input) != NULL) > 00702 cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); > 00703 if (APPLY_SPECIFIC(output) != NULL) > 00704 cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); > 00705 if (APPLY_SPECIFIC(kerns) != NULL) > 00706 cudnnDestroyFilterDescriptor(APPLY_SPECIFIC(kerns)); > 00707 > 00708 #undef DTYPE_INPUT_0 > 00709 #undef TYPENUM_INPUT_0 > 00710 #undef ITEMSIZE_INPUT_0 > 00711 #undef DTYPE_INPUT_1 > 00712 #undef TYPENUM_INPUT_1 > 00713 #undef ITEMSIZE_INPUT_1 > 00714 #undef DTYPE_INPUT_2 > 00715 #undef TYPENUM_INPUT_2 > 00716 #undef ITEMSIZE_INPUT_2 > 00717 #undef DTYPE_INPUT_4 > 00718 #undef TYPENUM_INPUT_4 > 00719 #undef ITEMSIZE_INPUT_4 > 00720 #undef DTYPE_INPUT_5 > 00721 #undef TYPENUM_INPUT_5 > 00722 #undef ITEMSIZE_INPUT_5 > 00723 #undef DTYPE_OUTPUT_0 > 00724 #undef TYPENUM_OUTPUT_0 > 00725 #undef ITEMSIZE_OUTPUT_0 > 00726 #undef APPLY_SPECIFIC > 00727 #undef CONV_INPLACE > 00728 #undef CONV_ALGO > 00729 double __DUMMY_18; > 00730 > 00731 Py_XDECREF(this->storage_V3); > 00732 Py_XDECREF(this->storage_V5); > 00733 Py_XDECREF(this->storage_V7); > 00734 Py_XDECREF(this->storage_V9); > 00735 Py_XDECREF(this->storage_V11); > 00736 Py_XDECREF(this->storage_V13); > 00737 Py_XDECREF(this->storage_V1); > 00738 Py_XDECREF(this->storage_V15); > 00739 } > 00740 int run(void) { > 00741 int __failure = 0; > 00742 > 00743 PyObject* py_V1; > 00744 > 00745 PyGpuArrayObject *V1; > 00746 > 00747 PyObject* py_V3; > 00748 > 00749 PyGpuArrayObject *V3; > 00750 > 00751 PyObject* py_V5; > 00752 > 00753 PyGpuArrayObject *V5; > 00754 > 00755 PyObject* py_V7; > 00756 > 00757 PyGpuArrayObject *V7; > 00758 > 00759 PyObject* py_V9; > 00760 > 00761 cudnnConvolutionDescriptor_t V9; > 00762 > 00763 PyObject* py_V11; > 00764 > 00765 typedef npy_float16 V11_dtype; // Deprecated use > dtype_V11 instead. > 00766 typedef npy_float16 dtype_V11; > 00767 > 00768 npy_float16 V11; > 00769 > 00770 PyObject* py_V13; > 00771 > 00772 typedef npy_float16 V13_dtype; // Deprecated use > dtype_V13 instead. > 00773 typedef npy_float16 dtype_V13; > 00774 > 00775 npy_float16 V13; > 00776 > 00777 { > 00778 > 00779 py_V1 = PyList_GET_ITEM(storage_V1, 0); > 00780 {Py_XINCREF(py_V1);} > 00781 > 00782 if (py_V1 == Py_None) > 00783 { > 00784 V1 = NULL; > 00785 } > 00786 else > 00787 { > 00788 > 00789 V1 = NULL; > 00790 if (py_V1 == Py_None) { > 00791 PyErr_SetString(PyExc_ValueError, "expected a > GpuArray, not None"); > 00792 { > 00793 __failure = 2; > 00794 if (!PyErr_Occurred()) { > 00795 PyErr_SetString(PyExc_RuntimeError, > 00796 "Unexpected error in an Op's C code. " > 00797 "No Python exception was set."); > 00798 } > 00799 goto __label_2;} > 00800 } > 00801 /* First check if we are the base type exactly (the most > common case), > 00802 then do the full subclass check if needed. */ > 00803 if (py_V1->ob_type != &PyGpuArrayType && > 00804 !PyObject_TypeCheck(py_V1, &PyGpuArrayType)) { > 00805 PyE > -- --- You received this message because you are subscribed to the Google Groups "theano-users" group. To unsubscribe from this group and stop receiving emails from it, send an email to [email protected]. For more options, visit https://groups.google.com/d/optout.
