I can't reproduce your problem using a simple convolution in float16. Either this is because your code is doing something unexpected or because the problem has been fixed in the development version.
In nay case the development version is a much better option for the new backend and float16 so I encourage you to upgrade and try again: http://deeplearning.net/software/theano/install.html#bleeding-edge-install-instructions . 2016-07-14 4:22 GMT-04:00 <[email protected]>: > Here is .theanorc: > > [global] > floatX = float16 > device=cuda > [cuda] > root = /usr/local/cuda-7.5 > > > [nvcc] > fastmath=True > > optimizer = fast_compile > > On Thursday, July 14, 2016 at 10:19:56 AM UTC+2, [email protected] > wrote: >> >> Hi Arnaud, >> I put _f16_ok = True in dnn.py ( attached). >> >> This is the error I received: >> >> Python 2.7.11 |Anaconda custom (64-bit)| (default, Dec 6 2015, 18:08:32) >> [GCC 4.4.7 20120313 (Red Hat 4.4.7-1)] on linux2 >> Type "help", "copyright", "credits" or "license" for more information. >> Anaconda is brought to you by Continuum Analytics. >> Please check out: http://continuum.io/thanks and https://anaconda.org >> >>> import run_multi_conv >> >> Mapped name None to device cuda: GeForce 840M >> WARNING (theano.gof.compilelock): Overriding existing lock by dead >> process '3202' (I am process '3351') >> Using cuDNN version 5005 on context None >> /home/luca/data/Theano-master/theano/tensor/signal/downsample.py:6: >> UserWarning: downsample module has been moved to the >> theano.tensor.signal.pool module. >> "downsample module has been moved to the theano.tensor.signal.pool >> module.") >> >>> >> >>> run_multi_conv.run_experiments() >> Disabling C code for Elemwise{mul,no_inplace} due to unsupported float16 >> Disabling C code for Elemwise{Cast{float32}} due to unsupported float16 >> Disabling C code for Elemwise{Cast{float16}} due to unsupported float16 >> Disabling C code for Elemwise{Cast{float16}} due to unsupported float16 >> Disabling C code for Alloc due to unsupported float16 >> Disabling C code for Cast{float16} due to unsupported float16 >> Disabling C code for Cast{float16} due to unsupported float16 >> Disabling C code for Cast{float16} due to unsupported float16 >> Disabling C code for Cast{float16} due to unsupported float16 >> Disabling C code for RandomFunction{binomial} due to unsupported float16 >> Disabling C code for RandomFunction{binomial} due to unsupported float16 >> =============================== >> 00001 #include <Python.h> >> 00002 #include <iostream> >> 00003 #include "theano_mod_helper.h" >> 00004 #include <gpuarray/array.h> >> 00005 #include <gpuarray/kernel.h> >> 00006 #include <gpuarray/error.h> >> 00007 #include <gpuarray/buffer.h> >> 00008 #include <gpuarray/buffer_blas.h> >> 00009 #include <numpy/arrayobject.h> >> 00010 #include <gpuarray_api.h> >> 00011 #include <math.h> >> 00012 #include <numpy/arrayscalars.h> >> 00013 #include "cudnn.h" >> 00014 #include "cudnn_helper.h" >> 00015 #include "gpuarray_helper.h" >> 00016 #include "gpuarray/types.h" >> 00017 #include "gpuarray/array.h" >> 00018 #include "gpuarray/util.h" >> 00019 #include "gpuarray/ext_cuda.h" >> 00020 #include "gpuarray_api.h" >> 00021 #include "numpy_compat.h" >> 00022 ////////////////////// >> 00023 //// Support Code >> 00024 ////////////////////// >> 00025 >> 00026 >> 00027 >> 00028 static int >> 00029 c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t >> desc) { >> 00030 cudnnDataType_t dt; >> 00031 size_t ds; >> 00032 switch (var->ga.typecode) { >> 00033 case GA_FLOAT: >> 00034 dt = CUDNN_DATA_FLOAT; >> 00035 break; >> 00036 case GA_DOUBLE: >> 00037 dt = CUDNN_DATA_DOUBLE; >> 00038 break; >> 00039 #if CUDNN_VERSION > 3000 >> 00040 case GA_HALF: >> 00041 dt = CUDNN_DATA_HALF; >> 00042 break; >> 00043 #endif >> 00044 default: >> 00045 PyErr_SetString(PyExc_TypeError, "Non-float datatype in >> c_set_tensorNd"); >> 00046 return -1; >> 00047 } >> 00048 ds = gpuarray_get_elsize(var->ga.typecode); >> 00049 >> 00050 int strs[5], dims[5], default_stride = 1; >> 00051 unsigned int nd = PyGpuArray_NDIM(var); >> 00052 >> 00053 if (nd > 5) { >> 00054 PyErr_SetString(PyExc_TypeError, "Tensor of more than 5d"); >> 00055 return -1; >> 00056 } >> 00057 >> 00058 for (unsigned int _i = nd; _i > 0; _i--) { >> 00059 unsigned int i = _i - 1; >> 00060 strs[i] = PyGpuArray_STRIDE(var, i) ? >> 00061 PyGpuArray_STRIDE(var, i)/ds : default_stride; >> 00062 default_stride *= PyGpuArray_DIM(var, i); >> 00063 dims[i] = PyGpuArray_DIM(var, i); >> 00064 } >> 00065 >> 00066 cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, dt, nd, >> dims, strs); >> 00067 if (err != CUDNN_STATUS_SUCCESS) { >> 00068 PyErr_Format(PyExc_RuntimeError, >> 00069 "Could not set tensorNd descriptor: %s", >> 00070 cudnnGetErrorString(err)); >> 00071 return -1; >> 00072 } >> 00073 return 0; >> 00074 } >> 00075 >> 00076 static int >> 00077 c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t >> desc) { >> 00078 cudnnDataType_t dt; >> 00079 cudnnStatus_t err; >> 00080 >> 00081 if (!GpuArray_IS_C_CONTIGUOUS(&var->ga)) { >> 00082 PyErr_SetString(PyExc_ValueError, >> 00083 "Only contiguous filters (kernels) are supported."); >> 00084 return -1; >> 00085 } >> 00086 switch (var->ga.typecode) { >> 00087 case GA_FLOAT: >> 00088 dt = CUDNN_DATA_FLOAT; >> 00089 break; >> 00090 case GA_DOUBLE: >> 00091 dt = CUDNN_DATA_DOUBLE; >> 00092 break; >> 00093 #if CUDNN_VERSION > 3000 >> 00094 case GA_HALF: >> 00095 dt = CUDNN_DATA_HALF; >> 00096 break; >> 00097 #endif >> 00098 default: >> 00099 PyErr_SetString(PyExc_TypeError, "Non-float datatype in >> c_set_filter"); >> 00100 return -1; >> 00101 } >> 00102 >> 00103 int dims[5]; >> 00104 unsigned int nd = PyGpuArray_NDIM(var); >> 00105 >> 00106 if (nd > 5) { >> 00107 PyErr_SetString(PyExc_TypeError, "Tensor of more than 5d"); >> 00108 return -1; >> 00109 } >> 00110 >> 00111 for (unsigned int _i = nd; _i > 0; _i--) { >> 00112 unsigned int i = _i - 1; >> 00113 dims[i] = PyGpuArray_DIM(var, i); >> 00114 } >> 00115 >> 00116 #if CUDNN_VERSION >= 5000 >> 00117 err = cudnnSetFilterNdDescriptor(desc, dt, >> CUDNN_TENSOR_NCHW, nd, dims); >> 00118 #else >> 00119 err = cudnnSetFilterNdDescriptor(desc, dt, nd, dims); >> 00120 #endif >> 00121 >> 00122 if (err != CUDNN_STATUS_SUCCESS) { >> 00123 PyErr_Format(PyExc_RuntimeError, >> 00124 "Could not set filter descriptor: %s.", >> 00125 cudnnGetErrorString(err)); >> 00126 return -1; >> 00127 } >> 00128 return 0; >> 00129 } >> 00130 >> 00131 >> 00132 >> 00133 namespace { >> 00134 struct __struct_compiled_op_86feacd077d8749f42b5d82709a80ba3 >> { >> 00135 PyObject* __ERROR; >> 00136 >> 00137 PyObject* storage_V3; >> 00138 PyObject* storage_V5; >> 00139 PyObject* storage_V7; >> 00140 PyObject* storage_V9; >> 00141 PyObject* storage_V11; >> 00142 PyObject* storage_V13; >> 00143 PyObject* storage_V1; >> 00144 PyObject* storage_V15; >> 00145 >> 00146 PyObject* py_V15; >> 00147 PyGpuContextObject *V15; >> 00148 #define DTYPE_INPUT_0 npy_float16 >> 00149 #define TYPENUM_INPUT_0 23 >> 00150 #define ITEMSIZE_INPUT_0 2 >> 00151 #define DTYPE_INPUT_1 npy_float16 >> 00152 #define TYPENUM_INPUT_1 23 >> 00153 #define ITEMSIZE_INPUT_1 2 >> 00154 #define DTYPE_INPUT_2 npy_float16 >> 00155 #define TYPENUM_INPUT_2 23 >> 00156 #define ITEMSIZE_INPUT_2 2 >> 00157 #define DTYPE_INPUT_4 npy_float16 >> 00158 #define TYPENUM_INPUT_4 23 >> 00159 #define ITEMSIZE_INPUT_4 2 >> 00160 #define DTYPE_INPUT_5 npy_float16 >> 00161 #define TYPENUM_INPUT_5 23 >> 00162 #define ITEMSIZE_INPUT_5 2 >> 00163 #define DTYPE_OUTPUT_0 npy_float16 >> 00164 #define TYPENUM_OUTPUT_0 23 >> 00165 #define ITEMSIZE_OUTPUT_0 2 >> 00166 #define APPLY_SPECIFIC(str) >> str##_node_86feacd077d8749f42b5d82709a80ba3_0 >> 00167 #define CONV_INPLACE 1 >> 00168 #define CONV_ALGO >> CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM >> 00169 >> 00170 >> 00171 PyGpuContextObject *ctx; >> 00172 cudnnHandle_t APPLY_SPECIFIC(_handle); >> 00173 >> 00174 >> 00175 cudnnTensorDescriptor_t APPLY_SPECIFIC(input); >> 00176 cudnnTensorDescriptor_t APPLY_SPECIFIC(output); >> 00177 cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns); >> 00178 >> 00179 >> 00180 >> 00181 #ifdef CHOOSE_ALGO >> 00182 int reuse_algo; >> 00183 cudnnConvolutionFwdAlgo_t prev_algo; >> 00184 #ifndef CHOOSE_ONCE >> 00185 size_t prev_img_dims[5]; >> 00186 size_t prev_kern_dims[5]; >> 00187 #endif >> 00188 #endif >> 00189 >> 00190 int >> 00191 APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, >> PyGpuArrayObject *kerns, >> 00192 PyGpuArrayObject *om, >> 00193 cudnnConvolutionDescriptor_t desc, >> 00194 double alpha, double beta, >> 00195 PyGpuArrayObject **output, >> 00196 PyGpuContextObject *c) { >> 00197 cudnnStatus_t err = CUDNN_STATUS_SUCCESS; >> 00198 float af = alpha, bf = beta; >> 00199 void *alpha_p; >> 00200 void *beta_p; >> 00201 >> 00202 if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) { >> 00203 PyErr_SetString(PyExc_ValueError, >> 00204 "images and kernel must have the same stack size"); >> 00205 return 1; >> 00206 } >> 00207 >> 00208 if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1) >> 00209 return 1; >> 00210 if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) >> 00211 return 1; >> 00212 >> 00213 switch (input->ga.typecode) { >> 00214 case GA_DOUBLE: >> 00215 alpha_p = (void *)α >> 00216 beta_p = (void *)β >> 00217 break; >> 00218 case GA_FLOAT: >> 00219 case GA_HALF: >> 00220 alpha_p = (void *)⁡ >> 00221 beta_p = (void *)&bf; >> 00222 break; >> 00223 default: >> 00224 PyErr_SetString(PyExc_TypeError, "Unsupported type in >> convolution"); >> 00225 return 1; >> 00226 } >> 00227 >> 00228 #ifdef CONV_INPLACE >> 00229 Py_XDECREF(*output); >> 00230 *output = om; >> 00231 Py_INCREF(*output); >> 00232 #else >> 00233 if (theano_prep_output(output, PyGpuArray_NDIM(om), >> PyGpuArray_DIMS(om), >> 00234 om->ga.typecode, GA_C_ORDER, c) != 0) >> 00235 return 1; >> 00236 if (beta != 0.0 && pygpu_move(*output, om)) >> 00237 return 1; >> 00238 #endif >> 00239 >> 00240 if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1) >> 00241 return 1; >> 00242 >> 00243 cudnnConvolutionFwdAlgo_t algo = CONV_ALGO; >> 00244 >> 00245 cuda_enter(c->ctx); >> 00246 #ifdef CHOOSE_ALGO >> 00247 #ifndef CHOOSE_ONCE >> 00248 reuse_algo = 1; >> 00249 for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { >> 00250 reuse_algo = (reuse_algo && >> 00251 PyGpuArray_DIM(input, i) == prev_img_dims[i]); >> 00252 reuse_algo = (reuse_algo && >> 00253 PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]); >> 00254 } >> 00255 #endif >> 00256 >> 00257 if (!reuse_algo) { >> 00258 #ifdef CHOOSE_TIME >> 00259 int count; >> 00260 cudnnConvolutionFwdAlgoPerf_t choice; >> 00261 err = cudnnFindConvolutionForwardAlgorithm( >> 00262 APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), >> APPLY_SPECIFIC(kerns), >> 00263 desc, APPLY_SPECIFIC(output), 1, &count, &choice); >> 00264 >> 00265 if (err != CUDNN_STATUS_SUCCESS) { >> 00266 PyErr_Format(PyExc_RuntimeError, >> 00267 "error selecting convolution algo: %s", >> 00268 cudnnGetErrorString(err)); >> 00269 cuda_exit(c->ctx); >> 00270 return 1; >> 00271 } >> 00272 algo = choice.algo; >> 00273 #else >> 00274 size_t free; >> 00275 int err2 = gpucontext_property(c->ctx, >> GA_CTX_PROP_FREE_GMEM, &free); >> 00276 >> 00277 if (err2 != GA_NO_ERROR) { >> 00278 PyErr_Format(PyExc_RuntimeError, "Error when trying to >> find the " >> 00279 "memory information on the GPU"); >> 00280 cuda_exit(c->ctx); >> 00281 return 1; >> 00282 } >> 00283 >> 00284 err = cudnnGetConvolutionForwardAlgorithm( >> 00285 APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), >> APPLY_SPECIFIC(kerns), >> 00286 desc, APPLY_SPECIFIC(output), >> 00287 CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, >> &algo); >> 00288 if (err != CUDNN_STATUS_SUCCESS) { >> 00289 PyErr_Format(PyExc_RuntimeError, >> 00290 "error selecting convolution algo: %s", >> 00291 cudnnGetErrorString(err)); >> 00292 cuda_exit(c->ctx); >> 00293 return 1; >> 00294 } >> 00295 #endif >> 00296 prev_algo = algo; >> 00297 } else { >> 00298 algo = prev_algo; >> 00299 } >> 00300 >> 00301 #ifdef CHOOSE_ONCE >> 00302 reuse_algo = 1; >> 00303 #else >> 00304 for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { >> 00305 prev_img_dims[i] = PyGpuArray_DIM(input, i); >> 00306 prev_kern_dims[i] = PyGpuArray_DIM(kerns, i); >> 00307 } >> 00308 #endif >> 00309 >> 00310 #endif >> 00311 >> 00312 /* These two algos are not supported for 3d conv */ >> 00313 if (PyGpuArray_NDIM(input) == 5 && >> 00314 (algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM >> || >> 00315 algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM)) >> 00316 algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; >> 00317 >> 00318 // The FFT implementation does not support strides, 1x1 >> filters or inputs >> 00319 // with a spatial dimension larger than 1024. The tiled-FFT >> implementation >> 00320 // does not support strides. >> 00321 // If the chosen implementation is FFT or tiled-FFT, validate >> that it can >> 00322 // be used on the current data and default to a safe >> implementation if it >> 00323 // can't. >> 00324 // The following code is 2d-specific but it is fine as FFT and >> tiled-FFT are >> 00325 // defined only for 2d filters >> 00326 if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT || >> 00327 algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && >> PyGpuArray_NDIM(input) == 4) { >> 00328 >> 00329 // Extract the properties of the convolution descriptor >> 00330 int nd; >> 00331 int pad[2]; >> 00332 int stride[2]; >> 00333 int upscale[2]; >> 00334 cudnnConvolutionMode_t mode; >> 00335 cudnnDataType_t data_type; >> 00336 err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, >> stride, >> 00337 upscale, &mode, >> &data_type); >> 00338 if (err != CUDNN_STATUS_SUCCESS) { >> 00339 PyErr_Format(PyExc_RuntimeError, >> 00340 "error getting convolution properties: %s", >> 00341 cudnnGetErrorString(err)); >> 00342 cuda_exit(c->ctx); >> 00343 return 1; >> 00344 } >> 00345 >> 00346 if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) >> 00347 { >> 00348 if (stride[0] != 1 || stride[1] != 1 || >> 00349 PyGpuArray_DIM(input, 2) > 1024 || >> PyGpuArray_DIM(input, 3) > 1024 || >> 00350 (PyGpuArray_DIM(kerns, 2) == 1 && >> PyGpuArray_DIM(kerns, 3) == 1)) >> 00351 { >> 00352 algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; >> 00353 } >> 00354 } >> 00355 else >> 00356 { >> 00357 // algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING >> 00358 if (stride[0] != 1 || stride[1] != 1) >> 00359 { >> 00360 algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; >> 00361 } >> 00362 } >> 00363 } >> 00364 >> 00365 { >> 00366 size_t worksize; >> 00367 gpudata *workspace; >> 00368 err = >> cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle), >> 00369 >> APPLY_SPECIFIC(input), >> 00370 >> APPLY_SPECIFIC(kerns), >> 00371 desc, >> 00372 >> APPLY_SPECIFIC(output), >> 00373 algo, >> 00374 &worksize); >> 00375 >> 00376 if (err == CUDNN_STATUS_NOT_SUPPORTED) { >> 00377 // Fallback to none algo if not supported >> 00378 // TODO: Print a warning >> 00379 algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; >> 00380 >> 00381 err = >> cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle), >> 00382 >> APPLY_SPECIFIC(input), >> 00383 >> APPLY_SPECIFIC(kerns), >> 00384 desc, >> 00385 >> APPLY_SPECIFIC(output), >> 00386 algo, >> 00387 &worksize); >> 00388 } >> 00389 >> 00390 if (err != CUDNN_STATUS_SUCCESS) { >> 00391 PyErr_Format(PyExc_RuntimeError, >> 00392 "error getting worksize: %s", >> 00393 cudnnGetErrorString(err)); >> 00394 cuda_exit(c->ctx); >> 00395 return 1; >> 00396 } >> 00397 >> 00398 /* >> 00399 * This is less than ideal since we need to free it after >> (which >> 00400 * introduces a synchronization point. But we don't have a >> module >> 00401 * to place a nice get_work_mem() function in. >> 00402 */ >> 00403 if (worksize != 0) { >> 00404 workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL); >> 00405 if (workspace == NULL) { >> 00406 PyErr_SetString(PyExc_RuntimeError, >> 00407 "Could not allocate working memory"); >> 00408 cuda_exit(c->ctx); >> 00409 return 1; >> 00410 } >> 00411 } >> 00412 >> 00413 cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ); >> 00414 cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); >> 00415 cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); >> 00416 >> 00417 err = cudnnConvolutionForward( >> 00418 APPLY_SPECIFIC(_handle), >> 00419 alpha_p, >> 00420 APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), >> 00421 APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), >> 00422 desc, algo, >> 00423 worksize == 0 ? NULL : *(void **)workspace, worksize, >> 00424 beta_p, >> 00425 APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output)); >> 00426 >> 00427 if (worksize != 0) >> 00428 gpudata_release(workspace); >> 00429 >> 00430 cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ); >> 00431 cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); >> 00432 cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); >> 00433 } >> 00434 cuda_exit(c->ctx); >> 00435 >> 00436 if (err != CUDNN_STATUS_SUCCESS) { >> 00437 PyErr_Format(PyExc_RuntimeError, "error doing operation: %s", >> 00438 cudnnGetErrorString(err)); >> 00439 return 1; >> 00440 } >> 00441 return 0; >> 00442 } >> 00443 >> 00444 #undef DTYPE_INPUT_0 >> 00445 #undef TYPENUM_INPUT_0 >> 00446 #undef ITEMSIZE_INPUT_0 >> 00447 #undef DTYPE_INPUT_1 >> 00448 #undef TYPENUM_INPUT_1 >> 00449 #undef ITEMSIZE_INPUT_1 >> 00450 #undef DTYPE_INPUT_2 >> 00451 #undef TYPENUM_INPUT_2 >> 00452 #undef ITEMSIZE_INPUT_2 >> 00453 #undef DTYPE_INPUT_4 >> 00454 #undef TYPENUM_INPUT_4 >> 00455 #undef ITEMSIZE_INPUT_4 >> 00456 #undef DTYPE_INPUT_5 >> 00457 #undef TYPENUM_INPUT_5 >> 00458 #undef ITEMSIZE_INPUT_5 >> 00459 #undef DTYPE_OUTPUT_0 >> 00460 #undef TYPENUM_OUTPUT_0 >> 00461 #undef ITEMSIZE_OUTPUT_0 >> 00462 #undef APPLY_SPECIFIC >> 00463 #undef CONV_INPLACE >> 00464 #undef CONV_ALGO >> 00465 >> 00466 __struct_compiled_op_86feacd077d8749f42b5d82709a80ba3() { >> 00467 // This is only somewhat safe because we: >> 00468 // 1) Are not a virtual class >> 00469 // 2) Do not use any virtual classes in the members >> 00470 // 3) Deal with mostly POD and pointers >> 00471 >> 00472 // If this changes, we would have to revise this, >> but for >> 00473 // now I am tired of chasing segfaults because >> 00474 // initialization code had an error and some pointer >> has >> 00475 // a junk value. >> 00476 memset(this, 0, sizeof(*this)); >> 00477 } >> 00478 >> ~__struct_compiled_op_86feacd077d8749f42b5d82709a80ba3(void) { >> 00479 cleanup(); >> 00480 } >> 00481 >> 00482 int init(PyObject* __ERROR, PyObject* storage_V3, >> PyObject* storage_V5, PyObject* storage_V7, PyObject* storage_V9, PyObject* >> storage_V11, PyObject* storage_V13, PyObject* storage_V1, PyObject* >> storage_V15) { >> 00483 Py_XINCREF(storage_V3); >> 00484 Py_XINCREF(storage_V5); >> 00485 Py_XINCREF(storage_V7); >> 00486 Py_XINCREF(storage_V9); >> 00487 Py_XINCREF(storage_V11); >> 00488 Py_XINCREF(storage_V13); >> 00489 Py_XINCREF(storage_V1); >> 00490 Py_XINCREF(storage_V15); >> 00491 this->storage_V3 = storage_V3; >> 00492 this->storage_V5 = storage_V5; >> 00493 this->storage_V7 = storage_V7; >> 00494 this->storage_V9 = storage_V9; >> 00495 this->storage_V11 = storage_V11; >> 00496 this->storage_V13 = storage_V13; >> 00497 this->storage_V1 = storage_V1; >> 00498 this->storage_V15 = storage_V15; >> 00499 >> 00500 >> 00501 >> 00502 >> 00503 >> 00504 >> 00505 >> 00506 >> 00507 >> 00508 py_V15 = PyList_GET_ITEM(storage_V15, 0); >> 00509 {Py_XINCREF(py_V15);} >> 00510 >> 00511 if (!PyObject_TypeCheck(py_V15, &PyGpuContextType)) { >> 00512 PyErr_SetString(PyExc_TypeError, "expected a GpuContext"); >> 00513 { >> 00514 if (!PyErr_Occurred()) { >> 00515 PyErr_SetString(PyExc_RuntimeError, >> 00516 "Unexpected error in an Op's C code. " >> 00517 "No Python exception was set."); >> 00518 } >> 00519 return 15; >> 00520 } >> 00521 } >> 00522 >> 00523 V15 = (PyGpuContextObject *)py_V15; >> 00524 Py_INCREF(V15); >> 00525 >> 00526 >> 00527 #define DTYPE_INPUT_0 npy_float16 >> 00528 #define TYPENUM_INPUT_0 23 >> 00529 #define ITEMSIZE_INPUT_0 2 >> 00530 #define DTYPE_INPUT_1 npy_float16 >> 00531 #define TYPENUM_INPUT_1 23 >> 00532 #define ITEMSIZE_INPUT_1 2 >> 00533 #define DTYPE_INPUT_2 npy_float16 >> 00534 #define TYPENUM_INPUT_2 23 >> 00535 #define ITEMSIZE_INPUT_2 2 >> 00536 #define DTYPE_INPUT_4 npy_float16 >> 00537 #define TYPENUM_INPUT_4 23 >> 00538 #define ITEMSIZE_INPUT_4 2 >> 00539 #define DTYPE_INPUT_5 npy_float16 >> 00540 #define TYPENUM_INPUT_5 23 >> 00541 #define ITEMSIZE_INPUT_5 2 >> 00542 #define DTYPE_OUTPUT_0 npy_float16 >> 00543 #define TYPENUM_OUTPUT_0 23 >> 00544 #define ITEMSIZE_OUTPUT_0 2 >> 00545 #define APPLY_SPECIFIC(str) >> str##_node_86feacd077d8749f42b5d82709a80ba3_0 >> 00546 #define CONV_INPLACE 1 >> 00547 #define CONV_ALGO >> CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM >> 00548 #define FAIL { \ >> 00549 if (!PyErr_Occurred()) { \ >> 00550 PyErr_SetString(PyExc_RuntimeError, \ >> 00551 "Unexpected error in an Op's C code. " \ >> 00552 "No Python exception was set."); \ >> 00553 } \ >> 00554 return 17; \ >> 00555 } >> 00556 #define PARAMS V15 >> 00557 >> 00558 >> 00559 { >> 00560 // We need to keep a reference here to have it available in >> the destructor. >> 00561 ctx = PARAMS; >> 00562 Py_INCREF(ctx); >> 00563 >> 00564 cuda_enter(PARAMS->ctx); >> 00565 cudnnStatus_t err; >> 00566 APPLY_SPECIFIC(_handle) = NULL; >> 00567 if ((err = cudnnCreate(&APPLY_SPECIFIC(_handle))) != >> CUDNN_STATUS_SUCCESS) { >> 00568 PyErr_Format(PyExc_RuntimeError, "could not create cuDNN >> handle: %s", >> 00569 cudnnGetErrorString(err)); >> 00570 cuda_exit(PARAMS->ctx); >> 00571 FAIL; >> 00572 } >> 00573 if ((err = cudnnSetStream(APPLY_SPECIFIC(_handle), >> 00574 cuda_get_stream(PARAMS->ctx))) != >> CUDNN_STATUS_SUCCESS) { >> 00575 PyErr_Format(PyExc_RuntimeError, "Could not set cudnn >> stream: %s", >> 00576 cudnnGetErrorString(err)); >> 00577 cuda_exit(PARAMS->ctx); >> 00578 FAIL; >> 00579 } >> 00580 cuda_exit(PARAMS->ctx); >> 00581 } >> 00582 >> 00583 >> 00584 >> 00585 cudnnStatus_t APPLY_SPECIFIC(err); >> 00586 APPLY_SPECIFIC(input) = NULL; >> 00587 APPLY_SPECIFIC(output) = NULL; >> 00588 APPLY_SPECIFIC(kerns) = NULL; >> 00589 if ((APPLY_SPECIFIC(err) = >> cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != >> CUDNN_STATUS_SUCCESS) { >> 00590 PyErr_Format(PyExc_MemoryError, "could not allocate tensor >> descriptor " >> 00591 "(inp): %s", >> cudnnGetErrorString(APPLY_SPECIFIC(err))); >> 00592 FAIL; >> 00593 } >> 00594 if ((APPLY_SPECIFIC(err) = >> cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != >> CUDNN_STATUS_SUCCESS) { >> 00595 PyErr_Format(PyExc_MemoryError, "could not allocate tensor >> descriptor " >> 00596 "(out): %s", >> cudnnGetErrorString(APPLY_SPECIFIC(err))); >> 00597 FAIL; >> 00598 } >> 00599 if ((APPLY_SPECIFIC(err) = >> cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns))) != >> CUDNN_STATUS_SUCCESS) { >> 00600 PyErr_Format(PyExc_MemoryError, "could not allocate filter >> descriptor: %s", >> 00601 cudnnGetErrorString(APPLY_SPECIFIC(err))); >> 00602 FAIL; >> 00603 } >> 00604 >> 00605 >> 00606 >> 00607 #ifdef CHOOSE_ALGO >> 00608 reuse_algo = 0; >> 00609 prev_algo = CONV_ALGO; >> 00610 #ifndef CHOOSE_ONCE >> 00611 memset(prev_img_dims, 0, sizeof(prev_img_dims)); >> 00612 memset(prev_kern_dims, 0, sizeof(prev_kern_dims)); >> 00613 #endif >> 00614 #endif >> 00615 >> 00616 >> 00617 #undef FAIL >> 00618 #undef PARAMS >> 00619 #undef DTYPE_INPUT_0 >> 00620 #undef TYPENUM_INPUT_0 >> 00621 #undef ITEMSIZE_INPUT_0 >> 00622 #undef DTYPE_INPUT_1 >> 00623 #undef TYPENUM_INPUT_1 >> 00624 #undef ITEMSIZE_INPUT_1 >> 00625 #undef DTYPE_INPUT_2 >> 00626 #undef TYPENUM_INPUT_2 >> 00627 #undef ITEMSIZE_INPUT_2 >> 00628 #undef DTYPE_INPUT_4 >> 00629 #undef TYPENUM_INPUT_4 >> 00630 #undef ITEMSIZE_INPUT_4 >> 00631 #undef DTYPE_INPUT_5 >> 00632 #undef TYPENUM_INPUT_5 >> 00633 #undef ITEMSIZE_INPUT_5 >> 00634 #undef DTYPE_OUTPUT_0 >> 00635 #undef TYPENUM_OUTPUT_0 >> 00636 #undef ITEMSIZE_OUTPUT_0 >> 00637 #undef APPLY_SPECIFIC >> 00638 #undef CONV_INPLACE >> 00639 #undef CONV_ALGO >> 00640 this->__ERROR = __ERROR; >> 00641 return 0; >> 00642 } >> 00643 void cleanup(void) { >> 00644 __label_1: >> 00645 >> 00646 double __DUMMY_1; >> 00647 __label_3: >> 00648 >> 00649 double __DUMMY_3; >> 00650 __label_5: >> 00651 >> 00652 double __DUMMY_5; >> 00653 __label_7: >> 00654 >> 00655 double __DUMMY_7; >> 00656 __label_9: >> 00657 >> 00658 double __DUMMY_9; >> 00659 __label_11: >> 00660 >> 00661 double __DUMMY_11; >> 00662 __label_13: >> 00663 >> 00664 double __DUMMY_13; >> 00665 __label_15: >> 00666 Py_XDECREF(V15); V15 = NULL; >> 00667 {Py_XDECREF(py_V15);} >> 00668 >> 00669 double __DUMMY_15; >> 00670 __label_18: >> 00671 >> 00672 #define DTYPE_INPUT_0 npy_float16 >> 00673 #define TYPENUM_INPUT_0 23 >> 00674 #define ITEMSIZE_INPUT_0 2 >> 00675 #define DTYPE_INPUT_1 npy_float16 >> 00676 #define TYPENUM_INPUT_1 23 >> 00677 #define ITEMSIZE_INPUT_1 2 >> 00678 #define DTYPE_INPUT_2 npy_float16 >> 00679 #define TYPENUM_INPUT_2 23 >> 00680 #define ITEMSIZE_INPUT_2 2 >> 00681 #define DTYPE_INPUT_4 npy_float16 >> 00682 #define TYPENUM_INPUT_4 23 >> 00683 #define ITEMSIZE_INPUT_4 2 >> 00684 #define DTYPE_INPUT_5 npy_float16 >> 00685 #define TYPENUM_INPUT_5 23 >> 00686 #define ITEMSIZE_INPUT_5 2 >> 00687 #define DTYPE_OUTPUT_0 npy_float16 >> 00688 #define TYPENUM_OUTPUT_0 23 >> 00689 #define ITEMSIZE_OUTPUT_0 2 >> 00690 #define APPLY_SPECIFIC(str) >> str##_node_86feacd077d8749f42b5d82709a80ba3_0 >> 00691 #define CONV_INPLACE 1 >> 00692 #define CONV_ALGO >> CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM >> 00693 >> 00694 >> 00695 cuda_enter(ctx->ctx); >> 00696 cudnnDestroy(APPLY_SPECIFIC(_handle)); >> 00697 cuda_exit(ctx->ctx); >> 00698 Py_DECREF((PyObject *)ctx); >> 00699 >> 00700 >> 00701 if (APPLY_SPECIFIC(input) != NULL) >> 00702 cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); >> 00703 if (APPLY_SPECIFIC(output) != NULL) >> 00704 cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); >> 00705 if (APPLY_SPECIFIC(kerns) != NULL) >> 00706 cudnnDestroyFilterDescriptor(APPLY_SPECIFIC(kerns)); >> 00707 >> 00708 #undef DTYPE_INPUT_0 >> 00709 #undef TYPENUM_INPUT_0 >> 00710 #undef ITEMSIZE_INPUT_0 >> 00711 #undef DTYPE_INPUT_1 >> 00712 #undef TYPENUM_INPUT_1 >> 00713 #undef ITEMSIZE_INPUT_1 >> 00714 #undef DTYPE_INPUT_2 >> 00715 #undef TYPENUM_INPUT_2 >> 00716 #undef ITEMSIZE_INPUT_2 >> 00717 #undef DTYPE_INPUT_4 >> 00718 #undef TYPENUM_INPUT_4 >> 00719 #undef ITEMSIZE_INPUT_4 >> 00720 #undef DTYPE_INPUT_5 >> 00721 #undef TYPENUM_INPUT_5 >> 00722 #undef ITEMSIZE_INPUT_5 >> 00723 #undef DTYPE_OUTPUT_0 >> 00724 #undef TYPENUM_OUTPUT_0 >> 00725 #undef ITEMSIZE_OUTPUT_0 >> 00726 #undef APPLY_SPECIFIC >> 00727 #undef CONV_INPLACE >> 00728 #undef CONV_ALGO >> 00729 double __DUMMY_18; >> 00730 >> 00731 Py_XDECREF(this->storage_V3); >> 00732 Py_XDECREF(this->storage_V5); >> 00733 Py_XDECREF(this->storage_V7); >> 00734 Py_XDECREF(this->storage_V9); >> 00735 Py_XDECREF(this->storage_V11); >> 00736 Py_XDECREF(this->storage_V13); >> 00737 Py_XDECREF(this->storage_V1); >> 00738 Py_XDECREF(this->storage_V15); >> 00739 } >> 00740 int run(void) { >> 00741 int __failure = 0; >> 00742 >> 00743 PyObject* py_V1; >> 00744 >> 00745 PyGpuArrayObject *V1; >> 00746 >> 00747 PyObject* py_V3; >> 00748 >> 00749 PyGpuArrayObject *V3; >> 00750 >> 00751 PyObject* py_V5; >> 00752 >> 00753 PyGpuArrayObject *V5; >> 00754 >> 00755 PyObject* py_V7; >> 00756 >> 00757 PyGpuArrayObject *V7; >> 00758 >> 00759 PyObject* py_V9; >> 00760 >> 00761 cudnnConvolutionDescriptor_t V9; >> 00762 >> 00763 PyObject* py_V11; >> 00764 >> 00765 typedef npy_float16 V11_dtype; // Deprecated use >> dtype_V11 instead. >> 00766 typedef npy_float16 dtype_V11; >> 00767 >> 00768 npy_float16 V11; >> 00769 >> 00770 PyObject* py_V13; >> 00771 >> 00772 typedef npy_float16 V13_dtype; // Deprecated use >> dtype_V13 instead. >> 00773 typedef npy_float16 dtype_V13; >> 00774 >> 00775 npy_float16 V13; >> 00776 >> 00777 { >> 00778 >> 00779 py_V1 = PyList_GET_ITEM(storage_V1, 0); >> 00780 {Py_XINCREF(py_V1);} >> 00781 >> 00782 if (py_V1 == Py_None) >> 00783 { >> 00784 V1 = NULL; >> 00785 } >> 00786 else >> 00787 { >> 00788 >> 00789 V1 = NULL; >> 00790 if (py_V1 == Py_None) { >> 00791 PyErr_SetString(PyExc_ValueError, "expected a >> GpuArray, not None"); >> 00792 { >> 00793 __failure = 2; >> 00794 if (!PyErr_Occurred()) { >> 00795 PyErr_SetString(PyExc_RuntimeError, >> 00796 "Unexpected error in an Op's C code. " >> 00797 "No Python exception was set."); >> 00798 } >> 00799 goto __label_2;} >> 00800 } >> 00801 /* First check if we are the base type exactly (the most >> common case), >> 00802 then do the full subclass check if needed. */ >> 00803 if (py_V1->ob_type != &PyGpuArrayType && >> 00804 !PyObject_TypeCheck(py_V1, &PyGpuArrayType)) { >> 00805 PyE >> > -- > > --- > You received this message because you are subscribed to the Google Groups > "theano-users" group. > To unsubscribe from this group and stop receiving emails from it, send an > email to [email protected]. > > For more options, visit https://groups.google.com/d/optout. > -- --- You received this message because you are subscribed to the Google Groups "theano-users" group. To unsubscribe from this group and stop receiving emails from it, send an email to [email protected]. For more options, visit https://groups.google.com/d/optout.
