Frederic, I'll do it and give you a feedback, many thanks Luca On Tuesday, July 19, 2016 at 10:09:21 PM UTC+2, nouiz wrote: > > We have a PR that upgrade some stuff about float16: > > https://github.com/Theano/Theano/pull/4764/files > > It probably fix your problem. Can you try it to confirm that you don't > have a different problem? > > thanks > > Frédéric > > On Fri, Jul 15, 2016 at 4:55 AM, <[email protected] <javascript:>> > wrote: > >> ok I try. >> thanks >> >> On Thursday, July 14, 2016 at 11:44:41 PM UTC+2, Arnaud Bergeron wrote: >>> >>> I can't reproduce your problem using a simple convolution in float16. >>> >>> Either this is because your code is doing something unexpected or >>> because the problem has been fixed in the development version. >>> >>> In nay case the development version is a much better option for the new >>> backend and float16 so I encourage you to upgrade and try again: >>> http://deeplearning.net/software/theano/install.html#bleeding-edge-install-instructions >>> . >>> >>> 2016-07-14 4:22 GMT-04:00 <[email protected]>: >>> >>>> Here is .theanorc: >>>> >>>> [global] >>>> floatX = float16 >>>> device=cuda >>>> [cuda] >>>> root = /usr/local/cuda-7.5 >>>> >>>> >>>> [nvcc] >>>> fastmath=True >>>> >>>> optimizer = fast_compile >>>> >>>> On Thursday, July 14, 2016 at 10:19:56 AM UTC+2, [email protected] >>>> wrote: >>>>> >>>>> Hi Arnaud, >>>>> I put _f16_ok = True in dnn.py ( attached). >>>>> >>>>> This is the error I received: >>>>> >>>>> Python 2.7.11 |Anaconda custom (64-bit)| (default, Dec 6 2015, >>>>> 18:08:32) >>>>> [GCC 4.4.7 20120313 (Red Hat 4.4.7-1)] on linux2 >>>>> Type "help", "copyright", "credits" or "license" for more information. >>>>> Anaconda is brought to you by Continuum Analytics. >>>>> Please check out: http://continuum.io/thanks and https://anaconda.org >>>>> >>> import run_multi_conv >>>>> >>>>> Mapped name None to device cuda: GeForce 840M >>>>> WARNING (theano.gof.compilelock): Overriding existing lock by dead >>>>> process '3202' (I am process '3351') >>>>> Using cuDNN version 5005 on context None >>>>> /home/luca/data/Theano-master/theano/tensor/signal/downsample.py:6: >>>>> UserWarning: downsample module has been moved to the >>>>> theano.tensor.signal.pool module. >>>>> "downsample module has been moved to the theano.tensor.signal.pool >>>>> module.") >>>>> >>> >>>>> >>> run_multi_conv.run_experiments() >>>>> Disabling C code for Elemwise{mul,no_inplace} due to unsupported >>>>> float16 >>>>> Disabling C code for Elemwise{Cast{float32}} due to unsupported float16 >>>>> Disabling C code for Elemwise{Cast{float16}} due to unsupported float16 >>>>> Disabling C code for Elemwise{Cast{float16}} due to unsupported float16 >>>>> Disabling C code for Alloc due to unsupported float16 >>>>> Disabling C code for Cast{float16} due to unsupported float16 >>>>> Disabling C code for Cast{float16} due to unsupported float16 >>>>> Disabling C code for Cast{float16} due to unsupported float16 >>>>> Disabling C code for Cast{float16} due to unsupported float16 >>>>> Disabling C code for RandomFunction{binomial} due to unsupported >>>>> float16 >>>>> Disabling C code for RandomFunction{binomial} due to unsupported >>>>> float16 >>>>> =============================== >>>>> 00001 #include <Python.h> >>>>> 00002 #include <iostream> >>>>> 00003 #include "theano_mod_helper.h" >>>>> 00004 #include <gpuarray/array.h> >>>>> 00005 #include <gpuarray/kernel.h> >>>>> 00006 #include <gpuarray/error.h> >>>>> 00007 #include <gpuarray/buffer.h> >>>>> 00008 #include <gpuarray/buffer_blas.h> >>>>> 00009 #include <numpy/arrayobject.h> >>>>> 00010 #include <gpuarray_api.h> >>>>> 00011 #include <math.h> >>>>> 00012 #include <numpy/arrayscalars.h> >>>>> 00013 #include "cudnn.h" >>>>> 00014 #include "cudnn_helper.h" >>>>> 00015 #include "gpuarray_helper.h" >>>>> 00016 #include "gpuarray/types.h" >>>>> 00017 #include "gpuarray/array.h" >>>>> 00018 #include "gpuarray/util.h" >>>>> 00019 #include "gpuarray/ext_cuda.h" >>>>> 00020 #include "gpuarray_api.h" >>>>> 00021 #include "numpy_compat.h" >>>>> 00022 ////////////////////// >>>>> 00023 //// Support Code >>>>> 00024 ////////////////////// >>>>> 00025 >>>>> 00026 >>>>> 00027 >>>>> 00028 static int >>>>> 00029 c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t >>>>> desc) { >>>>> 00030 cudnnDataType_t dt; >>>>> 00031 size_t ds; >>>>> 00032 switch (var->ga.typecode) { >>>>> 00033 case GA_FLOAT: >>>>> 00034 dt = CUDNN_DATA_FLOAT; >>>>> 00035 break; >>>>> 00036 case GA_DOUBLE: >>>>> 00037 dt = CUDNN_DATA_DOUBLE; >>>>> 00038 break; >>>>> 00039 #if CUDNN_VERSION > 3000 >>>>> 00040 case GA_HALF: >>>>> 00041 dt = CUDNN_DATA_HALF; >>>>> 00042 break; >>>>> 00043 #endif >>>>> 00044 default: >>>>> 00045 PyErr_SetString(PyExc_TypeError, "Non-float datatype in >>>>> c_set_tensorNd"); >>>>> 00046 return -1; >>>>> 00047 } >>>>> 00048 ds = gpuarray_get_elsize(var->ga.typecode); >>>>> 00049 >>>>> 00050 int strs[5], dims[5], default_stride = 1; >>>>> 00051 unsigned int nd = PyGpuArray_NDIM(var); >>>>> 00052 >>>>> 00053 if (nd > 5) { >>>>> 00054 PyErr_SetString(PyExc_TypeError, "Tensor of more than >>>>> 5d"); >>>>> 00055 return -1; >>>>> 00056 } >>>>> 00057 >>>>> 00058 for (unsigned int _i = nd; _i > 0; _i--) { >>>>> 00059 unsigned int i = _i - 1; >>>>> 00060 strs[i] = PyGpuArray_STRIDE(var, i) ? >>>>> 00061 PyGpuArray_STRIDE(var, i)/ds : default_stride; >>>>> 00062 default_stride *= PyGpuArray_DIM(var, i); >>>>> 00063 dims[i] = PyGpuArray_DIM(var, i); >>>>> 00064 } >>>>> 00065 >>>>> 00066 cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, dt, >>>>> nd, dims, strs); >>>>> 00067 if (err != CUDNN_STATUS_SUCCESS) { >>>>> 00068 PyErr_Format(PyExc_RuntimeError, >>>>> 00069 "Could not set tensorNd descriptor: %s", >>>>> 00070 cudnnGetErrorString(err)); >>>>> 00071 return -1; >>>>> 00072 } >>>>> 00073 return 0; >>>>> 00074 } >>>>> 00075 >>>>> 00076 static int >>>>> 00077 c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t >>>>> desc) { >>>>> 00078 cudnnDataType_t dt; >>>>> 00079 cudnnStatus_t err; >>>>> 00080 >>>>> 00081 if (!GpuArray_IS_C_CONTIGUOUS(&var->ga)) { >>>>> 00082 PyErr_SetString(PyExc_ValueError, >>>>> 00083 "Only contiguous filters (kernels) are >>>>> supported."); >>>>> 00084 return -1; >>>>> 00085 } >>>>> 00086 switch (var->ga.typecode) { >>>>> 00087 case GA_FLOAT: >>>>> 00088 dt = CUDNN_DATA_FLOAT; >>>>> 00089 break; >>>>> 00090 case GA_DOUBLE: >>>>> 00091 dt = CUDNN_DATA_DOUBLE; >>>>> 00092 break; >>>>> 00093 #if CUDNN_VERSION > 3000 >>>>> 00094 case GA_HALF: >>>>> 00095 dt = CUDNN_DATA_HALF; >>>>> 00096 break; >>>>> 00097 #endif >>>>> 00098 default: >>>>> 00099 PyErr_SetString(PyExc_TypeError, "Non-float datatype in >>>>> c_set_filter"); >>>>> 00100 return -1; >>>>> 00101 } >>>>> 00102 >>>>> 00103 int dims[5]; >>>>> 00104 unsigned int nd = PyGpuArray_NDIM(var); >>>>> 00105 >>>>> 00106 if (nd > 5) { >>>>> 00107 PyErr_SetString(PyExc_TypeError, "Tensor of more than >>>>> 5d"); >>>>> 00108 return -1; >>>>> 00109 } >>>>> 00110 >>>>> 00111 for (unsigned int _i = nd; _i > 0; _i--) { >>>>> 00112 unsigned int i = _i - 1; >>>>> 00113 dims[i] = PyGpuArray_DIM(var, i); >>>>> 00114 } >>>>> 00115 >>>>> 00116 #if CUDNN_VERSION >= 5000 >>>>> 00117 err = cudnnSetFilterNdDescriptor(desc, dt, >>>>> CUDNN_TENSOR_NCHW, nd, dims); >>>>> 00118 #else >>>>> 00119 err = cudnnSetFilterNdDescriptor(desc, dt, nd, dims); >>>>> 00120 #endif >>>>> 00121 >>>>> 00122 if (err != CUDNN_STATUS_SUCCESS) { >>>>> 00123 PyErr_Format(PyExc_RuntimeError, >>>>> 00124 "Could not set filter descriptor: %s.", >>>>> 00125 cudnnGetErrorString(err)); >>>>> 00126 return -1; >>>>> 00127 } >>>>> 00128 return 0; >>>>> 00129 } >>>>> 00130 >>>>> 00131 >>>>> 00132 >>>>> 00133 namespace { >>>>> 00134 struct >>>>> __struct_compiled_op_86feacd077d8749f42b5d82709a80ba3 { >>>>> 00135 PyObject* __ERROR; >>>>> 00136 >>>>> 00137 PyObject* storage_V3; >>>>> 00138 PyObject* storage_V5; >>>>> 00139 PyObject* storage_V7; >>>>> 00140 PyObject* storage_V9; >>>>> 00141 PyObject* storage_V11; >>>>> 00142 PyObject* storage_V13; >>>>> 00143 PyObject* storage_V1; >>>>> 00144 PyObject* storage_V15; >>>>> 00145 >>>>> 00146 PyObject* py_V15; >>>>> 00147 PyGpuContextObject *V15; >>>>> 00148 #define DTYPE_INPUT_0 npy_float16 >>>>> 00149 #define TYPENUM_INPUT_0 23 >>>>> 00150 #define ITEMSIZE_INPUT_0 2 >>>>> 00151 #define DTYPE_INPUT_1 npy_float16 >>>>> 00152 #define TYPENUM_INPUT_1 23 >>>>> 00153 #define ITEMSIZE_INPUT_1 2 >>>>> 00154 #define DTYPE_INPUT_2 npy_float16 >>>>> 00155 #define TYPENUM_INPUT_2 23 >>>>> 00156 #define ITEMSIZE_INPUT_2 2 >>>>> 00157 #define DTYPE_INPUT_4 npy_float16 >>>>> 00158 #define TYPENUM_INPUT_4 23 >>>>> 00159 #define ITEMSIZE_INPUT_4 2 >>>>> 00160 #define DTYPE_INPUT_5 npy_float16 >>>>> 00161 #define TYPENUM_INPUT_5 23 >>>>> 00162 #define ITEMSIZE_INPUT_5 2 >>>>> 00163 #define DTYPE_OUTPUT_0 npy_float16 >>>>> 00164 #define TYPENUM_OUTPUT_0 23 >>>>> 00165 #define ITEMSIZE_OUTPUT_0 2 >>>>> 00166 #define APPLY_SPECIFIC(str) >>>>> str##_node_86feacd077d8749f42b5d82709a80ba3_0 >>>>> 00167 #define CONV_INPLACE 1 >>>>> 00168 #define CONV_ALGO >>>>> CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM >>>>> 00169 >>>>> 00170 >>>>> 00171 PyGpuContextObject *ctx; >>>>> 00172 cudnnHandle_t APPLY_SPECIFIC(_handle); >>>>> 00173 >>>>> 00174 >>>>> 00175 cudnnTensorDescriptor_t APPLY_SPECIFIC(input); >>>>> 00176 cudnnTensorDescriptor_t APPLY_SPECIFIC(output); >>>>> 00177 cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns); >>>>> 00178 >>>>> 00179 >>>>> 00180 >>>>> 00181 #ifdef CHOOSE_ALGO >>>>> 00182 int reuse_algo; >>>>> 00183 cudnnConvolutionFwdAlgo_t prev_algo; >>>>> 00184 #ifndef CHOOSE_ONCE >>>>> 00185 size_t prev_img_dims[5]; >>>>> 00186 size_t prev_kern_dims[5]; >>>>> 00187 #endif >>>>> 00188 #endif >>>>> 00189 >>>>> 00190 int >>>>> 00191 APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, >>>>> PyGpuArrayObject *kerns, >>>>> 00192 PyGpuArrayObject *om, >>>>> 00193 cudnnConvolutionDescriptor_t desc, >>>>> 00194 double alpha, double beta, >>>>> 00195 PyGpuArrayObject **output, >>>>> 00196 PyGpuContextObject *c) { >>>>> 00197 cudnnStatus_t err = CUDNN_STATUS_SUCCESS; >>>>> 00198 float af = alpha, bf = beta; >>>>> 00199 void *alpha_p; >>>>> 00200 void *beta_p; >>>>> 00201 >>>>> 00202 if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) >>>>> { >>>>> 00203 PyErr_SetString(PyExc_ValueError, >>>>> 00204 "images and kernel must have the same stack >>>>> size"); >>>>> 00205 return 1; >>>>> 00206 } >>>>> 00207 >>>>> 00208 if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1) >>>>> 00209 return 1; >>>>> 00210 if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) >>>>> 00211 return 1; >>>>> 00212 >>>>> 00213 switch (input->ga.typecode) { >>>>> 00214 case GA_DOUBLE: >>>>> 00215 alpha_p = (void *)α >>>>> 00216 beta_p = (void *)β >>>>> 00217 break; >>>>> 00218 case GA_FLOAT: >>>>> 00219 case GA_HALF: >>>>> 00220 alpha_p = (void *)⁡ >>>>> 00221 beta_p = (void *)&bf; >>>>> 00222 break; >>>>> 00223 default: >>>>> 00224 PyErr_SetString(PyExc_TypeError, "Unsupported type in >>>>> convolution"); >>>>> 00225 return 1; >>>>> 00226 } >>>>> 00227 >>>>> 00228 #ifdef CONV_INPLACE >>>>> 00229 Py_XDECREF(*output); >>>>> 00230 *output = om; >>>>> 00231 Py_INCREF(*output); >>>>> 00232 #else >>>>> 00233 if (theano_prep_output(output, PyGpuArray_NDIM(om), >>>>> PyGpuArray_DIMS(om), >>>>> 00234 om->ga.typecode, GA_C_ORDER, c) != 0) >>>>> 00235 return 1; >>>>> 00236 if (beta != 0.0 && pygpu_move(*output, om)) >>>>> 00237 return 1; >>>>> 00238 #endif >>>>> 00239 >>>>> 00240 if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1) >>>>> 00241 return 1; >>>>> 00242 >>>>> 00243 cudnnConvolutionFwdAlgo_t algo = CONV_ALGO; >>>>> 00244 >>>>> 00245 cuda_enter(c->ctx); >>>>> 00246 #ifdef CHOOSE_ALGO >>>>> 00247 #ifndef CHOOSE_ONCE >>>>> 00248 reuse_algo = 1; >>>>> 00249 for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { >>>>> 00250 reuse_algo = (reuse_algo && >>>>> 00251 PyGpuArray_DIM(input, i) == >>>>> prev_img_dims[i]); >>>>> 00252 reuse_algo = (reuse_algo && >>>>> 00253 PyGpuArray_DIM(kerns, i) == >>>>> prev_kern_dims[i]); >>>>> 00254 } >>>>> 00255 #endif >>>>> 00256 >>>>> 00257 if (!reuse_algo) { >>>>> 00258 #ifdef CHOOSE_TIME >>>>> 00259 int count; >>>>> 00260 cudnnConvolutionFwdAlgoPerf_t choice; >>>>> 00261 err = cudnnFindConvolutionForwardAlgorithm( >>>>> 00262 APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), >>>>> APPLY_SPECIFIC(kerns), >>>>> 00263 desc, APPLY_SPECIFIC(output), 1, &count, &choice); >>>>> 00264 >>>>> 00265 if (err != CUDNN_STATUS_SUCCESS) { >>>>> 00266 PyErr_Format(PyExc_RuntimeError, >>>>> 00267 "error selecting convolution algo: %s", >>>>> 00268 cudnnGetErrorString(err)); >>>>> 00269 cuda_exit(c->ctx); >>>>> 00270 return 1; >>>>> 00271 } >>>>> 00272 algo = choice.algo; >>>>> 00273 #else >>>>> 00274 size_t free; >>>>> 00275 int err2 = gpucontext_property(c->ctx, >>>>> GA_CTX_PROP_FREE_GMEM, &free); >>>>> 00276 >>>>> 00277 if (err2 != GA_NO_ERROR) { >>>>> 00278 PyErr_Format(PyExc_RuntimeError, "Error when trying to >>>>> find the " >>>>> 00279 "memory information on the GPU"); >>>>> 00280 cuda_exit(c->ctx); >>>>> 00281 return 1; >>>>> 00282 } >>>>> 00283 >>>>> 00284 err = cudnnGetConvolutionForwardAlgorithm( >>>>> 00285 APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), >>>>> APPLY_SPECIFIC(kerns), >>>>> 00286 desc, APPLY_SPECIFIC(output), >>>>> 00287 CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, >>>>> &algo); >>>>> 00288 if (err != CUDNN_STATUS_SUCCESS) { >>>>> 00289 PyErr_Format(PyExc_RuntimeError, >>>>> 00290 "error selecting convolution algo: %s", >>>>> 00291 cudnnGetErrorString(err)); >>>>> 00292 cuda_exit(c->ctx); >>>>> 00293 return 1; >>>>> 00294 } >>>>> 00295 #endif >>>>> 00296 prev_algo = algo; >>>>> 00297 } else { >>>>> 00298 algo = prev_algo; >>>>> 00299 } >>>>> 00300 >>>>> 00301 #ifdef CHOOSE_ONCE >>>>> 00302 reuse_algo = 1; >>>>> 00303 #else >>>>> 00304 for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { >>>>> 00305 prev_img_dims[i] = PyGpuArray_DIM(input, i); >>>>> 00306 prev_kern_dims[i] = PyGpuArray_DIM(kerns, i); >>>>> 00307 } >>>>> 00308 #endif >>>>> 00309 >>>>> 00310 #endif >>>>> 00311 >>>>> 00312 /* These two algos are not supported for 3d conv */ >>>>> 00313 if (PyGpuArray_NDIM(input) == 5 && >>>>> 00314 (algo == >>>>> CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM || >>>>> 00315 algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM)) >>>>> 00316 algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; >>>>> 00317 >>>>> 00318 // The FFT implementation does not support strides, 1x1 >>>>> filters or inputs >>>>> 00319 // with a spatial dimension larger than 1024. The tiled-FFT >>>>> implementation >>>>> 00320 // does not support strides. >>>>> 00321 // If the chosen implementation is FFT or tiled-FFT, >>>>> validate that it can >>>>> 00322 // be used on the current data and default to a safe >>>>> implementation if it >>>>> 00323 // can't. >>>>> 00324 // The following code is 2d-specific but it is fine as FFT >>>>> and tiled-FFT are >>>>> 00325 // defined only for 2d filters >>>>> 00326 if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT || >>>>> 00327 algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && >>>>> PyGpuArray_NDIM(input) == 4) { >>>>> 00328 >>>>> 00329 // Extract the properties of the convolution descriptor >>>>> 00330 int nd; >>>>> 00331 int pad[2]; >>>>> 00332 int stride[2]; >>>>> 00333 int upscale[2]; >>>>> 00334 cudnnConvolutionMode_t mode; >>>>> 00335 cudnnDataType_t data_type; >>>>> 00336 err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, >>>>> stride, >>>>> 00337 upscale, &mode, >>>>> &data_type); >>>>> 00338 if (err != CUDNN_STATUS_SUCCESS) { >>>>> 00339 PyErr_Format(PyExc_RuntimeError, >>>>> 00340 "error getting convolution properties: %s", >>>>> 00341 cudnnGetErrorString(err)); >>>>> 00342 cuda_exit(c->ctx); >>>>> 00343 return 1; >>>>> 00344 } >>>>> 00345 >>>>> 00346 if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) >>>>> 00347 { >>>>> 00348 if (stride[0] != 1 || stride[1] != 1 || >>>>> 00349 PyGpuArray_DIM(input, 2) > 1024 || >>>>> PyGpuArray_DIM(input, 3) > 1024 || >>>>> 00350 (PyGpuArray_DIM(kerns, 2) == 1 && >>>>> PyGpuArray_DIM(kerns, 3) == 1)) >>>>> 00351 { >>>>> 00352 algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; >>>>> 00353 } >>>>> 00354 } >>>>> 00355 else >>>>> 00356 { >>>>> 00357 // algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING >>>>> 00358 if (stride[0] != 1 || stride[1] != 1) >>>>> 00359 { >>>>> 00360 algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; >>>>> 00361 } >>>>> 00362 } >>>>> 00363 } >>>>> 00364 >>>>> 00365 { >>>>> 00366 size_t worksize; >>>>> 00367 gpudata *workspace; >>>>> 00368 err = >>>>> cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle), >>>>> 00369 >>>>> APPLY_SPECIFIC(input), >>>>> 00370 >>>>> APPLY_SPECIFIC(kerns), >>>>> 00371 desc, >>>>> 00372 >>>>> APPLY_SPECIFIC(output), >>>>> 00373 algo, >>>>> 00374 &worksize); >>>>> 00375 >>>>> 00376 if (err == CUDNN_STATUS_NOT_SUPPORTED) { >>>>> 00377 // Fallback to none algo if not supported >>>>> 00378 // TODO: Print a warning >>>>> 00379 algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; >>>>> 00380 >>>>> 00381 err = >>>>> cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle), >>>>> 00382 >>>>> APPLY_SPECIFIC(input), >>>>> 00383 >>>>> APPLY_SPECIFIC(kerns), >>>>> 00384 desc, >>>>> 00385 >>>>> APPLY_SPECIFIC(output), >>>>> 00386 algo, >>>>> 00387 >>>>> &worksize); >>>>> 00388 } >>>>> 00389 >>>>> 00390 if (err != CUDNN_STATUS_SUCCESS) { >>>>> 00391 PyErr_Format(PyExc_RuntimeError, >>>>> 00392 "error getting worksize: %s", >>>>> 00393 cudnnGetErrorString(err)); >>>>> 00394 cuda_exit(c->ctx); >>>>> 00395 return 1; >>>>> 00396 } >>>>> 00397 >>>>> 00398 /* >>>>> 00399 * This is less than ideal since we need to free it after >>>>> (which >>>>> 00400 * introduces a synchronization point. But we don't have >>>>> a module >>>>> 00401 * to place a nice get_work_mem() function in. >>>>> 00402 */ >>>>> 00403 if (worksize != 0) { >>>>> 00404 workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, >>>>> NULL); >>>>> 00405 if (workspace == NULL) { >>>>> 00406 PyErr_SetString(PyExc_RuntimeError, >>>>> 00407 "Could not allocate working memory"); >>>>> 00408 cuda_exit(c->ctx); >>>>> 00409 return 1; >>>>> 00410 } >>>>> 00411 } >>>>> 00412 >>>>> 00413 cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ); >>>>> 00414 cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); >>>>> 00415 cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); >>>>> 00416 >>>>> 00417 err = cudnnConvolutionForward( >>>>> 00418 APPLY_SPECIFIC(_handle), >>>>> 00419 alpha_p, >>>>> 00420 APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), >>>>> 00421 APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), >>>>> 00422 desc, algo, >>>>> 00423 worksize == 0 ? NULL : *(void **)workspace, worksize, >>>>> 00424 beta_p, >>>>> 00425 APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output)); >>>>> 00426 >>>>> 00427 if (worksize != 0) >>>>> 00428 gpudata_release(workspace); >>>>> 00429 >>>>> 00430 cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ); >>>>> 00431 cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); >>>>> 00432 cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); >>>>> 00433 } >>>>> 00434 cuda_exit(c->ctx); >>>>> 00435 >>>>> 00436 if (err != CUDNN_STATUS_SUCCESS) { >>>>> 00437 PyErr_Format(PyExc_RuntimeError, "error doing operation: >>>>> %s", >>>>> 00438 cudnnGetErrorString(err)); >>>>> 00439 return 1; >>>>> 00440 } >>>>> 00441 return 0; >>>>> 00442 } >>>>> 00443 >>>>> 00444 #undef DTYPE_INPUT_0 >>>>> 00445 #undef TYPENUM_INPUT_0 >>>>> 00446 #undef ITEMSIZE_INPUT_0 >>>>> 00447 #undef DTYPE_INPUT_1 >>>>> 00448 #undef TYPENUM_INPUT_1 >>>>> 00449 #undef ITEMSIZE_INPUT_1 >>>>> 00450 #undef DTYPE_INPUT_2 >>>>> 00451 #undef TYPENUM_INPUT_2 >>>>> 00452 #undef ITEMSIZE_INPUT_2 >>>>> 00453 #undef DTYPE_INPUT_4 >>>>> 00454 #undef TYPENUM_INPUT_4 >>>>> 00455 #undef ITEMSIZE_INPUT_4 >>>>> 00456 #undef DTYPE_INPUT_5 >>>>> 00457 #undef TYPENUM_INPUT_5 >>>>> 00458 #undef ITEMSIZE_INPUT_5 >>>>> 00459 #undef DTYPE_OUTPUT_0 >>>>> 00460 #undef TYPENUM_OUTPUT_0 >>>>> 00461 #undef ITEMSIZE_OUTPUT_0 >>>>> 00462 #undef APPLY_SPECIFIC >>>>> 00463 #undef CONV_INPLACE >>>>> 00464 #undef CONV_ALGO >>>>> 00465 >>>>> 00466 >>>>> __struct_compiled_op_86feacd077d8749f42b5d82709a80ba3() { >>>>> 00467 // This is only somewhat safe because we: >>>>> 00468 // 1) Are not a virtual class >>>>> 00469 // 2) Do not use any virtual classes in the >>>>> members >>>>> 00470 // 3) Deal with mostly POD and pointers >>>>> 00471 >>>>> 00472 // If this changes, we would have to revise this, >>>>> but for >>>>> 00473 // now I am tired of chasing segfaults because >>>>> 00474 // initialization code had an error and some >>>>> pointer has >>>>> 00475 // a junk value. >>>>> 00476 memset(this, 0, sizeof(*this)); >>>>> 00477 } >>>>> 00478 >>>>> ~__struct_compiled_op_86feacd077d8749f42b5d82709a80ba3(void) { >>>>> 00479 cleanup(); >>>>> 00480 } >>>>> 00481 >>>>> 00482 int init(PyObject* __ERROR, PyObject* storage_V3, >>>>> PyObject* storage_V5, PyObject* storage_V7, PyObject* storage_V9, >>>>> PyObject* >>>>> storage_V11, PyObject* storage_V13, PyObject* storage_V1, PyObject* >>>>> storage_V15) { >>>>> 00483 Py_XINCREF(storage_V3); >>>>> 00484 Py_XINCREF(storage_V5); >>>>> 00485 Py_XINCREF(storage_V7); >>>>> 00486 Py_XINCREF(storage_V9); >>>>> 00487 Py_XINCREF(storage_V11); >>>>> 00488 Py_XINCREF(storage_V13); >>>>> 00489 Py_XINCREF(storage_V1); >>>>> 00490 Py_XINCREF(storage_V15); >>>>> 00491 this->storage_V3 = storage_V3; >>>>> 00492 this->storage_V5 = storage_V5; >>>>> 00493 this->storage_V7 = storage_V7; >>>>> 00494 this->storage_V9 = storage_V9; >>>>> 00495 this->storage_V11 = storage_V11; >>>>> 00496 this->storage_V13 = storage_V13; >>>>> 00497 this->storage_V1 = storage_V1; >>>>> 00498 this->storage_V15 = storage_V15; >>>>> 00499 >>>>> 00500 >>>>> 00501 >>>>> 00502 >>>>> 00503 >>>>> 00504 >>>>> 00505 >>>>> 00506 >>>>> 00507 >>>>> 00508 py_V15 = PyList_GET_ITEM(storage_V15, 0); >>>>> 00509 {Py_XINCREF(py_V15);} >>>>> 00510 >>>>> 00511 if (!PyObject_TypeCheck(py_V15, &PyGpuContextType)) { >>>>> 00512 PyErr_SetString(PyExc_TypeError, "expected a GpuContext"); >>>>> 00513 { >>>>> 00514 if (!PyErr_Occurred()) { >>>>> 00515 PyErr_SetString(PyExc_RuntimeError, >>>>> 00516 "Unexpected error in an Op's C code. " >>>>> 00517 "No Python exception was set."); >>>>> 00518 } >>>>> 00519 return 15; >>>>> 00520 } >>>>> 00521 } >>>>> 00522 >>>>> 00523 V15 = (PyGpuContextObject *)py_V15; >>>>> 00524 Py_INCREF(V15); >>>>> 00525 >>>>> 00526 >>>>> 00527 #define DTYPE_INPUT_0 npy_float16 >>>>> 00528 #define TYPENUM_INPUT_0 23 >>>>> 00529 #define ITEMSIZE_INPUT_0 2 >>>>> 00530 #define DTYPE_INPUT_1 npy_float16 >>>>> 00531 #define TYPENUM_INPUT_1 23 >>>>> 00532 #define ITEMSIZE_INPUT_1 2 >>>>> 00533 #define DTYPE_INPUT_2 npy_float16 >>>>> 00534 #define TYPENUM_INPUT_2 23 >>>>> 00535 #define ITEMSIZE_INPUT_2 2 >>>>> 00536 #define DTYPE_INPUT_4 npy_float16 >>>>> 00537 #define TYPENUM_INPUT_4 23 >>>>> 00538 #define ITEMSIZE_INPUT_4 2 >>>>> 00539 #define DTYPE_INPUT_5 npy_float16 >>>>> 00540 #define TYPENUM_INPUT_5 23 >>>>> 00541 #define ITEMSIZE_INPUT_5 2 >>>>> 00542 #define DTYPE_OUTPUT_0 npy_float16 >>>>> 00543 #define TYPENUM_OUTPUT_0 23 >>>>> 00544 #define ITEMSIZE_OUTPUT_0 2 >>>>> 00545 #define APPLY_SPECIFIC(str) >>>>> str##_node_86feacd077d8749f42b5d82709a80ba3_0 >>>>> 00546 #define CONV_INPLACE 1 >>>>> 00547 #define CONV_ALGO >>>>> CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM >>>>> 00548 #define FAIL { \ >>>>> 00549 if (!PyErr_Occurred()) { \ >>>>> 00550 PyErr_SetString(PyExc_RuntimeError, \ >>>>> 00551 "Unexpected error in an Op's C code. " \ >>>>> 00552 "No Python exception was set."); \ >>>>> 00553 } \ >>>>> 00554 return 17; \ >>>>> 00555 } >>>>> 00556 #define PARAMS V15 >>>>> 00557 >>>>> 00558 >>>>> 00559 { >>>>> 00560 // We need to keep a reference here to have it available in >>>>> the destructor. >>>>> 00561 ctx = PARAMS; >>>>> 00562 Py_INCREF(ctx); >>>>> 00563 >>>>> 00564 cuda_enter(PARAMS->ctx); >>>>> 00565 cudnnStatus_t err; >>>>> 00566 APPLY_SPECIFIC(_handle) = NULL; >>>>> 00567 if ((err = cudnnCreate(&APPLY_SPECIFIC(_handle))) != >>>>> CUDNN_STATUS_SUCCESS) { >>>>> 00568 PyErr_Format(PyExc_RuntimeError, "could not create cuDNN >>>>> handle: %s", >>>>> 00569 cudnnGetErrorString(err)); >>>>> 00570 cuda_exit(PARAMS->ctx); >>>>> 00571 FAIL; >>>>> 00572 } >>>>> 00573 if ((err = cudnnSetStream(APPLY_SPECIFIC(_handle), >>>>> 00574 cuda_get_stream(PARAMS->ctx))) != >>>>> CUDNN_STATUS_SUCCESS) { >>>>> 00575 PyErr_Format(PyExc_RuntimeError, "Could not set cudnn >>>>> stream: %s", >>>>> 00576 cudnnGetErrorString(err)); >>>>> 00577 cuda_exit(PARAMS->ctx); >>>>> 00578 FAIL; >>>>> 00579 } >>>>> 00580 cuda_exit(PARAMS->ctx); >>>>> 00581 } >>>>> 00582 >>>>> 00583 >>>>> 00584 >>>>> 00585 cudnnStatus_t APPLY_SPECIFIC(err); >>>>> 00586 APPLY_SPECIFIC(input) = NULL; >>>>> 00587 APPLY_SPECIFIC(output) = NULL; >>>>> 00588 APPLY_SPECIFIC(kerns) = NULL; >>>>> 00589 if ((APPLY_SPECIFIC(err) = >>>>> cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != >>>>> CUDNN_STATUS_SUCCESS) { >>>>> 00590 PyErr_Format(PyExc_MemoryError, "could not allocate tensor >>>>> descriptor " >>>>> 00591 "(inp): %s", >>>>> cudnnGetErrorString(APPLY_SPECIFIC(err))); >>>>> 00592 FAIL; >>>>> 00593 } >>>>> 00594 if ((APPLY_SPECIFIC(err) = >>>>> cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != >>>>> CUDNN_STATUS_SUCCESS) { >>>>> 00595 PyErr_Format(PyExc_MemoryError, "could not allocate tensor >>>>> descriptor " >>>>> 00596 "(out): %s", >>>>> cudnnGetErrorString(APPLY_SPECIFIC(err))); >>>>> 00597 FAIL; >>>>> 00598 } >>>>> 00599 if ((APPLY_SPECIFIC(err) = >>>>> cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns))) != >>>>> CUDNN_STATUS_SUCCESS) { >>>>> 00600 PyErr_Format(PyExc_MemoryError, "could not allocate filter >>>>> descriptor: %s", >>>>> 00601 cudnnGetErrorString(APPLY_SPECIFIC(err))); >>>>> 00602 FAIL; >>>>> 00603 } >>>>> 00604 >>>>> 00605 >>>>> 00606 >>>>> 00607 #ifdef CHOOSE_ALGO >>>>> 00608 reuse_algo = 0; >>>>> 00609 prev_algo = CONV_ALGO; >>>>> 00610 #ifndef CHOOSE_ONCE >>>>> 00611 memset(prev_img_dims, 0, sizeof(prev_img_dims)); >>>>> 00612 memset(prev_kern_dims, 0, sizeof(prev_kern_dims)); >>>>> 00613 #endif >>>>> 00614 #endif >>>>> 00615 >>>>> 00616 >>>>> 00617 #undef FAIL >>>>> 00618 #undef PARAMS >>>>> 00619 #undef DTYPE_INPUT_0 >>>>> 00620 #undef TYPENUM_INPUT_0 >>>>> 00621 #undef ITEMSIZE_INPUT_0 >>>>> 00622 #undef DTYPE_INPUT_1 >>>>> 00623 #undef TYPENUM_INPUT_1 >>>>> 00624 #undef ITEMSIZE_INPUT_1 >>>>> 00625 #undef DTYPE_INPUT_2 >>>>> 00626 #undef TYPENUM_INPUT_2 >>>>> 00627 #undef ITEMSIZE_INPUT_2 >>>>> 00628 #undef DTYPE_INPUT_4 >>>>> 00629 #undef TYPENUM_INPUT_4 >>>>> 00630 #undef ITEMSIZE_INPUT_4 >>>>> 00631 #undef DTYPE_INPUT_5 >>>>> 00632 #undef TYPENUM_INPUT_5 >>>>> 00633 #undef ITEMSIZE_INPUT_5 >>>>> 00634 #undef DTYPE_OUTPUT_0 >>>>> 00635 #undef TYPENUM_OUTPUT_0 >>>>> 00636 #undef ITEMSIZE_OUTPUT_0 >>>>> 00637 #undef APPLY_SPECIFIC >>>>> 00638 #undef CONV_INPLACE >>>>> 00639 #undef CONV_ALGO >>>>> 00640 this->__ERROR = __ERROR; >>>>> 00641 return 0; >>>>> 00642 } >>>>> 00643 void cleanup(void) { >>>>> 00644 __label_1: >>>>> 00645 >>>>> 00646 double __DUMMY_1; >>>>> 00647 __label_3: >>>>> 00648 >>>>> 00649 double __DUMMY_3; >>>>> 00650 __label_5: >>>>> 00651 >>>>> 00652 double __DUMMY_5; >>>>> 00653 __label_7: >>>>> 00654 >>>>> 00655 double __DUMMY_7; >>>>> 00656 __label_9: >>>>> 00657 >>>>> 00658 double __DUMMY_9; >>>>> 00659 __label_11: >>>>> 00660 >>>>> 00661 double __DUMMY_11; >>>>> 00662 __label_13: >>>>> 00663 >>>>> 00664 double __DUMMY_13; >>>>> 00665 __label_15: >>>>> 00666 Py_XDECREF(V15); V15 = NULL; >>>>> 00667 {Py_XDECREF(py_V15);} >>>>> 00668 >>>>> 00669 double __DUMMY_15; >>>>> 00670 __label_18: >>>>> 00671 >>>>> 00672 #define DTYPE_INPUT_0 npy_float16 >>>>> 00673 #define TYPENUM_INPUT_0 23 >>>>> 00674 #define ITEMSIZE_INPUT_0 2 >>>>> 00675 #define DTYPE_INPUT_1 npy_float16 >>>>> 00676 #define TYPENUM_INPUT_1 23 >>>>> 00677 #define ITEMSIZE_INPUT_1 2 >>>>> 00678 #define DTYPE_INPUT_2 npy_float16 >>>>> 00679 #define TYPENUM_INPUT_2 23 >>>>> 00680 #define ITEMSIZE_INPUT_2 2 >>>>> 00681 #define DTYPE_INPUT_4 npy_float16 >>>>> 00682 #define TYPENUM_INPUT_4 23 >>>>> 00683 #define ITEMSIZE_INPUT_4 2 >>>>> 00684 #define DTYPE_INPUT_5 npy_float16 >>>>> 00685 #define TYPENUM_INPUT_5 23 >>>>> 00686 #define ITEMSIZE_INPUT_5 2 >>>>> 00687 #define DTYPE_OUTPUT_0 npy_float16 >>>>> 00688 #define TYPENUM_OUTPUT_0 23 >>>>> 00689 #define ITEMSIZE_OUTPUT_0 2 >>>>> 00690 #define APPLY_SPECIFIC(str) >>>>> str##_node_86feacd077d8749f42b5d82709a80ba3_0 >>>>> 00691 #define CONV_INPLACE 1 >>>>> 00692 #define CONV_ALGO >>>>> CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM >>>>> 00693 >>>>> 00694 >>>>> 00695 cuda_enter(ctx->ctx); >>>>> 00696 cudnnDestroy(APPLY_SPECIFIC(_handle)); >>>>> 00697 cuda_exit(ctx->ctx); >>>>> 00698 Py_DECREF((PyObject *)ctx); >>>>> 00699 >>>>> 00700 >>>>> 00701 if (APPLY_SPECIFIC(input) != NULL) >>>>> 00702 cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); >>>>> 00703 if (APPLY_SPECIFIC(output) != NULL) >>>>> 00704 cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); >>>>> 00705 if (APPLY_SPECIFIC(kerns) != NULL) >>>>> 00706 cudnnDestroyFilterDescriptor(APPLY_SPECIFIC(kerns)); >>>>> 00707 >>>>> 00708 #undef DTYPE_INPUT_0 >>>>> 00709 #undef TYPENUM_INPUT_0 >>>>> 00710 #undef ITEMSIZE_INPUT_0 >>>>> 00711 #undef DTYPE_INPUT_1 >>>>> 00712 #undef TYPENUM_INPUT_1 >>>>> 00713 #undef ITEMSIZE_INPUT_1 >>>>> 00714 #undef DTYPE_INPUT_2 >>>>> 00715 #undef TYPENUM_INPUT_2 >>>>> 00716 #undef ITEMSIZE_INPUT_2 >>>>> 00717 #undef DTYPE_INPUT_4 >>>>> 00718 #undef TYPENUM_INPUT_4 >>>>> 00719 #undef ITEMSIZE_INPUT_4 >>>>> 00720 #undef DTYPE_INPUT_5 >>>>> 00721 #undef TYPENUM_INPUT_5 >>>>> 00722 #undef ITEMSIZE_INPUT_5 >>>>> 00723 #undef DTYPE_OUTPUT_0 >>>>> 00724 #undef TYPENUM_OUTPUT_0 >>>>> 00725 #undef ITEMSIZE_OUTPUT_0 >>>>> 00726 #undef APPLY_SPECIFIC >>>>> 00727 #undef CONV_INPLACE >>>>> 00728 #undef CONV_ALGO >>>>> 00729 double __DUMMY_18; >>>>> 00730 >>>>> 00731 Py_XDECREF(this->storage_V3); >>>>> 00732 Py_XDECREF(this->storage_V5); >>>>> 00733 Py_XDECREF(this->storage_V7); >>>>> 00734 Py_XDECREF(this->storage_V9); >>>>> 00735 Py_XDECREF(this->storage_V11); >>>>> 00736 Py_XDECREF(this->storage_V13); >>>>> 00737 Py_XDECREF(this->storage_V1); >>>>> 00738 Py_XDECREF(this->storage_V15); >>>>> 00739 } >>>>> 00740 int run(void) { >>>>> 00741 int __failure = 0; >>>>> 00742 >>>>> 00743 PyObject* py_V1; >>>>> 00744 >>>>> 00745 PyGpuArrayObject *V1; >>>>> 00746 >>>>> 00747 PyObject* py_V3; >>>>> 00748 >>>>> 00749 PyGpuArrayObject *V3; >>>>> 00750 >>>>> 00751 PyObject* py_V5; >>>>> 00752 >>>>> 00753 PyGpuArrayObject *V5; >>>>> 00754 >>>>> 00755 PyObject* py_V7; >>>>> 00756 >>>>> 00757 PyGpuArrayObject *V7; >>>>> 00758 >>>>> 00759 PyObject* py_V9; >>>>> 00760 >>>>> 00761 cudnnConvolutionDescriptor_t V9; >>>>> 00762 >>>>> 00763 PyObject* py_V11; >>>>> 00764 >>>>> 00765 typedef npy_float16 V11_dtype; // Deprecated >>>>> use dtype_V11 instead. >>>>> 00766 typedef npy_float16 dtype_V11; >>>>> 00767 >>>>> 00768 npy_float16 V11; >>>>> 00769 >>>>> 00770 PyObject* py_V13; >>>>> 00771 >>>>> 00772 typedef npy_float16 V13_dtype; // Deprecated >>>>> use dtype_V13 instead. >>>>> 00773 typedef npy_float16 dtype_V13; >>>>> 00774 >>>>> 00775 npy_float16 V13; >>>>> 00776 >>>>> 00777 { >>>>> 00778 >>>>> 00779 py_V1 = PyList_GET_ITEM(storage_V1, 0); >>>>> 00780 {Py_XINCREF(py_V1);} >>>>> 00781 >>>>> 00782 if (py_V1 == Py_None) >>>>> 00783 { >>>>> 00784 V1 = NULL; >>>>> 00785 } >>>>> 00786 else >>>>> 00787 { >>>>> 00788 >>>>> 00789 V1 = NULL; >>>>> 00790 if (py_V1 == Py_None) { >>>>> 00791 PyErr_SetString(PyExc_ValueError, "expected a >>>>> GpuArray, not None"); >>>>> 00792 { >>>>> 00793 __failure = 2; >>>>> 00794 if (!PyErr_Occurred()) { >>>>> 00795 PyErr_SetString(PyExc_RuntimeError, >>>>> 00796 "Unexpected error in an Op's C code. " >>>>> 00797 "No Python exception was set."); >>>>> 00798 } >>>>> 00799 goto __label_2;} >>>>> 00800 } >>>>> 00801 /* First check if we are the base type exactly (the >>>>> most common case), >>>>> 00802 then do the full subclass check if needed. */ >>>>> 00803 if (py_V1->ob_type != &PyGpuArrayType && >>>>> 00804 !PyObject_TypeCheck(py_V1, &PyGpuArrayType)) { >>>>> 00805 PyE >>>>> >>>> -- >>>> >>>> --- >>>> You received this message because you are subscribed to the Google >>>> Groups "theano-users" group. >>>> To unsubscribe from this group and stop receiving emails from it, send >>>> an email to [email protected]. >>>> >>>> For more options, visit https://groups.google.com/d/optout. >>>> >>> >>> -- >> >> --- >> You received this message because you are subscribed to the Google Groups >> "theano-users" group. >> To unsubscribe from this group and stop receiving emails from it, send an >> email to [email protected] <javascript:>. >> For more options, visit https://groups.google.com/d/optout. >> > >
-- --- You received this message because you are subscribed to the Google Groups "theano-users" group. To unsubscribe from this group and stop receiving emails from it, send an email to [email protected]. For more options, visit https://groups.google.com/d/optout.
