Re: [theano-users] Running code with CuDNN fails during compilation (mod.cpp) (Windows)

Pascal Lamblin Thu, 22 Sep 2016 11:41:07 -0700

Can you let us know which compiler (and which version) you are using?

On Thu, Sep 22, 2016, D. Kit wrote:
> Hello,
> 
> I'm attempting to use conv3D2D in Theano, but when I get to the function 
> compilation stage it fails with the following errors:
> 
> mod.cpp:684:1: error: expected unqualified-id before '{' token
> 
> mod.cpp:1345:1: error: expected '}' at end of input
> mod.cpp: In destructor '{anonymous}::__struct_compiled_op_d7db26d1cc9f
> 
> mod.cpp:573:21: error: 'cleanup' was not declared in this scope
> mod.cpp: At global scope:
> mod.cpp:678:1: error: expected unqualified-id at end of input
>  #undef ITEMSIZE_INPUT_2
>  ^
> mod.cpp:678:1: error: expected '}' at end of input
> 
> I'm attaching the mod.cpp file that was generated by cc.py, op.py, dnn.py, 
> etc., The initialization code from dnn_base.c starts at line 684, but there 
> is no function name defined, so the code sits outside any useful code 
> block. The "#section cleanup_code_struct" code section also appears to have 
> been inserted into the wrong spot.
> 
> Can someone tell me how to fix this?
> 
> --Dmitry
> 
> -- 
> 
> --- 
> You received this message because you are subscribed to the Google Groups 
> "theano-users" group.
> To unsubscribe from this group and stop receiving emails from it, send an 
> email to [email protected].
> For more options, visit https://groups.google.com/d/optout.


> #include <Python.h>
> #include <iostream>
> #include "theano_mod_helper.h"
> #include <gpuarray/array.h>
> #include <gpuarray/kernel.h>
> #include <gpuarray/error.h>
> #include <gpuarray/buffer.h>
> #include <gpuarray/buffer_blas.h>
> #include <numpy/arrayobject.h>
> #include <gpuarray_api.h>
> #include <math.h>
> #include <numpy/arrayscalars.h>
> #include "cudnn.h"
> #include "cudnn_helper.h"
> #include "gpuarray_helper.h"
> #include "gpuarray/types.h"
> #include "gpuarray/array.h"
> #include "gpuarray/util.h"
> #include "gpuarray/ext_cuda.h"
> #include "gpuarray_api.h"
> #include "numpy_compat.h"
> //////////////////////
> ////  Support Code
> //////////////////////
> 
> void _capsule_destructor(PyObject *o) {
>     void *d = PyCapsule_GetContext(o);
>     void *p = PyCapsule_GetPointer(o, NULL);
>     void (*f)(void *) = (void (*)(void *))d;
>     if (f != NULL) f(p);
> }
> 
> 
> 
> 
> static int
> c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
>   cudnnDataType_t dt;
>   size_t ds;
>   switch (var->ga.typecode) {
>   case GA_FLOAT:
>     dt = CUDNN_DATA_FLOAT;
>     break;
>   case GA_DOUBLE:
>     dt = CUDNN_DATA_DOUBLE;
>     break;
>   case GA_HALF:
>     dt = CUDNN_DATA_HALF;
>     break;
>   default:
>     PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensorNd");
>     return -1;
>   }
>   ds = gpuarray_get_elsize(var->ga.typecode);
> 
>   int strs[8], dims[8], default_stride = 1;
>   unsigned int nd = PyGpuArray_NDIM(var);
> 
>   if (nd > 8) {
>     PyErr_SetString(PyExc_TypeError, "Tensor of more than 8d");
>     return -1;
>   }
> 
>   for (unsigned int _i = nd; _i > 0; _i--) {
>     unsigned int i = _i - 1;
>     strs[i] = PyGpuArray_STRIDE(var, i) ?
>       PyGpuArray_STRIDE(var, i)/ds : default_stride;
>     default_stride *= PyGpuArray_DIM(var, i);
>     dims[i] = PyGpuArray_DIM(var, i);
>   }
> 
>   /* Tensors can't be smaller than 3d for cudnn so we pad the
>    * descriptor if they are */
>   for (unsigned int i = nd; i < 3; i++) {
>     strs[i] = 1;
>     dims[i] = 1;
>   }
> 
>   cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, dt, nd < 3 ? 3 : nd,
>                                                  dims, strs);
>   if (err != CUDNN_STATUS_SUCCESS) {
>     PyErr_Format(PyExc_RuntimeError,
>                "Could not set tensorNd descriptor: %s",
>                cudnnGetErrorString(err));
>     return -1;
>   }
>   return 0;
> }
> 
> static int c_make_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t 
> *desc) {
>   cudnnStatus_t err;
>   err = cudnnCreateTensorDescriptor(desc);
>   if (err != CUDNN_STATUS_SUCCESS) {
>     PyErr_Format(PyExc_RuntimeError,
>                  "Could not create tensor descriptor: %s",
>                  cudnnGetErrorString(err));
>     return -1;
>   }
>   if (c_set_tensorNd(var, *desc) != 0) {
>     cudnnDestroyTensorDescriptor(*desc);
>     return -1;
>   }
>   return 0;
> }
> 
> static int
> c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {
>   cudnnDataType_t dt;
>   cudnnStatus_t err;
> 
>   if (!GpuArray_IS_C_CONTIGUOUS(&var->ga)) {
>     PyErr_SetString(PyExc_ValueError,
>                   "Only contiguous filters (kernels) are supported.");
>     return -1;
>   }
>   switch (var->ga.typecode) {
>   case GA_FLOAT:
>     dt = CUDNN_DATA_FLOAT;
>     break;
>   case GA_DOUBLE:
>     dt = CUDNN_DATA_DOUBLE;
>     break;
>   case GA_HALF:
>     dt = CUDNN_DATA_HALF;
>     break;
>   default:
>     PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_filter");
>     return -1;
>   }
> 
>   int dims[8];
>   unsigned int nd = PyGpuArray_NDIM(var);
> 
>   if (nd > 8) {
>     PyErr_SetString(PyExc_TypeError, "Tensor of more than 8d");
>     return -1;
>   }
> 
>   for (unsigned int _i = nd; _i > 0; _i--) {
>     unsigned int i = _i - 1;
>     dims[i] = PyGpuArray_DIM(var, i);
>   }
> 
>   /* Filters can't be less than 3d so we pad */
>   for (unsigned int i = nd; i < 3; i++)
>     dims[i] = 1;
> 
>   if (nd < 3)
>     nd = 3;
> 
> #if CUDNN_VERSION >= 5000
>     err = cudnnSetFilterNdDescriptor(desc, dt, CUDNN_TENSOR_NCHW, nd, dims);
> #else
>     err = cudnnSetFilterNdDescriptor(desc, dt, nd, dims);
> #endif
> 
>   if (err != CUDNN_STATUS_SUCCESS) {
>     PyErr_Format(PyExc_RuntimeError,
>                "Could not set filter descriptor: %s.",
>                cudnnGetErrorString(err));
>     return -1;
>   }
>   return 0;
> }
> 
> static int c_make_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t 
> *desc) {
>   cudnnStatus_t err;
>   err = cudnnCreateFilterDescriptor(desc);
>   if (err != CUDNN_STATUS_SUCCESS) {
>     PyErr_Format(PyExc_RuntimeError,
>                  "Could not create tensor descriptor: %s",
>                  cudnnGetErrorString(err));
>     return -1;
>   }
>   if (c_set_filter(var, *desc) != 0) {
>     cudnnDestroyFilterDescriptor(*desc);
>     return -1;
>   }
>   return 0;
> }
> 
> 
> 
>     namespace {
>     struct __struct_compiled_op_d7db26d1cc9fb59ee56289f4dda72577 {
>         PyObject* __ERROR;
> 
>         PyObject* storage_V3;
> PyObject* storage_V5;
> PyObject* storage_V7;
> PyObject* storage_V9;
> PyObject* storage_V11;
> PyObject* storage_V13;
> PyObject* storage_V1;
> PyObject* storage_V15;
>         
>     PyObject* py_V15;
>     PyGpuContextObject *V15;
> 
> #define DTYPE_INPUT_0 npy_float32
> 
> #define TYPENUM_INPUT_0 11
> 
> #define ITEMSIZE_INPUT_0 4
> 
> #define DTYPE_INPUT_1 npy_float32
> 
> #define TYPENUM_INPUT_1 11
> 
> #define ITEMSIZE_INPUT_1 4
> 
> #define DTYPE_INPUT_2 npy_float32
> 
> #define TYPENUM_INPUT_2 11
> 
> #define ITEMSIZE_INPUT_2 4
> 
> #define DTYPE_INPUT_4 npy_float32
> 
> #define TYPENUM_INPUT_4 11
> 
> #define ITEMSIZE_INPUT_4 4
> 
> #define DTYPE_INPUT_5 npy_float32
> 
> #define TYPENUM_INPUT_5 11
> 
> #define ITEMSIZE_INPUT_5 4
> 
> #define DTYPE_OUTPUT_0 npy_float32
> 
> #define TYPENUM_OUTPUT_0 11
> 
> #define ITEMSIZE_OUTPUT_0 4
> 
> #define APPLY_SPECIFIC(str) str##_node_d7db26d1cc9fb59ee56289f4dda72577_0
> 
> #define CONV_INPLACE 1
> 
> #define CONV_ALGO CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
> 
> 
> 
> PyGpuContextObject *ctx;
> cudnnHandle_t APPLY_SPECIFIC(_handle);
> 
> 
> cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
> cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
> cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns);
> 
> 
> 
> #ifdef CHOOSE_ALGO
> int reuse_algo;
> cudnnConvolutionFwdAlgo_t prev_algo;
> #ifndef CHOOSE_ONCE
> size_t prev_img_dims[5];
> size_t prev_kern_dims[5];
> #endif
> #endif
> 
> int
> APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
>                          PyGpuArrayObject *om,
>                          cudnnConvolutionDescriptor_t desc,
>                          double alpha, double beta,
>                          PyGpuArrayObject **output,
>                          PyGpuContextObject *c) {
>   cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
>   float af = alpha, bf = beta;
>   void *alpha_p;
>   void *beta_p;
> 
>   if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
>     PyErr_SetString(PyExc_ValueError,
>                   "images and kernel must have the same stack size");
>     return 1;
>   }
> 
>   if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
>     return 1;
>   if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
>     return 1;
> 
>   switch (input->ga.typecode) {
>   case GA_DOUBLE:
>     alpha_p = (void *)&alpha;
>     beta_p = (void *)&beta;
>     break;
>   case GA_FLOAT:
>   case GA_HALF:
>     alpha_p = (void *)&af;
>     beta_p = (void *)&bf;
>     break;
>   default:
>     PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution");
>     return 1;
>   }
> 
> #ifdef CONV_INPLACE
>   Py_XDECREF(*output);
>   *output = om;
>   Py_INCREF(*output);
> #else
>   if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om),
>                          om->ga.typecode, GA_C_ORDER, c) != 0)
>     return 1;
>   if (beta != 0.0 && pygpu_move(*output, om))
>     return 1;
> #endif
> 
>   if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1)
>     return 1;
> 
>   cudnnConvolutionFwdAlgo_t algo = CONV_ALGO;
> 
>   cuda_enter(c->ctx);
> #ifdef CHOOSE_ALGO
> #ifndef CHOOSE_ONCE
>   reuse_algo = 1;
>   for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
>     reuse_algo = (reuse_algo &&
>                   PyGpuArray_DIM(input, i) == prev_img_dims[i]);
>     reuse_algo = (reuse_algo &&
>                   PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
>   }
> #endif
> 
>   if (!reuse_algo) {
> #ifdef CHOOSE_TIME
>     int count;
>     cudnnConvolutionFwdAlgoPerf_t choice;
>     err = cudnnFindConvolutionForwardAlgorithm(
>       APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
>       desc, APPLY_SPECIFIC(output), 1, &count, &choice);
> 
>     if (err != CUDNN_STATUS_SUCCESS) {
>       PyErr_Format(PyExc_RuntimeError,
>                    "error selecting convolution algo: %s",
>                    cudnnGetErrorString(err));
>       cuda_exit(c->ctx);
>       return 1;
>     }
>     algo = choice.algo;
> #else
>     size_t free;
>     int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_FREE_GMEM, &free);
> 
>     if (err2 != GA_NO_ERROR) {
>       PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
>                    "memory information on the GPU");
>       cuda_exit(c->ctx);
>       return 1;
>     }
> 
>     err = cudnnGetConvolutionForwardAlgorithm(
>       APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
>       desc, APPLY_SPECIFIC(output),
>       CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
>     if (err != CUDNN_STATUS_SUCCESS) {
>       PyErr_Format(PyExc_RuntimeError,
>                    "error selecting convolution algo: %s",
>                    cudnnGetErrorString(err));
>       cuda_exit(c->ctx);
>       return 1;
>     }
> #endif
>     prev_algo = algo;
>   } else {
>     algo = prev_algo;
>   }
> 
> #ifdef CHOOSE_ONCE
>   reuse_algo = 1;
> #else
>   for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
>     prev_img_dims[i] = PyGpuArray_DIM(input, i);
>     prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
>   }
> #endif
> 
> #endif
> 
>   /* These two algos are not supported for 3d conv */
>   if (PyGpuArray_NDIM(input) == 5 &&
>       (algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
>        algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM))
>     algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
> 
>   // The FFT implementation does not support strides, 1x1 filters or inputs
>   // with a spatial dimension larger than 1024. The tiled-FFT implementation
>   // does not support strides.
>   // If the chosen implementation is FFT or tiled-FFT, validate that it can
>   // be used on the current data and default to a safe implementation if it
>   // can't.
>   // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
>   // defined only for 2d filters
>   if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
>        algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && 
> PyGpuArray_NDIM(input) == 4) {
> 
>     // Extract the properties of the convolution descriptor
>     int nd;
>     int pad[2];
>     int stride[2];
>     int upscale[2];
>     cudnnConvolutionMode_t mode;
>     cudnnDataType_t data_type;
>     err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
>                                              upscale, &mode, &data_type);
>     if (err != CUDNN_STATUS_SUCCESS) {
>       PyErr_Format(PyExc_RuntimeError,
>                    "error getting convolution properties: %s",
>                    cudnnGetErrorString(err));
>       cuda_exit(c->ctx);
>       return 1;
>     }
> 
>     if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT)
>     {
>       if (stride[0] != 1 || stride[1] != 1 ||
>           PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 
> ||
>           (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
>       {
>         algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
>       }
>     }
>     else
>     {
>       // algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
>       if (stride[0] != 1 || stride[1] != 1)
>       {
>         algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
>       }
>     }
>   }
> 
>   {
>     size_t worksize;
>     gpudata *workspace;
>     err = cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle),
>                                                   APPLY_SPECIFIC(input),
>                                                   APPLY_SPECIFIC(kerns),
>                                                   desc,
>                                                   APPLY_SPECIFIC(output),
>                                                   algo,
>                                                   &worksize);
> 
>     if (err == CUDNN_STATUS_NOT_SUPPORTED) {
>       // Fallback to none algo if not supported
>       // TODO: Print a warning
>       algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
> 
>       err = cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle),
>                                                     APPLY_SPECIFIC(input),
>                                                     APPLY_SPECIFIC(kerns),
>                                                     desc,
>                                                     APPLY_SPECIFIC(output),
>                                                     algo,
>                                                     &worksize);
>     }
> 
>     if (err != CUDNN_STATUS_SUCCESS) {
>       PyErr_Format(PyExc_RuntimeError,
>                    "error getting worksize: %s",
>                    cudnnGetErrorString(err));
>       cuda_exit(c->ctx);
>       return 1;
>     }
> 
>     /*
>      * This is less than ideal since we need to free it after (which
>      * introduces a synchronization point. But we don't have a module
>      * to place a nice get_work_mem() function in.
>      */
>     if (worksize != 0) {
>       workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
>       if (workspace == NULL) {
>         PyErr_SetString(PyExc_RuntimeError,
>                         "Could not allocate working memory");
>         cuda_exit(c->ctx);
>         return 1;
>       }
>     }
> 
>     cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
>     cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
>     cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
> 
>     err = cudnnConvolutionForward(
>       APPLY_SPECIFIC(_handle),
>       alpha_p,
>       APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
>       APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
>       desc, algo,
>       worksize == 0 ? NULL : *(void **)workspace, worksize,
>       beta_p,
>       APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output));
> 
>     if (worksize != 0)
>       gpudata_release(workspace);
> 
>     cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
>     cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
>     cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
>   }
>   cuda_exit(c->ctx);
> 
>   if (err != CUDNN_STATUS_SUCCESS) {
>     PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
>                cudnnGetErrorString(err));
>     return 1;
>   }
>   return 0;
> }
> 
> 
> #undef DTYPE_INPUT_0
> 
> #undef TYPENUM_INPUT_0
> 
> #undef ITEMSIZE_INPUT_0
> 
> #undef DTYPE_INPUT_1
> 
> #undef TYPENUM_INPUT_1
> 
> #undef ITEMSIZE_INPUT_1
> 
> #undef DTYPE_INPUT_2
> 
> #undef TYPENUM_INPUT_2
> 
> #undef ITEMSIZE_INPUT_2
> 
> #undef DTYPE_INPUT_4
> 
> #undef TYPENUM_INPUT_4
> 
> #undef ITEMSIZE_INPUT_4
> 
> #undef DTYPE_INPUT_5
> 
> #undef TYPENUM_INPUT_5
> 
> #undef ITEMSIZE_INPUT_5
> 
> #undef DTYPE_OUTPUT_0
> 
> #undef TYPENUM_OUTPUT_0
> 
> #undef ITEMSIZE_OUTPUT_0
> 
> #undef APPLY_SPECIFIC
> 
> #undef CONV_INPLACE
> 
> #undef CONV_ALGO
> 
>         __struct_compiled_op_d7db26d1cc9fb59ee56289f4dda72577() {
>             // This is only somewhat safe because we:
>             //  1) Are not a virtual class
>             //  2) Do not use any virtual classes in the members
>             //  3) Deal with mostly POD and pointers
> 
>             // If this changes, we would have to revise this, but for
>             // now I am tired of chasing segfaults because
>             // initialization code had an error and some pointer has
>             // a junk value.
>             memset(this, 0, sizeof(*this));
>         }
>         ~__struct_compiled_op_d7db26d1cc9fb59ee56289f4dda72577(void) {
>             cleanup();
>         }
> 
>         int init(PyObject* __ERROR, PyObject* storage_V3, PyObject* 
> storage_V5, PyObject* storage_V7, PyObject* storage_V9, PyObject* 
> storage_V11, PyObject* storage_V13, PyObject* storage_V1, PyObject* 
> storage_V15) {
>             Py_XINCREF(storage_V3);
> Py_XINCREF(storage_V5);
> Py_XINCREF(storage_V7);
> Py_XINCREF(storage_V9);
> Py_XINCREF(storage_V11);
> Py_XINCREF(storage_V13);
> Py_XINCREF(storage_V1);
> Py_XINCREF(storage_V15);
>             this->storage_V3 = storage_V3;
> this->storage_V5 = storage_V5;
> this->storage_V7 = storage_V7;
> this->storage_V9 = storage_V9;
> this->storage_V11 = storage_V11;
> this->storage_V13 = storage_V13;
> this->storage_V1 = storage_V1;
> this->storage_V15 = storage_V15;
>             
> 
> 
> 
> 
> 
> 
> 
> 
>     py_V15 = PyList_GET_ITEM(storage_V15, 0);
>     {Py_XINCREF(py_V15);}
>     
> if (!PyObject_TypeCheck(py_V15, &PyGpuContextType)) {
>   PyErr_SetString(PyExc_TypeError, "expected a GpuContext");
>   {
>         if (!PyErr_Occurred()) {
>             PyErr_SetString(PyExc_RuntimeError,
>                 "Unexpected error in an Op's C code. "
>                 "No Python exception was set.");
>             }
>         return 15;
> }
> }
> 
> V15 = (PyGpuContextObject *)py_V15;
> Py_INCREF(V15);
> 
> 
> 
> #define DTYPE_INPUT_0 npy_float32
> 
> #define TYPENUM_INPUT_0 11
> 
> #define ITEMSIZE_INPUT_0 4
> 
> #define DTYPE_INPUT_1 npy_float32
> 
> #define TYPENUM_INPUT_1 11
> 
> #define ITEMSIZE_INPUT_1 4
> 
> #define DTYPE_INPUT_2 npy_float32
> 
> #define TYPENUM_INPUT_2 11
> 
> #define ITEMSIZE_INPUT_2 4
> 
> #define DTYPE_INPUT_4 npy_float32
> 
> #define TYPENUM_INPUT_4 11
> 
> #define ITEMSIZE_INPUT_4 4
> 
> #define DTYPE_INPUT_5 npy_float32
> 
> #define TYPENUM_INPUT_5 11
> 
> #define ITEMSIZE_INPUT_5 4
> 
> #define DTYPE_OUTPUT_0 npy_float32
> 
> #define TYPENUM_OUTPUT_0 11
> 
> #define ITEMSIZE_OUTPUT_0 4
> 
> #define APPLY_SPECIFIC(str) str##_node_d7db26d1cc9fb59ee56289f4dda72577_0
> 
> #define CONV_INPLACE 1
> 
> #define CONV_ALGO CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
> 
> #define FAIL { \
> 
>         if (!PyErr_Occurred()) { \
> 
>             PyErr_SetString(PyExc_RuntimeError, \
> 
>                 "Unexpected error in an Op's C code. " \
> 
>                 "No Python exception was set."); \
> 
>             } \
> 
>         return 17; \
> 
> }
> 
> #define PARAMS V15
> 
> 
> 
> {
>   // We need to keep a reference here to have it available in the destructor.
>   ctx = PARAMS;
>   Py_INCREF(ctx);
> 
>   cuda_enter(PARAMS->ctx);
>   cudnnStatus_t err;
>   APPLY_SPECIFIC(_handle) = NULL;
>   if ((err = cudnnCreate(&APPLY_SPECIFIC(_handle))) != CUDNN_STATUS_SUCCESS) {
>     PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s",
>                  cudnnGetErrorString(err));
>     cuda_exit(PARAMS->ctx);
>     FAIL;
>   }
>   if ((err = cudnnSetStream(APPLY_SPECIFIC(_handle),
>                             cuda_get_stream(PARAMS->ctx))) != 
> CUDNN_STATUS_SUCCESS) {
>     PyErr_Format(PyExc_RuntimeError, "Could not set cudnn stream: %s",
>                  cudnnGetErrorString(err));
>     cuda_exit(PARAMS->ctx);
>     FAIL;
>   }
>   cuda_exit(PARAMS->ctx);
> }
> 
> 
> 
> cudnnStatus_t APPLY_SPECIFIC(err);
> APPLY_SPECIFIC(input) = NULL;
> APPLY_SPECIFIC(output) = NULL;
> APPLY_SPECIFIC(kerns) = NULL;
> if ((APPLY_SPECIFIC(err) = 
> cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) 
> {
>   PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
>              "(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
>   FAIL;
> }
> if ((APPLY_SPECIFIC(err) = 
> cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != 
> CUDNN_STATUS_SUCCESS) {
>   PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
>                "(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
>   FAIL;
> }
> if ((APPLY_SPECIFIC(err) = 
> cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns))) != CUDNN_STATUS_SUCCESS) 
> {
>   PyErr_Format(PyExc_MemoryError, "could not allocate filter descriptor: %s", 
>              cudnnGetErrorString(APPLY_SPECIFIC(err)));
>   FAIL;
> }
> 
> 
> 
> #ifdef CHOOSE_ALGO
> reuse_algo = 0;
> prev_algo = CONV_ALGO;
> #ifndef CHOOSE_ONCE
> memset(prev_img_dims, 0, sizeof(prev_img_dims));
> memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
> #endif
> #endif
> 
> 
> 
> #undef FAIL
> 
> #undef PARAMS
> 
> #undef DTYPE_INPUT_0
> 
> #undef TYPENUM_INPUT_0
> 
> #undef ITEMSIZE_INPUT_0
> 
> #undef DTYPE_INPUT_1
> 
> #undef TYPENUM_INPUT_1
> 
> #undef ITEMSIZE_INPUT_1
> 
> #undef DTYPE_INPUT_2
> 
> #undef TYPENUM_INPUT_2
> 
> #undef ITEMSIZE_INPUT_2
> 
> #undef DTYPE_INPUT_4
> 
> #undef TYPENUM_INPUT_4
> 
> #undef ITEMSIZE_INPUT_4
> 
> #undef DTYPE_INPUT_5
> 
> #undef TYPENUM_INPUT_5
> 
> #undef ITEMSIZE_INPUT_5
> 
> #undef DTYPE_OUTPUT_0
> 
> #undef TYPENUM_OUTPUT_0
> 
> #undef ITEMSIZE_OUTPUT_0
> 
> #undef APPLY_SPECIFIC
> 
> #undef CONV_INPLACE
> 
> #undef CONV_ALGO
>             this->__ERROR = __ERROR;
>             return 0;
>         }
>         void cleanup(void) {
>             __label_1:
> 
> double __DUMMY_1;
> __label_3:
> 
> double __DUMMY_3;
> __label_5:
> 
> double __DUMMY_5;
> __label_7:
> 
> double __DUMMY_7;
> __label_9:
> 
> double __DUMMY_9;
> __label_11:
> 
> double __DUMMY_11;
> __label_13:
> 
> double __DUMMY_13;
> __label_15:
> Py_XDECREF(V15); V15 = NULL;
>     {Py_XDECREF(py_V15);}
>     
> double __DUMMY_15;
> __label_18:
> 
> 
> #define DTYPE_INPUT_0 npy_float32
> 
> #define TYPENUM_INPUT_0 11
> 
> #define ITEMSIZE_INPUT_0 4
> 
> #define DTYPE_INPUT_1 npy_float32
> 
> #define TYPENUM_INPUT_1 11
> 
> #define ITEMSIZE_INPUT_1 4
> 
> #define DTYPE_INPUT_2 npy_float32
> 
> #define TYPENUM_INPUT_2 11
> 
> #define ITEMSIZE_INPUT_2 4
> 
> #define DTYPE_INPUT_4 npy_float32
> 
> #define TYPENUM_INPUT_4 11
> 
> #define ITEMSIZE_INPUT_4 4
> 
> #define DTYPE_INPUT_5 npy_float32
> 
> #define TYPENUM_INPUT_5 11
> 
> #define ITEMSIZE_INPUT_5 4
> 
> #define DTYPE_OUTPUT_0 npy_float32
> 
> #define TYPENUM_OUTPUT_0 11
> 
> #define ITEMSIZE_OUTPUT_0 4
> 
> #define APPLY_SPECIFIC(str) str##_node_d7db26d1cc9fb59ee56289f4dda72577_0
> 
> #define CONV_INPLACE 1
> 
> #define CONV_ALGO CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
> 
> 
> 
> cuda_enter(ctx->ctx);
> cudnnDestroy(APPLY_SPECIFIC(_handle));
> cuda_exit(ctx->ctx);
> Py_DECREF((PyObject *)ctx);
> 
> 
> if (APPLY_SPECIFIC(input) != NULL)
>   cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input));
> if (APPLY_SPECIFIC(output) != NULL)
>   cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output));
> if (APPLY_SPECIFIC(kerns) != NULL)
>   cudnnDestroyFilterDescriptor(APPLY_SPECIFIC(kerns));
> 
> 
> #undef DTYPE_INPUT_0
> 
> #undef TYPENUM_INPUT_0
> 
> #undef ITEMSIZE_INPUT_0
> 
> #undef DTYPE_INPUT_1
> 
> #undef TYPENUM_INPUT_1
> 
> #undef ITEMSIZE_INPUT_1
> 
> #undef DTYPE_INPUT_2
> 
> #undef TYPENUM_INPUT_2
> 
> #undef ITEMSIZE_INPUT_2
> 
> #undef DTYPE_INPUT_4
> 
> #undef TYPENUM_INPUT_4
> 
> #undef ITEMSIZE_INPUT_4
> 
> #undef DTYPE_INPUT_5
> 
> #undef TYPENUM_INPUT_5
> 
> #undef ITEMSIZE_INPUT_5
> 
> #undef DTYPE_OUTPUT_0
> 
> #undef TYPENUM_OUTPUT_0
> 
> #undef ITEMSIZE_OUTPUT_0
> 
> #undef APPLY_SPECIFIC
> 
> #undef CONV_INPLACE
> 
> #undef CONV_ALGO
> double __DUMMY_18;
> 
>             Py_XDECREF(this->storage_V3);
> Py_XDECREF(this->storage_V5);
> Py_XDECREF(this->storage_V7);
> Py_XDECREF(this->storage_V9);
> Py_XDECREF(this->storage_V11);
> Py_XDECREF(this->storage_V13);
> Py_XDECREF(this->storage_V1);
> Py_XDECREF(this->storage_V15);
>         }
>         int run(void) {
>             int __failure = 0;
>             
>     PyObject* py_V1;
>     
>         PyGpuArrayObject *V1;
>         
>     PyObject* py_V3;
>     
>         PyGpuArrayObject *V3;
>         
>     PyObject* py_V5;
>     
>         PyGpuArrayObject *V5;
>         
>     PyObject* py_V7;
>     
>         PyGpuArrayObject *V7;
>         
>     PyObject* py_V9;
>     
>         cudnnConvolutionDescriptor_t V9;
>         
>     PyObject* py_V11;
>     
>                 typedef npy_float32 V11_dtype; // Deprecated use dtype_V11 
> instead.
>                 typedef npy_float32 dtype_V11;
>             
>         npy_float32 V11;
>         
>     PyObject* py_V13;
>     
>                 typedef npy_float32 V13_dtype; // Deprecated use dtype_V13 
> instead.
>                 typedef npy_float32 dtype_V13;
>             
>         npy_float32 V13;
>         
> {
> 
>     py_V1 = PyList_GET_ITEM(storage_V1, 0);
>     {Py_XINCREF(py_V1);}
>     
>         if (py_V1 == Py_None)
>         {
>             V1 = NULL;
>         }
>         else
>         {
>             
>         V1 = NULL;
>         if (py_V1 == Py_None) {
>             PyErr_SetString(PyExc_ValueError, "expected a GpuArray, not 
> None");
>             {
>         __failure = 2;
>         if (!PyErr_Occurred()) {
>             PyErr_SetString(PyExc_RuntimeError,
>                 "Unexpected error in an Op's C code. "
>                 "No Python exception was set.");
>             }
>         goto __label_2;}
>         }
>         /* First check if we are the base type exactly (the most common case),
>            then do the full subclass check if needed. */
>         if (py_V1->ob_type != &PyGpuArrayType &&
>             !PyObject_TypeCheck(py_V1, &PyGpuArrayType)) {
>             PyErr_SetString(PyExc_ValueError, "expected a GpuArray");
>             {
>         __failure = 2;
>         if (!PyErr_Occurred()) {
>             PyErr_SetString(PyExc_RuntimeError,
>                 "Unexpected error in an Op's C code. "
>                 "No Python exception was set.");
>             }
>         goto __label_2;}
>         }
>         V1 = (PyGpuArrayObject *)py_V1;
>         Py_INCREF(V1);
>         
>         }
>         
> {
> 
>     py_V3 = PyList_GET_ITEM(storage_V3, 0);
>     {Py_XINCREF(py_V3);}
>     
>         V3 = NULL;
>         if (py_V3 == Py_None) {
>             PyErr_SetString(PyExc_ValueError, "expected a GpuArray, not 
> None");
>             {
>         __failure = 4;
>         if (!PyErr_Occurred()) {
>             PyErr_SetString(PyExc_RuntimeError,
>                 "Unexpected error in an Op's C code. "
>                 "No Python exception was set.");
>             }
>         goto __label_4;}
>         }
>         /* First check if we are the base type exactly (the most common case),
>            then do the full subclass check if needed. */
>         if (py_V3->ob_type != &PyGpuArrayType &&
>             !PyObject_TypeCheck(py_V3, &PyGpuArrayType)) {
>             PyErr_SetString(PyExc_ValueError, "expected a GpuArray");
>             {
>         __failure = 4;
>         if (!PyErr_Occurred()) {
>             PyErr_SetString(PyExc_RuntimeError,
>                 "Unexpected error in an Op's C code. "
>                 "No Python exception was set.");
>             }
>         goto __label_4;}
>         }
>         V3 = (PyGpuArrayObject *)py_V3;
>         Py_INCREF(V3);
>         
> {
> 
>     py_V5 = PyList_GET_ITEM(storage_V5, 0);
>     {Py_XINCREF(py_V5);}
>     
>         V5 = NULL;
>         if (py_V5 == Py_None) {
>             PyErr_SetString(PyExc_ValueError, "expected a GpuArray, not 
> None");
>             {
>         __failure = 6;
>         if (!PyErr_Occurred()) {
>             PyErr_SetString(PyExc_RuntimeError,
>                 "Unexpected error in an Op's C code. "
>                 "No Python exception was set.");
>             }
>         goto __label_6;}
>         }
>         /* First check if we are the base type exactly (the most common case),
>            then do the full subclass check if needed. */
>         if (py_V5->ob_type != &PyGpuArrayType &&
>             !PyObject_TypeCheck(py_V5, &PyGpuArrayType)) {
>             PyErr_SetString(PyExc_ValueError, "expected a GpuArray");
>             {
>         __failure = 6;
>         if (!PyErr_Occurred()) {
>             PyErr_SetString(PyExc_RuntimeError,
>                 "Unexpected error in an Op's C code. "
>                 "No Python exception was set.");
>             }
>         goto __label_6;}
>         }
>         V5 = (PyGpuArrayObject *)py_V5;
>         Py_INCREF(V5);
>         
> {
> 
>     py_V7 = PyList_GET_ITEM(storage_V7, 0);
>     {Py_XINCREF(py_V7);}
>     
>         V7 = NULL;
>         if (py_V7 == Py_None) {
>             PyErr_SetString(PyExc_ValueError, "expected a GpuArray, not 
> None");
>             {
>         __failure = 8;
>         if (!PyErr_Occurred()) {
>             PyErr_SetString(PyExc_RuntimeError,
>                 "Unexpected error in an Op's C code. "
>                 "No Python exception was set.");
>             }
>         goto __label_8;}
>         }
>         /* First check if we are the base type exactly (the most common case),
>            then do the full subclass check if needed. */
>         if (py_V7->ob_type != &PyGpuArrayType &&
>             !PyObject_TypeCheck(py_V7, &PyGpuArrayType)) {
>             PyErr_SetString(PyExc_ValueError, "expected a GpuArray");
>             {
>         __failure = 8;
>         if (!PyErr_Occurred()) {
>             PyErr_SetString(PyExc_RuntimeError,
>                 "Unexpected error in an Op's C code. "
>                 "No Python exception was set.");
>             }
>         goto __label_8;}
>         }
>         V7 = (PyGpuArrayObject *)py_V7;
>         Py_INCREF(V7);
>         
> {
> 
>     py_V9 = PyList_GET_ITEM(storage_V9, 0);
>     {Py_XINCREF(py_V9);}
>     
>   V9 = (cudnnConvolutionDescriptor_t)PyCapsule_GetPointer(py_V9, NULL);
>   if (V9 == NULL) {
>         __failure = 10;
>         if (!PyErr_Occurred()) {
>             PyErr_SetString(PyExc_RuntimeError,
>                 "Unexpected error in an Op's C code. "
>                 "No Python exception was set.");
>             }
>         goto __label_10;}
>         
> {
> 
>     py_V11 = PyList_GET_ITEM(storage_V11, 0);
>     {Py_XINCREF(py_V11);}
>     
>             if (!PyObject_TypeCheck(py_V11, &PyFloat32ArrType_Type))
>             {
>                 PyErr_Format(PyExc_ValueError,
>                     "Scalar check failed (npy_float32)");
>                 {
>         __failure = 12;
>         if (!PyErr_Occurred()) {
>             PyErr_SetString(PyExc_RuntimeError,
>                 "Unexpected error in an Op's C code. "
>                 "No Python exception was set.");
>             }
>         goto __label_12;}
>             }
>             
>         PyArray_ScalarAsCtype(py_V11, &V11);
>         
> {
> 
>     py_V13 = PyList_GET_ITEM(storage_V13, 0);
>     {Py_XINCREF(py_V13);}
>     
>             if (!PyObject_TypeCheck(py_V13, &PyFloat32ArrType_Type))
>             {
>                 PyErr_Format(PyExc_ValueError,
>                     "Scalar check failed (npy_float32)");
>                 {
>         __failure = 14;
>         if (!PyErr_Occurred()) {
>             PyErr_SetString(PyExc_RuntimeError,
>                 "Unexpected error in an Op's C code. "
>                 "No Python exception was set.");
>             }
>         goto __label_14;}
>             }
>             
>         PyArray_ScalarAsCtype(py_V13, &V13);
>         
> {
> 
> {
> // Op class GpuDnnConv
> 
>                 #define APPLY_SPECIFIC(str) 
> str##_node_d7db26d1cc9fb59ee56289f4dda72577_0
> 
> #define CONV_INPLACE 1
> 
> #define CONV_ALGO CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
>                 {
>                   if (APPLY_SPECIFIC(conv_fwd)(V3, V5, V7, V9, V11, V13, &V1, 
> V15) != 0) {
>                     {
>         __failure = 17;
>         if (!PyErr_Occurred()) {
>             PyErr_SetString(PyExc_RuntimeError,
>                 "Unexpected error in an Op's C code. "
>                 "No Python exception was set.");
>             }
>         goto __label_17;}
>                   }
>                 }
>                 #undef APPLY_SPECIFIC
> 
> #undef CONV_INPLACE
> 
> #undef CONV_ALGO
>                 __label_17:
> 
> double __DUMMY_17;
> 
> }
> __label_16:
> 
> double __DUMMY_16;
> 
> }
> __label_14:
> 
>     {Py_XDECREF(py_V13);}
>     
> double __DUMMY_14;
> 
> }
> __label_12:
> 
>     {Py_XDECREF(py_V11);}
>     
> double __DUMMY_12;
> 
> }
> __label_10:
> 
>     {Py_XDECREF(py_V9);}
>     
> double __DUMMY_10;
> 
> }
> __label_8:
> Py_XDECREF(V7); V7 = NULL;
>     {Py_XDECREF(py_V7);}
>     
> double __DUMMY_8;
> 
> }
> __label_6:
> Py_XDECREF(V5); V5 = NULL;
>     {Py_XDECREF(py_V5);}
>     
> double __DUMMY_6;
> 
> }
> __label_4:
> Py_XDECREF(V3); V3 = NULL;
>     {Py_XDECREF(py_V3);}
>     
> double __DUMMY_4;
> 
> }
> __label_2:
> 
>     if (!__failure) {
>       
>         if (!V1) {
>             Py_XDECREF(py_V1);
>             Py_INCREF(Py_None);
>             py_V1 = Py_None;
>         } else if ((void *)py_V1 != (void *)V1) {
>             Py_XDECREF(py_V1);
>             py_V1 = (PyObject *)V1;
>             Py_INCREF(py_V1);
>         }
>         
>       PyObject* old = PyList_GET_ITEM(storage_V1, 0);
>       {Py_XINCREF(py_V1);}
>       PyList_SET_ITEM(storage_V1, 0, py_V1);
>       {Py_XDECREF(old);}
>     }
>     Py_XDECREF(V1); V1 = NULL;
>     {Py_XDECREF(py_V1);}
>     
> double __DUMMY_2;
> 
> }
> 
>             
>         if (__failure) {
>             // When there is a failure, this code puts the exception
>             // in __ERROR.
>             PyObject* err_type = NULL;
>             PyObject* err_msg = NULL;
>             PyObject* err_traceback = NULL;
>             PyErr_Fetch(&err_type, &err_msg, &err_traceback);
>             if (!err_type) {err_type = Py_None;Py_INCREF(Py_None);}
>             if (!err_msg) {err_msg = Py_None; Py_INCREF(Py_None);}
>             if (!err_traceback) {err_traceback = Py_None; Py_INCREF(Py_None);}
>             PyObject* old_err_type = PyList_GET_ITEM(__ERROR, 0);
>             PyObject* old_err_msg = PyList_GET_ITEM(__ERROR, 1);
>             PyObject* old_err_traceback = PyList_GET_ITEM(__ERROR, 2);
>             PyList_SET_ITEM(__ERROR, 0, err_type);
>             PyList_SET_ITEM(__ERROR, 1, err_msg);
>             PyList_SET_ITEM(__ERROR, 2, err_traceback);
>             {Py_XDECREF(old_err_type);}
>             {Py_XDECREF(old_err_msg);}
>             {Py_XDECREF(old_err_traceback);}
>         }
>         // The failure code is returned to index what code block failed.
>         return __failure;
>         
>         }
>     };
>     }
>     
> 
>         static int 
> __struct_compiled_op_d7db26d1cc9fb59ee56289f4dda72577_executor(__struct_compiled_op_d7db26d1cc9fb59ee56289f4dda72577*
>  self) {
>             return self->run();
>         }
> 
>         static void 
> __struct_compiled_op_d7db26d1cc9fb59ee56289f4dda72577_destructor(void* 
> executor, void* self) {
>             delete 
> ((__struct_compiled_op_d7db26d1cc9fb59ee56289f4dda72577*)self);
>         }
>         
> //////////////////////
> ////  Functions
> //////////////////////
> static PyObject * instantiate(PyObject * self, PyObject *argtuple) {
>   assert(PyTuple_Check(argtuple));
>   if (9 != PyTuple_Size(argtuple)){ 
>      PyErr_Format(PyExc_TypeError, "Wrong number of arguments, expected 9, 
> got %i", (int)PyTuple_Size(argtuple));
>      return NULL;
>   }
>   __struct_compiled_op_d7db26d1cc9fb59ee56289f4dda72577* struct_ptr = new 
> __struct_compiled_op_d7db26d1cc9fb59ee56289f4dda72577();
>   if (struct_ptr->init( PyTuple_GET_ITEM(argtuple, 
> 0),PyTuple_GET_ITEM(argtuple, 1),PyTuple_GET_ITEM(argtuple, 
> 2),PyTuple_GET_ITEM(argtuple, 3),PyTuple_GET_ITEM(argtuple, 
> 4),PyTuple_GET_ITEM(argtuple, 5),PyTuple_GET_ITEM(argtuple, 
> 6),PyTuple_GET_ITEM(argtuple, 7),PyTuple_GET_ITEM(argtuple, 8) ) != 0) {
>     delete struct_ptr;
>     return NULL;
>   }
>   PyObject* thunk = 
> PyCObject_FromVoidPtrAndDesc((void*)(&__struct_compiled_op_d7db26d1cc9fb59ee56289f4dda72577_executor),
>  struct_ptr, 
> __struct_compiled_op_d7db26d1cc9fb59ee56289f4dda72577_destructor);
>   return thunk; }
> 
> //////////////////////
> ////  Module init
> //////////////////////
> static PyMethodDef MyMethods[] = {
>       {"instantiate", instantiate, METH_VARARGS, "undocumented"} ,
>       {NULL, NULL, 0, NULL}
> };
> PyMODINIT_FUNC initd7db26d1cc9fb59ee56289f4dda72577(void){
>    import_pygpu__gpuarray();
>    import_array();
>    
> 
> setup_ext_cuda();
> 
> 
>    (void) Py_InitModule("d7db26d1cc9fb59ee56289f4dda72577", MyMethods);
> }


-- 
Pascal

-- 

--- 
You received this message because you are subscribed to the Google Groups 
"theano-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/d/optout.

Re: [theano-users] Running code with CuDNN fails during compilation (mod.cpp) (Windows)

Reply via email to