[theano-users] Running code with CuDNN fails during compilation (mod.cpp) (Windows)

D. Kit Thu, 22 Sep 2016 02:17:52 -0700

Hello,

I'm attempting to use conv3D2D in Theano, but when I get to the function 
compilation stage it fails with the following errors:


mod.cpp:684:1: error: expected unqualified-id before '{' token

mod.cpp:1345:1: error: expected '}' at end of input
mod.cpp: In destructor '{anonymous}::__struct_compiled_op_d7db26d1cc9f

mod.cpp:573:21: error: 'cleanup' was not declared in this scope
mod.cpp: At global scope:
mod.cpp:678:1: error: expected unqualified-id at end of input
 #undef ITEMSIZE_INPUT_2
 ^
mod.cpp:678:1: error: expected '}' at end of input

I'm attaching the mod.cpp file that was generated by cc.py, op.py, dnn.py, 
etc., The initialization code from dnn_base.c starts at line 684, but there 
is no function name defined, so the code sits outside any useful code 
block. The "#section cleanup_code_struct" code section also appears to have 
been inserted into the wrong spot.

Can someone tell me how to fix this?

--Dmitry

-- 

--- 
You received this message because you are subscribed to the Google Groups 
"theano-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/d/optout.

#include <Python.h>
#include <iostream>
#include "theano_mod_helper.h"
#include <gpuarray/array.h>
#include <gpuarray/kernel.h>
#include <gpuarray/error.h>
#include <gpuarray/buffer.h>
#include <gpuarray/buffer_blas.h>
#include <numpy/arrayobject.h>
#include <gpuarray_api.h>
#include <math.h>
#include <numpy/arrayscalars.h>
#include "cudnn.h"
#include "cudnn_helper.h"
#include "gpuarray_helper.h"
#include "gpuarray/types.h"
#include "gpuarray/array.h"
#include "gpuarray/util.h"
#include "gpuarray/ext_cuda.h"
#include "gpuarray_api.h"
#include "numpy_compat.h"
//////////////////////
////  Support Code
//////////////////////

void _capsule_destructor(PyObject *o) {
    void *d = PyCapsule_GetContext(o);
    void *p = PyCapsule_GetPointer(o, NULL);
    void (*f)(void *) = (void (*)(void *))d;
    if (f != NULL) f(p);
}




static int
c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
  cudnnDataType_t dt;
  size_t ds;
  switch (var->ga.typecode) {
  case GA_FLOAT:
    dt = CUDNN_DATA_FLOAT;
    break;
  case GA_DOUBLE:
    dt = CUDNN_DATA_DOUBLE;
    break;
  case GA_HALF:
    dt = CUDNN_DATA_HALF;
    break;
  default:
    PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensorNd");
    return -1;
  }
  ds = gpuarray_get_elsize(var->ga.typecode);

  int strs[8], dims[8], default_stride = 1;
  unsigned int nd = PyGpuArray_NDIM(var);

  if (nd > 8) {
    PyErr_SetString(PyExc_TypeError, "Tensor of more than 8d");
    return -1;
  }

  for (unsigned int _i = nd; _i > 0; _i--) {
    unsigned int i = _i - 1;
    strs[i] = PyGpuArray_STRIDE(var, i) ?
      PyGpuArray_STRIDE(var, i)/ds : default_stride;
    default_stride *= PyGpuArray_DIM(var, i);
    dims[i] = PyGpuArray_DIM(var, i);
  }

  /* Tensors can't be smaller than 3d for cudnn so we pad the
   * descriptor if they are */
  for (unsigned int i = nd; i < 3; i++) {
    strs[i] = 1;
    dims[i] = 1;
  }

  cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, dt, nd < 3 ? 3 : nd,
                                                 dims, strs);
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,
		 "Could not set tensorNd descriptor: %s",
		 cudnnGetErrorString(err));
    return -1;
  }
  return 0;
}

static int c_make_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t *desc) {
  cudnnStatus_t err;
  err = cudnnCreateTensorDescriptor(desc);
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,
                 "Could not create tensor descriptor: %s",
                 cudnnGetErrorString(err));
    return -1;
  }
  if (c_set_tensorNd(var, *desc) != 0) {
    cudnnDestroyTensorDescriptor(*desc);
    return -1;
  }
  return 0;
}

static int
c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {
  cudnnDataType_t dt;
  cudnnStatus_t err;

  if (!GpuArray_IS_C_CONTIGUOUS(&var->ga)) {
    PyErr_SetString(PyExc_ValueError,
		    "Only contiguous filters (kernels) are supported.");
    return -1;
  }
  switch (var->ga.typecode) {
  case GA_FLOAT:
    dt = CUDNN_DATA_FLOAT;
    break;
  case GA_DOUBLE:
    dt = CUDNN_DATA_DOUBLE;
    break;
  case GA_HALF:
    dt = CUDNN_DATA_HALF;
    break;
  default:
    PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_filter");
    return -1;
  }

  int dims[8];
  unsigned int nd = PyGpuArray_NDIM(var);

  if (nd > 8) {
    PyErr_SetString(PyExc_TypeError, "Tensor of more than 8d");
    return -1;
  }

  for (unsigned int _i = nd; _i > 0; _i--) {
    unsigned int i = _i - 1;
    dims[i] = PyGpuArray_DIM(var, i);
  }

  /* Filters can't be less than 3d so we pad */
  for (unsigned int i = nd; i < 3; i++)
    dims[i] = 1;

  if (nd < 3)
    nd = 3;

#if CUDNN_VERSION >= 5000
    err = cudnnSetFilterNdDescriptor(desc, dt, CUDNN_TENSOR_NCHW, nd, dims);
#else
    err = cudnnSetFilterNdDescriptor(desc, dt, nd, dims);
#endif

  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,
		 "Could not set filter descriptor: %s.",
		 cudnnGetErrorString(err));
    return -1;
  }
  return 0;
}

static int c_make_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t *desc) {
  cudnnStatus_t err;
  err = cudnnCreateFilterDescriptor(desc);
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,
                 "Could not create tensor descriptor: %s",
                 cudnnGetErrorString(err));
    return -1;
  }
  if (c_set_filter(var, *desc) != 0) {
    cudnnDestroyFilterDescriptor(*desc);
    return -1;
  }
  return 0;
}



    namespace {
    struct __struct_compiled_op_d7db26d1cc9fb59ee56289f4dda72577 {
        PyObject* __ERROR;

        PyObject* storage_V3;
PyObject* storage_V5;
PyObject* storage_V7;
PyObject* storage_V9;
PyObject* storage_V11;
PyObject* storage_V13;
PyObject* storage_V1;
PyObject* storage_V15;
        
    PyObject* py_V15;
    PyGpuContextObject *V15;

#define DTYPE_INPUT_0 npy_float32

#define TYPENUM_INPUT_0 11

#define ITEMSIZE_INPUT_0 4

#define DTYPE_INPUT_1 npy_float32

#define TYPENUM_INPUT_1 11

#define ITEMSIZE_INPUT_1 4

#define DTYPE_INPUT_2 npy_float32

#define TYPENUM_INPUT_2 11

#define ITEMSIZE_INPUT_2 4

#define DTYPE_INPUT_4 npy_float32

#define TYPENUM_INPUT_4 11

#define ITEMSIZE_INPUT_4 4

#define DTYPE_INPUT_5 npy_float32

#define TYPENUM_INPUT_5 11

#define ITEMSIZE_INPUT_5 4

#define DTYPE_OUTPUT_0 npy_float32

#define TYPENUM_OUTPUT_0 11

#define ITEMSIZE_OUTPUT_0 4

#define APPLY_SPECIFIC(str) str##_node_d7db26d1cc9fb59ee56289f4dda72577_0

#define CONV_INPLACE 1

#define CONV_ALGO CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM



PyGpuContextObject *ctx;
cudnnHandle_t APPLY_SPECIFIC(_handle);


cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns);



#ifdef CHOOSE_ALGO
int reuse_algo;
cudnnConvolutionFwdAlgo_t prev_algo;
#ifndef CHOOSE_ONCE
size_t prev_img_dims[5];
size_t prev_kern_dims[5];
#endif
#endif

int
APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
                         PyGpuArrayObject *om,
                         cudnnConvolutionDescriptor_t desc,
                         double alpha, double beta,
                         PyGpuArrayObject **output,
                         PyGpuContextObject *c) {
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;

  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
    PyErr_SetString(PyExc_ValueError,
		    "images and kernel must have the same stack size");
    return 1;
  }

  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
    return 1;
  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
    return 1;

  switch (input->ga.typecode) {
  case GA_DOUBLE:
    alpha_p = (void *)&alpha;
    beta_p = (void *)&beta;
    break;
  case GA_FLOAT:
  case GA_HALF:
    alpha_p = (void *)&af;
    beta_p = (void *)&bf;
    break;
  default:
    PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution");
    return 1;
  }

#ifdef CONV_INPLACE
  Py_XDECREF(*output);
  *output = om;
  Py_INCREF(*output);
#else
  if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om),
                         om->ga.typecode, GA_C_ORDER, c) != 0)
    return 1;
  if (beta != 0.0 && pygpu_move(*output, om))
    return 1;
#endif

  if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1)
    return 1;

  cudnnConvolutionFwdAlgo_t algo = CONV_ALGO;

  cuda_enter(c->ctx);
#ifdef CHOOSE_ALGO
#ifndef CHOOSE_ONCE
  reuse_algo = 1;
  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
    reuse_algo = (reuse_algo &&
                  PyGpuArray_DIM(input, i) == prev_img_dims[i]);
    reuse_algo = (reuse_algo &&
                  PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
  }
#endif

  if (!reuse_algo) {
#ifdef CHOOSE_TIME
    int count;
    cudnnConvolutionFwdAlgoPerf_t choice;
    err = cudnnFindConvolutionForwardAlgorithm(
      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
      desc, APPLY_SPECIFIC(output), 1, &count, &choice);

    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error selecting convolution algo: %s",
                   cudnnGetErrorString(err));
      cuda_exit(c->ctx);
      return 1;
    }
    algo = choice.algo;
#else
    size_t free;
    int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_FREE_GMEM, &free);

    if (err2 != GA_NO_ERROR) {
      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
                   "memory information on the GPU");
      cuda_exit(c->ctx);
      return 1;
    }

    err = cudnnGetConvolutionForwardAlgorithm(
      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
      desc, APPLY_SPECIFIC(output),
      CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error selecting convolution algo: %s",
                   cudnnGetErrorString(err));
      cuda_exit(c->ctx);
      return 1;
    }
#endif
    prev_algo = algo;
  } else {
    algo = prev_algo;
  }

#ifdef CHOOSE_ONCE
  reuse_algo = 1;
#else
  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
    prev_img_dims[i] = PyGpuArray_DIM(input, i);
    prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
  }
#endif

#endif

  /* These two algos are not supported for 3d conv */
  if (PyGpuArray_NDIM(input) == 5 &&
      (algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
       algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM))
    algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;

  // The FFT implementation does not support strides, 1x1 filters or inputs
  // with a spatial dimension larger than 1024. The tiled-FFT implementation
  // does not support strides.
  // If the chosen implementation is FFT or tiled-FFT, validate that it can
  // be used on the current data and default to a safe implementation if it
  // can't.
  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
  // defined only for 2d filters
  if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
       algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) {

    // Extract the properties of the convolution descriptor
    int nd;
    int pad[2];
    int stride[2];
    int upscale[2];
    cudnnConvolutionMode_t mode;
    cudnnDataType_t data_type;
    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
                                             upscale, &mode, &data_type);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error getting convolution properties: %s",
                   cudnnGetErrorString(err));
      cuda_exit(c->ctx);
      return 1;
    }

    if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT)
    {
      if (stride[0] != 1 || stride[1] != 1 ||
          PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
          (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
      {
        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
      }
    }
    else
    {
      // algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
      if (stride[0] != 1 || stride[1] != 1)
      {
        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
      }
    }
  }

  {
    size_t worksize;
    gpudata *workspace;
    err = cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle),
                                                  APPLY_SPECIFIC(input),
                                                  APPLY_SPECIFIC(kerns),
                                                  desc,
                                                  APPLY_SPECIFIC(output),
                                                  algo,
                                                  &worksize);

    if (err == CUDNN_STATUS_NOT_SUPPORTED) {
      // Fallback to none algo if not supported
      // TODO: Print a warning
      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;

      err = cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle),
                                                    APPLY_SPECIFIC(input),
                                                    APPLY_SPECIFIC(kerns),
                                                    desc,
                                                    APPLY_SPECIFIC(output),
                                                    algo,
                                                    &worksize);
    }

    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error getting worksize: %s",
                   cudnnGetErrorString(err));
      cuda_exit(c->ctx);
      return 1;
    }

    /*
     * This is less than ideal since we need to free it after (which
     * introduces a synchronization point. But we don't have a module
     * to place a nice get_work_mem() function in.
     */
    if (worksize != 0) {
      workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
      if (workspace == NULL) {
        PyErr_SetString(PyExc_RuntimeError,
                        "Could not allocate working memory");
        cuda_exit(c->ctx);
        return 1;
      }
    }

    cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

    err = cudnnConvolutionForward(
      APPLY_SPECIFIC(_handle),
      alpha_p,
      APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
      APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
      desc, algo,
      worksize == 0 ? NULL : *(void **)workspace, worksize,
      beta_p,
      APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output));

    if (worksize != 0)
      gpudata_release(workspace);

    cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
  }
  cuda_exit(c->ctx);

  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
		 cudnnGetErrorString(err));
    return 1;
  }
  return 0;
}


#undef DTYPE_INPUT_0

#undef TYPENUM_INPUT_0

#undef ITEMSIZE_INPUT_0

#undef DTYPE_INPUT_1

#undef TYPENUM_INPUT_1

#undef ITEMSIZE_INPUT_1

#undef DTYPE_INPUT_2

#undef TYPENUM_INPUT_2

#undef ITEMSIZE_INPUT_2

#undef DTYPE_INPUT_4

#undef TYPENUM_INPUT_4

#undef ITEMSIZE_INPUT_4

#undef DTYPE_INPUT_5

#undef TYPENUM_INPUT_5

#undef ITEMSIZE_INPUT_5

#undef DTYPE_OUTPUT_0

#undef TYPENUM_OUTPUT_0

#undef ITEMSIZE_OUTPUT_0

#undef APPLY_SPECIFIC

#undef CONV_INPLACE

#undef CONV_ALGO

        __struct_compiled_op_d7db26d1cc9fb59ee56289f4dda72577() {
            // This is only somewhat safe because we:
            //  1) Are not a virtual class
            //  2) Do not use any virtual classes in the members
            //  3) Deal with mostly POD and pointers

            // If this changes, we would have to revise this, but for
            // now I am tired of chasing segfaults because
            // initialization code had an error and some pointer has
            // a junk value.
            memset(this, 0, sizeof(*this));
        }
        ~__struct_compiled_op_d7db26d1cc9fb59ee56289f4dda72577(void) {
            cleanup();
        }

        int init(PyObject* __ERROR, PyObject* storage_V3, PyObject* storage_V5, PyObject* storage_V7, PyObject* storage_V9, PyObject* storage_V11, PyObject* storage_V13, PyObject* storage_V1, PyObject* storage_V15) {
            Py_XINCREF(storage_V3);
Py_XINCREF(storage_V5);
Py_XINCREF(storage_V7);
Py_XINCREF(storage_V9);
Py_XINCREF(storage_V11);
Py_XINCREF(storage_V13);
Py_XINCREF(storage_V1);
Py_XINCREF(storage_V15);
            this->storage_V3 = storage_V3;
this->storage_V5 = storage_V5;
this->storage_V7 = storage_V7;
this->storage_V9 = storage_V9;
this->storage_V11 = storage_V11;
this->storage_V13 = storage_V13;
this->storage_V1 = storage_V1;
this->storage_V15 = storage_V15;
            








    py_V15 = PyList_GET_ITEM(storage_V15, 0);
    {Py_XINCREF(py_V15);}
    
if (!PyObject_TypeCheck(py_V15, &PyGpuContextType)) {
  PyErr_SetString(PyExc_TypeError, "expected a GpuContext");
  {
        if (!PyErr_Occurred()) {
            PyErr_SetString(PyExc_RuntimeError,
                "Unexpected error in an Op's C code. "
                "No Python exception was set.");
            }
        return 15;
}
}

V15 = (PyGpuContextObject *)py_V15;
Py_INCREF(V15);



#define DTYPE_INPUT_0 npy_float32

#define TYPENUM_INPUT_0 11

#define ITEMSIZE_INPUT_0 4

#define DTYPE_INPUT_1 npy_float32

#define TYPENUM_INPUT_1 11

#define ITEMSIZE_INPUT_1 4

#define DTYPE_INPUT_2 npy_float32

#define TYPENUM_INPUT_2 11

#define ITEMSIZE_INPUT_2 4

#define DTYPE_INPUT_4 npy_float32

#define TYPENUM_INPUT_4 11

#define ITEMSIZE_INPUT_4 4

#define DTYPE_INPUT_5 npy_float32

#define TYPENUM_INPUT_5 11

#define ITEMSIZE_INPUT_5 4

#define DTYPE_OUTPUT_0 npy_float32

#define TYPENUM_OUTPUT_0 11

#define ITEMSIZE_OUTPUT_0 4

#define APPLY_SPECIFIC(str) str##_node_d7db26d1cc9fb59ee56289f4dda72577_0

#define CONV_INPLACE 1

#define CONV_ALGO CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM

#define FAIL { \

        if (!PyErr_Occurred()) { \

            PyErr_SetString(PyExc_RuntimeError, \

                "Unexpected error in an Op's C code. " \

                "No Python exception was set."); \

            } \

        return 17; \

}

#define PARAMS V15



{
  // We need to keep a reference here to have it available in the destructor.
  ctx = PARAMS;
  Py_INCREF(ctx);

  cuda_enter(PARAMS->ctx);
  cudnnStatus_t err;
  APPLY_SPECIFIC(_handle) = NULL;
  if ((err = cudnnCreate(&APPLY_SPECIFIC(_handle))) != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s",
                 cudnnGetErrorString(err));
    cuda_exit(PARAMS->ctx);
    FAIL;
  }
  if ((err = cudnnSetStream(APPLY_SPECIFIC(_handle),
                            cuda_get_stream(PARAMS->ctx))) != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "Could not set cudnn stream: %s",
                 cudnnGetErrorString(err));
    cuda_exit(PARAMS->ctx);
    FAIL;
  }
  cuda_exit(PARAMS->ctx);
}



cudnnStatus_t APPLY_SPECIFIC(err);
APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(output) = NULL;
APPLY_SPECIFIC(kerns) = NULL;
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
	       "(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
  FAIL;
}
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
               "(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
  FAIL;
}
if ((APPLY_SPECIFIC(err) = cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns))) != CUDNN_STATUS_SUCCESS) {
  PyErr_Format(PyExc_MemoryError, "could not allocate filter descriptor: %s", 
	       cudnnGetErrorString(APPLY_SPECIFIC(err)));
  FAIL;
}



#ifdef CHOOSE_ALGO
reuse_algo = 0;
prev_algo = CONV_ALGO;
#ifndef CHOOSE_ONCE
memset(prev_img_dims, 0, sizeof(prev_img_dims));
memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
#endif
#endif



#undef FAIL

#undef PARAMS

#undef DTYPE_INPUT_0

#undef TYPENUM_INPUT_0

#undef ITEMSIZE_INPUT_0

#undef DTYPE_INPUT_1

#undef TYPENUM_INPUT_1

#undef ITEMSIZE_INPUT_1

#undef DTYPE_INPUT_2

#undef TYPENUM_INPUT_2

#undef ITEMSIZE_INPUT_2

#undef DTYPE_INPUT_4

#undef TYPENUM_INPUT_4

#undef ITEMSIZE_INPUT_4

#undef DTYPE_INPUT_5

#undef TYPENUM_INPUT_5

#undef ITEMSIZE_INPUT_5

#undef DTYPE_OUTPUT_0

#undef TYPENUM_OUTPUT_0

#undef ITEMSIZE_OUTPUT_0

#undef APPLY_SPECIFIC

#undef CONV_INPLACE

#undef CONV_ALGO
            this->__ERROR = __ERROR;
            return 0;
        }
        void cleanup(void) {
            __label_1:

double __DUMMY_1;
__label_3:

double __DUMMY_3;
__label_5:

double __DUMMY_5;
__label_7:

double __DUMMY_7;
__label_9:

double __DUMMY_9;
__label_11:

double __DUMMY_11;
__label_13:

double __DUMMY_13;
__label_15:
Py_XDECREF(V15); V15 = NULL;
    {Py_XDECREF(py_V15);}
    
double __DUMMY_15;
__label_18:


#define DTYPE_INPUT_0 npy_float32

#define TYPENUM_INPUT_0 11

#define ITEMSIZE_INPUT_0 4

#define DTYPE_INPUT_1 npy_float32

#define TYPENUM_INPUT_1 11

#define ITEMSIZE_INPUT_1 4

#define DTYPE_INPUT_2 npy_float32

#define TYPENUM_INPUT_2 11

#define ITEMSIZE_INPUT_2 4

#define DTYPE_INPUT_4 npy_float32

#define TYPENUM_INPUT_4 11

#define ITEMSIZE_INPUT_4 4

#define DTYPE_INPUT_5 npy_float32

#define TYPENUM_INPUT_5 11

#define ITEMSIZE_INPUT_5 4

#define DTYPE_OUTPUT_0 npy_float32

#define TYPENUM_OUTPUT_0 11

#define ITEMSIZE_OUTPUT_0 4

#define APPLY_SPECIFIC(str) str##_node_d7db26d1cc9fb59ee56289f4dda72577_0

#define CONV_INPLACE 1

#define CONV_ALGO CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM



cuda_enter(ctx->ctx);
cudnnDestroy(APPLY_SPECIFIC(_handle));
cuda_exit(ctx->ctx);
Py_DECREF((PyObject *)ctx);


if (APPLY_SPECIFIC(input) != NULL)
  cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input));
if (APPLY_SPECIFIC(output) != NULL)
  cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output));
if (APPLY_SPECIFIC(kerns) != NULL)
  cudnnDestroyFilterDescriptor(APPLY_SPECIFIC(kerns));


#undef DTYPE_INPUT_0

#undef TYPENUM_INPUT_0

#undef ITEMSIZE_INPUT_0

#undef DTYPE_INPUT_1

#undef TYPENUM_INPUT_1

#undef ITEMSIZE_INPUT_1

#undef DTYPE_INPUT_2

#undef TYPENUM_INPUT_2

#undef ITEMSIZE_INPUT_2

#undef DTYPE_INPUT_4

#undef TYPENUM_INPUT_4

#undef ITEMSIZE_INPUT_4

#undef DTYPE_INPUT_5

#undef TYPENUM_INPUT_5

#undef ITEMSIZE_INPUT_5

#undef DTYPE_OUTPUT_0

#undef TYPENUM_OUTPUT_0

#undef ITEMSIZE_OUTPUT_0

#undef APPLY_SPECIFIC

#undef CONV_INPLACE

#undef CONV_ALGO
double __DUMMY_18;

            Py_XDECREF(this->storage_V3);
Py_XDECREF(this->storage_V5);
Py_XDECREF(this->storage_V7);
Py_XDECREF(this->storage_V9);
Py_XDECREF(this->storage_V11);
Py_XDECREF(this->storage_V13);
Py_XDECREF(this->storage_V1);
Py_XDECREF(this->storage_V15);
        }
        int run(void) {
            int __failure = 0;
            
    PyObject* py_V1;
    
        PyGpuArrayObject *V1;
        
    PyObject* py_V3;
    
        PyGpuArrayObject *V3;
        
    PyObject* py_V5;
    
        PyGpuArrayObject *V5;
        
    PyObject* py_V7;
    
        PyGpuArrayObject *V7;
        
    PyObject* py_V9;
    
        cudnnConvolutionDescriptor_t V9;
        
    PyObject* py_V11;
    
                typedef npy_float32 V11_dtype; // Deprecated use dtype_V11 instead.
                typedef npy_float32 dtype_V11;
            
        npy_float32 V11;
        
    PyObject* py_V13;
    
                typedef npy_float32 V13_dtype; // Deprecated use dtype_V13 instead.
                typedef npy_float32 dtype_V13;
            
        npy_float32 V13;
        
{

    py_V1 = PyList_GET_ITEM(storage_V1, 0);
    {Py_XINCREF(py_V1);}
    
        if (py_V1 == Py_None)
        {
            V1 = NULL;
        }
        else
        {
            
        V1 = NULL;
        if (py_V1 == Py_None) {
            PyErr_SetString(PyExc_ValueError, "expected a GpuArray, not None");
            {
        __failure = 2;
        if (!PyErr_Occurred()) {
            PyErr_SetString(PyExc_RuntimeError,
                "Unexpected error in an Op's C code. "
                "No Python exception was set.");
            }
        goto __label_2;}
        }
        /* First check if we are the base type exactly (the most common case),
           then do the full subclass check if needed. */
        if (py_V1->ob_type != &PyGpuArrayType &&
            !PyObject_TypeCheck(py_V1, &PyGpuArrayType)) {
            PyErr_SetString(PyExc_ValueError, "expected a GpuArray");
            {
        __failure = 2;
        if (!PyErr_Occurred()) {
            PyErr_SetString(PyExc_RuntimeError,
                "Unexpected error in an Op's C code. "
                "No Python exception was set.");
            }
        goto __label_2;}
        }
        V1 = (PyGpuArrayObject *)py_V1;
        Py_INCREF(V1);
        
        }
        
{

    py_V3 = PyList_GET_ITEM(storage_V3, 0);
    {Py_XINCREF(py_V3);}
    
        V3 = NULL;
        if (py_V3 == Py_None) {
            PyErr_SetString(PyExc_ValueError, "expected a GpuArray, not None");
            {
        __failure = 4;
        if (!PyErr_Occurred()) {
            PyErr_SetString(PyExc_RuntimeError,
                "Unexpected error in an Op's C code. "
                "No Python exception was set.");
            }
        goto __label_4;}
        }
        /* First check if we are the base type exactly (the most common case),
           then do the full subclass check if needed. */
        if (py_V3->ob_type != &PyGpuArrayType &&
            !PyObject_TypeCheck(py_V3, &PyGpuArrayType)) {
            PyErr_SetString(PyExc_ValueError, "expected a GpuArray");
            {
        __failure = 4;
        if (!PyErr_Occurred()) {
            PyErr_SetString(PyExc_RuntimeError,
                "Unexpected error in an Op's C code. "
                "No Python exception was set.");
            }
        goto __label_4;}
        }
        V3 = (PyGpuArrayObject *)py_V3;
        Py_INCREF(V3);
        
{

    py_V5 = PyList_GET_ITEM(storage_V5, 0);
    {Py_XINCREF(py_V5);}
    
        V5 = NULL;
        if (py_V5 == Py_None) {
            PyErr_SetString(PyExc_ValueError, "expected a GpuArray, not None");
            {
        __failure = 6;
        if (!PyErr_Occurred()) {
            PyErr_SetString(PyExc_RuntimeError,
                "Unexpected error in an Op's C code. "
                "No Python exception was set.");
            }
        goto __label_6;}
        }
        /* First check if we are the base type exactly (the most common case),
           then do the full subclass check if needed. */
        if (py_V5->ob_type != &PyGpuArrayType &&
            !PyObject_TypeCheck(py_V5, &PyGpuArrayType)) {
            PyErr_SetString(PyExc_ValueError, "expected a GpuArray");
            {
        __failure = 6;
        if (!PyErr_Occurred()) {
            PyErr_SetString(PyExc_RuntimeError,
                "Unexpected error in an Op's C code. "
                "No Python exception was set.");
            }
        goto __label_6;}
        }
        V5 = (PyGpuArrayObject *)py_V5;
        Py_INCREF(V5);
        
{

    py_V7 = PyList_GET_ITEM(storage_V7, 0);
    {Py_XINCREF(py_V7);}
    
        V7 = NULL;
        if (py_V7 == Py_None) {
            PyErr_SetString(PyExc_ValueError, "expected a GpuArray, not None");
            {
        __failure = 8;
        if (!PyErr_Occurred()) {
            PyErr_SetString(PyExc_RuntimeError,
                "Unexpected error in an Op's C code. "
                "No Python exception was set.");
            }
        goto __label_8;}
        }
        /* First check if we are the base type exactly (the most common case),
           then do the full subclass check if needed. */
        if (py_V7->ob_type != &PyGpuArrayType &&
            !PyObject_TypeCheck(py_V7, &PyGpuArrayType)) {
            PyErr_SetString(PyExc_ValueError, "expected a GpuArray");
            {
        __failure = 8;
        if (!PyErr_Occurred()) {
            PyErr_SetString(PyExc_RuntimeError,
                "Unexpected error in an Op's C code. "
                "No Python exception was set.");
            }
        goto __label_8;}
        }
        V7 = (PyGpuArrayObject *)py_V7;
        Py_INCREF(V7);
        
{

    py_V9 = PyList_GET_ITEM(storage_V9, 0);
    {Py_XINCREF(py_V9);}
    
  V9 = (cudnnConvolutionDescriptor_t)PyCapsule_GetPointer(py_V9, NULL);
  if (V9 == NULL) {
        __failure = 10;
        if (!PyErr_Occurred()) {
            PyErr_SetString(PyExc_RuntimeError,
                "Unexpected error in an Op's C code. "
                "No Python exception was set.");
            }
        goto __label_10;}
        
{

    py_V11 = PyList_GET_ITEM(storage_V11, 0);
    {Py_XINCREF(py_V11);}
    
            if (!PyObject_TypeCheck(py_V11, &PyFloat32ArrType_Type))
            {
                PyErr_Format(PyExc_ValueError,
                    "Scalar check failed (npy_float32)");
                {
        __failure = 12;
        if (!PyErr_Occurred()) {
            PyErr_SetString(PyExc_RuntimeError,
                "Unexpected error in an Op's C code. "
                "No Python exception was set.");
            }
        goto __label_12;}
            }
            
        PyArray_ScalarAsCtype(py_V11, &V11);
        
{

    py_V13 = PyList_GET_ITEM(storage_V13, 0);
    {Py_XINCREF(py_V13);}
    
            if (!PyObject_TypeCheck(py_V13, &PyFloat32ArrType_Type))
            {
                PyErr_Format(PyExc_ValueError,
                    "Scalar check failed (npy_float32)");
                {
        __failure = 14;
        if (!PyErr_Occurred()) {
            PyErr_SetString(PyExc_RuntimeError,
                "Unexpected error in an Op's C code. "
                "No Python exception was set.");
            }
        goto __label_14;}
            }
            
        PyArray_ScalarAsCtype(py_V13, &V13);
        
{

{
// Op class GpuDnnConv

                #define APPLY_SPECIFIC(str) str##_node_d7db26d1cc9fb59ee56289f4dda72577_0

#define CONV_INPLACE 1

#define CONV_ALGO CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
                {
                  if (APPLY_SPECIFIC(conv_fwd)(V3, V5, V7, V9, V11, V13, &V1, V15) != 0) {
                    {
        __failure = 17;
        if (!PyErr_Occurred()) {
            PyErr_SetString(PyExc_RuntimeError,
                "Unexpected error in an Op's C code. "
                "No Python exception was set.");
            }
        goto __label_17;}
                  }
                }
                #undef APPLY_SPECIFIC

#undef CONV_INPLACE

#undef CONV_ALGO
                __label_17:

double __DUMMY_17;

}
__label_16:

double __DUMMY_16;

}
__label_14:

    {Py_XDECREF(py_V13);}
    
double __DUMMY_14;

}
__label_12:

    {Py_XDECREF(py_V11);}
    
double __DUMMY_12;

}
__label_10:

    {Py_XDECREF(py_V9);}
    
double __DUMMY_10;

}
__label_8:
Py_XDECREF(V7); V7 = NULL;
    {Py_XDECREF(py_V7);}
    
double __DUMMY_8;

}
__label_6:
Py_XDECREF(V5); V5 = NULL;
    {Py_XDECREF(py_V5);}
    
double __DUMMY_6;

}
__label_4:
Py_XDECREF(V3); V3 = NULL;
    {Py_XDECREF(py_V3);}
    
double __DUMMY_4;

}
__label_2:

    if (!__failure) {
      
        if (!V1) {
            Py_XDECREF(py_V1);
            Py_INCREF(Py_None);
            py_V1 = Py_None;
        } else if ((void *)py_V1 != (void *)V1) {
            Py_XDECREF(py_V1);
            py_V1 = (PyObject *)V1;
            Py_INCREF(py_V1);
        }
        
      PyObject* old = PyList_GET_ITEM(storage_V1, 0);
      {Py_XINCREF(py_V1);}
      PyList_SET_ITEM(storage_V1, 0, py_V1);
      {Py_XDECREF(old);}
    }
    Py_XDECREF(V1); V1 = NULL;
    {Py_XDECREF(py_V1);}
    
double __DUMMY_2;

}

            
        if (__failure) {
            // When there is a failure, this code puts the exception
            // in __ERROR.
            PyObject* err_type = NULL;
            PyObject* err_msg = NULL;
            PyObject* err_traceback = NULL;
            PyErr_Fetch(&err_type, &err_msg, &err_traceback);
            if (!err_type) {err_type = Py_None;Py_INCREF(Py_None);}
            if (!err_msg) {err_msg = Py_None; Py_INCREF(Py_None);}
            if (!err_traceback) {err_traceback = Py_None; Py_INCREF(Py_None);}
            PyObject* old_err_type = PyList_GET_ITEM(__ERROR, 0);
            PyObject* old_err_msg = PyList_GET_ITEM(__ERROR, 1);
            PyObject* old_err_traceback = PyList_GET_ITEM(__ERROR, 2);
            PyList_SET_ITEM(__ERROR, 0, err_type);
            PyList_SET_ITEM(__ERROR, 1, err_msg);
            PyList_SET_ITEM(__ERROR, 2, err_traceback);
            {Py_XDECREF(old_err_type);}
            {Py_XDECREF(old_err_msg);}
            {Py_XDECREF(old_err_traceback);}
        }
        // The failure code is returned to index what code block failed.
        return __failure;
        
        }
    };
    }
    

        static int __struct_compiled_op_d7db26d1cc9fb59ee56289f4dda72577_executor(__struct_compiled_op_d7db26d1cc9fb59ee56289f4dda72577* self) {
            return self->run();
        }

        static void __struct_compiled_op_d7db26d1cc9fb59ee56289f4dda72577_destructor(void* executor, void* self) {
            delete ((__struct_compiled_op_d7db26d1cc9fb59ee56289f4dda72577*)self);
        }
        
//////////////////////
////  Functions
//////////////////////
static PyObject * instantiate(PyObject * self, PyObject *argtuple) {
  assert(PyTuple_Check(argtuple));
  if (9 != PyTuple_Size(argtuple)){ 
     PyErr_Format(PyExc_TypeError, "Wrong number of arguments, expected 9, got %i", (int)PyTuple_Size(argtuple));
     return NULL;
  }
  __struct_compiled_op_d7db26d1cc9fb59ee56289f4dda72577* struct_ptr = new __struct_compiled_op_d7db26d1cc9fb59ee56289f4dda72577();
  if (struct_ptr->init( PyTuple_GET_ITEM(argtuple, 0),PyTuple_GET_ITEM(argtuple, 1),PyTuple_GET_ITEM(argtuple, 2),PyTuple_GET_ITEM(argtuple, 3),PyTuple_GET_ITEM(argtuple, 4),PyTuple_GET_ITEM(argtuple, 5),PyTuple_GET_ITEM(argtuple, 6),PyTuple_GET_ITEM(argtuple, 7),PyTuple_GET_ITEM(argtuple, 8) ) != 0) {
    delete struct_ptr;
    return NULL;
  }
  PyObject* thunk = PyCObject_FromVoidPtrAndDesc((void*)(&__struct_compiled_op_d7db26d1cc9fb59ee56289f4dda72577_executor), struct_ptr, __struct_compiled_op_d7db26d1cc9fb59ee56289f4dda72577_destructor);
  return thunk; }

//////////////////////
////  Module init
//////////////////////
static PyMethodDef MyMethods[] = {
	{"instantiate", instantiate, METH_VARARGS, "undocumented"} ,
	{NULL, NULL, 0, NULL}
};
PyMODINIT_FUNC initd7db26d1cc9fb59ee56289f4dda72577(void){
   import_pygpu__gpuarray();
   import_array();
   

setup_ext_cuda();


   (void) Py_InitModule("d7db26d1cc9fb59ee56289f4dda72577", MyMethods);
}

[theano-users] Running code with CuDNN fails during compilation (mod.cpp) (Windows)

Reply via email to