Re: [theano-users] Error using floatX = float16 to save memory

Arnaud Bergeron Thu, 14 Jul 2016 14:45:07 -0700

I can't reproduce your problem using a simple convolution in float16.

Either this is because your code is doing something unexpected or because
the problem has been fixed in the development version.


In nay case the development version is a much better option for the new
backend and float16 so I encourage you to upgrade and try again:
http://deeplearning.net/software/theano/install.html#bleeding-edge-install-instructions
.

2016-07-14 4:22 GMT-04:00 <[email protected]>:

> Here is .theanorc:
>
> [global]
> floatX = float16
> device=cuda
> [cuda]
> root = /usr/local/cuda-7.5
>
>
> [nvcc]
> fastmath=True
>
> optimizer = fast_compile
>
> On Thursday, July 14, 2016 at 10:19:56 AM UTC+2, [email protected]
> wrote:
>>
>> Hi Arnaud,
>> I put _f16_ok = True in dnn.py ( attached).
>>
>> This is the error I received:
>>
>> Python 2.7.11 |Anaconda custom (64-bit)| (default, Dec  6 2015, 18:08:32)
>> [GCC 4.4.7 20120313 (Red Hat 4.4.7-1)] on linux2
>> Type "help", "copyright", "credits" or "license" for more information.
>> Anaconda is brought to you by Continuum Analytics.
>> Please check out: http://continuum.io/thanks and https://anaconda.org
>> >>> import run_multi_conv
>>
>> Mapped name None to device cuda: GeForce 840M
>> WARNING (theano.gof.compilelock): Overriding existing lock by dead
>> process '3202' (I am process '3351')
>> Using cuDNN version 5005 on context None
>> /home/luca/data/Theano-master/theano/tensor/signal/downsample.py:6:
>> UserWarning: downsample module has been moved to the
>> theano.tensor.signal.pool module.
>>   "downsample module has been moved to the theano.tensor.signal.pool
>> module.")
>> >>>
>> >>> run_multi_conv.run_experiments()
>> Disabling C code for Elemwise{mul,no_inplace} due to unsupported float16
>> Disabling C code for Elemwise{Cast{float32}} due to unsupported float16
>> Disabling C code for Elemwise{Cast{float16}} due to unsupported float16
>> Disabling C code for Elemwise{Cast{float16}} due to unsupported float16
>> Disabling C code for Alloc due to unsupported float16
>> Disabling C code for Cast{float16} due to unsupported float16
>> Disabling C code for Cast{float16} due to unsupported float16
>> Disabling C code for Cast{float16} due to unsupported float16
>> Disabling C code for Cast{float16} due to unsupported float16
>> Disabling C code for RandomFunction{binomial} due to unsupported float16
>> Disabling C code for RandomFunction{binomial} due to unsupported float16
>> ===============================
>> 00001    #include <Python.h>
>> 00002    #include <iostream>
>> 00003    #include "theano_mod_helper.h"
>> 00004    #include <gpuarray/array.h>
>> 00005    #include <gpuarray/kernel.h>
>> 00006    #include <gpuarray/error.h>
>> 00007    #include <gpuarray/buffer.h>
>> 00008    #include <gpuarray/buffer_blas.h>
>> 00009    #include <numpy/arrayobject.h>
>> 00010    #include <gpuarray_api.h>
>> 00011    #include <math.h>
>> 00012    #include <numpy/arrayscalars.h>
>> 00013    #include "cudnn.h"
>> 00014    #include "cudnn_helper.h"
>> 00015    #include "gpuarray_helper.h"
>> 00016    #include "gpuarray/types.h"
>> 00017    #include "gpuarray/array.h"
>> 00018    #include "gpuarray/util.h"
>> 00019    #include "gpuarray/ext_cuda.h"
>> 00020    #include "gpuarray_api.h"
>> 00021    #include "numpy_compat.h"
>> 00022    //////////////////////
>> 00023    ////  Support Code
>> 00024    //////////////////////
>> 00025
>> 00026
>> 00027
>> 00028    static int
>> 00029    c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t
>> desc) {
>> 00030      cudnnDataType_t dt;
>> 00031      size_t ds;
>> 00032      switch (var->ga.typecode) {
>> 00033      case GA_FLOAT:
>> 00034        dt = CUDNN_DATA_FLOAT;
>> 00035        break;
>> 00036      case GA_DOUBLE:
>> 00037        dt = CUDNN_DATA_DOUBLE;
>> 00038        break;
>> 00039    #if CUDNN_VERSION > 3000
>> 00040      case GA_HALF:
>> 00041        dt = CUDNN_DATA_HALF;
>> 00042        break;
>> 00043    #endif
>> 00044      default:
>> 00045        PyErr_SetString(PyExc_TypeError, "Non-float datatype in
>> c_set_tensorNd");
>> 00046        return -1;
>> 00047      }
>> 00048      ds = gpuarray_get_elsize(var->ga.typecode);
>> 00049
>> 00050      int strs[5], dims[5], default_stride = 1;
>> 00051      unsigned int nd = PyGpuArray_NDIM(var);
>> 00052
>> 00053      if (nd > 5) {
>> 00054        PyErr_SetString(PyExc_TypeError, "Tensor of more than 5d");
>> 00055        return -1;
>> 00056      }
>> 00057
>> 00058      for (unsigned int _i = nd; _i > 0; _i--) {
>> 00059        unsigned int i = _i - 1;
>> 00060        strs[i] = PyGpuArray_STRIDE(var, i) ?
>> 00061          PyGpuArray_STRIDE(var, i)/ds : default_stride;
>> 00062        default_stride *= PyGpuArray_DIM(var, i);
>> 00063        dims[i] = PyGpuArray_DIM(var, i);
>> 00064      }
>> 00065
>> 00066      cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, dt, nd,
>> dims, strs);
>> 00067      if (err != CUDNN_STATUS_SUCCESS) {
>> 00068        PyErr_Format(PyExc_RuntimeError,
>> 00069             "Could not set tensorNd descriptor: %s",
>> 00070             cudnnGetErrorString(err));
>> 00071        return -1;
>> 00072      }
>> 00073      return 0;
>> 00074    }
>> 00075
>> 00076    static int
>> 00077    c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t
>> desc) {
>> 00078      cudnnDataType_t dt;
>> 00079      cudnnStatus_t err;
>> 00080
>> 00081      if (!GpuArray_IS_C_CONTIGUOUS(&var->ga)) {
>> 00082        PyErr_SetString(PyExc_ValueError,
>> 00083                "Only contiguous filters (kernels) are supported.");
>> 00084        return -1;
>> 00085      }
>> 00086      switch (var->ga.typecode) {
>> 00087      case GA_FLOAT:
>> 00088        dt = CUDNN_DATA_FLOAT;
>> 00089        break;
>> 00090      case GA_DOUBLE:
>> 00091        dt = CUDNN_DATA_DOUBLE;
>> 00092        break;
>> 00093    #if CUDNN_VERSION > 3000
>> 00094      case GA_HALF:
>> 00095        dt = CUDNN_DATA_HALF;
>> 00096        break;
>> 00097    #endif
>> 00098      default:
>> 00099        PyErr_SetString(PyExc_TypeError, "Non-float datatype in
>> c_set_filter");
>> 00100        return -1;
>> 00101      }
>> 00102
>> 00103      int dims[5];
>> 00104      unsigned int nd = PyGpuArray_NDIM(var);
>> 00105
>> 00106      if (nd > 5) {
>> 00107        PyErr_SetString(PyExc_TypeError, "Tensor of more than 5d");
>> 00108        return -1;
>> 00109      }
>> 00110
>> 00111      for (unsigned int _i = nd; _i > 0; _i--) {
>> 00112        unsigned int i = _i - 1;
>> 00113        dims[i] = PyGpuArray_DIM(var, i);
>> 00114      }
>> 00115
>> 00116    #if CUDNN_VERSION >= 5000
>> 00117        err = cudnnSetFilterNdDescriptor(desc, dt,
>> CUDNN_TENSOR_NCHW, nd, dims);
>> 00118    #else
>> 00119        err = cudnnSetFilterNdDescriptor(desc, dt, nd, dims);
>> 00120    #endif
>> 00121
>> 00122      if (err != CUDNN_STATUS_SUCCESS) {
>> 00123        PyErr_Format(PyExc_RuntimeError,
>> 00124             "Could not set filter descriptor: %s.",
>> 00125             cudnnGetErrorString(err));
>> 00126        return -1;
>> 00127      }
>> 00128      return 0;
>> 00129    }
>> 00130
>> 00131
>> 00132
>> 00133        namespace {
>> 00134        struct __struct_compiled_op_86feacd077d8749f42b5d82709a80ba3
>> {
>> 00135            PyObject* __ERROR;
>> 00136
>> 00137            PyObject* storage_V3;
>> 00138    PyObject* storage_V5;
>> 00139    PyObject* storage_V7;
>> 00140    PyObject* storage_V9;
>> 00141    PyObject* storage_V11;
>> 00142    PyObject* storage_V13;
>> 00143    PyObject* storage_V1;
>> 00144    PyObject* storage_V15;
>> 00145
>> 00146        PyObject* py_V15;
>> 00147        PyGpuContextObject *V15;
>> 00148    #define DTYPE_INPUT_0 npy_float16
>> 00149    #define TYPENUM_INPUT_0 23
>> 00150    #define ITEMSIZE_INPUT_0 2
>> 00151    #define DTYPE_INPUT_1 npy_float16
>> 00152    #define TYPENUM_INPUT_1 23
>> 00153    #define ITEMSIZE_INPUT_1 2
>> 00154    #define DTYPE_INPUT_2 npy_float16
>> 00155    #define TYPENUM_INPUT_2 23
>> 00156    #define ITEMSIZE_INPUT_2 2
>> 00157    #define DTYPE_INPUT_4 npy_float16
>> 00158    #define TYPENUM_INPUT_4 23
>> 00159    #define ITEMSIZE_INPUT_4 2
>> 00160    #define DTYPE_INPUT_5 npy_float16
>> 00161    #define TYPENUM_INPUT_5 23
>> 00162    #define ITEMSIZE_INPUT_5 2
>> 00163    #define DTYPE_OUTPUT_0 npy_float16
>> 00164    #define TYPENUM_OUTPUT_0 23
>> 00165    #define ITEMSIZE_OUTPUT_0 2
>> 00166    #define APPLY_SPECIFIC(str)
>> str##_node_86feacd077d8749f42b5d82709a80ba3_0
>> 00167    #define CONV_INPLACE 1
>> 00168    #define CONV_ALGO
>> CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
>> 00169
>> 00170
>> 00171    PyGpuContextObject *ctx;
>> 00172    cudnnHandle_t APPLY_SPECIFIC(_handle);
>> 00173
>> 00174
>> 00175    cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
>> 00176    cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
>> 00177    cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns);
>> 00178
>> 00179
>> 00180
>> 00181    #ifdef CHOOSE_ALGO
>> 00182    int reuse_algo;
>> 00183    cudnnConvolutionFwdAlgo_t prev_algo;
>> 00184    #ifndef CHOOSE_ONCE
>> 00185    size_t prev_img_dims[5];
>> 00186    size_t prev_kern_dims[5];
>> 00187    #endif
>> 00188    #endif
>> 00189
>> 00190    int
>> 00191    APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input,
>> PyGpuArrayObject *kerns,
>> 00192                             PyGpuArrayObject *om,
>> 00193                             cudnnConvolutionDescriptor_t desc,
>> 00194                             double alpha, double beta,
>> 00195                             PyGpuArrayObject **output,
>> 00196                             PyGpuContextObject *c) {
>> 00197      cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
>> 00198      float af = alpha, bf = beta;
>> 00199      void *alpha_p;
>> 00200      void *beta_p;
>> 00201
>> 00202      if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
>> 00203        PyErr_SetString(PyExc_ValueError,
>> 00204                "images and kernel must have the same stack size");
>> 00205        return 1;
>> 00206      }
>> 00207
>> 00208      if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
>> 00209        return 1;
>> 00210      if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
>> 00211        return 1;
>> 00212
>> 00213      switch (input->ga.typecode) {
>> 00214      case GA_DOUBLE:
>> 00215        alpha_p = (void *)&alpha;
>> 00216        beta_p = (void *)&beta;
>> 00217        break;
>> 00218      case GA_FLOAT:
>> 00219      case GA_HALF:
>> 00220        alpha_p = (void *)&af;
>> 00221        beta_p = (void *)&bf;
>> 00222        break;
>> 00223      default:
>> 00224        PyErr_SetString(PyExc_TypeError, "Unsupported type in
>> convolution");
>> 00225        return 1;
>> 00226      }
>> 00227
>> 00228    #ifdef CONV_INPLACE
>> 00229      Py_XDECREF(*output);
>> 00230      *output = om;
>> 00231      Py_INCREF(*output);
>> 00232    #else
>> 00233      if (theano_prep_output(output, PyGpuArray_NDIM(om),
>> PyGpuArray_DIMS(om),
>> 00234                             om->ga.typecode, GA_C_ORDER, c) != 0)
>> 00235        return 1;
>> 00236      if (beta != 0.0 && pygpu_move(*output, om))
>> 00237        return 1;
>> 00238    #endif
>> 00239
>> 00240      if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1)
>> 00241        return 1;
>> 00242
>> 00243      cudnnConvolutionFwdAlgo_t algo = CONV_ALGO;
>> 00244
>> 00245      cuda_enter(c->ctx);
>> 00246    #ifdef CHOOSE_ALGO
>> 00247    #ifndef CHOOSE_ONCE
>> 00248      reuse_algo = 1;
>> 00249      for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
>> 00250        reuse_algo = (reuse_algo &&
>> 00251                      PyGpuArray_DIM(input, i) == prev_img_dims[i]);
>> 00252        reuse_algo = (reuse_algo &&
>> 00253                      PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
>> 00254      }
>> 00255    #endif
>> 00256
>> 00257      if (!reuse_algo) {
>> 00258    #ifdef CHOOSE_TIME
>> 00259        int count;
>> 00260        cudnnConvolutionFwdAlgoPerf_t choice;
>> 00261        err = cudnnFindConvolutionForwardAlgorithm(
>> 00262          APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input),
>> APPLY_SPECIFIC(kerns),
>> 00263          desc, APPLY_SPECIFIC(output), 1, &count, &choice);
>> 00264
>> 00265        if (err != CUDNN_STATUS_SUCCESS) {
>> 00266          PyErr_Format(PyExc_RuntimeError,
>> 00267                       "error selecting convolution algo: %s",
>> 00268                       cudnnGetErrorString(err));
>> 00269          cuda_exit(c->ctx);
>> 00270          return 1;
>> 00271        }
>> 00272        algo = choice.algo;
>> 00273    #else
>> 00274        size_t free;
>> 00275        int err2 = gpucontext_property(c->ctx,
>> GA_CTX_PROP_FREE_GMEM, &free);
>> 00276
>> 00277        if (err2 != GA_NO_ERROR) {
>> 00278          PyErr_Format(PyExc_RuntimeError, "Error when trying to
>> find the "
>> 00279                       "memory information on the GPU");
>> 00280          cuda_exit(c->ctx);
>> 00281          return 1;
>> 00282        }
>> 00283
>> 00284        err = cudnnGetConvolutionForwardAlgorithm(
>> 00285          APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input),
>> APPLY_SPECIFIC(kerns),
>> 00286          desc, APPLY_SPECIFIC(output),
>> 00287          CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free,
>> &algo);
>> 00288        if (err != CUDNN_STATUS_SUCCESS) {
>> 00289          PyErr_Format(PyExc_RuntimeError,
>> 00290                       "error selecting convolution algo: %s",
>> 00291                       cudnnGetErrorString(err));
>> 00292          cuda_exit(c->ctx);
>> 00293          return 1;
>> 00294        }
>> 00295    #endif
>> 00296        prev_algo = algo;
>> 00297      } else {
>> 00298        algo = prev_algo;
>> 00299      }
>> 00300
>> 00301    #ifdef CHOOSE_ONCE
>> 00302      reuse_algo = 1;
>> 00303    #else
>> 00304      for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
>> 00305        prev_img_dims[i] = PyGpuArray_DIM(input, i);
>> 00306        prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
>> 00307      }
>> 00308    #endif
>> 00309
>> 00310    #endif
>> 00311
>> 00312      /* These two algos are not supported for 3d conv */
>> 00313      if (PyGpuArray_NDIM(input) == 5 &&
>> 00314          (algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
>> ||
>> 00315           algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM))
>> 00316        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
>> 00317
>> 00318      // The FFT implementation does not support strides, 1x1
>> filters or inputs
>> 00319      // with a spatial dimension larger than 1024. The tiled-FFT
>> implementation
>> 00320      // does not support strides.
>> 00321      // If the chosen implementation is FFT or tiled-FFT, validate
>> that it can
>> 00322      // be used on the current data and default to a safe
>> implementation if it
>> 00323      // can't.
>> 00324      // The following code is 2d-specific but it is fine as FFT and
>> tiled-FFT are
>> 00325      // defined only for 2d filters
>> 00326      if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
>> 00327           algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) &&
>> PyGpuArray_NDIM(input) == 4) {
>> 00328
>> 00329        // Extract the properties of the convolution descriptor
>> 00330        int nd;
>> 00331        int pad[2];
>> 00332        int stride[2];
>> 00333        int upscale[2];
>> 00334        cudnnConvolutionMode_t mode;
>> 00335        cudnnDataType_t data_type;
>> 00336        err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad,
>> stride,
>> 00337                                                 upscale, &mode,
>> &data_type);
>> 00338        if (err != CUDNN_STATUS_SUCCESS) {
>> 00339          PyErr_Format(PyExc_RuntimeError,
>> 00340                       "error getting convolution properties: %s",
>> 00341                       cudnnGetErrorString(err));
>> 00342          cuda_exit(c->ctx);
>> 00343          return 1;
>> 00344        }
>> 00345
>> 00346        if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT)
>> 00347        {
>> 00348          if (stride[0] != 1 || stride[1] != 1 ||
>> 00349              PyGpuArray_DIM(input, 2) > 1024 ||
>> PyGpuArray_DIM(input, 3) > 1024 ||
>> 00350              (PyGpuArray_DIM(kerns, 2) == 1 &&
>> PyGpuArray_DIM(kerns, 3) == 1))
>> 00351          {
>> 00352            algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
>> 00353          }
>> 00354        }
>> 00355        else
>> 00356        {
>> 00357          // algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
>> 00358          if (stride[0] != 1 || stride[1] != 1)
>> 00359          {
>> 00360            algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
>> 00361          }
>> 00362        }
>> 00363      }
>> 00364
>> 00365      {
>> 00366        size_t worksize;
>> 00367        gpudata *workspace;
>> 00368        err =
>> cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle),
>> 00369
>> APPLY_SPECIFIC(input),
>> 00370
>> APPLY_SPECIFIC(kerns),
>> 00371                                                      desc,
>> 00372
>> APPLY_SPECIFIC(output),
>> 00373                                                      algo,
>> 00374                                                      &worksize);
>> 00375
>> 00376        if (err == CUDNN_STATUS_NOT_SUPPORTED) {
>> 00377          // Fallback to none algo if not supported
>> 00378          // TODO: Print a warning
>> 00379          algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
>> 00380
>> 00381          err =
>> cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle),
>> 00382
>> APPLY_SPECIFIC(input),
>> 00383
>> APPLY_SPECIFIC(kerns),
>> 00384                                                        desc,
>> 00385
>> APPLY_SPECIFIC(output),
>> 00386                                                        algo,
>> 00387                                                        &worksize);
>> 00388        }
>> 00389
>> 00390        if (err != CUDNN_STATUS_SUCCESS) {
>> 00391          PyErr_Format(PyExc_RuntimeError,
>> 00392                       "error getting worksize: %s",
>> 00393                       cudnnGetErrorString(err));
>> 00394          cuda_exit(c->ctx);
>> 00395          return 1;
>> 00396        }
>> 00397
>> 00398        /*
>> 00399         * This is less than ideal since we need to free it after
>> (which
>> 00400         * introduces a synchronization point. But we don't have a
>> module
>> 00401         * to place a nice get_work_mem() function in.
>> 00402         */
>> 00403        if (worksize != 0) {
>> 00404          workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
>> 00405          if (workspace == NULL) {
>> 00406            PyErr_SetString(PyExc_RuntimeError,
>> 00407                            "Could not allocate working memory");
>> 00408            cuda_exit(c->ctx);
>> 00409            return 1;
>> 00410          }
>> 00411        }
>> 00412
>> 00413        cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
>> 00414        cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
>> 00415        cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
>> 00416
>> 00417        err = cudnnConvolutionForward(
>> 00418          APPLY_SPECIFIC(_handle),
>> 00419          alpha_p,
>> 00420          APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
>> 00421          APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
>> 00422          desc, algo,
>> 00423          worksize == 0 ? NULL : *(void **)workspace, worksize,
>> 00424          beta_p,
>> 00425          APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output));
>> 00426
>> 00427        if (worksize != 0)
>> 00428          gpudata_release(workspace);
>> 00429
>> 00430        cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
>> 00431        cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
>> 00432        cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
>> 00433      }
>> 00434      cuda_exit(c->ctx);
>> 00435
>> 00436      if (err != CUDNN_STATUS_SUCCESS) {
>> 00437        PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
>> 00438             cudnnGetErrorString(err));
>> 00439        return 1;
>> 00440      }
>> 00441      return 0;
>> 00442    }
>> 00443
>> 00444    #undef DTYPE_INPUT_0
>> 00445    #undef TYPENUM_INPUT_0
>> 00446    #undef ITEMSIZE_INPUT_0
>> 00447    #undef DTYPE_INPUT_1
>> 00448    #undef TYPENUM_INPUT_1
>> 00449    #undef ITEMSIZE_INPUT_1
>> 00450    #undef DTYPE_INPUT_2
>> 00451    #undef TYPENUM_INPUT_2
>> 00452    #undef ITEMSIZE_INPUT_2
>> 00453    #undef DTYPE_INPUT_4
>> 00454    #undef TYPENUM_INPUT_4
>> 00455    #undef ITEMSIZE_INPUT_4
>> 00456    #undef DTYPE_INPUT_5
>> 00457    #undef TYPENUM_INPUT_5
>> 00458    #undef ITEMSIZE_INPUT_5
>> 00459    #undef DTYPE_OUTPUT_0
>> 00460    #undef TYPENUM_OUTPUT_0
>> 00461    #undef ITEMSIZE_OUTPUT_0
>> 00462    #undef APPLY_SPECIFIC
>> 00463    #undef CONV_INPLACE
>> 00464    #undef CONV_ALGO
>> 00465
>> 00466            __struct_compiled_op_86feacd077d8749f42b5d82709a80ba3() {
>> 00467                // This is only somewhat safe because we:
>> 00468                //  1) Are not a virtual class
>> 00469                //  2) Do not use any virtual classes in the members
>> 00470                //  3) Deal with mostly POD and pointers
>> 00471
>> 00472                // If this changes, we would have to revise this,
>> but for
>> 00473                // now I am tired of chasing segfaults because
>> 00474                // initialization code had an error and some pointer
>> has
>> 00475                // a junk value.
>> 00476                memset(this, 0, sizeof(*this));
>> 00477            }
>> 00478
>> ~__struct_compiled_op_86feacd077d8749f42b5d82709a80ba3(void) {
>> 00479                cleanup();
>> 00480            }
>> 00481
>> 00482            int init(PyObject* __ERROR, PyObject* storage_V3,
>> PyObject* storage_V5, PyObject* storage_V7, PyObject* storage_V9, PyObject*
>> storage_V11, PyObject* storage_V13, PyObject* storage_V1, PyObject*
>> storage_V15) {
>> 00483                Py_XINCREF(storage_V3);
>> 00484    Py_XINCREF(storage_V5);
>> 00485    Py_XINCREF(storage_V7);
>> 00486    Py_XINCREF(storage_V9);
>> 00487    Py_XINCREF(storage_V11);
>> 00488    Py_XINCREF(storage_V13);
>> 00489    Py_XINCREF(storage_V1);
>> 00490    Py_XINCREF(storage_V15);
>> 00491                this->storage_V3 = storage_V3;
>> 00492    this->storage_V5 = storage_V5;
>> 00493    this->storage_V7 = storage_V7;
>> 00494    this->storage_V9 = storage_V9;
>> 00495    this->storage_V11 = storage_V11;
>> 00496    this->storage_V13 = storage_V13;
>> 00497    this->storage_V1 = storage_V1;
>> 00498    this->storage_V15 = storage_V15;
>> 00499
>> 00500
>> 00501
>> 00502
>> 00503
>> 00504
>> 00505
>> 00506
>> 00507
>> 00508        py_V15 = PyList_GET_ITEM(storage_V15, 0);
>> 00509        {Py_XINCREF(py_V15);}
>> 00510
>> 00511    if (!PyObject_TypeCheck(py_V15, &PyGpuContextType)) {
>> 00512      PyErr_SetString(PyExc_TypeError, "expected a GpuContext");
>> 00513      {
>> 00514            if (!PyErr_Occurred()) {
>> 00515                PyErr_SetString(PyExc_RuntimeError,
>> 00516                    "Unexpected error in an Op's C code. "
>> 00517                    "No Python exception was set.");
>> 00518                }
>> 00519            return 15;
>> 00520    }
>> 00521    }
>> 00522
>> 00523    V15 = (PyGpuContextObject *)py_V15;
>> 00524    Py_INCREF(V15);
>> 00525
>> 00526
>> 00527    #define DTYPE_INPUT_0 npy_float16
>> 00528    #define TYPENUM_INPUT_0 23
>> 00529    #define ITEMSIZE_INPUT_0 2
>> 00530    #define DTYPE_INPUT_1 npy_float16
>> 00531    #define TYPENUM_INPUT_1 23
>> 00532    #define ITEMSIZE_INPUT_1 2
>> 00533    #define DTYPE_INPUT_2 npy_float16
>> 00534    #define TYPENUM_INPUT_2 23
>> 00535    #define ITEMSIZE_INPUT_2 2
>> 00536    #define DTYPE_INPUT_4 npy_float16
>> 00537    #define TYPENUM_INPUT_4 23
>> 00538    #define ITEMSIZE_INPUT_4 2
>> 00539    #define DTYPE_INPUT_5 npy_float16
>> 00540    #define TYPENUM_INPUT_5 23
>> 00541    #define ITEMSIZE_INPUT_5 2
>> 00542    #define DTYPE_OUTPUT_0 npy_float16
>> 00543    #define TYPENUM_OUTPUT_0 23
>> 00544    #define ITEMSIZE_OUTPUT_0 2
>> 00545    #define APPLY_SPECIFIC(str)
>> str##_node_86feacd077d8749f42b5d82709a80ba3_0
>> 00546    #define CONV_INPLACE 1
>> 00547    #define CONV_ALGO
>> CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
>> 00548    #define FAIL { \
>> 00549            if (!PyErr_Occurred()) { \
>> 00550                PyErr_SetString(PyExc_RuntimeError, \
>> 00551                    "Unexpected error in an Op's C code. " \
>> 00552                    "No Python exception was set."); \
>> 00553                } \
>> 00554            return 17; \
>> 00555    }
>> 00556    #define PARAMS V15
>> 00557
>> 00558
>> 00559    {
>> 00560      // We need to keep a reference here to have it available in
>> the destructor.
>> 00561      ctx = PARAMS;
>> 00562      Py_INCREF(ctx);
>> 00563
>> 00564      cuda_enter(PARAMS->ctx);
>> 00565      cudnnStatus_t err;
>> 00566      APPLY_SPECIFIC(_handle) = NULL;
>> 00567      if ((err = cudnnCreate(&APPLY_SPECIFIC(_handle))) !=
>> CUDNN_STATUS_SUCCESS) {
>> 00568        PyErr_Format(PyExc_RuntimeError, "could not create cuDNN
>> handle: %s",
>> 00569                     cudnnGetErrorString(err));
>> 00570        cuda_exit(PARAMS->ctx);
>> 00571        FAIL;
>> 00572      }
>> 00573      if ((err = cudnnSetStream(APPLY_SPECIFIC(_handle),
>> 00574                                cuda_get_stream(PARAMS->ctx))) !=
>> CUDNN_STATUS_SUCCESS) {
>> 00575        PyErr_Format(PyExc_RuntimeError, "Could not set cudnn
>> stream: %s",
>> 00576                     cudnnGetErrorString(err));
>> 00577        cuda_exit(PARAMS->ctx);
>> 00578        FAIL;
>> 00579      }
>> 00580      cuda_exit(PARAMS->ctx);
>> 00581    }
>> 00582
>> 00583
>> 00584
>> 00585    cudnnStatus_t APPLY_SPECIFIC(err);
>> 00586    APPLY_SPECIFIC(input) = NULL;
>> 00587    APPLY_SPECIFIC(output) = NULL;
>> 00588    APPLY_SPECIFIC(kerns) = NULL;
>> 00589    if ((APPLY_SPECIFIC(err) =
>> cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) !=
>> CUDNN_STATUS_SUCCESS) {
>> 00590      PyErr_Format(PyExc_MemoryError, "could not allocate tensor
>> descriptor "
>> 00591               "(inp): %s",
>> cudnnGetErrorString(APPLY_SPECIFIC(err)));
>> 00592      FAIL;
>> 00593    }
>> 00594    if ((APPLY_SPECIFIC(err) =
>> cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) !=
>> CUDNN_STATUS_SUCCESS) {
>> 00595      PyErr_Format(PyExc_MemoryError, "could not allocate tensor
>> descriptor "
>> 00596                   "(out): %s",
>> cudnnGetErrorString(APPLY_SPECIFIC(err)));
>> 00597      FAIL;
>> 00598    }
>> 00599    if ((APPLY_SPECIFIC(err) =
>> cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns))) !=
>> CUDNN_STATUS_SUCCESS) {
>> 00600      PyErr_Format(PyExc_MemoryError, "could not allocate filter
>> descriptor: %s",
>> 00601               cudnnGetErrorString(APPLY_SPECIFIC(err)));
>> 00602      FAIL;
>> 00603    }
>> 00604
>> 00605
>> 00606
>> 00607    #ifdef CHOOSE_ALGO
>> 00608    reuse_algo = 0;
>> 00609    prev_algo = CONV_ALGO;
>> 00610    #ifndef CHOOSE_ONCE
>> 00611    memset(prev_img_dims, 0, sizeof(prev_img_dims));
>> 00612    memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
>> 00613    #endif
>> 00614    #endif
>> 00615
>> 00616
>> 00617    #undef FAIL
>> 00618    #undef PARAMS
>> 00619    #undef DTYPE_INPUT_0
>> 00620    #undef TYPENUM_INPUT_0
>> 00621    #undef ITEMSIZE_INPUT_0
>> 00622    #undef DTYPE_INPUT_1
>> 00623    #undef TYPENUM_INPUT_1
>> 00624    #undef ITEMSIZE_INPUT_1
>> 00625    #undef DTYPE_INPUT_2
>> 00626    #undef TYPENUM_INPUT_2
>> 00627    #undef ITEMSIZE_INPUT_2
>> 00628    #undef DTYPE_INPUT_4
>> 00629    #undef TYPENUM_INPUT_4
>> 00630    #undef ITEMSIZE_INPUT_4
>> 00631    #undef DTYPE_INPUT_5
>> 00632    #undef TYPENUM_INPUT_5
>> 00633    #undef ITEMSIZE_INPUT_5
>> 00634    #undef DTYPE_OUTPUT_0
>> 00635    #undef TYPENUM_OUTPUT_0
>> 00636    #undef ITEMSIZE_OUTPUT_0
>> 00637    #undef APPLY_SPECIFIC
>> 00638    #undef CONV_INPLACE
>> 00639    #undef CONV_ALGO
>> 00640                this->__ERROR = __ERROR;
>> 00641                return 0;
>> 00642            }
>> 00643            void cleanup(void) {
>> 00644                __label_1:
>> 00645
>> 00646    double __DUMMY_1;
>> 00647    __label_3:
>> 00648
>> 00649    double __DUMMY_3;
>> 00650    __label_5:
>> 00651
>> 00652    double __DUMMY_5;
>> 00653    __label_7:
>> 00654
>> 00655    double __DUMMY_7;
>> 00656    __label_9:
>> 00657
>> 00658    double __DUMMY_9;
>> 00659    __label_11:
>> 00660
>> 00661    double __DUMMY_11;
>> 00662    __label_13:
>> 00663
>> 00664    double __DUMMY_13;
>> 00665    __label_15:
>> 00666    Py_XDECREF(V15); V15 = NULL;
>> 00667        {Py_XDECREF(py_V15);}
>> 00668
>> 00669    double __DUMMY_15;
>> 00670    __label_18:
>> 00671
>> 00672    #define DTYPE_INPUT_0 npy_float16
>> 00673    #define TYPENUM_INPUT_0 23
>> 00674    #define ITEMSIZE_INPUT_0 2
>> 00675    #define DTYPE_INPUT_1 npy_float16
>> 00676    #define TYPENUM_INPUT_1 23
>> 00677    #define ITEMSIZE_INPUT_1 2
>> 00678    #define DTYPE_INPUT_2 npy_float16
>> 00679    #define TYPENUM_INPUT_2 23
>> 00680    #define ITEMSIZE_INPUT_2 2
>> 00681    #define DTYPE_INPUT_4 npy_float16
>> 00682    #define TYPENUM_INPUT_4 23
>> 00683    #define ITEMSIZE_INPUT_4 2
>> 00684    #define DTYPE_INPUT_5 npy_float16
>> 00685    #define TYPENUM_INPUT_5 23
>> 00686    #define ITEMSIZE_INPUT_5 2
>> 00687    #define DTYPE_OUTPUT_0 npy_float16
>> 00688    #define TYPENUM_OUTPUT_0 23
>> 00689    #define ITEMSIZE_OUTPUT_0 2
>> 00690    #define APPLY_SPECIFIC(str)
>> str##_node_86feacd077d8749f42b5d82709a80ba3_0
>> 00691    #define CONV_INPLACE 1
>> 00692    #define CONV_ALGO
>> CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
>> 00693
>> 00694
>> 00695    cuda_enter(ctx->ctx);
>> 00696    cudnnDestroy(APPLY_SPECIFIC(_handle));
>> 00697    cuda_exit(ctx->ctx);
>> 00698    Py_DECREF((PyObject *)ctx);
>> 00699
>> 00700
>> 00701    if (APPLY_SPECIFIC(input) != NULL)
>> 00702      cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input));
>> 00703    if (APPLY_SPECIFIC(output) != NULL)
>> 00704      cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output));
>> 00705    if (APPLY_SPECIFIC(kerns) != NULL)
>> 00706      cudnnDestroyFilterDescriptor(APPLY_SPECIFIC(kerns));
>> 00707
>> 00708    #undef DTYPE_INPUT_0
>> 00709    #undef TYPENUM_INPUT_0
>> 00710    #undef ITEMSIZE_INPUT_0
>> 00711    #undef DTYPE_INPUT_1
>> 00712    #undef TYPENUM_INPUT_1
>> 00713    #undef ITEMSIZE_INPUT_1
>> 00714    #undef DTYPE_INPUT_2
>> 00715    #undef TYPENUM_INPUT_2
>> 00716    #undef ITEMSIZE_INPUT_2
>> 00717    #undef DTYPE_INPUT_4
>> 00718    #undef TYPENUM_INPUT_4
>> 00719    #undef ITEMSIZE_INPUT_4
>> 00720    #undef DTYPE_INPUT_5
>> 00721    #undef TYPENUM_INPUT_5
>> 00722    #undef ITEMSIZE_INPUT_5
>> 00723    #undef DTYPE_OUTPUT_0
>> 00724    #undef TYPENUM_OUTPUT_0
>> 00725    #undef ITEMSIZE_OUTPUT_0
>> 00726    #undef APPLY_SPECIFIC
>> 00727    #undef CONV_INPLACE
>> 00728    #undef CONV_ALGO
>> 00729    double __DUMMY_18;
>> 00730
>> 00731                Py_XDECREF(this->storage_V3);
>> 00732    Py_XDECREF(this->storage_V5);
>> 00733    Py_XDECREF(this->storage_V7);
>> 00734    Py_XDECREF(this->storage_V9);
>> 00735    Py_XDECREF(this->storage_V11);
>> 00736    Py_XDECREF(this->storage_V13);
>> 00737    Py_XDECREF(this->storage_V1);
>> 00738    Py_XDECREF(this->storage_V15);
>> 00739            }
>> 00740            int run(void) {
>> 00741                int __failure = 0;
>> 00742
>> 00743        PyObject* py_V1;
>> 00744
>> 00745            PyGpuArrayObject *V1;
>> 00746
>> 00747        PyObject* py_V3;
>> 00748
>> 00749            PyGpuArrayObject *V3;
>> 00750
>> 00751        PyObject* py_V5;
>> 00752
>> 00753            PyGpuArrayObject *V5;
>> 00754
>> 00755        PyObject* py_V7;
>> 00756
>> 00757            PyGpuArrayObject *V7;
>> 00758
>> 00759        PyObject* py_V9;
>> 00760
>> 00761            cudnnConvolutionDescriptor_t V9;
>> 00762
>> 00763        PyObject* py_V11;
>> 00764
>> 00765                    typedef npy_float16 V11_dtype; // Deprecated use
>> dtype_V11 instead.
>> 00766                    typedef npy_float16 dtype_V11;
>> 00767
>> 00768            npy_float16 V11;
>> 00769
>> 00770        PyObject* py_V13;
>> 00771
>> 00772                    typedef npy_float16 V13_dtype; // Deprecated use
>> dtype_V13 instead.
>> 00773                    typedef npy_float16 dtype_V13;
>> 00774
>> 00775            npy_float16 V13;
>> 00776
>> 00777    {
>> 00778
>> 00779        py_V1 = PyList_GET_ITEM(storage_V1, 0);
>> 00780        {Py_XINCREF(py_V1);}
>> 00781
>> 00782            if (py_V1 == Py_None)
>> 00783            {
>> 00784                V1 = NULL;
>> 00785            }
>> 00786            else
>> 00787            {
>> 00788
>> 00789            V1 = NULL;
>> 00790            if (py_V1 == Py_None) {
>> 00791                PyErr_SetString(PyExc_ValueError, "expected a
>> GpuArray, not None");
>> 00792                {
>> 00793            __failure = 2;
>> 00794            if (!PyErr_Occurred()) {
>> 00795                PyErr_SetString(PyExc_RuntimeError,
>> 00796                    "Unexpected error in an Op's C code. "
>> 00797                    "No Python exception was set.");
>> 00798                }
>> 00799            goto __label_2;}
>> 00800            }
>> 00801            /* First check if we are the base type exactly (the most
>> common case),
>> 00802               then do the full subclass check if needed. */
>> 00803            if (py_V1->ob_type != &PyGpuArrayType &&
>> 00804                !PyObject_TypeCheck(py_V1, &PyGpuArrayType)) {
>> 00805                PyE
>>
> --
>
> ---
> You received this message because you are subscribed to the Google Groups
> "theano-users" group.
> To unsubscribe from this group and stop receiving emails from it, send an
> email to [email protected].
>
> For more options, visit https://groups.google.com/d/optout.
>

-- 

--- 
You received this message because you are subscribed to the Google Groups 
"theano-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/d/optout.

Re: [theano-users] Error using floatX = float16 to save memory

Reply via email to