Re: [theano-users] Error using floatX = float16 to save memory

luca . wagner . 0812 Thu, 14 Jul 2016 01:22:35 -0700

Here is .theanorc:

[global]
floatX = float16
device=cuda
[cuda] 
root = /usr/local/cuda-7.5



[nvcc]
fastmath=True

optimizer = fast_compile

On Thursday, July 14, 2016 at 10:19:56 AM UTC+2, [email protected] 
wrote:
>
> Hi Arnaud,
> I put _f16_ok = True in dnn.py ( attached).
>
> This is the error I received:
>
> Python 2.7.11 |Anaconda custom (64-bit)| (default, Dec  6 2015, 18:08:32) 
> [GCC 4.4.7 20120313 (Red Hat 4.4.7-1)] on linux2
> Type "help", "copyright", "credits" or "license" for more information.
> Anaconda is brought to you by Continuum Analytics.
> Please check out: http://continuum.io/thanks and https://anaconda.org
> >>> import run_multi_conv
>
> Mapped name None to device cuda: GeForce 840M
> WARNING (theano.gof.compilelock): Overriding existing lock by dead process 
> '3202' (I am process '3351')
> Using cuDNN version 5005 on context None
> /home/luca/data/Theano-master/theano/tensor/signal/downsample.py:6: 
> UserWarning: downsample module has been moved to the 
> theano.tensor.signal.pool module.
>   "downsample module has been moved to the theano.tensor.signal.pool 
> module.")
> >>> 
> >>> run_multi_conv.run_experiments()
> Disabling C code for Elemwise{mul,no_inplace} due to unsupported float16
> Disabling C code for Elemwise{Cast{float32}} due to unsupported float16
> Disabling C code for Elemwise{Cast{float16}} due to unsupported float16
> Disabling C code for Elemwise{Cast{float16}} due to unsupported float16
> Disabling C code for Alloc due to unsupported float16
> Disabling C code for Cast{float16} due to unsupported float16
> Disabling C code for Cast{float16} due to unsupported float16
> Disabling C code for Cast{float16} due to unsupported float16
> Disabling C code for Cast{float16} due to unsupported float16
> Disabling C code for RandomFunction{binomial} due to unsupported float16
> Disabling C code for RandomFunction{binomial} due to unsupported float16
> ===============================
> 00001    #include <Python.h>
> 00002    #include <iostream>
> 00003    #include "theano_mod_helper.h"
> 00004    #include <gpuarray/array.h>
> 00005    #include <gpuarray/kernel.h>
> 00006    #include <gpuarray/error.h>
> 00007    #include <gpuarray/buffer.h>
> 00008    #include <gpuarray/buffer_blas.h>
> 00009    #include <numpy/arrayobject.h>
> 00010    #include <gpuarray_api.h>
> 00011    #include <math.h>
> 00012    #include <numpy/arrayscalars.h>
> 00013    #include "cudnn.h"
> 00014    #include "cudnn_helper.h"
> 00015    #include "gpuarray_helper.h"
> 00016    #include "gpuarray/types.h"
> 00017    #include "gpuarray/array.h"
> 00018    #include "gpuarray/util.h"
> 00019    #include "gpuarray/ext_cuda.h"
> 00020    #include "gpuarray_api.h"
> 00021    #include "numpy_compat.h"
> 00022    //////////////////////
> 00023    ////  Support Code
> 00024    //////////////////////
> 00025    
> 00026    
> 00027    
> 00028    static int
> 00029    c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t 
> desc) {
> 00030      cudnnDataType_t dt;
> 00031      size_t ds;
> 00032      switch (var->ga.typecode) {
> 00033      case GA_FLOAT:
> 00034        dt = CUDNN_DATA_FLOAT;
> 00035        break;
> 00036      case GA_DOUBLE:
> 00037        dt = CUDNN_DATA_DOUBLE;
> 00038        break;
> 00039    #if CUDNN_VERSION > 3000
> 00040      case GA_HALF:
> 00041        dt = CUDNN_DATA_HALF;
> 00042        break;
> 00043    #endif
> 00044      default:
> 00045        PyErr_SetString(PyExc_TypeError, "Non-float datatype in 
> c_set_tensorNd");
> 00046        return -1;
> 00047      }
> 00048      ds = gpuarray_get_elsize(var->ga.typecode);
> 00049    
> 00050      int strs[5], dims[5], default_stride = 1;
> 00051      unsigned int nd = PyGpuArray_NDIM(var);
> 00052    
> 00053      if (nd > 5) {
> 00054        PyErr_SetString(PyExc_TypeError, "Tensor of more than 5d");
> 00055        return -1;
> 00056      }
> 00057    
> 00058      for (unsigned int _i = nd; _i > 0; _i--) {
> 00059        unsigned int i = _i - 1;
> 00060        strs[i] = PyGpuArray_STRIDE(var, i) ?
> 00061          PyGpuArray_STRIDE(var, i)/ds : default_stride;
> 00062        default_stride *= PyGpuArray_DIM(var, i);
> 00063        dims[i] = PyGpuArray_DIM(var, i);
> 00064      }
> 00065    
> 00066      cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, dt, nd, 
> dims, strs);
> 00067      if (err != CUDNN_STATUS_SUCCESS) {
> 00068        PyErr_Format(PyExc_RuntimeError,
> 00069             "Could not set tensorNd descriptor: %s",
> 00070             cudnnGetErrorString(err));
> 00071        return -1;
> 00072      }
> 00073      return 0;
> 00074    }
> 00075    
> 00076    static int
> 00077    c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) 
> {
> 00078      cudnnDataType_t dt;
> 00079      cudnnStatus_t err;
> 00080    
> 00081      if (!GpuArray_IS_C_CONTIGUOUS(&var->ga)) {
> 00082        PyErr_SetString(PyExc_ValueError,
> 00083                "Only contiguous filters (kernels) are supported.");
> 00084        return -1;
> 00085      }
> 00086      switch (var->ga.typecode) {
> 00087      case GA_FLOAT:
> 00088        dt = CUDNN_DATA_FLOAT;
> 00089        break;
> 00090      case GA_DOUBLE:
> 00091        dt = CUDNN_DATA_DOUBLE;
> 00092        break;
> 00093    #if CUDNN_VERSION > 3000
> 00094      case GA_HALF:
> 00095        dt = CUDNN_DATA_HALF;
> 00096        break;
> 00097    #endif
> 00098      default:
> 00099        PyErr_SetString(PyExc_TypeError, "Non-float datatype in 
> c_set_filter");
> 00100        return -1;
> 00101      }
> 00102    
> 00103      int dims[5];
> 00104      unsigned int nd = PyGpuArray_NDIM(var);
> 00105    
> 00106      if (nd > 5) {
> 00107        PyErr_SetString(PyExc_TypeError, "Tensor of more than 5d");
> 00108        return -1;
> 00109      }
> 00110    
> 00111      for (unsigned int _i = nd; _i > 0; _i--) {
> 00112        unsigned int i = _i - 1;
> 00113        dims[i] = PyGpuArray_DIM(var, i);
> 00114      }
> 00115    
> 00116    #if CUDNN_VERSION >= 5000
> 00117        err = cudnnSetFilterNdDescriptor(desc, dt, CUDNN_TENSOR_NCHW, 
> nd, dims);
> 00118    #else
> 00119        err = cudnnSetFilterNdDescriptor(desc, dt, nd, dims);
> 00120    #endif
> 00121    
> 00122      if (err != CUDNN_STATUS_SUCCESS) {
> 00123        PyErr_Format(PyExc_RuntimeError,
> 00124             "Could not set filter descriptor: %s.",
> 00125             cudnnGetErrorString(err));
> 00126        return -1;
> 00127      }
> 00128      return 0;
> 00129    }
> 00130    
> 00131    
> 00132    
> 00133        namespace {
> 00134        struct __struct_compiled_op_86feacd077d8749f42b5d82709a80ba3 {
> 00135            PyObject* __ERROR;
> 00136    
> 00137            PyObject* storage_V3;
> 00138    PyObject* storage_V5;
> 00139    PyObject* storage_V7;
> 00140    PyObject* storage_V9;
> 00141    PyObject* storage_V11;
> 00142    PyObject* storage_V13;
> 00143    PyObject* storage_V1;
> 00144    PyObject* storage_V15;
> 00145            
> 00146        PyObject* py_V15;
> 00147        PyGpuContextObject *V15;
> 00148    #define DTYPE_INPUT_0 npy_float16
> 00149    #define TYPENUM_INPUT_0 23
> 00150    #define ITEMSIZE_INPUT_0 2
> 00151    #define DTYPE_INPUT_1 npy_float16
> 00152    #define TYPENUM_INPUT_1 23
> 00153    #define ITEMSIZE_INPUT_1 2
> 00154    #define DTYPE_INPUT_2 npy_float16
> 00155    #define TYPENUM_INPUT_2 23
> 00156    #define ITEMSIZE_INPUT_2 2
> 00157    #define DTYPE_INPUT_4 npy_float16
> 00158    #define TYPENUM_INPUT_4 23
> 00159    #define ITEMSIZE_INPUT_4 2
> 00160    #define DTYPE_INPUT_5 npy_float16
> 00161    #define TYPENUM_INPUT_5 23
> 00162    #define ITEMSIZE_INPUT_5 2
> 00163    #define DTYPE_OUTPUT_0 npy_float16
> 00164    #define TYPENUM_OUTPUT_0 23
> 00165    #define ITEMSIZE_OUTPUT_0 2
> 00166    #define APPLY_SPECIFIC(str) 
> str##_node_86feacd077d8749f42b5d82709a80ba3_0
> 00167    #define CONV_INPLACE 1
> 00168    #define CONV_ALGO CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
> 00169    
> 00170    
> 00171    PyGpuContextObject *ctx;
> 00172    cudnnHandle_t APPLY_SPECIFIC(_handle);
> 00173    
> 00174    
> 00175    cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
> 00176    cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
> 00177    cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns);
> 00178    
> 00179    
> 00180    
> 00181    #ifdef CHOOSE_ALGO
> 00182    int reuse_algo;
> 00183    cudnnConvolutionFwdAlgo_t prev_algo;
> 00184    #ifndef CHOOSE_ONCE
> 00185    size_t prev_img_dims[5];
> 00186    size_t prev_kern_dims[5];
> 00187    #endif
> 00188    #endif
> 00189    
> 00190    int
> 00191    APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, 
> PyGpuArrayObject *kerns,
> 00192                             PyGpuArrayObject *om,
> 00193                             cudnnConvolutionDescriptor_t desc,
> 00194                             double alpha, double beta,
> 00195                             PyGpuArrayObject **output,
> 00196                             PyGpuContextObject *c) {
> 00197      cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
> 00198      float af = alpha, bf = beta;
> 00199      void *alpha_p;
> 00200      void *beta_p;
> 00201    
> 00202      if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
> 00203        PyErr_SetString(PyExc_ValueError,
> 00204                "images and kernel must have the same stack size");
> 00205        return 1;
> 00206      }
> 00207    
> 00208      if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
> 00209        return 1;
> 00210      if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
> 00211        return 1;
> 00212    
> 00213      switch (input->ga.typecode) {
> 00214      case GA_DOUBLE:
> 00215        alpha_p = (void *)&alpha;
> 00216        beta_p = (void *)&beta;
> 00217        break;
> 00218      case GA_FLOAT:
> 00219      case GA_HALF:
> 00220        alpha_p = (void *)&af;
> 00221        beta_p = (void *)&bf;
> 00222        break;
> 00223      default:
> 00224        PyErr_SetString(PyExc_TypeError, "Unsupported type in 
> convolution");
> 00225        return 1;
> 00226      }
> 00227    
> 00228    #ifdef CONV_INPLACE
> 00229      Py_XDECREF(*output);
> 00230      *output = om;
> 00231      Py_INCREF(*output);
> 00232    #else
> 00233      if (theano_prep_output(output, PyGpuArray_NDIM(om), 
> PyGpuArray_DIMS(om),
> 00234                             om->ga.typecode, GA_C_ORDER, c) != 0)
> 00235        return 1;
> 00236      if (beta != 0.0 && pygpu_move(*output, om))
> 00237        return 1;
> 00238    #endif
> 00239    
> 00240      if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1)
> 00241        return 1;
> 00242    
> 00243      cudnnConvolutionFwdAlgo_t algo = CONV_ALGO;
> 00244    
> 00245      cuda_enter(c->ctx);
> 00246    #ifdef CHOOSE_ALGO
> 00247    #ifndef CHOOSE_ONCE
> 00248      reuse_algo = 1;
> 00249      for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
> 00250        reuse_algo = (reuse_algo &&
> 00251                      PyGpuArray_DIM(input, i) == prev_img_dims[i]);
> 00252        reuse_algo = (reuse_algo &&
> 00253                      PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
> 00254      }
> 00255    #endif
> 00256    
> 00257      if (!reuse_algo) {
> 00258    #ifdef CHOOSE_TIME
> 00259        int count;
> 00260        cudnnConvolutionFwdAlgoPerf_t choice;
> 00261        err = cudnnFindConvolutionForwardAlgorithm(
> 00262          APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), 
> APPLY_SPECIFIC(kerns),
> 00263          desc, APPLY_SPECIFIC(output), 1, &count, &choice);
> 00264    
> 00265        if (err != CUDNN_STATUS_SUCCESS) {
> 00266          PyErr_Format(PyExc_RuntimeError,
> 00267                       "error selecting convolution algo: %s",
> 00268                       cudnnGetErrorString(err));
> 00269          cuda_exit(c->ctx);
> 00270          return 1;
> 00271        }
> 00272        algo = choice.algo;
> 00273    #else
> 00274        size_t free;
> 00275        int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_FREE_GMEM, 
> &free);
> 00276    
> 00277        if (err2 != GA_NO_ERROR) {
> 00278          PyErr_Format(PyExc_RuntimeError, "Error when trying to find 
> the "
> 00279                       "memory information on the GPU");
> 00280          cuda_exit(c->ctx);
> 00281          return 1;
> 00282        }
> 00283    
> 00284        err = cudnnGetConvolutionForwardAlgorithm(
> 00285          APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), 
> APPLY_SPECIFIC(kerns),
> 00286          desc, APPLY_SPECIFIC(output),
> 00287          CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
> 00288        if (err != CUDNN_STATUS_SUCCESS) {
> 00289          PyErr_Format(PyExc_RuntimeError,
> 00290                       "error selecting convolution algo: %s",
> 00291                       cudnnGetErrorString(err));
> 00292          cuda_exit(c->ctx);
> 00293          return 1;
> 00294        }
> 00295    #endif
> 00296        prev_algo = algo;
> 00297      } else {
> 00298        algo = prev_algo;
> 00299      }
> 00300    
> 00301    #ifdef CHOOSE_ONCE
> 00302      reuse_algo = 1;
> 00303    #else
> 00304      for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
> 00305        prev_img_dims[i] = PyGpuArray_DIM(input, i);
> 00306        prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
> 00307      }
> 00308    #endif
> 00309    
> 00310    #endif
> 00311    
> 00312      /* These two algos are not supported for 3d conv */
> 00313      if (PyGpuArray_NDIM(input) == 5 &&
> 00314          (algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
> 00315           algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM))
> 00316        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
> 00317    
> 00318      // The FFT implementation does not support strides, 1x1 filters 
> or inputs
> 00319      // with a spatial dimension larger than 1024. The tiled-FFT 
> implementation
> 00320      // does not support strides.
> 00321      // If the chosen implementation is FFT or tiled-FFT, validate 
> that it can
> 00322      // be used on the current data and default to a safe 
> implementation if it
> 00323      // can't.
> 00324      // The following code is 2d-specific but it is fine as FFT and 
> tiled-FFT are
> 00325      // defined only for 2d filters
> 00326      if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
> 00327           algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && 
> PyGpuArray_NDIM(input) == 4) {
> 00328    
> 00329        // Extract the properties of the convolution descriptor
> 00330        int nd;
> 00331        int pad[2];
> 00332        int stride[2];
> 00333        int upscale[2];
> 00334        cudnnConvolutionMode_t mode;
> 00335        cudnnDataType_t data_type;
> 00336        err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, 
> stride,
> 00337                                                 upscale, &mode, 
> &data_type);
> 00338        if (err != CUDNN_STATUS_SUCCESS) {
> 00339          PyErr_Format(PyExc_RuntimeError,
> 00340                       "error getting convolution properties: %s",
> 00341                       cudnnGetErrorString(err));
> 00342          cuda_exit(c->ctx);
> 00343          return 1;
> 00344        }
> 00345    
> 00346        if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT)
> 00347        {
> 00348          if (stride[0] != 1 || stride[1] != 1 ||
> 00349              PyGpuArray_DIM(input, 2) > 1024 || 
> PyGpuArray_DIM(input, 3) > 1024 ||
> 00350              (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 
> 3) == 1))
> 00351          {
> 00352            algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
> 00353          }
> 00354        }
> 00355        else
> 00356        {
> 00357          // algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
> 00358          if (stride[0] != 1 || stride[1] != 1)
> 00359          {
> 00360            algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
> 00361          }
> 00362        }
> 00363      }
> 00364    
> 00365      {
> 00366        size_t worksize;
> 00367        gpudata *workspace;
> 00368        err = 
> cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle),
> 00369                                                      
> APPLY_SPECIFIC(input),
> 00370                                                      
> APPLY_SPECIFIC(kerns),
> 00371                                                      desc,
> 00372                                                      
> APPLY_SPECIFIC(output),
> 00373                                                      algo,
> 00374                                                      &worksize);
> 00375    
> 00376        if (err == CUDNN_STATUS_NOT_SUPPORTED) {
> 00377          // Fallback to none algo if not supported
> 00378          // TODO: Print a warning
> 00379          algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
> 00380    
> 00381          err = 
> cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle),
> 00382                                                        
> APPLY_SPECIFIC(input),
> 00383                                                        
> APPLY_SPECIFIC(kerns),
> 00384                                                        desc,
> 00385                                                        
> APPLY_SPECIFIC(output),
> 00386                                                        algo,
> 00387                                                        &worksize);
> 00388        }
> 00389    
> 00390        if (err != CUDNN_STATUS_SUCCESS) {
> 00391          PyErr_Format(PyExc_RuntimeError,
> 00392                       "error getting worksize: %s",
> 00393                       cudnnGetErrorString(err));
> 00394          cuda_exit(c->ctx);
> 00395          return 1;
> 00396        }
> 00397    
> 00398        /*
> 00399         * This is less than ideal since we need to free it after 
> (which
> 00400         * introduces a synchronization point. But we don't have a 
> module
> 00401         * to place a nice get_work_mem() function in.
> 00402         */
> 00403        if (worksize != 0) {
> 00404          workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
> 00405          if (workspace == NULL) {
> 00406            PyErr_SetString(PyExc_RuntimeError,
> 00407                            "Could not allocate working memory");
> 00408            cuda_exit(c->ctx);
> 00409            return 1;
> 00410          }
> 00411        }
> 00412    
> 00413        cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
> 00414        cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
> 00415        cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
> 00416    
> 00417        err = cudnnConvolutionForward(
> 00418          APPLY_SPECIFIC(_handle),
> 00419          alpha_p,
> 00420          APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
> 00421          APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
> 00422          desc, algo,
> 00423          worksize == 0 ? NULL : *(void **)workspace, worksize,
> 00424          beta_p,
> 00425          APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output));
> 00426    
> 00427        if (worksize != 0)
> 00428          gpudata_release(workspace);
> 00429    
> 00430        cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
> 00431        cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
> 00432        cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
> 00433      }
> 00434      cuda_exit(c->ctx);
> 00435    
> 00436      if (err != CUDNN_STATUS_SUCCESS) {
> 00437        PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
> 00438             cudnnGetErrorString(err));
> 00439        return 1;
> 00440      }
> 00441      return 0;
> 00442    }
> 00443    
> 00444    #undef DTYPE_INPUT_0
> 00445    #undef TYPENUM_INPUT_0
> 00446    #undef ITEMSIZE_INPUT_0
> 00447    #undef DTYPE_INPUT_1
> 00448    #undef TYPENUM_INPUT_1
> 00449    #undef ITEMSIZE_INPUT_1
> 00450    #undef DTYPE_INPUT_2
> 00451    #undef TYPENUM_INPUT_2
> 00452    #undef ITEMSIZE_INPUT_2
> 00453    #undef DTYPE_INPUT_4
> 00454    #undef TYPENUM_INPUT_4
> 00455    #undef ITEMSIZE_INPUT_4
> 00456    #undef DTYPE_INPUT_5
> 00457    #undef TYPENUM_INPUT_5
> 00458    #undef ITEMSIZE_INPUT_5
> 00459    #undef DTYPE_OUTPUT_0
> 00460    #undef TYPENUM_OUTPUT_0
> 00461    #undef ITEMSIZE_OUTPUT_0
> 00462    #undef APPLY_SPECIFIC
> 00463    #undef CONV_INPLACE
> 00464    #undef CONV_ALGO
> 00465    
> 00466            __struct_compiled_op_86feacd077d8749f42b5d82709a80ba3() {
> 00467                // This is only somewhat safe because we:
> 00468                //  1) Are not a virtual class
> 00469                //  2) Do not use any virtual classes in the members
> 00470                //  3) Deal with mostly POD and pointers
> 00471    
> 00472                // If this changes, we would have to revise this, but 
> for
> 00473                // now I am tired of chasing segfaults because
> 00474                // initialization code had an error and some pointer 
> has
> 00475                // a junk value.
> 00476                memset(this, 0, sizeof(*this));
> 00477            }
> 00478            
> ~__struct_compiled_op_86feacd077d8749f42b5d82709a80ba3(void) {
> 00479                cleanup();
> 00480            }
> 00481    
> 00482            int init(PyObject* __ERROR, PyObject* storage_V3, 
> PyObject* storage_V5, PyObject* storage_V7, PyObject* storage_V9, PyObject* 
> storage_V11, PyObject* storage_V13, PyObject* storage_V1, PyObject* 
> storage_V15) {
> 00483                Py_XINCREF(storage_V3);
> 00484    Py_XINCREF(storage_V5);
> 00485    Py_XINCREF(storage_V7);
> 00486    Py_XINCREF(storage_V9);
> 00487    Py_XINCREF(storage_V11);
> 00488    Py_XINCREF(storage_V13);
> 00489    Py_XINCREF(storage_V1);
> 00490    Py_XINCREF(storage_V15);
> 00491                this->storage_V3 = storage_V3;
> 00492    this->storage_V5 = storage_V5;
> 00493    this->storage_V7 = storage_V7;
> 00494    this->storage_V9 = storage_V9;
> 00495    this->storage_V11 = storage_V11;
> 00496    this->storage_V13 = storage_V13;
> 00497    this->storage_V1 = storage_V1;
> 00498    this->storage_V15 = storage_V15;
> 00499                
> 00500    
> 00501    
> 00502    
> 00503    
> 00504    
> 00505    
> 00506    
> 00507    
> 00508        py_V15 = PyList_GET_ITEM(storage_V15, 0);
> 00509        {Py_XINCREF(py_V15);}
> 00510        
> 00511    if (!PyObject_TypeCheck(py_V15, &PyGpuContextType)) {
> 00512      PyErr_SetString(PyExc_TypeError, "expected a GpuContext");
> 00513      {
> 00514            if (!PyErr_Occurred()) {
> 00515                PyErr_SetString(PyExc_RuntimeError,
> 00516                    "Unexpected error in an Op's C code. "
> 00517                    "No Python exception was set.");
> 00518                }
> 00519            return 15;
> 00520    }
> 00521    }
> 00522    
> 00523    V15 = (PyGpuContextObject *)py_V15;
> 00524    Py_INCREF(V15);
> 00525    
> 00526    
> 00527    #define DTYPE_INPUT_0 npy_float16
> 00528    #define TYPENUM_INPUT_0 23
> 00529    #define ITEMSIZE_INPUT_0 2
> 00530    #define DTYPE_INPUT_1 npy_float16
> 00531    #define TYPENUM_INPUT_1 23
> 00532    #define ITEMSIZE_INPUT_1 2
> 00533    #define DTYPE_INPUT_2 npy_float16
> 00534    #define TYPENUM_INPUT_2 23
> 00535    #define ITEMSIZE_INPUT_2 2
> 00536    #define DTYPE_INPUT_4 npy_float16
> 00537    #define TYPENUM_INPUT_4 23
> 00538    #define ITEMSIZE_INPUT_4 2
> 00539    #define DTYPE_INPUT_5 npy_float16
> 00540    #define TYPENUM_INPUT_5 23
> 00541    #define ITEMSIZE_INPUT_5 2
> 00542    #define DTYPE_OUTPUT_0 npy_float16
> 00543    #define TYPENUM_OUTPUT_0 23
> 00544    #define ITEMSIZE_OUTPUT_0 2
> 00545    #define APPLY_SPECIFIC(str) 
> str##_node_86feacd077d8749f42b5d82709a80ba3_0
> 00546    #define CONV_INPLACE 1
> 00547    #define CONV_ALGO CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
> 00548    #define FAIL { \
> 00549            if (!PyErr_Occurred()) { \
> 00550                PyErr_SetString(PyExc_RuntimeError, \
> 00551                    "Unexpected error in an Op's C code. " \
> 00552                    "No Python exception was set."); \
> 00553                } \
> 00554            return 17; \
> 00555    }
> 00556    #define PARAMS V15
> 00557    
> 00558    
> 00559    {
> 00560      // We need to keep a reference here to have it available in the 
> destructor.
> 00561      ctx = PARAMS;
> 00562      Py_INCREF(ctx);
> 00563    
> 00564      cuda_enter(PARAMS->ctx);
> 00565      cudnnStatus_t err;
> 00566      APPLY_SPECIFIC(_handle) = NULL;
> 00567      if ((err = cudnnCreate(&APPLY_SPECIFIC(_handle))) != 
> CUDNN_STATUS_SUCCESS) {
> 00568        PyErr_Format(PyExc_RuntimeError, "could not create cuDNN 
> handle: %s",
> 00569                     cudnnGetErrorString(err));
> 00570        cuda_exit(PARAMS->ctx);
> 00571        FAIL;
> 00572      }
> 00573      if ((err = cudnnSetStream(APPLY_SPECIFIC(_handle),
> 00574                                cuda_get_stream(PARAMS->ctx))) != 
> CUDNN_STATUS_SUCCESS) {
> 00575        PyErr_Format(PyExc_RuntimeError, "Could not set cudnn stream: 
> %s",
> 00576                     cudnnGetErrorString(err));
> 00577        cuda_exit(PARAMS->ctx);
> 00578        FAIL;
> 00579      }
> 00580      cuda_exit(PARAMS->ctx);
> 00581    }
> 00582    
> 00583    
> 00584    
> 00585    cudnnStatus_t APPLY_SPECIFIC(err);
> 00586    APPLY_SPECIFIC(input) = NULL;
> 00587    APPLY_SPECIFIC(output) = NULL;
> 00588    APPLY_SPECIFIC(kerns) = NULL;
> 00589    if ((APPLY_SPECIFIC(err) = 
> cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != 
> CUDNN_STATUS_SUCCESS) {
> 00590      PyErr_Format(PyExc_MemoryError, "could not allocate tensor 
> descriptor "
> 00591               "(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
> 00592      FAIL;
> 00593    }
> 00594    if ((APPLY_SPECIFIC(err) = 
> cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != 
> CUDNN_STATUS_SUCCESS) {
> 00595      PyErr_Format(PyExc_MemoryError, "could not allocate tensor 
> descriptor "
> 00596                   "(out): %s", 
> cudnnGetErrorString(APPLY_SPECIFIC(err)));
> 00597      FAIL;
> 00598    }
> 00599    if ((APPLY_SPECIFIC(err) = 
> cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns))) != 
> CUDNN_STATUS_SUCCESS) {
> 00600      PyErr_Format(PyExc_MemoryError, "could not allocate filter 
> descriptor: %s", 
> 00601               cudnnGetErrorString(APPLY_SPECIFIC(err)));
> 00602      FAIL;
> 00603    }
> 00604    
> 00605    
> 00606    
> 00607    #ifdef CHOOSE_ALGO
> 00608    reuse_algo = 0;
> 00609    prev_algo = CONV_ALGO;
> 00610    #ifndef CHOOSE_ONCE
> 00611    memset(prev_img_dims, 0, sizeof(prev_img_dims));
> 00612    memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
> 00613    #endif
> 00614    #endif
> 00615    
> 00616    
> 00617    #undef FAIL
> 00618    #undef PARAMS
> 00619    #undef DTYPE_INPUT_0
> 00620    #undef TYPENUM_INPUT_0
> 00621    #undef ITEMSIZE_INPUT_0
> 00622    #undef DTYPE_INPUT_1
> 00623    #undef TYPENUM_INPUT_1
> 00624    #undef ITEMSIZE_INPUT_1
> 00625    #undef DTYPE_INPUT_2
> 00626    #undef TYPENUM_INPUT_2
> 00627    #undef ITEMSIZE_INPUT_2
> 00628    #undef DTYPE_INPUT_4
> 00629    #undef TYPENUM_INPUT_4
> 00630    #undef ITEMSIZE_INPUT_4
> 00631    #undef DTYPE_INPUT_5
> 00632    #undef TYPENUM_INPUT_5
> 00633    #undef ITEMSIZE_INPUT_5
> 00634    #undef DTYPE_OUTPUT_0
> 00635    #undef TYPENUM_OUTPUT_0
> 00636    #undef ITEMSIZE_OUTPUT_0
> 00637    #undef APPLY_SPECIFIC
> 00638    #undef CONV_INPLACE
> 00639    #undef CONV_ALGO
> 00640                this->__ERROR = __ERROR;
> 00641                return 0;
> 00642            }
> 00643            void cleanup(void) {
> 00644                __label_1:
> 00645    
> 00646    double __DUMMY_1;
> 00647    __label_3:
> 00648    
> 00649    double __DUMMY_3;
> 00650    __label_5:
> 00651    
> 00652    double __DUMMY_5;
> 00653    __label_7:
> 00654    
> 00655    double __DUMMY_7;
> 00656    __label_9:
> 00657    
> 00658    double __DUMMY_9;
> 00659    __label_11:
> 00660    
> 00661    double __DUMMY_11;
> 00662    __label_13:
> 00663    
> 00664    double __DUMMY_13;
> 00665    __label_15:
> 00666    Py_XDECREF(V15); V15 = NULL;
> 00667        {Py_XDECREF(py_V15);}
> 00668        
> 00669    double __DUMMY_15;
> 00670    __label_18:
> 00671    
> 00672    #define DTYPE_INPUT_0 npy_float16
> 00673    #define TYPENUM_INPUT_0 23
> 00674    #define ITEMSIZE_INPUT_0 2
> 00675    #define DTYPE_INPUT_1 npy_float16
> 00676    #define TYPENUM_INPUT_1 23
> 00677    #define ITEMSIZE_INPUT_1 2
> 00678    #define DTYPE_INPUT_2 npy_float16
> 00679    #define TYPENUM_INPUT_2 23
> 00680    #define ITEMSIZE_INPUT_2 2
> 00681    #define DTYPE_INPUT_4 npy_float16
> 00682    #define TYPENUM_INPUT_4 23
> 00683    #define ITEMSIZE_INPUT_4 2
> 00684    #define DTYPE_INPUT_5 npy_float16
> 00685    #define TYPENUM_INPUT_5 23
> 00686    #define ITEMSIZE_INPUT_5 2
> 00687    #define DTYPE_OUTPUT_0 npy_float16
> 00688    #define TYPENUM_OUTPUT_0 23
> 00689    #define ITEMSIZE_OUTPUT_0 2
> 00690    #define APPLY_SPECIFIC(str) 
> str##_node_86feacd077d8749f42b5d82709a80ba3_0
> 00691    #define CONV_INPLACE 1
> 00692    #define CONV_ALGO CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
> 00693    
> 00694    
> 00695    cuda_enter(ctx->ctx);
> 00696    cudnnDestroy(APPLY_SPECIFIC(_handle));
> 00697    cuda_exit(ctx->ctx);
> 00698    Py_DECREF((PyObject *)ctx);
> 00699    
> 00700    
> 00701    if (APPLY_SPECIFIC(input) != NULL)
> 00702      cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input));
> 00703    if (APPLY_SPECIFIC(output) != NULL)
> 00704      cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output));
> 00705    if (APPLY_SPECIFIC(kerns) != NULL)
> 00706      cudnnDestroyFilterDescriptor(APPLY_SPECIFIC(kerns));
> 00707    
> 00708    #undef DTYPE_INPUT_0
> 00709    #undef TYPENUM_INPUT_0
> 00710    #undef ITEMSIZE_INPUT_0
> 00711    #undef DTYPE_INPUT_1
> 00712    #undef TYPENUM_INPUT_1
> 00713    #undef ITEMSIZE_INPUT_1
> 00714    #undef DTYPE_INPUT_2
> 00715    #undef TYPENUM_INPUT_2
> 00716    #undef ITEMSIZE_INPUT_2
> 00717    #undef DTYPE_INPUT_4
> 00718    #undef TYPENUM_INPUT_4
> 00719    #undef ITEMSIZE_INPUT_4
> 00720    #undef DTYPE_INPUT_5
> 00721    #undef TYPENUM_INPUT_5
> 00722    #undef ITEMSIZE_INPUT_5
> 00723    #undef DTYPE_OUTPUT_0
> 00724    #undef TYPENUM_OUTPUT_0
> 00725    #undef ITEMSIZE_OUTPUT_0
> 00726    #undef APPLY_SPECIFIC
> 00727    #undef CONV_INPLACE
> 00728    #undef CONV_ALGO
> 00729    double __DUMMY_18;
> 00730    
> 00731                Py_XDECREF(this->storage_V3);
> 00732    Py_XDECREF(this->storage_V5);
> 00733    Py_XDECREF(this->storage_V7);
> 00734    Py_XDECREF(this->storage_V9);
> 00735    Py_XDECREF(this->storage_V11);
> 00736    Py_XDECREF(this->storage_V13);
> 00737    Py_XDECREF(this->storage_V1);
> 00738    Py_XDECREF(this->storage_V15);
> 00739            }
> 00740            int run(void) {
> 00741                int __failure = 0;
> 00742                
> 00743        PyObject* py_V1;
> 00744        
> 00745            PyGpuArrayObject *V1;
> 00746            
> 00747        PyObject* py_V3;
> 00748        
> 00749            PyGpuArrayObject *V3;
> 00750            
> 00751        PyObject* py_V5;
> 00752        
> 00753            PyGpuArrayObject *V5;
> 00754            
> 00755        PyObject* py_V7;
> 00756        
> 00757            PyGpuArrayObject *V7;
> 00758            
> 00759        PyObject* py_V9;
> 00760        
> 00761            cudnnConvolutionDescriptor_t V9;
> 00762            
> 00763        PyObject* py_V11;
> 00764        
> 00765                    typedef npy_float16 V11_dtype; // Deprecated use 
> dtype_V11 instead.
> 00766                    typedef npy_float16 dtype_V11;
> 00767                
> 00768            npy_float16 V11;
> 00769            
> 00770        PyObject* py_V13;
> 00771        
> 00772                    typedef npy_float16 V13_dtype; // Deprecated use 
> dtype_V13 instead.
> 00773                    typedef npy_float16 dtype_V13;
> 00774                
> 00775            npy_float16 V13;
> 00776            
> 00777    {
> 00778    
> 00779        py_V1 = PyList_GET_ITEM(storage_V1, 0);
> 00780        {Py_XINCREF(py_V1);}
> 00781        
> 00782            if (py_V1 == Py_None)
> 00783            {
> 00784                V1 = NULL;
> 00785            }
> 00786            else
> 00787            {
> 00788                
> 00789            V1 = NULL;
> 00790            if (py_V1 == Py_None) {
> 00791                PyErr_SetString(PyExc_ValueError, "expected a 
> GpuArray, not None");
> 00792                {
> 00793            __failure = 2;
> 00794            if (!PyErr_Occurred()) {
> 00795                PyErr_SetString(PyExc_RuntimeError,
> 00796                    "Unexpected error in an Op's C code. "
> 00797                    "No Python exception was set.");
> 00798                }
> 00799            goto __label_2;}
> 00800            }
> 00801            /* First check if we are the base type exactly (the most 
> common case),
> 00802               then do the full subclass check if needed. */
> 00803            if (py_V1->ob_type != &PyGpuArrayType &&
> 00804                !PyObject_TypeCheck(py_V1, &PyGpuArrayType)) {
> 00805                PyE
>

-- 

--- 
You received this message because you are subscribed to the Google Groups 
"theano-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/d/optout.

Re: [theano-users] Error using floatX = float16 to save memory

Reply via email to