Re: [theano-users] Error using floatX = float16 to save memory

luca . wagner . 0812 Fri, 15 Jul 2016 01:56:08 -0700

ok I try.
thanks

On Thursday, July 14, 2016 at 11:44:41 PM UTC+2, Arnaud Bergeron wrote:
>
> I can't reproduce your problem using a simple convolution in float16.
>
> Either this is because your code is doing something unexpected or because 
> the problem has been fixed in the development version.
>
> In nay case the development version is a much better option for the new 
> backend and float16 so I encourage you to upgrade and try again: 
> http://deeplearning.net/software/theano/install.html#bleeding-edge-install-instructions
> .
>
> 2016-07-14 4:22 GMT-04:00 <[email protected] <javascript:>>:
>
>> Here is .theanorc:
>>
>> [global]
>> floatX = float16
>> device=cuda
>> [cuda] 
>> root = /usr/local/cuda-7.5
>>
>>
>> [nvcc]
>> fastmath=True
>>
>> optimizer = fast_compile
>>
>> On Thursday, July 14, 2016 at 10:19:56 AM UTC+2, [email protected] 
>> wrote:
>>>
>>> Hi Arnaud,
>>> I put _f16_ok = True in dnn.py ( attached).
>>>
>>> This is the error I received:
>>>
>>> Python 2.7.11 |Anaconda custom (64-bit)| (default, Dec  6 2015, 
>>> 18:08:32) 
>>> [GCC 4.4.7 20120313 (Red Hat 4.4.7-1)] on linux2
>>> Type "help", "copyright", "credits" or "license" for more information.
>>> Anaconda is brought to you by Continuum Analytics.
>>> Please check out: http://continuum.io/thanks and https://anaconda.org
>>> >>> import run_multi_conv
>>>
>>> Mapped name None to device cuda: GeForce 840M
>>> WARNING (theano.gof.compilelock): Overriding existing lock by dead 
>>> process '3202' (I am process '3351')
>>> Using cuDNN version 5005 on context None
>>> /home/luca/data/Theano-master/theano/tensor/signal/downsample.py:6: 
>>> UserWarning: downsample module has been moved to the 
>>> theano.tensor.signal.pool module.
>>>   "downsample module has been moved to the theano.tensor.signal.pool 
>>> module.")
>>> >>> 
>>> >>> run_multi_conv.run_experiments()
>>> Disabling C code for Elemwise{mul,no_inplace} due to unsupported float16
>>> Disabling C code for Elemwise{Cast{float32}} due to unsupported float16
>>> Disabling C code for Elemwise{Cast{float16}} due to unsupported float16
>>> Disabling C code for Elemwise{Cast{float16}} due to unsupported float16
>>> Disabling C code for Alloc due to unsupported float16
>>> Disabling C code for Cast{float16} due to unsupported float16
>>> Disabling C code for Cast{float16} due to unsupported float16
>>> Disabling C code for Cast{float16} due to unsupported float16
>>> Disabling C code for Cast{float16} due to unsupported float16
>>> Disabling C code for RandomFunction{binomial} due to unsupported float16
>>> Disabling C code for RandomFunction{binomial} due to unsupported float16
>>> ===============================
>>> 00001    #include <Python.h>
>>> 00002    #include <iostream>
>>> 00003    #include "theano_mod_helper.h"
>>> 00004    #include <gpuarray/array.h>
>>> 00005    #include <gpuarray/kernel.h>
>>> 00006    #include <gpuarray/error.h>
>>> 00007    #include <gpuarray/buffer.h>
>>> 00008    #include <gpuarray/buffer_blas.h>
>>> 00009    #include <numpy/arrayobject.h>
>>> 00010    #include <gpuarray_api.h>
>>> 00011    #include <math.h>
>>> 00012    #include <numpy/arrayscalars.h>
>>> 00013    #include "cudnn.h"
>>> 00014    #include "cudnn_helper.h"
>>> 00015    #include "gpuarray_helper.h"
>>> 00016    #include "gpuarray/types.h"
>>> 00017    #include "gpuarray/array.h"
>>> 00018    #include "gpuarray/util.h"
>>> 00019    #include "gpuarray/ext_cuda.h"
>>> 00020    #include "gpuarray_api.h"
>>> 00021    #include "numpy_compat.h"
>>> 00022    //////////////////////
>>> 00023    ////  Support Code
>>> 00024    //////////////////////
>>> 00025    
>>> 00026    
>>> 00027    
>>> 00028    static int
>>> 00029    c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t 
>>> desc) {
>>> 00030      cudnnDataType_t dt;
>>> 00031      size_t ds;
>>> 00032      switch (var->ga.typecode) {
>>> 00033      case GA_FLOAT:
>>> 00034        dt = CUDNN_DATA_FLOAT;
>>> 00035        break;
>>> 00036      case GA_DOUBLE:
>>> 00037        dt = CUDNN_DATA_DOUBLE;
>>> 00038        break;
>>> 00039    #if CUDNN_VERSION > 3000
>>> 00040      case GA_HALF:
>>> 00041        dt = CUDNN_DATA_HALF;
>>> 00042        break;
>>> 00043    #endif
>>> 00044      default:
>>> 00045        PyErr_SetString(PyExc_TypeError, "Non-float datatype in 
>>> c_set_tensorNd");
>>> 00046        return -1;
>>> 00047      }
>>> 00048      ds = gpuarray_get_elsize(var->ga.typecode);
>>> 00049    
>>> 00050      int strs[5], dims[5], default_stride = 1;
>>> 00051      unsigned int nd = PyGpuArray_NDIM(var);
>>> 00052    
>>> 00053      if (nd > 5) {
>>> 00054        PyErr_SetString(PyExc_TypeError, "Tensor of more than 5d");
>>> 00055        return -1;
>>> 00056      }
>>> 00057    
>>> 00058      for (unsigned int _i = nd; _i > 0; _i--) {
>>> 00059        unsigned int i = _i - 1;
>>> 00060        strs[i] = PyGpuArray_STRIDE(var, i) ?
>>> 00061          PyGpuArray_STRIDE(var, i)/ds : default_stride;
>>> 00062        default_stride *= PyGpuArray_DIM(var, i);
>>> 00063        dims[i] = PyGpuArray_DIM(var, i);
>>> 00064      }
>>> 00065    
>>> 00066      cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, dt, nd, 
>>> dims, strs);
>>> 00067      if (err != CUDNN_STATUS_SUCCESS) {
>>> 00068        PyErr_Format(PyExc_RuntimeError,
>>> 00069             "Could not set tensorNd descriptor: %s",
>>> 00070             cudnnGetErrorString(err));
>>> 00071        return -1;
>>> 00072      }
>>> 00073      return 0;
>>> 00074    }
>>> 00075    
>>> 00076    static int
>>> 00077    c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t 
>>> desc) {
>>> 00078      cudnnDataType_t dt;
>>> 00079      cudnnStatus_t err;
>>> 00080    
>>> 00081      if (!GpuArray_IS_C_CONTIGUOUS(&var->ga)) {
>>> 00082        PyErr_SetString(PyExc_ValueError,
>>> 00083                "Only contiguous filters (kernels) are supported.");
>>> 00084        return -1;
>>> 00085      }
>>> 00086      switch (var->ga.typecode) {
>>> 00087      case GA_FLOAT:
>>> 00088        dt = CUDNN_DATA_FLOAT;
>>> 00089        break;
>>> 00090      case GA_DOUBLE:
>>> 00091        dt = CUDNN_DATA_DOUBLE;
>>> 00092        break;
>>> 00093    #if CUDNN_VERSION > 3000
>>> 00094      case GA_HALF:
>>> 00095        dt = CUDNN_DATA_HALF;
>>> 00096        break;
>>> 00097    #endif
>>> 00098      default:
>>> 00099        PyErr_SetString(PyExc_TypeError, "Non-float datatype in 
>>> c_set_filter");
>>> 00100        return -1;
>>> 00101      }
>>> 00102    
>>> 00103      int dims[5];
>>> 00104      unsigned int nd = PyGpuArray_NDIM(var);
>>> 00105    
>>> 00106      if (nd > 5) {
>>> 00107        PyErr_SetString(PyExc_TypeError, "Tensor of more than 5d");
>>> 00108        return -1;
>>> 00109      }
>>> 00110    
>>> 00111      for (unsigned int _i = nd; _i > 0; _i--) {
>>> 00112        unsigned int i = _i - 1;
>>> 00113        dims[i] = PyGpuArray_DIM(var, i);
>>> 00114      }
>>> 00115    
>>> 00116    #if CUDNN_VERSION >= 5000
>>> 00117        err = cudnnSetFilterNdDescriptor(desc, dt, 
>>> CUDNN_TENSOR_NCHW, nd, dims);
>>> 00118    #else
>>> 00119        err = cudnnSetFilterNdDescriptor(desc, dt, nd, dims);
>>> 00120    #endif
>>> 00121    
>>> 00122      if (err != CUDNN_STATUS_SUCCESS) {
>>> 00123        PyErr_Format(PyExc_RuntimeError,
>>> 00124             "Could not set filter descriptor: %s.",
>>> 00125             cudnnGetErrorString(err));
>>> 00126        return -1;
>>> 00127      }
>>> 00128      return 0;
>>> 00129    }
>>> 00130    
>>> 00131    
>>> 00132    
>>> 00133        namespace {
>>> 00134        struct 
>>> __struct_compiled_op_86feacd077d8749f42b5d82709a80ba3 {
>>> 00135            PyObject* __ERROR;
>>> 00136    
>>> 00137            PyObject* storage_V3;
>>> 00138    PyObject* storage_V5;
>>> 00139    PyObject* storage_V7;
>>> 00140    PyObject* storage_V9;
>>> 00141    PyObject* storage_V11;
>>> 00142    PyObject* storage_V13;
>>> 00143    PyObject* storage_V1;
>>> 00144    PyObject* storage_V15;
>>> 00145            
>>> 00146        PyObject* py_V15;
>>> 00147        PyGpuContextObject *V15;
>>> 00148    #define DTYPE_INPUT_0 npy_float16
>>> 00149    #define TYPENUM_INPUT_0 23
>>> 00150    #define ITEMSIZE_INPUT_0 2
>>> 00151    #define DTYPE_INPUT_1 npy_float16
>>> 00152    #define TYPENUM_INPUT_1 23
>>> 00153    #define ITEMSIZE_INPUT_1 2
>>> 00154    #define DTYPE_INPUT_2 npy_float16
>>> 00155    #define TYPENUM_INPUT_2 23
>>> 00156    #define ITEMSIZE_INPUT_2 2
>>> 00157    #define DTYPE_INPUT_4 npy_float16
>>> 00158    #define TYPENUM_INPUT_4 23
>>> 00159    #define ITEMSIZE_INPUT_4 2
>>> 00160    #define DTYPE_INPUT_5 npy_float16
>>> 00161    #define TYPENUM_INPUT_5 23
>>> 00162    #define ITEMSIZE_INPUT_5 2
>>> 00163    #define DTYPE_OUTPUT_0 npy_float16
>>> 00164    #define TYPENUM_OUTPUT_0 23
>>> 00165    #define ITEMSIZE_OUTPUT_0 2
>>> 00166    #define APPLY_SPECIFIC(str) 
>>> str##_node_86feacd077d8749f42b5d82709a80ba3_0
>>> 00167    #define CONV_INPLACE 1
>>> 00168    #define CONV_ALGO 
>>> CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
>>> 00169    
>>> 00170    
>>> 00171    PyGpuContextObject *ctx;
>>> 00172    cudnnHandle_t APPLY_SPECIFIC(_handle);
>>> 00173    
>>> 00174    
>>> 00175    cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
>>> 00176    cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
>>> 00177    cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns);
>>> 00178    
>>> 00179    
>>> 00180    
>>> 00181    #ifdef CHOOSE_ALGO
>>> 00182    int reuse_algo;
>>> 00183    cudnnConvolutionFwdAlgo_t prev_algo;
>>> 00184    #ifndef CHOOSE_ONCE
>>> 00185    size_t prev_img_dims[5];
>>> 00186    size_t prev_kern_dims[5];
>>> 00187    #endif
>>> 00188    #endif
>>> 00189    
>>> 00190    int
>>> 00191    APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, 
>>> PyGpuArrayObject *kerns,
>>> 00192                             PyGpuArrayObject *om,
>>> 00193                             cudnnConvolutionDescriptor_t desc,
>>> 00194                             double alpha, double beta,
>>> 00195                             PyGpuArrayObject **output,
>>> 00196                             PyGpuContextObject *c) {
>>> 00197      cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
>>> 00198      float af = alpha, bf = beta;
>>> 00199      void *alpha_p;
>>> 00200      void *beta_p;
>>> 00201    
>>> 00202      if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
>>> 00203        PyErr_SetString(PyExc_ValueError,
>>> 00204                "images and kernel must have the same stack size");
>>> 00205        return 1;
>>> 00206      }
>>> 00207    
>>> 00208      if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
>>> 00209        return 1;
>>> 00210      if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
>>> 00211        return 1;
>>> 00212    
>>> 00213      switch (input->ga.typecode) {
>>> 00214      case GA_DOUBLE:
>>> 00215        alpha_p = (void *)&alpha;
>>> 00216        beta_p = (void *)&beta;
>>> 00217        break;
>>> 00218      case GA_FLOAT:
>>> 00219      case GA_HALF:
>>> 00220        alpha_p = (void *)&af;
>>> 00221        beta_p = (void *)&bf;
>>> 00222        break;
>>> 00223      default:
>>> 00224        PyErr_SetString(PyExc_TypeError, "Unsupported type in 
>>> convolution");
>>> 00225        return 1;
>>> 00226      }
>>> 00227    
>>> 00228    #ifdef CONV_INPLACE
>>> 00229      Py_XDECREF(*output);
>>> 00230      *output = om;
>>> 00231      Py_INCREF(*output);
>>> 00232    #else
>>> 00233      if (theano_prep_output(output, PyGpuArray_NDIM(om), 
>>> PyGpuArray_DIMS(om),
>>> 00234                             om->ga.typecode, GA_C_ORDER, c) != 0)
>>> 00235        return 1;
>>> 00236      if (beta != 0.0 && pygpu_move(*output, om))
>>> 00237        return 1;
>>> 00238    #endif
>>> 00239    
>>> 00240      if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1)
>>> 00241        return 1;
>>> 00242    
>>> 00243      cudnnConvolutionFwdAlgo_t algo = CONV_ALGO;
>>> 00244    
>>> 00245      cuda_enter(c->ctx);
>>> 00246    #ifdef CHOOSE_ALGO
>>> 00247    #ifndef CHOOSE_ONCE
>>> 00248      reuse_algo = 1;
>>> 00249      for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
>>> 00250        reuse_algo = (reuse_algo &&
>>> 00251                      PyGpuArray_DIM(input, i) == prev_img_dims[i]);
>>> 00252        reuse_algo = (reuse_algo &&
>>> 00253                      PyGpuArray_DIM(kerns, i) == 
>>> prev_kern_dims[i]);
>>> 00254      }
>>> 00255    #endif
>>> 00256    
>>> 00257      if (!reuse_algo) {
>>> 00258    #ifdef CHOOSE_TIME
>>> 00259        int count;
>>> 00260        cudnnConvolutionFwdAlgoPerf_t choice;
>>> 00261        err = cudnnFindConvolutionForwardAlgorithm(
>>> 00262          APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), 
>>> APPLY_SPECIFIC(kerns),
>>> 00263          desc, APPLY_SPECIFIC(output), 1, &count, &choice);
>>> 00264    
>>> 00265        if (err != CUDNN_STATUS_SUCCESS) {
>>> 00266          PyErr_Format(PyExc_RuntimeError,
>>> 00267                       "error selecting convolution algo: %s",
>>> 00268                       cudnnGetErrorString(err));
>>> 00269          cuda_exit(c->ctx);
>>> 00270          return 1;
>>> 00271        }
>>> 00272        algo = choice.algo;
>>> 00273    #else
>>> 00274        size_t free;
>>> 00275        int err2 = gpucontext_property(c->ctx, 
>>> GA_CTX_PROP_FREE_GMEM, &free);
>>> 00276    
>>> 00277        if (err2 != GA_NO_ERROR) {
>>> 00278          PyErr_Format(PyExc_RuntimeError, "Error when trying to 
>>> find the "
>>> 00279                       "memory information on the GPU");
>>> 00280          cuda_exit(c->ctx);
>>> 00281          return 1;
>>> 00282        }
>>> 00283    
>>> 00284        err = cudnnGetConvolutionForwardAlgorithm(
>>> 00285          APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), 
>>> APPLY_SPECIFIC(kerns),
>>> 00286          desc, APPLY_SPECIFIC(output),
>>> 00287          CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, 
>>> &algo);
>>> 00288        if (err != CUDNN_STATUS_SUCCESS) {
>>> 00289          PyErr_Format(PyExc_RuntimeError,
>>> 00290                       "error selecting convolution algo: %s",
>>> 00291                       cudnnGetErrorString(err));
>>> 00292          cuda_exit(c->ctx);
>>> 00293          return 1;
>>> 00294        }
>>> 00295    #endif
>>> 00296        prev_algo = algo;
>>> 00297      } else {
>>> 00298        algo = prev_algo;
>>> 00299      }
>>> 00300    
>>> 00301    #ifdef CHOOSE_ONCE
>>> 00302      reuse_algo = 1;
>>> 00303    #else
>>> 00304      for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
>>> 00305        prev_img_dims[i] = PyGpuArray_DIM(input, i);
>>> 00306        prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
>>> 00307      }
>>> 00308    #endif
>>> 00309    
>>> 00310    #endif
>>> 00311    
>>> 00312      /* These two algos are not supported for 3d conv */
>>> 00313      if (PyGpuArray_NDIM(input) == 5 &&
>>> 00314          (algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM 
>>> ||
>>> 00315           algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM))
>>> 00316        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
>>> 00317    
>>> 00318      // The FFT implementation does not support strides, 1x1 
>>> filters or inputs
>>> 00319      // with a spatial dimension larger than 1024. The tiled-FFT 
>>> implementation
>>> 00320      // does not support strides.
>>> 00321      // If the chosen implementation is FFT or tiled-FFT, validate 
>>> that it can
>>> 00322      // be used on the current data and default to a safe 
>>> implementation if it
>>> 00323      // can't.
>>> 00324      // The following code is 2d-specific but it is fine as FFT 
>>> and tiled-FFT are
>>> 00325      // defined only for 2d filters
>>> 00326      if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
>>> 00327           algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && 
>>> PyGpuArray_NDIM(input) == 4) {
>>> 00328    
>>> 00329        // Extract the properties of the convolution descriptor
>>> 00330        int nd;
>>> 00331        int pad[2];
>>> 00332        int stride[2];
>>> 00333        int upscale[2];
>>> 00334        cudnnConvolutionMode_t mode;
>>> 00335        cudnnDataType_t data_type;
>>> 00336        err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, 
>>> stride,
>>> 00337                                                 upscale, &mode, 
>>> &data_type);
>>> 00338        if (err != CUDNN_STATUS_SUCCESS) {
>>> 00339          PyErr_Format(PyExc_RuntimeError,
>>> 00340                       "error getting convolution properties: %s",
>>> 00341                       cudnnGetErrorString(err));
>>> 00342          cuda_exit(c->ctx);
>>> 00343          return 1;
>>> 00344        }
>>> 00345    
>>> 00346        if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT)
>>> 00347        {
>>> 00348          if (stride[0] != 1 || stride[1] != 1 ||
>>> 00349              PyGpuArray_DIM(input, 2) > 1024 || 
>>> PyGpuArray_DIM(input, 3) > 1024 ||
>>> 00350              (PyGpuArray_DIM(kerns, 2) == 1 && 
>>> PyGpuArray_DIM(kerns, 3) == 1))
>>> 00351          {
>>> 00352            algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
>>> 00353          }
>>> 00354        }
>>> 00355        else
>>> 00356        {
>>> 00357          // algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
>>> 00358          if (stride[0] != 1 || stride[1] != 1)
>>> 00359          {
>>> 00360            algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
>>> 00361          }
>>> 00362        }
>>> 00363      }
>>> 00364    
>>> 00365      {
>>> 00366        size_t worksize;
>>> 00367        gpudata *workspace;
>>> 00368        err = 
>>> cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle),
>>> 00369                                                      
>>> APPLY_SPECIFIC(input),
>>> 00370                                                      
>>> APPLY_SPECIFIC(kerns),
>>> 00371                                                      desc,
>>> 00372                                                      
>>> APPLY_SPECIFIC(output),
>>> 00373                                                      algo,
>>> 00374                                                      &worksize);
>>> 00375    
>>> 00376        if (err == CUDNN_STATUS_NOT_SUPPORTED) {
>>> 00377          // Fallback to none algo if not supported
>>> 00378          // TODO: Print a warning
>>> 00379          algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
>>> 00380    
>>> 00381          err = 
>>> cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle),
>>> 00382                                                        
>>> APPLY_SPECIFIC(input),
>>> 00383                                                        
>>> APPLY_SPECIFIC(kerns),
>>> 00384                                                        desc,
>>> 00385                                                        
>>> APPLY_SPECIFIC(output),
>>> 00386                                                        algo,
>>> 00387                                                        &worksize);
>>> 00388        }
>>> 00389    
>>> 00390        if (err != CUDNN_STATUS_SUCCESS) {
>>> 00391          PyErr_Format(PyExc_RuntimeError,
>>> 00392                       "error getting worksize: %s",
>>> 00393                       cudnnGetErrorString(err));
>>> 00394          cuda_exit(c->ctx);
>>> 00395          return 1;
>>> 00396        }
>>> 00397    
>>> 00398        /*
>>> 00399         * This is less than ideal since we need to free it after 
>>> (which
>>> 00400         * introduces a synchronization point. But we don't have a 
>>> module
>>> 00401         * to place a nice get_work_mem() function in.
>>> 00402         */
>>> 00403        if (worksize != 0) {
>>> 00404          workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, 
>>> NULL);
>>> 00405          if (workspace == NULL) {
>>> 00406            PyErr_SetString(PyExc_RuntimeError,
>>> 00407                            "Could not allocate working memory");
>>> 00408            cuda_exit(c->ctx);
>>> 00409            return 1;
>>> 00410          }
>>> 00411        }
>>> 00412    
>>> 00413        cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
>>> 00414        cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
>>> 00415        cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
>>> 00416    
>>> 00417        err = cudnnConvolutionForward(
>>> 00418          APPLY_SPECIFIC(_handle),
>>> 00419          alpha_p,
>>> 00420          APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
>>> 00421          APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
>>> 00422          desc, algo,
>>> 00423          worksize == 0 ? NULL : *(void **)workspace, worksize,
>>> 00424          beta_p,
>>> 00425          APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output));
>>> 00426    
>>> 00427        if (worksize != 0)
>>> 00428          gpudata_release(workspace);
>>> 00429    
>>> 00430        cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
>>> 00431        cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
>>> 00432        cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
>>> 00433      }
>>> 00434      cuda_exit(c->ctx);
>>> 00435    
>>> 00436      if (err != CUDNN_STATUS_SUCCESS) {
>>> 00437        PyErr_Format(PyExc_RuntimeError, "error doing operation: 
>>> %s",
>>> 00438             cudnnGetErrorString(err));
>>> 00439        return 1;
>>> 00440      }
>>> 00441      return 0;
>>> 00442    }
>>> 00443    
>>> 00444    #undef DTYPE_INPUT_0
>>> 00445    #undef TYPENUM_INPUT_0
>>> 00446    #undef ITEMSIZE_INPUT_0
>>> 00447    #undef DTYPE_INPUT_1
>>> 00448    #undef TYPENUM_INPUT_1
>>> 00449    #undef ITEMSIZE_INPUT_1
>>> 00450    #undef DTYPE_INPUT_2
>>> 00451    #undef TYPENUM_INPUT_2
>>> 00452    #undef ITEMSIZE_INPUT_2
>>> 00453    #undef DTYPE_INPUT_4
>>> 00454    #undef TYPENUM_INPUT_4
>>> 00455    #undef ITEMSIZE_INPUT_4
>>> 00456    #undef DTYPE_INPUT_5
>>> 00457    #undef TYPENUM_INPUT_5
>>> 00458    #undef ITEMSIZE_INPUT_5
>>> 00459    #undef DTYPE_OUTPUT_0
>>> 00460    #undef TYPENUM_OUTPUT_0
>>> 00461    #undef ITEMSIZE_OUTPUT_0
>>> 00462    #undef APPLY_SPECIFIC
>>> 00463    #undef CONV_INPLACE
>>> 00464    #undef CONV_ALGO
>>> 00465    
>>> 00466            __struct_compiled_op_86feacd077d8749f42b5d82709a80ba3() 
>>> {
>>> 00467                // This is only somewhat safe because we:
>>> 00468                //  1) Are not a virtual class
>>> 00469                //  2) Do not use any virtual classes in the members
>>> 00470                //  3) Deal with mostly POD and pointers
>>> 00471    
>>> 00472                // If this changes, we would have to revise this, 
>>> but for
>>> 00473                // now I am tired of chasing segfaults because
>>> 00474                // initialization code had an error and some 
>>> pointer has
>>> 00475                // a junk value.
>>> 00476                memset(this, 0, sizeof(*this));
>>> 00477            }
>>> 00478            
>>> ~__struct_compiled_op_86feacd077d8749f42b5d82709a80ba3(void) {
>>> 00479                cleanup();
>>> 00480            }
>>> 00481    
>>> 00482            int init(PyObject* __ERROR, PyObject* storage_V3, 
>>> PyObject* storage_V5, PyObject* storage_V7, PyObject* storage_V9, PyObject* 
>>> storage_V11, PyObject* storage_V13, PyObject* storage_V1, PyObject* 
>>> storage_V15) {
>>> 00483                Py_XINCREF(storage_V3);
>>> 00484    Py_XINCREF(storage_V5);
>>> 00485    Py_XINCREF(storage_V7);
>>> 00486    Py_XINCREF(storage_V9);
>>> 00487    Py_XINCREF(storage_V11);
>>> 00488    Py_XINCREF(storage_V13);
>>> 00489    Py_XINCREF(storage_V1);
>>> 00490    Py_XINCREF(storage_V15);
>>> 00491                this->storage_V3 = storage_V3;
>>> 00492    this->storage_V5 = storage_V5;
>>> 00493    this->storage_V7 = storage_V7;
>>> 00494    this->storage_V9 = storage_V9;
>>> 00495    this->storage_V11 = storage_V11;
>>> 00496    this->storage_V13 = storage_V13;
>>> 00497    this->storage_V1 = storage_V1;
>>> 00498    this->storage_V15 = storage_V15;
>>> 00499                
>>> 00500    
>>> 00501    
>>> 00502    
>>> 00503    
>>> 00504    
>>> 00505    
>>> 00506    
>>> 00507    
>>> 00508        py_V15 = PyList_GET_ITEM(storage_V15, 0);
>>> 00509        {Py_XINCREF(py_V15);}
>>> 00510        
>>> 00511    if (!PyObject_TypeCheck(py_V15, &PyGpuContextType)) {
>>> 00512      PyErr_SetString(PyExc_TypeError, "expected a GpuContext");
>>> 00513      {
>>> 00514            if (!PyErr_Occurred()) {
>>> 00515                PyErr_SetString(PyExc_RuntimeError,
>>> 00516                    "Unexpected error in an Op's C code. "
>>> 00517                    "No Python exception was set.");
>>> 00518                }
>>> 00519            return 15;
>>> 00520    }
>>> 00521    }
>>> 00522    
>>> 00523    V15 = (PyGpuContextObject *)py_V15;
>>> 00524    Py_INCREF(V15);
>>> 00525    
>>> 00526    
>>> 00527    #define DTYPE_INPUT_0 npy_float16
>>> 00528    #define TYPENUM_INPUT_0 23
>>> 00529    #define ITEMSIZE_INPUT_0 2
>>> 00530    #define DTYPE_INPUT_1 npy_float16
>>> 00531    #define TYPENUM_INPUT_1 23
>>> 00532    #define ITEMSIZE_INPUT_1 2
>>> 00533    #define DTYPE_INPUT_2 npy_float16
>>> 00534    #define TYPENUM_INPUT_2 23
>>> 00535    #define ITEMSIZE_INPUT_2 2
>>> 00536    #define DTYPE_INPUT_4 npy_float16
>>> 00537    #define TYPENUM_INPUT_4 23
>>> 00538    #define ITEMSIZE_INPUT_4 2
>>> 00539    #define DTYPE_INPUT_5 npy_float16
>>> 00540    #define TYPENUM_INPUT_5 23
>>> 00541    #define ITEMSIZE_INPUT_5 2
>>> 00542    #define DTYPE_OUTPUT_0 npy_float16
>>> 00543    #define TYPENUM_OUTPUT_0 23
>>> 00544    #define ITEMSIZE_OUTPUT_0 2
>>> 00545    #define APPLY_SPECIFIC(str) 
>>> str##_node_86feacd077d8749f42b5d82709a80ba3_0
>>> 00546    #define CONV_INPLACE 1
>>> 00547    #define CONV_ALGO 
>>> CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
>>> 00548    #define FAIL { \
>>> 00549            if (!PyErr_Occurred()) { \
>>> 00550                PyErr_SetString(PyExc_RuntimeError, \
>>> 00551                    "Unexpected error in an Op's C code. " \
>>> 00552                    "No Python exception was set."); \
>>> 00553                } \
>>> 00554            return 17; \
>>> 00555    }
>>> 00556    #define PARAMS V15
>>> 00557    
>>> 00558    
>>> 00559    {
>>> 00560      // We need to keep a reference here to have it available in 
>>> the destructor.
>>> 00561      ctx = PARAMS;
>>> 00562      Py_INCREF(ctx);
>>> 00563    
>>> 00564      cuda_enter(PARAMS->ctx);
>>> 00565      cudnnStatus_t err;
>>> 00566      APPLY_SPECIFIC(_handle) = NULL;
>>> 00567      if ((err = cudnnCreate(&APPLY_SPECIFIC(_handle))) != 
>>> CUDNN_STATUS_SUCCESS) {
>>> 00568        PyErr_Format(PyExc_RuntimeError, "could not create cuDNN 
>>> handle: %s",
>>> 00569                     cudnnGetErrorString(err));
>>> 00570        cuda_exit(PARAMS->ctx);
>>> 00571        FAIL;
>>> 00572      }
>>> 00573      if ((err = cudnnSetStream(APPLY_SPECIFIC(_handle),
>>> 00574                                cuda_get_stream(PARAMS->ctx))) != 
>>> CUDNN_STATUS_SUCCESS) {
>>> 00575        PyErr_Format(PyExc_RuntimeError, "Could not set cudnn 
>>> stream: %s",
>>> 00576                     cudnnGetErrorString(err));
>>> 00577        cuda_exit(PARAMS->ctx);
>>> 00578        FAIL;
>>> 00579      }
>>> 00580      cuda_exit(PARAMS->ctx);
>>> 00581    }
>>> 00582    
>>> 00583    
>>> 00584    
>>> 00585    cudnnStatus_t APPLY_SPECIFIC(err);
>>> 00586    APPLY_SPECIFIC(input) = NULL;
>>> 00587    APPLY_SPECIFIC(output) = NULL;
>>> 00588    APPLY_SPECIFIC(kerns) = NULL;
>>> 00589    if ((APPLY_SPECIFIC(err) = 
>>> cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != 
>>> CUDNN_STATUS_SUCCESS) {
>>> 00590      PyErr_Format(PyExc_MemoryError, "could not allocate tensor 
>>> descriptor "
>>> 00591               "(inp): %s", 
>>> cudnnGetErrorString(APPLY_SPECIFIC(err)));
>>> 00592      FAIL;
>>> 00593    }
>>> 00594    if ((APPLY_SPECIFIC(err) = 
>>> cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != 
>>> CUDNN_STATUS_SUCCESS) {
>>> 00595      PyErr_Format(PyExc_MemoryError, "could not allocate tensor 
>>> descriptor "
>>> 00596                   "(out): %s", 
>>> cudnnGetErrorString(APPLY_SPECIFIC(err)));
>>> 00597      FAIL;
>>> 00598    }
>>> 00599    if ((APPLY_SPECIFIC(err) = 
>>> cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns))) != 
>>> CUDNN_STATUS_SUCCESS) {
>>> 00600      PyErr_Format(PyExc_MemoryError, "could not allocate filter 
>>> descriptor: %s", 
>>> 00601               cudnnGetErrorString(APPLY_SPECIFIC(err)));
>>> 00602      FAIL;
>>> 00603    }
>>> 00604    
>>> 00605    
>>> 00606    
>>> 00607    #ifdef CHOOSE_ALGO
>>> 00608    reuse_algo = 0;
>>> 00609    prev_algo = CONV_ALGO;
>>> 00610    #ifndef CHOOSE_ONCE
>>> 00611    memset(prev_img_dims, 0, sizeof(prev_img_dims));
>>> 00612    memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
>>> 00613    #endif
>>> 00614    #endif
>>> 00615    
>>> 00616    
>>> 00617    #undef FAIL
>>> 00618    #undef PARAMS
>>> 00619    #undef DTYPE_INPUT_0
>>> 00620    #undef TYPENUM_INPUT_0
>>> 00621    #undef ITEMSIZE_INPUT_0
>>> 00622    #undef DTYPE_INPUT_1
>>> 00623    #undef TYPENUM_INPUT_1
>>> 00624    #undef ITEMSIZE_INPUT_1
>>> 00625    #undef DTYPE_INPUT_2
>>> 00626    #undef TYPENUM_INPUT_2
>>> 00627    #undef ITEMSIZE_INPUT_2
>>> 00628    #undef DTYPE_INPUT_4
>>> 00629    #undef TYPENUM_INPUT_4
>>> 00630    #undef ITEMSIZE_INPUT_4
>>> 00631    #undef DTYPE_INPUT_5
>>> 00632    #undef TYPENUM_INPUT_5
>>> 00633    #undef ITEMSIZE_INPUT_5
>>> 00634    #undef DTYPE_OUTPUT_0
>>> 00635    #undef TYPENUM_OUTPUT_0
>>> 00636    #undef ITEMSIZE_OUTPUT_0
>>> 00637    #undef APPLY_SPECIFIC
>>> 00638    #undef CONV_INPLACE
>>> 00639    #undef CONV_ALGO
>>> 00640                this->__ERROR = __ERROR;
>>> 00641                return 0;
>>> 00642            }
>>> 00643            void cleanup(void) {
>>> 00644                __label_1:
>>> 00645    
>>> 00646    double __DUMMY_1;
>>> 00647    __label_3:
>>> 00648    
>>> 00649    double __DUMMY_3;
>>> 00650    __label_5:
>>> 00651    
>>> 00652    double __DUMMY_5;
>>> 00653    __label_7:
>>> 00654    
>>> 00655    double __DUMMY_7;
>>> 00656    __label_9:
>>> 00657    
>>> 00658    double __DUMMY_9;
>>> 00659    __label_11:
>>> 00660    
>>> 00661    double __DUMMY_11;
>>> 00662    __label_13:
>>> 00663    
>>> 00664    double __DUMMY_13;
>>> 00665    __label_15:
>>> 00666    Py_XDECREF(V15); V15 = NULL;
>>> 00667        {Py_XDECREF(py_V15);}
>>> 00668        
>>> 00669    double __DUMMY_15;
>>> 00670    __label_18:
>>> 00671    
>>> 00672    #define DTYPE_INPUT_0 npy_float16
>>> 00673    #define TYPENUM_INPUT_0 23
>>> 00674    #define ITEMSIZE_INPUT_0 2
>>> 00675    #define DTYPE_INPUT_1 npy_float16
>>> 00676    #define TYPENUM_INPUT_1 23
>>> 00677    #define ITEMSIZE_INPUT_1 2
>>> 00678    #define DTYPE_INPUT_2 npy_float16
>>> 00679    #define TYPENUM_INPUT_2 23
>>> 00680    #define ITEMSIZE_INPUT_2 2
>>> 00681    #define DTYPE_INPUT_4 npy_float16
>>> 00682    #define TYPENUM_INPUT_4 23
>>> 00683    #define ITEMSIZE_INPUT_4 2
>>> 00684    #define DTYPE_INPUT_5 npy_float16
>>> 00685    #define TYPENUM_INPUT_5 23
>>> 00686    #define ITEMSIZE_INPUT_5 2
>>> 00687    #define DTYPE_OUTPUT_0 npy_float16
>>> 00688    #define TYPENUM_OUTPUT_0 23
>>> 00689    #define ITEMSIZE_OUTPUT_0 2
>>> 00690    #define APPLY_SPECIFIC(str) 
>>> str##_node_86feacd077d8749f42b5d82709a80ba3_0
>>> 00691    #define CONV_INPLACE 1
>>> 00692    #define CONV_ALGO 
>>> CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
>>> 00693    
>>> 00694    
>>> 00695    cuda_enter(ctx->ctx);
>>> 00696    cudnnDestroy(APPLY_SPECIFIC(_handle));
>>> 00697    cuda_exit(ctx->ctx);
>>> 00698    Py_DECREF((PyObject *)ctx);
>>> 00699    
>>> 00700    
>>> 00701    if (APPLY_SPECIFIC(input) != NULL)
>>> 00702      cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input));
>>> 00703    if (APPLY_SPECIFIC(output) != NULL)
>>> 00704      cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output));
>>> 00705    if (APPLY_SPECIFIC(kerns) != NULL)
>>> 00706      cudnnDestroyFilterDescriptor(APPLY_SPECIFIC(kerns));
>>> 00707    
>>> 00708    #undef DTYPE_INPUT_0
>>> 00709    #undef TYPENUM_INPUT_0
>>> 00710    #undef ITEMSIZE_INPUT_0
>>> 00711    #undef DTYPE_INPUT_1
>>> 00712    #undef TYPENUM_INPUT_1
>>> 00713    #undef ITEMSIZE_INPUT_1
>>> 00714    #undef DTYPE_INPUT_2
>>> 00715    #undef TYPENUM_INPUT_2
>>> 00716    #undef ITEMSIZE_INPUT_2
>>> 00717    #undef DTYPE_INPUT_4
>>> 00718    #undef TYPENUM_INPUT_4
>>> 00719    #undef ITEMSIZE_INPUT_4
>>> 00720    #undef DTYPE_INPUT_5
>>> 00721    #undef TYPENUM_INPUT_5
>>> 00722    #undef ITEMSIZE_INPUT_5
>>> 00723    #undef DTYPE_OUTPUT_0
>>> 00724    #undef TYPENUM_OUTPUT_0
>>> 00725    #undef ITEMSIZE_OUTPUT_0
>>> 00726    #undef APPLY_SPECIFIC
>>> 00727    #undef CONV_INPLACE
>>> 00728    #undef CONV_ALGO
>>> 00729    double __DUMMY_18;
>>> 00730    
>>> 00731                Py_XDECREF(this->storage_V3);
>>> 00732    Py_XDECREF(this->storage_V5);
>>> 00733    Py_XDECREF(this->storage_V7);
>>> 00734    Py_XDECREF(this->storage_V9);
>>> 00735    Py_XDECREF(this->storage_V11);
>>> 00736    Py_XDECREF(this->storage_V13);
>>> 00737    Py_XDECREF(this->storage_V1);
>>> 00738    Py_XDECREF(this->storage_V15);
>>> 00739            }
>>> 00740            int run(void) {
>>> 00741                int __failure = 0;
>>> 00742                
>>> 00743        PyObject* py_V1;
>>> 00744        
>>> 00745            PyGpuArrayObject *V1;
>>> 00746            
>>> 00747        PyObject* py_V3;
>>> 00748        
>>> 00749            PyGpuArrayObject *V3;
>>> 00750            
>>> 00751        PyObject* py_V5;
>>> 00752        
>>> 00753            PyGpuArrayObject *V5;
>>> 00754            
>>> 00755        PyObject* py_V7;
>>> 00756        
>>> 00757            PyGpuArrayObject *V7;
>>> 00758            
>>> 00759        PyObject* py_V9;
>>> 00760        
>>> 00761            cudnnConvolutionDescriptor_t V9;
>>> 00762            
>>> 00763        PyObject* py_V11;
>>> 00764        
>>> 00765                    typedef npy_float16 V11_dtype; // Deprecated 
>>> use dtype_V11 instead.
>>> 00766                    typedef npy_float16 dtype_V11;
>>> 00767                
>>> 00768            npy_float16 V11;
>>> 00769            
>>> 00770        PyObject* py_V13;
>>> 00771        
>>> 00772                    typedef npy_float16 V13_dtype; // Deprecated 
>>> use dtype_V13 instead.
>>> 00773                    typedef npy_float16 dtype_V13;
>>> 00774                
>>> 00775            npy_float16 V13;
>>> 00776            
>>> 00777    {
>>> 00778    
>>> 00779        py_V1 = PyList_GET_ITEM(storage_V1, 0);
>>> 00780        {Py_XINCREF(py_V1);}
>>> 00781        
>>> 00782            if (py_V1 == Py_None)
>>> 00783            {
>>> 00784                V1 = NULL;
>>> 00785            }
>>> 00786            else
>>> 00787            {
>>> 00788                
>>> 00789            V1 = NULL;
>>> 00790            if (py_V1 == Py_None) {
>>> 00791                PyErr_SetString(PyExc_ValueError, "expected a 
>>> GpuArray, not None");
>>> 00792                {
>>> 00793            __failure = 2;
>>> 00794            if (!PyErr_Occurred()) {
>>> 00795                PyErr_SetString(PyExc_RuntimeError,
>>> 00796                    "Unexpected error in an Op's C code. "
>>> 00797                    "No Python exception was set.");
>>> 00798                }
>>> 00799            goto __label_2;}
>>> 00800            }
>>> 00801            /* First check if we are the base type exactly (the 
>>> most common case),
>>> 00802               then do the full subclass check if needed. */
>>> 00803            if (py_V1->ob_type != &PyGpuArrayType &&
>>> 00804                !PyObject_TypeCheck(py_V1, &PyGpuArrayType)) {
>>> 00805                PyE
>>>
>> -- 
>>
>> --- 
>> You received this message because you are subscribed to the Google Groups 
>> "theano-users" group.
>> To unsubscribe from this group and stop receiving emails from it, send an 
>> email to [email protected] <javascript:>.
>>
>> For more options, visit https://groups.google.com/d/optout.
>>
>
>


-- 

--- 
You received this message because you are subscribed to the Google Groups 
"theano-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/d/optout.

Re: [theano-users] Error using floatX = float16 to save memory

Reply via email to