[PyCUDA] pycuda execute error

张鲁宁 Tue, 28 Mar 2017 21:22:04 -0700

I successfully installed pycuda,by install the Microsoft VC compiler for python 
2.7, but when i executing the test code, it comes out a new error . The 
tarceback is too long to read,I have no clue about it,could anybody offer me 
some help again please?



 I dont know how to set the .theanorc.txt, I copied it from net and edit it a 
little,i dont know how to make all these thing rignt, ,intalling  cuda 7.5 also 
dont work.... I really need your help,have you met this situation? I use 
vs2010, anaconda 2, use pip intall to install theano and pycuda,Use conda 
install mingw libpython, os is win7 64bit,my gpu is GTX770m,and set all the env 
varible to path,but while using cpu,the code can run correctly,but after 
setting it to gpu, it comes out a lot error. I was so depressed，Thank you 
advance.


from theano import function, config, shared, sandbox  
import theano.tensor as T  
import numpy  
import time  
  
vlen = 10 * 30 * 768  # 10 x #cores x # threads per core  
iters = 1000  
  
rng = numpy.random.RandomState(22)  
x = shared(numpy.asarray(rng.rand(vlen), config.floatX))  
f = function([], T.exp(x))  
print f.maker.fgraph.toposort()  
t0 = time.time()  
for i in xrange(iters):  
    r = f()  
t1 = time.time()  
print 'Looping %d times took' % iters, t1 - t0, 'seconds'  
print 'Result is', r  
if numpy.any([isinstance(x.op, T.Elemwise) for x in 
f.maker.fgraph.toposort()]):  
    print 'Used the cpu'  
else:  
    print 'Used the gpu'

C:\ProgramData\Anaconda2\python.exe 
C:/Users/Administrator/Desktop/work/ntm-one-shot-master/tt.py
WARNING (theano.sandbox.cuda): The cuda backend is deprecated and will be 
removed in the next release (v0.10).  Please switch to the gpuarray backend. 
You can get more information about how to switch at this URL:
 
https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX 770M (CNMeM is disabled, cuDNN not available)
1 #include <Python.h>
2 #include <iostream>
3 #include "theano_mod_helper.h"
4 #include "cuda_ndarray.cuh"
5 //////////////////////
6 ////  Support Code
7 //////////////////////
8 
9             static __global__ void 
kernel_reduce_ccontig_node_544270fe7a21a748315f83abfe0913cc_0(
10                     const unsigned int d0,
11                     const float *A,
12                     float * Z)
13             {
14                 const int threadCount = blockDim.x;
15                 const int threadNum = threadIdx.x;
16                 extern __shared__ float buf[];
17                 float myresult = 0;
18 
19                 if (warpSize != 32)
20                 {
21                     return;  //TODO: set error code
22                 }
23 
24                 for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
25                 {
26                     myresult = myresult + A[i0];
27                 }
28                 
29         __syncthreads(); // some kernel do multiple reduction.
30         buf[threadNum] = myresult;
31         __syncthreads();
32 
33         // rest of function is handled by one warp
34         if (threadNum < warpSize)
35         {
36             //round up all the partial sums into the first `warpSize` 
elements
37             for (int i = threadNum + warpSize; i < threadCount; i += 
warpSize)
38             {
39                 myresult = myresult + buf[i];
40             }
41             buf[threadNum] = myresult;
42         /*Comment this optimization as it don't work on Fermi GPU.
43         TODO: find why it don't work or put the GPU compute capability into 
the version
44             // no sync because only one warp is running
45             if(threadCount >32)
46             {buf[threadNum] = buf[threadNum] + 
buf[threadNum+16];buf[threadNum] = buf[threadNum] + 
buf[threadNum+8];buf[threadNum] = buf[threadNum] + 
buf[threadNum+4];buf[threadNum] = buf[threadNum] + 
buf[threadNum+2];buf[threadNum] = buf[threadNum] + buf[threadNum+1];
47                 if (threadNum == 0)
48                 {
49                     Z[0] = buf[0];
50                 }
51 
52             }
53             else */
54             if (threadNum < 16)
55             {
56                 //reduce so that threadNum 0 has the reduction of everything
57                 if (threadNum + 16 < threadCount) buf[threadNum] = 
buf[threadNum] + buf[threadNum+16];if (threadNum + 8 < threadCount) 
buf[threadNum] = buf[threadNum] + buf[threadNum+8];if (threadNum + 4 < 
threadCount) buf[threadNum] = buf[threadNum] + buf[threadNum+4];if (threadNum + 
2 < threadCount) buf[threadNum] = buf[threadNum] + buf[threadNum+2];if 
(threadNum + 1 < threadCount) buf[threadNum] = buf[threadNum] + 
buf[threadNum+1];
58                 if (threadNum == 0)
59                 {
60                     Z[0] = buf[0];
61                 }
62             }
63         }
64         
65             }
66             
67 
68             static __global__ void 
kernel_reduce_1_node_544270fe7a21a748315f83abfe0913cc_0(
69                     const unsigned int d0,
70                     const float *A, const int sA0,
71                     float * Z)
72             {
73                 const int threadCount = blockDim.x;
74                 const int threadNum = threadIdx.x;
75                 extern __shared__ float buf[];
76                 float myresult = 0;
77 
78                 if (warpSize != 32)
79                 {
80                     return;  //TODO: set error code
81                 }
82 
83                 for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
84                 {
85                     myresult = myresult + A[i0 * sA0];
86                 }
87                 
88         __syncthreads(); // some kernel do multiple reduction.
89         buf[threadNum] = myresult;
90         __syncthreads();
91 
92         // rest of function is handled by one warp
93         if (threadNum < warpSize)
94         {
95             //round up all the partial sums into the first `warpSize` 
elements
96             for (int i = threadNum + warpSize; i < threadCount; i += 
warpSize)
97             {
98                 myresult = myresult + buf[i];
99             }
100             buf[threadNum] = myresult;
101         /*Comment this optimization as it don't work on Fermi GPU.
102         TODO: find why it don't work or put the GPU compute capability into 
the version
103             // no sync because only one warp is running
104             if(threadCount >32)
105             {buf[threadNum] = buf[threadNum] + 
buf[threadNum+16];buf[threadNum] = buf[threadNum] + 
buf[threadNum+8];buf[threadNum] = buf[threadNum] + 
buf[threadNum+4];buf[threadNum] = buf[threadNum] + 
buf[threadNum+2];buf[threadNum] = buf[threadNum] + buf[threadNum+1];
106                 if (threadNum == 0)
107                 {
108                     Z[0] = buf[0];
109                 }
110 
111             }
112             else */
113             if (threadNum < 16)
114             {
115                 //reduce so that threadNum 0 has the reduction of everything
116                 if (threadNum + 16 < threadCount) buf[threadNum] = 
buf[threadNum] + buf[threadNum+16];if (threadNum + 8 < threadCount) 
buf[threadNum] = buf[threadNum] + buf[threadNum+8];if (threadNum + 4 < 
threadCount) buf[threadNum] = buf[threadNum] + buf[threadNum+4];if (threadNum + 
2 < threadCount) buf[threadNum] = buf[threadNum] + buf[threadNum+2];if 
(threadNum + 1 < threadCount) buf[threadNum] = buf[threadNum] + 
buf[threadNum+1];
117                 if (threadNum == 0)
118                 {
119                     Z[0] = buf[0];
120                 }
121             }
122         }
123         
124             }
125             
126 
127 
128     namespace {
129     struct __struct_compiled_op_544270fe7a21a748315f83abfe0913cc {
130         PyObject* __ERROR;
131 
132         PyObject* storage_V3;
133 PyObject* storage_V1;
134         
135 
136         __struct_compiled_op_544270fe7a21a748315f83abfe0913cc() {
137             // This is only somewhat safe because we:
138             //  1) Are not a virtual class
139             //  2) Do not use any virtual classes in the members
140             //  3) Deal with mostly POD and pointers
141 
142             // If this changes, we would have to revise this, but for
143             // now I am tired of chasing segfaults because
144             // initialization code had an error and some pointer has
145             // a junk value.
146             memset(this, 0, sizeof(*this));
147         }
148         ~__struct_compiled_op_544270fe7a21a748315f83abfe0913cc(void) {
149             cleanup();
150         }
151 
152         int init(PyObject* __ERROR, PyObject* storage_V3, PyObject* 
storage_V1) {
153             Py_XINCREF(storage_V3);
154 Py_XINCREF(storage_V1);
155             this->storage_V3 = storage_V3;
156 this->storage_V1 = storage_V1;
157             
158 
159 
160 
161             this->__ERROR = __ERROR;
162             return 0;
163         }
164         void cleanup(void) {
165             __label_1:
166 
167 double __DUMMY_1;
168 __label_3:
169 
170 double __DUMMY_3;
171 __label_6:
172 
173 double __DUMMY_6;
174 
175             Py_XDECREF(this->storage_V3);
176 Py_XDECREF(this->storage_V1);
177         }
178         int run(void) {
179             int __failure = 0;
180             
181     PyObject* py_V1;
182      CudaNdarray * V1;
183     PyObject* py_V3;
184      CudaNdarray * V3;
185 {
186 
187     py_V1 = PyList_GET_ITEM(storage_V1, 0);
188     {Py_XINCREF(py_V1);}
189     
190         if (py_V1 == Py_None)
191         {
192             V1 = NULL;
193         }
194         else
195         {
196             
197         assert(py_V1->ob_refcnt >= 2); // There should be at least one ref 
from the container object,
198         // and one ref from the local scope.
199 
200         if (CudaNdarray_Check(py_V1))
201         {
202             //fprintf(stderr, "c_extract CNDA object w refcnt %p %i\n", 
py_V1, (py_V1->ob_refcnt));
203             V1 = (CudaNdarray*)py_V1;
204             //std::cerr << "c_extract " << V1 << '\n';
205         
206 
207                 if (V1->nd != 0)
208                 {
209                     PyErr_Format(PyExc_RuntimeError,
210                                  "c_extract: Some CudaNdarray has rank %i, 
it was supposed to have rank 0",
211                                  V1->nd);
212                     V1 = NULL;
213                     {
214         __failure = 2;
215         if (!PyErr_Occurred()) {
216             PyErr_SetString(PyExc_RuntimeError,
217                 "Unexpected error in an Op's C code. "
218                 "No Python exception was set.");
219             }
220         goto __label_2;};
221                 }
222                 //std::cerr << "c_extract " << V1 << " nd check passed\n";
223             
224 
225                 assert(V1);
226                 Py_INCREF(py_V1);
227             }
228             else if (py_V1 == Py_None)
229             {
230                 PyErr_SetString(PyExc_TypeError,
231                                 "expected a CudaNdarray, not None");
232                 V1 = NULL;
233                 {
234         __failure = 2;
235         if (!PyErr_Occurred()) {
236             PyErr_SetString(PyExc_RuntimeError,
237                 "Unexpected error in an Op's C code. "
238                 "No Python exception was set.");
239             }
240         goto __label_2;};
241             }
242             else
243             {
244                 //fprintf(stderr, "FAILING c_extract CNDA object w refcnt 
%p %i\n", py_V1, (py_V1->ob_refcnt));
245                 PyErr_SetString(PyExc_TypeError, "Argument not a 
CudaNdarray");
246                 V1 = NULL;
247                 {
248         __failure = 2;
249         if (!PyErr_Occurred()) {
250             PyErr_SetString(PyExc_RuntimeError,
251                 "Unexpected error in an Op's C code. "
252                 "No Python exception was set.");
253             }
254         goto __label_2;};
255             }
256             //std::cerr << "c_extract done " << V1 << '\n';
257             
258 
259         }
260         
261 {
262 
263     py_V3 = PyList_GET_ITEM(storage_V3, 0);
264     {Py_XINCREF(py_V3);}
265     
266         assert(py_V3->ob_refcnt >= 2); // There should be at least one ref 
from the container object,
267         // and one ref from the local scope.
268 
269         if (CudaNdarray_Check(py_V3))
270         {
271             //fprintf(stderr, "c_extract CNDA object w refcnt %p %i\n", 
py_V3, (py_V3->ob_refcnt));
272             V3 = (CudaNdarray*)py_V3;
273             //std::cerr << "c_extract " << V3 << '\n';
274         
275 
276                 if (V3->nd != 1)
277                 {
278                     PyErr_Format(PyExc_RuntimeError,
279                                  "c_extract: Some CudaNdarray has rank %i, 
it was supposed to have rank 1",
280                                  V3->nd);
281                     V3 = NULL;
282                     {
283         __failure = 4;
284         if (!PyErr_Occurred()) {
285             PyErr_SetString(PyExc_RuntimeError,
286                 "Unexpected error in an Op's C code. "
287                 "No Python exception was set.");
288             }
289         goto __label_4;};
290                 }
291                 //std::cerr << "c_extract " << V3 << " nd check passed\n";
292             
293 
294                 assert(V3);
295                 Py_INCREF(py_V3);
296             }
297             else if (py_V3 == Py_None)
298             {
299                 PyErr_SetString(PyExc_TypeError,
300                                 "expected a CudaNdarray, not None");
301                 V3 = NULL;
302                 {
303         __failure = 4;
304         if (!PyErr_Occurred()) {
305             PyErr_SetString(PyExc_RuntimeError,
306                 "Unexpected error in an Op's C code. "
307                 "No Python exception was set.");
308             }
309         goto __label_4;};
310             }
311             else
312             {
313                 //fprintf(stderr, "FAILING c_extract CNDA object w refcnt 
%p %i\n", py_V3, (py_V3->ob_refcnt));
314                 PyErr_SetString(PyExc_TypeError, "Argument not a 
CudaNdarray");
315                 V3 = NULL;
316                 {
317         __failure = 4;
318         if (!PyErr_Occurred()) {
319             PyErr_SetString(PyExc_RuntimeError,
320                 "Unexpected error in an Op's C code. "
321                 "No Python exception was set.");
322             }
323         goto __label_4;};
324             }
325             //std::cerr << "c_extract done " << V3 << '\n';
326             
327 
328 {
329 // Op class GpuCAReduce
330 
331         if (V3->nd != 1)
332         {
333             PyErr_Format(PyExc_TypeError,
334                          "required nd=1, got nd=%i", V3->nd);
335             {
336         __failure = 5;
337         if (!PyErr_Occurred()) {
338             PyErr_SetString(PyExc_RuntimeError,
339                 "Unexpected error in an Op's C code. "
340                 "No Python exception was set.");
341             }
342         goto __label_5;};
343         }
344         
345 
346         if (  !V1
347            || (V1->nd != 0)
348         
349 
350            )
351         {
352             
353 int *new_dims=NULL; 
354 
355             Py_XDECREF(V1);
356             V1 = (CudaNdarray*) CudaNdarray_NewDims(0, new_dims);
357             if (NULL == V1)
358             {
359                 {
360         __failure = 5;
361         if (!PyErr_Occurred()) {
362             PyErr_SetString(PyExc_RuntimeError,
363                 "Unexpected error in an Op's C code. "
364                 "No Python exception was set.");
365             }
366         goto __label_5;};
367             }
368         }
369         
370 
371         if (CudaNdarray_SIZE(V1) && ! CudaNdarray_SIZE(V3)){
372             cudaMemset(V1->devdata, 0, CudaNdarray_SIZE(V1) * 
sizeof(float));
373         }
374         else if (CudaNdarray_SIZE(V1))
375         {
376         
377 if(CudaNdarray_is_c_contiguous( V3)){
378 
379         {
380           if(CudaNdarray_SIZE(V3)==0){
381             cudaMemset(V1->devdata, 0, CudaNdarray_SIZE(V1) * 
sizeof(float));
382           }else{
383             int verbose = 0;
384             dim3 n_threads(
385                     std::min(CudaNdarray_SIZE(V3),
386                              (size_t) NUM_VECTOR_OP_THREADS_PER_BLOCK));
387             dim3 n_blocks(1);
388             if (verbose) printf("running 
kernel_reduce_ccontig_node_544270fe7a21a748315f83abfe0913cc_0"
389                                 " n_threads.x=%d, size=%d, ndim=%d\n",
390                                 n_threads.x,CudaNdarray_SIZE(V3),V3->nd);
391             int n_shared = sizeof(float) * n_threads.x;
392             
kernel_reduce_ccontig_node_544270fe7a21a748315f83abfe0913cc_0<<<n_blocks, 
n_threads, n_shared>>>(
393                     CudaNdarray_SIZE(V3),
394                     CudaNdarray_DEV_DATA(V3),
395                     CudaNdarray_DEV_DATA(V1));
396             CNDA_THREAD_SYNC;
397             cudaError_t sts = cudaGetLastError();
398             if (cudaSuccess != sts)
399             {
400                 PyErr_Format(PyExc_RuntimeError,
401                              "Cuda error: %s: %s."
402                              " (grid: %i x %i; block: %i x %i x %i)\n",
403                     
"kernel_reduce_ccontig_node_544270fe7a21a748315f83abfe0913cc_0",
404                     cudaGetErrorString(sts),
405                     n_blocks.x,
406                     n_blocks.y,
407                     n_threads.x,
408                     n_threads.y,
409                     n_threads.z);
410                 {
411         __failure = 5;
412         if (!PyErr_Occurred()) {
413             PyErr_SetString(PyExc_RuntimeError,
414                 "Unexpected error in an Op's C code. "
415                 "No Python exception was set.");
416             }
417         goto __label_5;};
418             }
419          }
420         }
421         
422 }else{
423 
424         {
425             int verbose = 0;
426             dim3 n_threads(
427                     std::min(CudaNdarray_HOST_DIMS(V3)[0],
428                             NUM_VECTOR_OP_THREADS_PER_BLOCK));
429             dim3 n_blocks(1);
430             
431             if (verbose)
432                 printf("running 
kernel_reduce_1_node_544270fe7a21a748315f83abfe0913cc_0\n");
433             int n_shared = sizeof(float) * n_threads.x * n_threads.y * 
n_threads.z;
434             if (verbose>1)
435                 printf("n_threads.x=%d, n_threads.y=%d, n_threads.z=%d,"
436                        " nb_threads=%d, n_blocks.x=%d, n_blocks.y=%d,"
437                        " nb_block=%d, n_shared=%d, shape=(%d)\n",
438                                   n_threads.x,n_threads.y,n_threads.z,
439                                   n_threads.x*n_threads.y*n_threads.z,
440                                   n_blocks.x,n_blocks.y,
441                                   n_blocks.x*n_blocks.y, n_shared, 
CudaNdarray_HOST_DIMS(V3)[0]);
442             
kernel_reduce_1_node_544270fe7a21a748315f83abfe0913cc_0<<<n_blocks, n_threads, 
n_shared>>>(
443             
444 
445                     CudaNdarray_HOST_DIMS(V3)[0],
446             
447 
448                     CudaNdarray_DEV_DATA(V3)
449             
450 
451                     ,CudaNdarray_HOST_STRIDES(V3)[0]
452             
453 
454                     ,CudaNdarray_DEV_DATA(V1)
455             
456 
457                     );
458             CNDA_THREAD_SYNC;
459             cudaError_t sts = cudaGetLastError();
460             if (cudaSuccess != sts)
461             {
462                 PyErr_Format(PyExc_RuntimeError,
463                     "Cuda error: %s: %s."
464                     " (grid: %i x %i; block: %i x %i x %i)"
465                     " shape=(%d) \n",
466                     
"kernel_reduce_1_node_544270fe7a21a748315f83abfe0913cc_0",
467                     cudaGetErrorString(sts),
468                     n_blocks.x,
469                     n_blocks.y,
470                     n_threads.x,
471                     n_threads.y,
472                     n_threads.z,
473                     CudaNdarray_HOST_DIMS(V3)[0]);
474                 {
475         __failure = 5;
476         if (!PyErr_Occurred()) {
477             PyErr_SetString(PyExc_RuntimeError,
478                 "Unexpected error in an Op's C code. "
479                 "No Python exception was set.");
480             }
481         goto __label_5;};
482             }
483         
484 
485         }
486         
487 }
488 
489         }
490         
491 __label_5:
492 
493 double __DUMMY_5;
494 
495 }
496 __label_4:
497 
498         //std::cerr << "cleanup " << py_V3 << " " << V3 << "\n";
499         //fprintf(stderr, "c_cleanup CNDA py_object w refcnt %p %i\n", 
py_V3, (py_V3->ob_refcnt));
500         if (V3)
501         {
502             //fprintf(stderr, "c_cleanup CNDA cn_object w refcnt %p %i\n", 
V3, (V3->ob_refcnt));
503             Py_XDECREF(V3);
504         }
505         //std::cerr << "cleanup done" << py_V3 << "\n";
506         
507     {Py_XDECREF(py_V3);}
508     
509 double __DUMMY_4;
510 
511 }
512 __label_2:
513 
514     if (!__failure) {
515       
516         //std::cerr << "sync\n";
517         if (NULL == V1) {
518             // failure: sync None to storage
519             Py_XDECREF(py_V1);
520             py_V1 = Py_None;
521             Py_INCREF(py_V1);
522         }
523         else
524         {
525             if (py_V1 != (PyObject*)V1)
526             {
527                 Py_XDECREF(py_V1);
528                 py_V1 = (PyObject*)V1;
529                 Py_INCREF(py_V1);
530             }
531             assert(py_V1->ob_refcnt);
532         }
533         
534       PyObject* old = PyList_GET_ITEM(storage_V1, 0);
535       {Py_XINCREF(py_V1);}
536       PyList_SET_ITEM(storage_V1, 0, py_V1);
537       {Py_XDECREF(old);}
538     }
539     
540         //std::cerr << "cleanup " << py_V1 << " " << V1 << "\n";
541         //fprintf(stderr, "c_cleanup CNDA py_object w refcnt %p %i\n", 
py_V1, (py_V1->ob_refcnt));
542         if (V1)
543         {
544             //fprintf(stderr, "c_cleanup CNDA cn_object w refcnt %p %i\n", 
V1, (V1->ob_refcnt));
545             Py_XDECREF(V1);
546         }
547         //std::cerr << "cleanup done" << py_V1 << "\n";
548         
549     {Py_XDECREF(py_V1);}
550     
551 double __DUMMY_2;
552 
553 }
554 
555             
556         if (__failure) {
557             // When there is a failure, this code puts the exception
558             // in __ERROR.
559             PyObject* err_type = NULL;
560             PyObject* err_msg = NULL;
561             PyObject* err_traceback = NULL;
562             PyErr_Fetch(&err_type, &err_msg, &err_traceback);
563             if (!err_type) {err_type = Py_None;Py_INCREF(Py_None);}
564             if (!err_msg) {err_msg = Py_None; Py_INCREF(Py_None);}
565             if (!err_traceback) {err_traceback = Py_None; 
Py_INCREF(Py_None);}
566             PyObject* old_err_type = PyList_GET_ITEM(__ERROR, 0);
567             PyObject* old_err_msg = PyList_GET_ITEM(__ERROR, 1);
568             PyObject* old_err_traceback = PyList_GET_ITEM(__ERROR, 2);
569             PyList_SET_ITEM(__ERROR, 0, err_type);
570             PyList_SET_ITEM(__ERROR, 1, err_msg);
571             PyList_SET_ITEM(__ERROR, 2, err_traceback);
572             {Py_XDECREF(old_err_type);}
573             {Py_XDECREF(old_err_msg);}
574             {Py_XDECREF(old_err_traceback);}
575         }
576         // The failure code is returned to index what code block failed.
577         return __failure;
578         
579         }
580     };
581     }
582     
583 
584         static int 
__struct_compiled_op_544270fe7a21a748315f83abfe0913cc_executor(__struct_compiled_op_544270fe7a21a748315f83abfe0913cc*
 self) {
585             return self->run();
586         }
587 
588         static void 
__struct_compiled_op_544270fe7a21a748315f83abfe0913cc_destructor(void* 
executor, void* self) {
589             delete 
((__struct_compiled_op_544270fe7a21a748315f83abfe0913cc*)self);
590         }
591         
592 //////////////////////
593 ////  Functions
594 //////////////////////
595 static PyObject * instantiate(PyObject * self, PyObject *argtuple) {
596   assert(PyTuple_Check(argtuple));
597   if (3 != PyTuple_Size(argtuple)){ 
598      PyErr_Format(PyExc_TypeError, "Wrong number of arguments, expected 3, 
got %i", (int)PyTuple_Size(argtuple));
599      return NULL;
600   }
601   __struct_compiled_op_544270fe7a21a748315f83abfe0913cc* struct_ptr = new 
__struct_compiled_op_544270fe7a21a748315f83abfe0913cc();
602   if (struct_ptr->init( PyTuple_GET_ITEM(argtuple, 
0),PyTuple_GET_ITEM(argtuple, 1),PyTuple_GET_ITEM(argtuple, 2) ) != 0) {
603     delete struct_ptr;
604     return NULL;
605   }
606   PyObject* thunk = 
PyCObject_FromVoidPtrAndDesc((void*)(&__struct_compiled_op_544270fe7a21a748315f83abfe0913cc_executor),
 struct_ptr, __struct_compiled_op_544270fe7a21a748315f83abfe0913cc_destructor);
607   return thunk; }
608 
609 //////////////////////
610 ////  Module init
611 //////////////////////
612 static PyMethodDef MyMethods[] = {
613     {"instantiate", instantiate, METH_VARARGS, "undocumented"} ,
614     {NULL, NULL, 0, NULL}
615 };
616 PyMODINIT_FUNC init544270fe7a21a748315f83abfe0913cc(void){
617    (void) Py_InitModule("544270fe7a21a748315f83abfe0913cc", MyMethods);
618 }
619 
===============================
c:\program files\nvidia gpu computing 
toolkit\cuda\v8.0\include\math_functions.h(1): error: expected a declaration

c:\program files\nvidia gpu computing 
toolkit\cuda\v8.0\include\cuda_surface_types.h(91): warning: parsing restarts 
here after previous syntax error

c:\program files\nvidia gpu computing 
toolkit\cuda\v8.0\include\cuda_surface_types.h(94): error: surface is not a 
template

c:\program files\nvidia gpu computing 
toolkit\cuda\v8.0\include\cuda_surface_types.h(97): error: explicit type is 
missing ("int" assumed)

c:\program files\nvidia gpu computing 
toolkit\cuda\v8.0\include\device_functions.h(1): error: expected a declaration

C:\Program Files\NVIDIA GPU Computing 
Toolkit\CUDA\v8.0\include\cuda_runtime.h(1420): warning: parsing restarts here 
after previous syntax error

C:\Program Files\NVIDIA GPU Computing 
Toolkit\CUDA\v8.0\include\cuda_runtime.h(1882): error: surface is not a template

C:\Program Files\NVIDIA GPU Computing 
Toolkit\CUDA\v8.0\include\cuda_runtime.h(1911): error: surface is not a template

C:\ProgramData\Anaconda2\lib\site-packages\theano\sandbox\cuda\cuda_ndarray.cuh(331):
 error: identifier "log2" is undefined

mod.cu(14): error: identifier "blockDim" is undefined

mod.cu(15): error: identifier "threadIdx" is undefined

mod.cu(19): error: identifier "warpSize" is undefined

mod.cu(29): error: identifier "__syncthreads" is undefined

mod.cu(73): error: identifier "blockDim" is undefined

mod.cu(74): error: identifier "threadIdx" is undefined

mod.cu(78): error: identifier "warpSize" is undefined

mod.cu(88): error: identifier "__syncthreads" is undefined










15 errors detected in the compilation of 
"C:/Users/ADMINI~1/AppData/Local/Temp/tmpxft_000054a8_00000000-10_mod.cpp1.ii".
Traceback (most recent call last):
  File "C:/Users/Administrator/Desktop/work/ntm-one-shot-master/tt.py", line 1, 
in <module>
    from theano import function, config, shared, sandbox
  File "C:\ProgramData\Anaconda2\lib\site-packages\theano\__init__.py", line 
116, in <module>
    theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1()
  File 
"C:\ProgramData\Anaconda2\lib\site-packages\theano\sandbox\cuda\tests\test_driver.py",
 line 32, in test_nvidia_driver1
    profile=False)
  File "C:\ProgramData\Anaconda2\lib\site-packages\theano\compile\function.py", 
line 326, in function
    output_keys=output_keys)
  File "C:\ProgramData\Anaconda2\lib\site-packages\theano\compile\pfunc.py", 
line 486, in pfunc
    output_keys=output_keys)
  File 
"C:\ProgramData\Anaconda2\lib\site-packages\theano\compile\function_module.py", 
line 1808, in orig_function
    defaults)
  File 
"C:\ProgramData\Anaconda2\lib\site-packages\theano\compile\function_module.py", 
line 1674, in create
nvcc warning : nvcc support for Microsoft Visual Studio 2010 and earlier has 
been deprecated and is no longer being maintained
mod.cu

['nvcc', '-shared', '-O3', '-arch=sm_30', '-Xlinker', '/DEBUG', '-D 
HAVE_ROUND', '-m64', '-Xcompiler', 
'-DCUDA_NDARRAY_CUH=18715462c72ed6afcd7ca5d52813ce90,-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION,/Zi,/MD',
 
'-I"C:\\Users\\Administrator\\AppData\\Local\\Theano\\compiledir_Windows-7-6.1.7601-SP1-Intel64_Family_6_Model_60_Stepping_3_GenuineIntel-2.7.13-64\\cuda_ndarray"',
 '-I"C:\\ProgramData\\Anaconda2\\lib\\site-packages\\numpy\\core\\include"', 
'-I"C:\\ProgramData\\Anaconda2\\include"', 
'-I"C:\\ProgramData\\Anaconda2\\lib\\site-packages\\theano\\gof"', 
'-I"C:\\ProgramData\\Anaconda2\\lib\\site-packages\\theano\\sandbox\\cuda"', 
'-L"C:\\Users\\Administrator\\AppData\\Local\\Theano\\compiledir_Windows-7-6.1.7601-SP1-Intel64_Family_6_Model_60_Stepping_3_GenuineIntel-2.7.13-64\\cuda_ndarray"',
 '-L"C:\\ProgramData\\Anaconda2\\libs"', '-L"C:\\ProgramData\\Anaconda2"', 
'-o', 
'C:\\Users\\Administrator\\AppData\\Local\\Theano\\compiledir_Windows-7-6.1.7601-SP1-Intel64_Family_6_Model_60_Stepping_3_GenuineIntel-2.7.13-64\\tmpmenaes\\544270fe7a21a748315f83abfe0913cc.pyd',
 'mod.cu', '-lcudart', '-lcublas', '-lcuda_ndarray', '-lpython27']
    input_storage=input_storage_lists, storage_map=storage_map)
  File "C:\ProgramData\Anaconda2\lib\site-packages\theano\gof\link.py", line 
699, in make_thunk
    storage_map=storage_map)[:3]
  File "C:\ProgramData\Anaconda2\lib\site-packages\theano\gof\vm.py", line 
1047, in make_all
    impl=impl))
  File "C:\ProgramData\Anaconda2\lib\site-packages\theano\gof\op.py", line 935, 
in make_thunk
    no_recycling)
  File "C:\ProgramData\Anaconda2\lib\site-packages\theano\gof\op.py", line 839, 
in make_c_thunk
    output_storage=node_output_storage)
  File "C:\ProgramData\Anaconda2\lib\site-packages\theano\gof\cc.py", line 
1190, in make_thunk
    keep_lock=keep_lock)
  File "C:\ProgramData\Anaconda2\lib\site-packages\theano\gof\cc.py", line 
1131, in __compile__
    keep_lock=keep_lock)
  File "C:\ProgramData\Anaconda2\lib\site-packages\theano\gof\cc.py", line 
1586, in cthunk_factory
    key=key, lnk=self, keep_lock=keep_lock)
  File "C:\ProgramData\Anaconda2\lib\site-packages\theano\gof\cmodule.py", line 
1159, in module_from_key
    module = lnk.compile_cmodule(location)
  File "C:\ProgramData\Anaconda2\lib\site-packages\theano\gof\cc.py", line 
1489, in compile_cmodule
    preargs=preargs)
  File 
"C:\ProgramData\Anaconda2\lib\site-packages\theano\sandbox\cuda\nvcc_compiler.py",
 line 405, in compile_str
    'for cmd', ' '.join(cmd))
Exception: ('The following error happened while compiling the node', 
GpuCAReduce{add}{1}(<CudaNdarrayType(float32, vector)>), '\n', 'nvcc return 
status', 2, 'for cmd', 'nvcc -shared -O3 -arch=sm_30 -Xlinker /DEBUG -D 
HAVE_ROUND -m64 -Xcompiler 
-DCUDA_NDARRAY_CUH=18715462c72ed6afcd7ca5d52813ce90,-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION,/Zi,/MD
 
-I"C:\\Users\\Administrator\\AppData\\Local\\Theano\\compiledir_Windows-7-6.1.7601-SP1-Intel64_Family_6_Model_60_Stepping_3_GenuineIntel-2.7.13-64\\cuda_ndarray"
 -I"C:\\ProgramData\\Anaconda2\\lib\\site-packages\\numpy\\core\\include" 
-I"C:\\ProgramData\\Anaconda2\\include" 
-I"C:\\ProgramData\\Anaconda2\\lib\\site-packages\\theano\\gof" 
-I"C:\\ProgramData\\Anaconda2\\lib\\site-packages\\theano\\sandbox\\cuda" 
-L"C:\\Users\\Administrator\\AppData\\Local\\Theano\\compiledir_Windows-7-6.1.7601-SP1-Intel64_Family_6_Model_60_Stepping_3_GenuineIntel-2.7.13-64\\cuda_ndarray"
 -L"C:\\ProgramData\\Anaconda2\\libs" -L"C:\\ProgramData\\Anaconda2" -o 
C:\\Users\\Administrator\\AppData\\Local\\Theano\\compiledir_Windows-7-6.1.7601-SP1-Intel64_Family_6_Model_60_Stepping_3_GenuineIntel-2.7.13-64\\tmpmenaes\\544270fe7a21a748315f83abfe0913cc.pyd
 mod.cu -lcudart -lcublas -lcuda_ndarray -lpython27', 
'[GpuCAReduce{add}{1}(<CudaNdarrayType(float32, vector)>)]')

Process finished with exit code 1

_______________________________________________
PyCUDA mailing list
PyCUDA@tiker.net
https://lists.tiker.net/listinfo/pycuda

[PyCUDA] pycuda execute error

Reply via email to