(The example in
https://bugs.launchpad.net/ubuntu/+source/pyopencl/+bug/1354086 no
longer hangs, so the "rapidly allocating and freeing pyopencl objects
doesn't actually free the memory" aspect has evidently been fixed, but
keeping too many objects for the available memory still does hang.

On further investigation, it isn't that simple:

-Create and keep (large_array_test.py): With swap, creates the objects in swap (becoming slow but not actually hanging), then errors out on trying to do arithmetic on them. Without swap, errors out in object creation, but not before the kernel's OOM killer terminates a few pieces of the desktop (according to the log, it thinks my test is only using ~50MB, so it (like gnome-system-monitor) evidently can't see GPU memory use). -Rapid create-then-free (arraybug_test.py) using the result (naive pyopencl.clmath): Doesn't hang, but may segfault on exit. (Kernel log: python3[8857]: segfault at 20 ip 00007fa5caac3c84 sp 00007ffca782b2a0 error 4 in libpthread-2.21.so[7fa5caaba000+18000] ; I can't be more specific because it doesn't crash under gdb). -Rapid create-then-free throwing away the result (this would be a bug in a real program, but probably shouldn't hang the entire system): With swap, starts using it, hence becoming slow but not actually hanging (though I haven't taken it to the point of actually filling the swap). Without swap, hangs (at least screen output does: desktop clock frozen, mouse pointer minimally responsive), with disk activity (!); Alt+SysRq+s,u,b restarts, but usually doesn't write the log).
#!/usr/bin/env python3
#Depends: python3-pyopencl python3-numpy
from __future__ import division,print_function
import pyopencl
import pyopencl.tools
import pyopencl.array
import numpy as np
import time
import pyopencl.clmath
import gc
import sys
pad=['']
def trace_c(frame,event,arg):
    if event=='return':
        pad[0]=pad[0][0:-1]
    if event!='line':#never actually hits c_call,c_return
        print(pad[0],frame.f_code.co_name,frame.f_code.co_filename[len("/home/rnpalmer/.local/lib/python2.7/site-packages/"):],frame.f_code.co_firstlineno,event)
        try:
            print(arg.__name__)
        except AttributeError:
            pass
    if event=='call':
        pad[0]=pad[0]+' '
    return trace_c
#sys.settrace(trace_c)
ctx=pyopencl.create_some_context()
cq0=pyopencl.CommandQueue(ctx)
#mpool=pyopencl.tools.MemoryPool(pyopencl.tools.ImmediateAllocator(cq0))#using this avoids the hang
a=np.random.randn(1e6).astype(np.dtype('float32'))
aCL=pyopencl.array.to_device(cq0,a)#doesn't help: ,allocator=pyopencl.tools.ImmediateAllocator(cq0))
f1=pyopencl.elementwise.ElementwiseKernel(ctx,pyopencl.tools.dtype_to_ctype(aCL.dtype)+" *a,"+pyopencl.tools.dtype_to_ctype(aCL.dtype)+" *b","b[i]=cos(a[i])+sin(a[i])+sqrt(a[i])","cossinsqrt")
bCL=aCL+1
n=0
b=np.cos(a)+np.sin(a)+np.sqrt(a)
bCL=pyopencl.clmath.cos(aCL)+pyopencl.clmath.sin(aCL)+pyopencl.clmath.sqrt(aCL)
s=""
bCLlist=[]
print("gc:",gc.collect())
while s=="":
    #bCL=aCL._new_with_changes(None,None)#(used by empty_like,etc.)doesn't hang within 10,000
    #bCL=aCL._new_like_me()#(used by +,etc.)doesn't hang within 10,000
    #bCL=pyopencl.array.zeros_like(aCL)#doesn't hang within 10,000
    #bCL=pyopencl.array.empty_like(aCL)#doesn't hang within 5500
    #bCL=pyopencl.array.empty_like(aCL);f1(aCL,bCL).wait()#doesn't hang within 8000
    #bCL=pyopencl.array.empty_like(aCL);f2=pyopencl.elementwise.ElementwiseKernel(ctx,pyopencl.tools.dtype_to_ctype(bCL.dtype)+" *a,"+pyopencl.tools.dtype_to_ctype(bCL.dtype)+" *b","b[i]=cos(a[i])+sin(a[i])+sqrt(a[i])","cossinsqrt");f2(aCL,bCL).wait()#doesn't hang within 5500
    #cCL=pyopencl.array.to_device(cq0,a);bCL=pyopencl.array.to_device(cq0,a);f1(cCL,bCL).wait()#doesn't hang within 5000
    #bCL=pyopencl.array.empty_like(aCL);f1(bCL,aCL).wait()#doesn't hang within 5500
    #bCL=pyopencl.array.to_device(cq0,a)#doesn't hang within 10,000
    bCL=pyopencl.array.to_device(cq0,a);pyopencl.enqueue_copy(cq0,bCL.data,aCL.data)#hangs after ~700, visible to System Monitor#Xenial: starts swapping (temporary freezes) after ~700, erratic displayed RAM usage, reached ~4500/15GB without a full hang; with swap disabled, hangs after ~700
    #bCL=pyopencl.array.to_device(cq0,a);pyopencl.enqueue_copy(cq0,bCL.data,aCL.data).wait()#doesn't hang within 8000
    #bCL=pyopencl.array.to_device(cq0,a);cCL=pyopencl.array.to_device(cq0,a);pyopencl.enqueue_copy(cq0,bCL.data,cCL.data)#hangs after ~350
    #cCL=pyopencl.array.to_device(cq0,a);pyopencl.enqueue_copy(cq0,bCL.data,cCL.data)#hangs after ~700, visible to System Monitor and Valgrind
    #cCL=pyopencl.array.to_device(cq0,a);pyopencl.enqueue_copy(cq0,bCL.data,cCL.data).wait()#doesn't hang within 5000
    #bCL=pyopencl.array.to_device(cq0,a);cCL=pyopencl.array.to_device(cq0,a);bCL=cCL+cCL#hangs after ~350, partly visible to System Monitor and Valgrind
    #bCL=pyopencl.array.to_device(cq0,a);pyopencl.enqueue_copy(cq0,bCL.data,a.data)#doesn't hang within 4000
    #bCL=pyopencl.array.arange(cq0,0,1e6,1,dtype='float32')#doesn't hang within 6000, but not sure if this is correct usage
    #bCL=aCL.copy(cq0)#hangs after ~700, with or without explicit cq0
    #bCL=aCL+1#hangs after ~700#Xenial: starts swapping (temporary freezes) at ~2400, doesn't hang within 5000
    #bCL=aCL+bCL#hangs after ~700, not visible to System Monitor#Xenial: doesn't hang within 10,000, but segfaults on exit after long runs (reproducibly, but not in gdb...)
    #bCL=aCL+aCL#hangs after ~700#Xenial: starts swapping (temporary freezes) after ~4500 (despite low displayed RAM usage) but doesn't hang
    #bCL+=aCL#doesn't hang within 10,000#Xenial: no swapping within 10,000
    #bCL=pyopencl.array.empty_like(aCL);bCL+=aCL#hangs after ~700
    #bCL=pyopencl.array.to_device(cq0,a);bCL+=1#hangs after ~700
    #bCL=pyopencl.clmath.cos(aCL)+pyopencl.clmath.sin(aCL)+pyopencl.clmath.sqrt(aCL)#hangs after ~140, or ~700 with just cos#Xenial: starts swapping (temporary freezes) at ~200, doesn't hang within 300
    #bCL=aCL*aCL*aCL*aCL*aCL*aCL*aCL*aCL*aCL*aCL*aCL*aCL*aCL*aCL*aCL*aCL*aCL*aCL*aCL*aCL#hangs after ~35
    #cq0.finish()#doesn't prevent hang#Xenial: does prevent swapping in clmath, but not enqueue_copy
    #pyopencl.enqueue_barrier(cq0).wait()#prevents hang#Xenial: prevents swapping (clmath and enqueue_copy)
    #bCLlist.append(bCL)#slows down a lot, but doesn't actually hang within 1600
    if n%100==0:
        print("gc:",gc.collect(),len(gc.garbage),gc.is_tracked(aCL),gc.is_tracked(bCL),len(gc.get_referrers(aCL)),len(gc.get_referrers(bCL)))
        #print("refsA:",gc.get_referrers(aCL),"\nrefsB:",gc.get_referrers(bCL))#only the global dictionary
        s=input("Press Enter to continue")
    n=n+1
    print(n,end=' ')
    sys.stdout.flush()
#python: /tmp/buildd/beignet-0.9.3git/src/intel/intel_gpgpu.c:567: intel_gpgpu_check_binded_buf_address: Assertion `gpgpu->binded_buf[i]->offset != 0' failed.
#apt-get install python3-pyopencl beignet-opencl-icd && su test1
#python3 /home/rnpalmer/Debian/builds/stackbuild/arraybug_test.py
#!/usr/bin/env python3
#Depends: python3-pyopencl python3-numpy
from __future__ import division,print_function
import pyopencl
import pyopencl.array
import numpy as np
import time
import pyopencl.clmath
ctx=pyopencl.create_some_context()
cq=pyopencl.CommandQueue(ctx)
asize=250*(2**20)#fails above approx. 235 for 2-array, 162 for 3-array, 100 for 5-array, but the exact number varies
#Warning: very large sizes will hang your system, https://bugs.launchpad.net/ubuntu/+source/beignet/+bug/1354086
aCL=pyopencl.array.arange(cq,0,asize,1,dtype='float32')
s=input("Press Enter to continue")
bCL=pyopencl.array.arange(cq,0,asize,1,dtype='float32')
s=input("Press Enter to continue")
cCL=pyopencl.array.arange(cq,0,asize,1,dtype='float32')
dCL=pyopencl.array.arange(cq,0,asize,1,dtype='float32')
eCL=pyopencl.array.arange(cq,0,asize,1,dtype='float32')
print("CL arrays created")
s=input("Press Enter to continue")
ans=aCL[0:1000].get()*4
f2=pyopencl.elementwise.ElementwiseKernel(ctx,pyopencl.tools.dtype_to_ctype(aCL.dtype)+" *a,"+pyopencl.tools.dtype_to_ctype(aCL.dtype)+" *c","c[i]=3*a[i]+c[i]","twoarray")
f3=pyopencl.elementwise.ElementwiseKernel(ctx,pyopencl.tools.dtype_to_ctype(aCL.dtype)+" *a,"+pyopencl.tools.dtype_to_ctype(aCL.dtype)+" *b,"+pyopencl.tools.dtype_to_ctype(aCL.dtype)+" *c","c[i]=3*a[i]+b[i]","threearray")
f5=pyopencl.elementwise.ElementwiseKernel(ctx,pyopencl.tools.dtype_to_ctype(aCL.dtype)+" *a,"+pyopencl.tools.dtype_to_ctype(aCL.dtype)+" *b,"+pyopencl.tools.dtype_to_ctype(aCL.dtype)+" *c,"+pyopencl.tools.dtype_to_ctype(aCL.dtype)+" *d,"+pyopencl.tools.dtype_to_ctype(aCL.dtype)+" *e","c[i]=a[i]+b[i]+d[i]+e[i]","fivearray")
f5b=pyopencl.elementwise.ElementwiseKernel(ctx,pyopencl.tools.dtype_to_ctype(aCL.dtype)+" *a,"+pyopencl.tools.dtype_to_ctype(aCL.dtype)+" *b,"+pyopencl.tools.dtype_to_ctype(aCL.dtype)+" *c,"+pyopencl.tools.dtype_to_ctype(aCL.dtype)+" *d,"+pyopencl.tools.dtype_to_ctype(aCL.dtype)+" *e","c[i]=4*e[i]","fivearray_usetwo")
f2(aCL,cCL).wait()
#f3(aCL,bCL,cCL).wait()
f5(aCL,bCL,cCL,dCL,eCL).wait()
#f5b(aCL,bCL,cCL,dCL,eCL).wait()
print("size",len(aCL)," error ",np.max(np.nan_to_num(np.abs(cCL[0:1000].get()-ans))),"first 10 ",ans[0:10],cCL[0:10].get())

_______________________________________________
Beignet mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/beignet

Reply via email to