Hi Eric,

You'll to declare the function 'extern "C"', then it should work.

Andreas

Eric Scheffel <eric.schef...@nottingham.edu.cn> writes:
> Thanks again for making available the cuda wrapper library for python. It's 
> great to use and helps me a lot in my own research. One problem I am facing 
> at the moment is that I am trying to use Danny Ruijters' kernel code for 
> using bicubic interpolation on nvidia GPU's, instead of the "built-in" 
> bilinear ones. The code is available at 
> http://www.dannyruijters.nl/cubicinterpolation/
>
> Attaches is the kernel source code I am trying to run using pycuda. The 
> problem is that even though it compiles using the "SourceModule" function 
> (using the no_extern_c=True option), I cannot fetch the "compnkcross" kernel 
> method using "get_function". I get the error message:
>
> In [15]: modo.get_function('compnkcross')
> ---------------------------------------------------------------------------
> LogicError                                Traceback (most recent call last)
> <ipython-input-15-e2bfb6bd9c2d> in <module>()
> ----> 1 modo.get_function('compnkcross')
>
> /usr/lib/python2.7/site-packages/pycuda-2013.1.1-py2.7-linux-i686.egg/pycuda/compiler.pyc
>  in get_function(self, name)
>     283
>     284     def get_function(self, name):
> --> 285         return self.module.get_function(name)
>
> LogicError: cuModuleGetFunction failed: not found
>
>
> I CAN fetch the texture references, but that is about it. The original source 
> code for the kernel is distributed over a number of cu-files as well as at 
> least one header file. I have simply taken them and substituted out to create 
> one big monolithic kernel code without external dependencies. Being paranoid 
> about not exactly knowing what pycuda supports I have also converted template 
> kernel functions into specific cases which I need for running my own code, 
> though I suspect that pycuda of course supports template functions.
>
> Thanks,
> Eric
>
> (im Moment im kalten Karlsruhe)
>
>
> ////////////////////////////////////////////////////////////////////////////////
> typedef unsigned int uint;
> typedef unsigned short ushort;
>
> // float functions
> ////////////////////////////////////////////////////////////////////////////////
>
> // lerp
> inline __device__ __host__ float lerp(float a, float b, float t)
> {
>     return a + t*(b-a);
> }
>
> // clamp
> inline __device__ __host__ float clamp(float f, float a, float b)
> {
>     return fmaxf(a, fminf(f, b));
> }
>
> // int2 functions
> ////////////////////////////////////////////////////////////////////////////////
>
> // addition
> inline __host__ __device__ int2 operator+(int2 a, int2 b)
> {
>     return make_int2(a.x + b.x, a.y + b.y);
> }
> inline __host__ __device__ void operator+=(int2 &a, int2 b)
> {
>     a.x += b.x; a.y += b.y;
> }
>
> // subtract
> inline __host__ __device__ int2 operator-(int2 a, int2 b)
> {
>     return make_int2(a.x - b.x, a.y - b.y);
> }
> inline __host__ __device__ void operator-=(int2 &a, int2 b)
> {
>     a.x -= b.x; a.y -= b.y;
> }
>
> // multiply
> inline __host__ __device__ int2 operator*(int2 a, int2 b)
> {
>     return make_int2(a.x * b.x, a.y * b.y);
> }
> inline __host__ __device__ int2 operator*(int2 a, int s)
> {
>     return make_int2(a.x * s, a.y * s);
> }
> inline __host__ __device__ int2 operator*(int s, int2 a)
> {
>     return make_int2(a.x * s, a.y * s);
> }
> inline __host__ __device__ void operator*=(int2 &a, int s)
> {
>     a.x *= s; a.y *= s;
> }
>
> // float2 functions
> ////////////////////////////////////////////////////////////////////////////////
>
> // additional constructors
> inline __host__ __device__ float2 make_float2(float s)
> {
>     return make_float2(s, s);
> }
> inline __host__ __device__ float2 make_float2(int2 a)
> {
>     return make_float2(float(a.x), float(a.y));
> }
>
> // addition
> inline __host__ __device__ float2 operator+(float2 a, float2 b)
> {
>     return make_float2(a.x + b.x, a.y + b.y);
> }
> inline __host__ __device__ void operator+=(float2 &a, float2 b)
> {
>     a.x += b.x; a.y += b.y;
> }
>
> // subtract
> inline __host__ __device__ float2 operator-(float2 a, float2 b)
> {
>     return make_float2(a.x - b.x, a.y - b.y);
> }
> inline __host__ __device__ void operator-=(float2 &a, float2 b)
> {
>     a.x -= b.x; a.y -= b.y;
> }
>
> // multiply
> inline __host__ __device__ float2 operator*(float2 a, float2 b)
> {
>     return make_float2(a.x * b.x, a.y * b.y);
> }
> inline __host__ __device__ float2 operator*(float2 a, float s)
> {
>     return make_float2(a.x * s, a.y * s);
> }
> inline __host__ __device__ float2 operator*(float s, float2 a)
> {
>     return make_float2(a.x * s, a.y * s);
> }
> inline __host__ __device__ void operator*=(float2 &a, float s)
> {
>     a.x *= s; a.y *= s;
> }
>
> // divide
> inline __host__ __device__ float2 operator/(float2 a, float2 b)
> {
>     return make_float2(a.x / b.x, a.y / b.y);
> }
> inline __host__ __device__ float2 operator/(float2 a, float s)
> {
>     float inv = 1.0f / s;
>     return a * inv;
> }
> inline __host__ __device__ float2 operator/(float s, float2 a) //Danny
> {
> //    float inv = 1.0f / s;
> //    return a * inv;
>     return make_float2(s / a.x, s / a.y);
> }
> inline __host__ __device__ void operator/=(float2 &a, float s)
> {
>     float inv = 1.0f / s;
>     a *= inv;
> }
>
> // lerp
> inline __device__ __host__ float2 lerp(float2 a, float2 b, float t)
> {
>     return a + t*(b-a);
> }
>
> // clamp
> inline __device__ __host__ float2 clamp(float2 v, float a, float b)
> {
>     return make_float2(clamp(v.x, a, b), clamp(v.y, a, b));
> }
>
> inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b)
> {
>     return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
> }
>
> // dot product
> inline __host__ __device__ float dot(float2 a, float2 b)
> {
>     return a.x * b.x + a.y * b.y;
> }
>
> // length
> inline __host__ __device__ float length(float2 v)
> {
>     return sqrtf(dot(v, v));
> }
>
> // normalize
> inline __host__ __device__ float2 normalize(float2 v)
> {
>     float invLen = 1.0f / sqrtf(dot(v, v));
>     return v * invLen;
> }
>
> // floor
> inline __host__ __device__ float2 floor(const float2 v)
> {
>     return make_float2(floor(v.x), floor(v.y));
> }
>
> // reflect
> inline __host__ __device__ float2 reflect(float2 i, float2 n)
> {
>     return i - 2.0f * n * dot(n,i);
> }
>
> // float3 functions
> ////////////////////////////////////////////////////////////////////////////////
>
> // additional constructors
> inline __host__ __device__ float3 make_float3(float s)
> {
>     return make_float3(s, s, s);
> }
> inline __host__ __device__ float3 make_float3(float2 a)
> {
>     return make_float3(a.x, a.y, 0.0f);
> }
> inline __host__ __device__ float3 make_float3(float2 a, float s)
> {
>     return make_float3(a.x, a.y, s);
> }
> inline __host__ __device__ float3 make_float3(float4 a)
> {
>     return make_float3(a.x, a.y, a.z);  // discards w
> }
> inline __host__ __device__ float3 make_float3(int3 a)
> {
>     return make_float3(float(a.x), float(a.y), float(a.z));
> }
>
> // min
> static __inline__ __host__ __device__ float3 fminf(float3 a, float3 b)
> {
>     return make_float3(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z));
> }
>
> // max
> static __inline__ __host__ __device__ float3 fmaxf(float3 a, float3 b)
> {
>     return make_float3(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z));
> }
>
> // addition
> inline __host__ __device__ float3 operator+(float3 a, float3 b)
> {
>     return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
> }
> inline __host__ __device__ float3 operator+(float3 a, float b)
> {
>     return make_float3(a.x + b, a.y + b, a.z + b);
> }
> inline __host__ __device__ void operator+=(float3 &a, float3 b)
> {
>     a.x += b.x; a.y += b.y; a.z += b.z;
> }
>
> // subtract
> inline __host__ __device__ float3 operator-(float3 a, float3 b)
> {
>     return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
> }
> inline __host__ __device__ float3 operator-(float3 a, float b)
> {
>     return make_float3(a.x - b, a.y - b, a.z - b);
> }
> inline __host__ __device__ void operator-=(float3 &a, float3 b)
> {
>     a.x -= b.x; a.y -= b.y; a.z -= b.z;
> }
>
> // multiply
> inline __host__ __device__ float3 operator*(float3 a, float3 b)
> {
>     return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
> }
> inline __host__ __device__ float3 operator*(float3 a, float s)
> {
>     return make_float3(a.x * s, a.y * s, a.z * s);
> }
> inline __host__ __device__ float3 operator*(float s, float3 a)
> {
>     return make_float3(a.x * s, a.y * s, a.z * s);
> }
> inline __host__ __device__ void operator*=(float3 &a, float s)
> {
>     a.x *= s; a.y *= s; a.z *= s;
> }
>
> // divide
> inline __host__ __device__ float3 operator/(float3 a, float3 b)
> {
>     return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
> }
> inline __host__ __device__ float3 operator/(float3 a, float s)
> {
>     float inv = 1.0f / s;
>     return a * inv;
> }
> inline __host__ __device__ float3 operator/(float s, float3 a) //Danny
> {
> //    float inv = 1.0f / s;
> //    return a * inv;
>     return make_float3(s / a.x, s / a.y, s / a.z);
> }
> inline __host__ __device__ void operator/=(float3 &a, float s)
> {
>     float inv = 1.0f / s;
>     a *= inv;
> }
>
> // lerp
> inline __device__ __host__ float3 lerp(float3 a, float3 b, float t)
> {
>     return a + t*(b-a);
> }
>
> // clamp
> inline __device__ __host__ float3 clamp(float3 v, float a, float b)
> {
>     return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
> }
>
> inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
> {
>     return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, 
> a.z, b.z));
> }
>
> // dot product
> inline __host__ __device__ float dot(float3 a, float3 b)
> {
>     return a.x * b.x + a.y * b.y + a.z * b.z;
> }
>
> // cross product
> inline __host__ __device__ float3 cross(float3 a, float3 b)
> {
>     return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - 
> a.y*b.x);
> }
>
> // length
> inline __host__ __device__ float length(float3 v)
> {
>     return sqrtf(dot(v, v));
> }
>
> // normalize
> inline __host__ __device__ float3 normalize(float3 v)
> {
>     float invLen = 1.0f / sqrtf(dot(v, v));
>     return v * invLen;
> }
>
> // floor
> inline __host__ __device__ float3 floor(const float3 v)
> {
>     return make_float3(floor(v.x), floor(v.y), floor(v.z));
> }
>
> // reflect
> inline __host__ __device__ float3 reflect(float3 i, float3 n)
> {
>     return i - 2.0f * n * dot(n,i);
> }
>
> // float4 functions
> ////////////////////////////////////////////////////////////////////////////////
>
> // additional constructors
> inline __host__ __device__ float4 make_float4(float s)
> {
>     return make_float4(s, s, s, s);
> }
> inline __host__ __device__ float4 make_float4(float3 a)
> {
>     return make_float4(a.x, a.y, a.z, 0.0f);
> }
> inline __host__ __device__ float4 make_float4(float3 a, float w)
> {
>     return make_float4(a.x, a.y, a.z, w);
> }
> inline __host__ __device__ float4 make_float4(int4 a)
> {
>     return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
> }
>
> // min
> static __inline__ __host__ __device__ float4 fminf(float4 a, float4 b)
> {
>     return make_float4(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z), 
> fminf(a.w,b.w));
> }
>
> // max
> static __inline__ __host__ __device__ float4 fmaxf(float4 a, float4 b)
> {
>     return make_float4(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z), 
> fmaxf(a.w,b.w));
> }
>
> // addition
> inline __host__ __device__ float4 operator+(float4 a, float4 b)
> {
>     return make_float4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
> }
> inline __host__ __device__ void operator+=(float4 &a, float4 b)
> {
>     a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
> }
>
> // subtract
> inline __host__ __device__ float4 operator-(float4 a, float4 b)
> {
>     return make_float4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
> }
> inline __host__ __device__ void operator-=(float4 &a, float4 b)
> {
>     a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
> }
>
> // multiply
> inline __host__ __device__ float4 operator*(float4 a, float s)
> {
>     return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
> }
> inline __host__ __device__ float4 operator*(float s, float4 a)
> {
>     return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
> }
> inline __host__ __device__ void operator*=(float4 &a, float s)
> {
>     a.x *= s; a.y *= s; a.z *= s; a.w *= s;
> }
>
> // divide
> inline __host__ __device__ float4 operator/(float4 a, float4 b)
> {
>     return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
> }
> inline __host__ __device__ float4 operator/(float4 a, float s)
> {
>     float inv = 1.0f / s;
>     return a * inv;
> }
> inline __host__ __device__ float4 operator/(float s, float4 a) //Danny
> {
> //    float inv = 1.0f / s;
> //    return a * inv;
>     return make_float4(s / a.x, s / a.y, s / a.z, s / a.w);
> }
> inline __host__ __device__ void operator/=(float4 &a, float s)
> {
>     float inv = 1.0f / s;
>     a *= inv;
> }
>
> // lerp
> inline __device__ __host__ float4 lerp(float4 a, float4 b, float t)
> {
>     return a + t*(b-a);
> }
>
> // clamp
> inline __device__ __host__ float4 clamp(float4 v, float a, float b)
> {
>     return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), 
> clamp(v.w, a, b));
> }
>
> inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b)
> {
>     return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, 
> a.z, b.z), clamp(v.w, a.w, b.w));
> }
>
> // dot product
> inline __host__ __device__ float dot(float4 a, float4 b)
> {
>     return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
> }
>
> // length
> inline __host__ __device__ float length(float4 r)
> {
>     return sqrtf(dot(r, r));
> }
>
> // normalize
> inline __host__ __device__ float4 normalize(float4 v)
> {
>     float invLen = 1.0f / sqrtf(dot(v, v));
>     return v * invLen;
> }
>
> // floor
> inline __host__ __device__ float4 floor(const float4 v)
> {
>     return make_float4(floor(v.x), floor(v.y), floor(v.z), floor(v.w));
> }
>
> // int3 functions
> ////////////////////////////////////////////////////////////////////////////////
>
> // additional constructors
> inline __host__ __device__ int3 make_int3(int s)
> {
>     return make_int3(s, s, s);
> }
> inline __host__ __device__ int3 make_int3(float3 a)
> {
>     return make_int3(int(a.x), int(a.y), int(a.z));
> }
>
> // min
> inline __host__ __device__ int3 min(int3 a, int3 b)
> {
>     return make_int3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
> }
>
> // max
> inline __host__ __device__ int3 max(int3 a, int3 b)
> {
>     return make_int3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
> }
>
> // addition
> inline __host__ __device__ int3 operator+(int3 a, int3 b)
> {
>     return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
> }
> inline __host__ __device__ void operator+=(int3 &a, int3 b)
> {
>     a.x += b.x; a.y += b.y; a.z += b.z;
> }
>
> // subtract
> inline __host__ __device__ int3 operator-(int3 a, int3 b)
> {
>     return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
> }
>
> inline __host__ __device__ void operator-=(int3 &a, int3 b)
> {
>     a.x -= b.x; a.y -= b.y; a.z -= b.z;
> }
>
> // multiply
> inline __host__ __device__ int3 operator*(int3 a, int3 b)
> {
>     return make_int3(a.x * b.x, a.y * b.y, a.z * b.z);
> }
> inline __host__ __device__ int3 operator*(int3 a, int s)
> {
>     return make_int3(a.x * s, a.y * s, a.z * s);
> }
> inline __host__ __device__ int3 operator*(int s, int3 a)
> {
>     return make_int3(a.x * s, a.y * s, a.z * s);
> }
> inline __host__ __device__ void operator*=(int3 &a, int s)
> {
>     a.x *= s; a.y *= s; a.z *= s;
> }
>
> // divide
> inline __host__ __device__ int3 operator/(int3 a, int3 b)
> {
>     return make_int3(a.x / b.x, a.y / b.y, a.z / b.z);
> }
> inline __host__ __device__ int3 operator/(int3 a, int s)
> {
>     return make_int3(a.x / s, a.y / s, a.z / s);
> }
> inline __host__ __device__ int3 operator/(int s, int3 a)
> {
>     return make_int3(a.x / s, a.y / s, a.z / s);
> }
> inline __host__ __device__ void operator/=(int3 &a, int s)
> {
>     a.x /= s; a.y /= s; a.z /= s;
> }
>
> // clamp
> inline __device__ __host__ int clamp(int f, int a, int b)
> {
>     return max(a, min(f, b));
> }
>
> inline __device__ __host__ int3 clamp(int3 v, int a, int b)
> {
>     return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
> }
>
> inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b)
> {
>     return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, 
> a.z, b.z));
> }
>
>
> // uint3 functions
> ////////////////////////////////////////////////////////////////////////////////
>
> // additional constructors
> inline __host__ __device__ uint3 make_uint3(uint s)
> {
>     return make_uint3(s, s, s);
> }
> inline __host__ __device__ uint3 make_uint3(float3 a)
> {
>     return make_uint3(uint(a.x), uint(a.y), uint(a.z));
> }
>
> // min
> inline __host__ __device__ uint3 min(uint3 a, uint3 b)
> {
>     return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
> }
>
> // max
> inline __host__ __device__ uint3 max(uint3 a, uint3 b)
> {
>     return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
> }
>
> // addition
> inline __host__ __device__ uint3 operator+(uint3 a, uint3 b)
> {
>     return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z);
> }
> inline __host__ __device__ void operator+=(uint3 &a, uint3 b)
> {
>     a.x += b.x; a.y += b.y; a.z += b.z;
> }
>
> // subtract
> inline __host__ __device__ uint3 operator-(uint3 a, uint3 b)
> {
>     return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z);
> }
>
> inline __host__ __device__ void operator-=(uint3 &a, uint3 b)
> {
>     a.x -= b.x; a.y -= b.y; a.z -= b.z;
> }
>
> // multiply
> inline __host__ __device__ uint3 operator*(uint3 a, uint3 b)
> {
>     return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z);
> }
> inline __host__ __device__ uint3 operator*(uint3 a, uint s)
> {
>     return make_uint3(a.x * s, a.y * s, a.z * s);
> }
> inline __host__ __device__ uint3 operator*(uint s, uint3 a)
> {
>     return make_uint3(a.x * s, a.y * s, a.z * s);
> }
> inline __host__ __device__ void operator*=(uint3 &a, uint s)
> {
>     a.x *= s; a.y *= s; a.z *= s;
> }
>
> // divide
> inline __host__ __device__ uint3 operator/(uint3 a, uint3 b)
> {
>     return make_uint3(a.x / b.x, a.y / b.y, a.z / b.z);
> }
> inline __host__ __device__ uint3 operator/(uint3 a, uint s)
> {
>     return make_uint3(a.x / s, a.y / s, a.z / s);
> }
> inline __host__ __device__ uint3 operator/(uint s, uint3 a)
> {
>     return make_uint3(a.x / s, a.y / s, a.z / s);
> }
> inline __host__ __device__ void operator/=(uint3 &a, uint s)
> {
>     a.x /= s; a.y /= s; a.z /= s;
> }
>
> // clamp
> inline __device__ __host__ uint clamp(uint f, uint a, uint b)
> {
>     return max(a, min(f, b));
> }
>
> inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b)
> {
>     return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
> }
>
> inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b)
> {
>     return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, 
> a.z, b.z));
> }
>
>
>
> typedef unsigned int uint;
> typedef unsigned short ushort;
> typedef unsigned char uchar;
> typedef signed char schar;
>
> inline __device__ __host__ uint UMIN(uint a, uint b)
> {
>     return a < b ? a : b;
> }
>
> inline __device__ __host__ uint PowTwoDivider(uint n)
> {
>     if (n == 0) return 0;
>     uint divider = 1;
>     while ((n & divider) == 0) divider <<= 1;
>     return divider;
> }
>
> inline __host__ __device__ float2 operator-(float a, float2 b)
> {
>     return make_float2(a - b.x, a - b.y);
> }
>
> inline __host__ __device__ float3 operator-(float a, float3 b)
> {
>     return make_float3(a - b.x, a - b.y, a - b.z);
> }
>
>
>
> // Cubic B-spline function
> // The 3rd order Maximal Order and Minimum Support function, that it is 
> maximally differentiable.
> inline __host__ __device__ float bspline(float t)
> {
>     t = fabs(t);
>     const float a = 2.0f - t;
>
>     if (t < 1.0f) return 2.0f/3.0f - 0.5f*t*t*a;
>     else if (t < 2.0f) return a*a*a / 6.0f;
>     else return 0.0f;
> }
>
> // The first order derivative of the cubic B-spline
> inline __host__ __device__ float bspline_1st_derivative(float t)
> {
>     if (-2.0f < t && t <= -1.0f) return 0.5f*t*t + 2.0f*t + 2.0f;
>     else if (-1.0f < t && t <= 0.0f) return -1.5f*t*t - 2.0f*t;
>     else if ( 0.0f < t && t <= 1.0f) return  1.5f*t*t - 2.0f*t;
>     else if ( 1.0f < t && t <  2.0f) return -0.5f*t*t + 2.0f*t - 2.0f;
>     else return 0.0f;
> }
>
> // The second order derivative of the cubic B-spline
> inline __host__ __device__ float bspline_2nd_derivative(float t)
> {
>     t = fabs(t);
>
>     if (t < 1.0f) return 3.0f*t - 2.0f;
>     else if (t < 2.0f) return 2.0f - t;
>     else return 0.0f;
> }
>
> // Inline calculation of the bspline convolution weights, without conditional 
> statements
> inline __device__ void bspline_weights(float fraction, float& w0, float& w1, 
> float& w2, float& w3)
> {
>     const float one_frac = 1.0f - fraction;
>     const float squared = fraction * fraction;
>     const float one_sqd = one_frac * one_frac;
>
>     w0 = 1.0f/6.0f * one_sqd * one_frac;
>     w1 = 2.0f/3.0f - 0.5f * squared * (2.0f-fraction);
>     w2 = 2.0f/3.0f - 0.5f * one_sqd * (2.0f-one_frac);
>     w3 = 1.0f/6.0f * squared * fraction;
> }
>
> // Inline calculation of the first order derivative bspline convolution 
> weights, without conditional statements
> inline __device__ void bspline_weights_1st_derivative(float fraction, float& 
> w0, float& w1, float& w2, float& w3)
> {
>     const float squared = fraction * fraction;
>
>     w0 = -0.5f * squared + fraction - 0.5f;
>     w1 =  1.5f * squared - 2.0f * fraction;
>     w2 = -1.5f * squared + fraction + 0.5f;
>     w3 =  0.5f * squared;
> }
>
> // Inline calculation of the second order derivative bspline convolution 
> weights, without conditional statements
> inline __device__ void bspline_weights_2nd_derivative(float fraction, float& 
> w0, float& w1, float& w2, float& w3)
> {
>     w0 =  1.0f - fraction;
>     w1 =  3.0f * fraction - 2.0f;
>     w2 = -3.0f * fraction + 1.0f;
>     w3 =  fraction;
> }
>
>
> __device__ float cubicTex2DSimple(texture<float, 2> tex, float x, float y)
> {
>     // transform the coordinate from [0,extent] to [-0.5, extent-0.5]
>     const float2 coord_grid = make_float2(x - 0.5f, y - 0.5f);
>     float2 index = floor(coord_grid);
>     const float2 fraction = coord_grid - index;
>     index.x += 0.5f;  //move from [-0.5, extent-0.5] to [0, extent]
>     index.y += 0.5f;  //move from [-0.5, extent-0.5] to [0, extent]
>
>     float result = 0.0f;
>     for (float y=-1; y < 2.5f; y++)
>     {
>         float bsplineY = bspline(y-fraction.y);
>         float v = index.y + y;
>         for (float x=-1; x < 2.5f; x++)
>         {
>             float bsplineXY = bspline(x-fraction.x) * bsplineY;
>             float u = index.x + x;
>             result += bsplineXY * tex2D(tex, u, v);
>         }
>     }
>     return result;
> }
>
>
> #define CUBICTEX2D cubicTex2DSimple
> texture<float, 2> polbb, polbg, polgb, polgg;
>
> __global__ void compnkcross(int N, int ngrid, int ndeg2, float Theta, float 
> K_min, float K_max, float k_min, float k_max, float K, int asvec, float *y, 
> float *x, int *empstat)
> {
> int col = blockIdx.x*blockDim.x + threadIdx.x;
> if (col < N && empstat[col] == 0 && asvec == 0) {
>   y[col] = CUBICTEX2D(polbb, ((K-K_min)/(K_max-K_min))*ndeg2, 
> powf(((x[col]-k_min)/(k_max-k_min)),(1.0f/Theta))*ngrid);
> }
> if (col < N && empstat[col] == 0 && asvec == 1) {
>   y[col] = CUBICTEX2D(polgb, ((K-K_min)/(K_max-K_min))*ndeg2, 
> powf(((x[col]-k_min)/(k_max-k_min)),(1.0f/Theta))*ngrid);
> }
> if (col < N && empstat[col] == 1 && asvec == 0) {
>   y[col] = CUBICTEX2D(polbg, ((K-K_min)/(K_max-K_min))*ndeg2, 
> powf(((x[col]-k_min)/(k_max-k_min)),(1.0f/Theta))*ngrid);
> }
> if (col < N && empstat[col] == 1 && asvec == 1) {
>   y[col] = CUBICTEX2D(polgg, ((K-K_min)/(K_max-K_min))*ndeg2, 
> powf(((x[col]-k_min)/(k_max-k_min)),(1.0f/Theta))*ngrid);
> }
> }
>
> #undef CUBICTEX2D
>
> --
> Dr. Eric Michael Scheffel
> Assistant Professor in Economics
> Ningbo Nottingham University
> Site: http://www.ericscheffel.com
>
>
>
>
> ---------------------------
> Dr. Eric Michael Scheffel
> Assistant Professor in Economics
> Ningbo Nottingham University
> Site: http://www.ericscheffel.com
>
> This message and any attachment are intended solely for the addressee and may 
> contain confidential information. 
> If you have received this message in error, please send it back to me, and 
> immediately delete it. 
>   
> Please do not use, copy or disclose the information contained in this message 
> or in any attachment.  
>
> Any views or opinions expressed by the author of this email do not 
> necessarily reflect the views of The University of Nottingham Ningbo China.
>
>
> This message has been checked for viruses but the contents of an attachment 
> may still contain software viruses which could damage your computer system:
> you are advised to perform your own checks. 
>
> Email communications with The University of Nottingham Ningbo China may be 
> monitored as permitted by UK and Chinese legislation.
>
> _______________________________________________
> PyCUDA mailing list
> PyCUDA@tiker.net
> http://lists.tiker.net/listinfo/pycuda
>

Attachment: pgpi4NDdvrNLp.pgp
Description: PGP signature

_______________________________________________
PyCUDA mailing list
PyCUDA@tiker.net
http://lists.tiker.net/listinfo/pycuda

Reply via email to