Dear Andreas,

Thanks again for making available the cuda wrapper library for python. It's 
great to use and helps me a lot in my own research. One problem I am facing at 
the moment is that I am trying to use Danny Ruijters' kernel code for using 
bicubic interpolation on nvidia GPU's, instead of the "built-in" bilinear ones. 
The code is available at http://www.dannyruijters.nl/cubicinterpolation/

Attaches is the kernel source code I am trying to run using pycuda. The problem 
is that even though it compiles using the "SourceModule" function (using the 
no_extern_c=True option), I cannot fetch the "compnkcross" kernel method using 
"get_function". I get the error message:

In [15]: modo.get_function('compnkcross')
---------------------------------------------------------------------------
LogicError                                Traceback (most recent call last)
<ipython-input-15-e2bfb6bd9c2d> in <module>()
----> 1 modo.get_function('compnkcross')

/usr/lib/python2.7/site-packages/pycuda-2013.1.1-py2.7-linux-i686.egg/pycuda/compiler.pyc
 in get_function(self, name)
    283
    284     def get_function(self, name):
--> 285         return self.module.get_function(name)

LogicError: cuModuleGetFunction failed: not found


I CAN fetch the texture references, but that is about it. The original source 
code for the kernel is distributed over a number of cu-files as well as at 
least one header file. I have simply taken them and substituted out to create 
one big monolithic kernel code without external dependencies. Being paranoid 
about not exactly knowing what pycuda supports I have also converted template 
kernel functions into specific cases which I need for running my own code, 
though I suspect that pycuda of course supports template functions.

Thanks,
Eric

(im Moment im kalten Karlsruhe)


////////////////////////////////////////////////////////////////////////////////
typedef unsigned int uint;
typedef unsigned short ushort;

// float functions
////////////////////////////////////////////////////////////////////////////////

// lerp
inline __device__ __host__ float lerp(float a, float b, float t)
{
    return a + t*(b-a);
}

// clamp
inline __device__ __host__ float clamp(float f, float a, float b)
{
    return fmaxf(a, fminf(f, b));
}

// int2 functions
////////////////////////////////////////////////////////////////////////////////

// addition
inline __host__ __device__ int2 operator+(int2 a, int2 b)
{
    return make_int2(a.x + b.x, a.y + b.y);
}
inline __host__ __device__ void operator+=(int2 &a, int2 b)
{
    a.x += b.x; a.y += b.y;
}

// subtract
inline __host__ __device__ int2 operator-(int2 a, int2 b)
{
    return make_int2(a.x - b.x, a.y - b.y);
}
inline __host__ __device__ void operator-=(int2 &a, int2 b)
{
    a.x -= b.x; a.y -= b.y;
}

// multiply
inline __host__ __device__ int2 operator*(int2 a, int2 b)
{
    return make_int2(a.x * b.x, a.y * b.y);
}
inline __host__ __device__ int2 operator*(int2 a, int s)
{
    return make_int2(a.x * s, a.y * s);
}
inline __host__ __device__ int2 operator*(int s, int2 a)
{
    return make_int2(a.x * s, a.y * s);
}
inline __host__ __device__ void operator*=(int2 &a, int s)
{
    a.x *= s; a.y *= s;
}

// float2 functions
////////////////////////////////////////////////////////////////////////////////

// additional constructors
inline __host__ __device__ float2 make_float2(float s)
{
    return make_float2(s, s);
}
inline __host__ __device__ float2 make_float2(int2 a)
{
    return make_float2(float(a.x), float(a.y));
}

// addition
inline __host__ __device__ float2 operator+(float2 a, float2 b)
{
    return make_float2(a.x + b.x, a.y + b.y);
}
inline __host__ __device__ void operator+=(float2 &a, float2 b)
{
    a.x += b.x; a.y += b.y;
}

// subtract
inline __host__ __device__ float2 operator-(float2 a, float2 b)
{
    return make_float2(a.x - b.x, a.y - b.y);
}
inline __host__ __device__ void operator-=(float2 &a, float2 b)
{
    a.x -= b.x; a.y -= b.y;
}

// multiply
inline __host__ __device__ float2 operator*(float2 a, float2 b)
{
    return make_float2(a.x * b.x, a.y * b.y);
}
inline __host__ __device__ float2 operator*(float2 a, float s)
{
    return make_float2(a.x * s, a.y * s);
}
inline __host__ __device__ float2 operator*(float s, float2 a)
{
    return make_float2(a.x * s, a.y * s);
}
inline __host__ __device__ void operator*=(float2 &a, float s)
{
    a.x *= s; a.y *= s;
}

// divide
inline __host__ __device__ float2 operator/(float2 a, float2 b)
{
    return make_float2(a.x / b.x, a.y / b.y);
}
inline __host__ __device__ float2 operator/(float2 a, float s)
{
    float inv = 1.0f / s;
    return a * inv;
}
inline __host__ __device__ float2 operator/(float s, float2 a) //Danny
{
//    float inv = 1.0f / s;
//    return a * inv;
    return make_float2(s / a.x, s / a.y);
}
inline __host__ __device__ void operator/=(float2 &a, float s)
{
    float inv = 1.0f / s;
    a *= inv;
}

// lerp
inline __device__ __host__ float2 lerp(float2 a, float2 b, float t)
{
    return a + t*(b-a);
}

// clamp
inline __device__ __host__ float2 clamp(float2 v, float a, float b)
{
    return make_float2(clamp(v.x, a, b), clamp(v.y, a, b));
}

inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b)
{
    return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
}

// dot product
inline __host__ __device__ float dot(float2 a, float2 b)
{
    return a.x * b.x + a.y * b.y;
}

// length
inline __host__ __device__ float length(float2 v)
{
    return sqrtf(dot(v, v));
}

// normalize
inline __host__ __device__ float2 normalize(float2 v)
{
    float invLen = 1.0f / sqrtf(dot(v, v));
    return v * invLen;
}

// floor
inline __host__ __device__ float2 floor(const float2 v)
{
    return make_float2(floor(v.x), floor(v.y));
}

// reflect
inline __host__ __device__ float2 reflect(float2 i, float2 n)
{
    return i - 2.0f * n * dot(n,i);
}

// float3 functions
////////////////////////////////////////////////////////////////////////////////

// additional constructors
inline __host__ __device__ float3 make_float3(float s)
{
    return make_float3(s, s, s);
}
inline __host__ __device__ float3 make_float3(float2 a)
{
    return make_float3(a.x, a.y, 0.0f);
}
inline __host__ __device__ float3 make_float3(float2 a, float s)
{
    return make_float3(a.x, a.y, s);
}
inline __host__ __device__ float3 make_float3(float4 a)
{
    return make_float3(a.x, a.y, a.z);  // discards w
}
inline __host__ __device__ float3 make_float3(int3 a)
{
    return make_float3(float(a.x), float(a.y), float(a.z));
}

// min
static __inline__ __host__ __device__ float3 fminf(float3 a, float3 b)
{
    return make_float3(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z));
}

// max
static __inline__ __host__ __device__ float3 fmaxf(float3 a, float3 b)
{
    return make_float3(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z));
}

// addition
inline __host__ __device__ float3 operator+(float3 a, float3 b)
{
    return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
}
inline __host__ __device__ float3 operator+(float3 a, float b)
{
    return make_float3(a.x + b, a.y + b, a.z + b);
}
inline __host__ __device__ void operator+=(float3 &a, float3 b)
{
    a.x += b.x; a.y += b.y; a.z += b.z;
}

// subtract
inline __host__ __device__ float3 operator-(float3 a, float3 b)
{
    return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
}
inline __host__ __device__ float3 operator-(float3 a, float b)
{
    return make_float3(a.x - b, a.y - b, a.z - b);
}
inline __host__ __device__ void operator-=(float3 &a, float3 b)
{
    a.x -= b.x; a.y -= b.y; a.z -= b.z;
}

// multiply
inline __host__ __device__ float3 operator*(float3 a, float3 b)
{
    return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
}
inline __host__ __device__ float3 operator*(float3 a, float s)
{
    return make_float3(a.x * s, a.y * s, a.z * s);
}
inline __host__ __device__ float3 operator*(float s, float3 a)
{
    return make_float3(a.x * s, a.y * s, a.z * s);
}
inline __host__ __device__ void operator*=(float3 &a, float s)
{
    a.x *= s; a.y *= s; a.z *= s;
}

// divide
inline __host__ __device__ float3 operator/(float3 a, float3 b)
{
    return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
}
inline __host__ __device__ float3 operator/(float3 a, float s)
{
    float inv = 1.0f / s;
    return a * inv;
}
inline __host__ __device__ float3 operator/(float s, float3 a) //Danny
{
//    float inv = 1.0f / s;
//    return a * inv;
    return make_float3(s / a.x, s / a.y, s / a.z);
}
inline __host__ __device__ void operator/=(float3 &a, float s)
{
    float inv = 1.0f / s;
    a *= inv;
}

// lerp
inline __device__ __host__ float3 lerp(float3 a, float3 b, float t)
{
    return a + t*(b-a);
}

// clamp
inline __device__ __host__ float3 clamp(float3 v, float a, float b)
{
    return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
}

inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
{
    return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, 
a.z, b.z));
}

// dot product
inline __host__ __device__ float dot(float3 a, float3 b)
{
    return a.x * b.x + a.y * b.y + a.z * b.z;
}

// cross product
inline __host__ __device__ float3 cross(float3 a, float3 b)
{
    return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
}

// length
inline __host__ __device__ float length(float3 v)
{
    return sqrtf(dot(v, v));
}

// normalize
inline __host__ __device__ float3 normalize(float3 v)
{
    float invLen = 1.0f / sqrtf(dot(v, v));
    return v * invLen;
}

// floor
inline __host__ __device__ float3 floor(const float3 v)
{
    return make_float3(floor(v.x), floor(v.y), floor(v.z));
}

// reflect
inline __host__ __device__ float3 reflect(float3 i, float3 n)
{
    return i - 2.0f * n * dot(n,i);
}

// float4 functions
////////////////////////////////////////////////////////////////////////////////

// additional constructors
inline __host__ __device__ float4 make_float4(float s)
{
    return make_float4(s, s, s, s);
}
inline __host__ __device__ float4 make_float4(float3 a)
{
    return make_float4(a.x, a.y, a.z, 0.0f);
}
inline __host__ __device__ float4 make_float4(float3 a, float w)
{
    return make_float4(a.x, a.y, a.z, w);
}
inline __host__ __device__ float4 make_float4(int4 a)
{
    return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
}

// min
static __inline__ __host__ __device__ float4 fminf(float4 a, float4 b)
{
    return make_float4(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z), 
fminf(a.w,b.w));
}

// max
static __inline__ __host__ __device__ float4 fmaxf(float4 a, float4 b)
{
    return make_float4(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z), 
fmaxf(a.w,b.w));
}

// addition
inline __host__ __device__ float4 operator+(float4 a, float4 b)
{
    return make_float4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
}
inline __host__ __device__ void operator+=(float4 &a, float4 b)
{
    a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
}

// subtract
inline __host__ __device__ float4 operator-(float4 a, float4 b)
{
    return make_float4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
}
inline __host__ __device__ void operator-=(float4 &a, float4 b)
{
    a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
}

// multiply
inline __host__ __device__ float4 operator*(float4 a, float s)
{
    return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
}
inline __host__ __device__ float4 operator*(float s, float4 a)
{
    return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
}
inline __host__ __device__ void operator*=(float4 &a, float s)
{
    a.x *= s; a.y *= s; a.z *= s; a.w *= s;
}

// divide
inline __host__ __device__ float4 operator/(float4 a, float4 b)
{
    return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
}
inline __host__ __device__ float4 operator/(float4 a, float s)
{
    float inv = 1.0f / s;
    return a * inv;
}
inline __host__ __device__ float4 operator/(float s, float4 a) //Danny
{
//    float inv = 1.0f / s;
//    return a * inv;
    return make_float4(s / a.x, s / a.y, s / a.z, s / a.w);
}
inline __host__ __device__ void operator/=(float4 &a, float s)
{
    float inv = 1.0f / s;
    a *= inv;
}

// lerp
inline __device__ __host__ float4 lerp(float4 a, float4 b, float t)
{
    return a + t*(b-a);
}

// clamp
inline __device__ __host__ float4 clamp(float4 v, float a, float b)
{
    return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), 
clamp(v.w, a, b));
}

inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b)
{
    return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, 
a.z, b.z), clamp(v.w, a.w, b.w));
}

// dot product
inline __host__ __device__ float dot(float4 a, float4 b)
{
    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
}

// length
inline __host__ __device__ float length(float4 r)
{
    return sqrtf(dot(r, r));
}

// normalize
inline __host__ __device__ float4 normalize(float4 v)
{
    float invLen = 1.0f / sqrtf(dot(v, v));
    return v * invLen;
}

// floor
inline __host__ __device__ float4 floor(const float4 v)
{
    return make_float4(floor(v.x), floor(v.y), floor(v.z), floor(v.w));
}

// int3 functions
////////////////////////////////////////////////////////////////////////////////

// additional constructors
inline __host__ __device__ int3 make_int3(int s)
{
    return make_int3(s, s, s);
}
inline __host__ __device__ int3 make_int3(float3 a)
{
    return make_int3(int(a.x), int(a.y), int(a.z));
}

// min
inline __host__ __device__ int3 min(int3 a, int3 b)
{
    return make_int3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
}

// max
inline __host__ __device__ int3 max(int3 a, int3 b)
{
    return make_int3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
}

// addition
inline __host__ __device__ int3 operator+(int3 a, int3 b)
{
    return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
}
inline __host__ __device__ void operator+=(int3 &a, int3 b)
{
    a.x += b.x; a.y += b.y; a.z += b.z;
}

// subtract
inline __host__ __device__ int3 operator-(int3 a, int3 b)
{
    return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
}

inline __host__ __device__ void operator-=(int3 &a, int3 b)
{
    a.x -= b.x; a.y -= b.y; a.z -= b.z;
}

// multiply
inline __host__ __device__ int3 operator*(int3 a, int3 b)
{
    return make_int3(a.x * b.x, a.y * b.y, a.z * b.z);
}
inline __host__ __device__ int3 operator*(int3 a, int s)
{
    return make_int3(a.x * s, a.y * s, a.z * s);
}
inline __host__ __device__ int3 operator*(int s, int3 a)
{
    return make_int3(a.x * s, a.y * s, a.z * s);
}
inline __host__ __device__ void operator*=(int3 &a, int s)
{
    a.x *= s; a.y *= s; a.z *= s;
}

// divide
inline __host__ __device__ int3 operator/(int3 a, int3 b)
{
    return make_int3(a.x / b.x, a.y / b.y, a.z / b.z);
}
inline __host__ __device__ int3 operator/(int3 a, int s)
{
    return make_int3(a.x / s, a.y / s, a.z / s);
}
inline __host__ __device__ int3 operator/(int s, int3 a)
{
    return make_int3(a.x / s, a.y / s, a.z / s);
}
inline __host__ __device__ void operator/=(int3 &a, int s)
{
    a.x /= s; a.y /= s; a.z /= s;
}

// clamp
inline __device__ __host__ int clamp(int f, int a, int b)
{
    return max(a, min(f, b));
}

inline __device__ __host__ int3 clamp(int3 v, int a, int b)
{
    return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
}

inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b)
{
    return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, 
a.z, b.z));
}


// uint3 functions
////////////////////////////////////////////////////////////////////////////////

// additional constructors
inline __host__ __device__ uint3 make_uint3(uint s)
{
    return make_uint3(s, s, s);
}
inline __host__ __device__ uint3 make_uint3(float3 a)
{
    return make_uint3(uint(a.x), uint(a.y), uint(a.z));
}

// min
inline __host__ __device__ uint3 min(uint3 a, uint3 b)
{
    return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
}

// max
inline __host__ __device__ uint3 max(uint3 a, uint3 b)
{
    return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
}

// addition
inline __host__ __device__ uint3 operator+(uint3 a, uint3 b)
{
    return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z);
}
inline __host__ __device__ void operator+=(uint3 &a, uint3 b)
{
    a.x += b.x; a.y += b.y; a.z += b.z;
}

// subtract
inline __host__ __device__ uint3 operator-(uint3 a, uint3 b)
{
    return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z);
}

inline __host__ __device__ void operator-=(uint3 &a, uint3 b)
{
    a.x -= b.x; a.y -= b.y; a.z -= b.z;
}

// multiply
inline __host__ __device__ uint3 operator*(uint3 a, uint3 b)
{
    return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z);
}
inline __host__ __device__ uint3 operator*(uint3 a, uint s)
{
    return make_uint3(a.x * s, a.y * s, a.z * s);
}
inline __host__ __device__ uint3 operator*(uint s, uint3 a)
{
    return make_uint3(a.x * s, a.y * s, a.z * s);
}
inline __host__ __device__ void operator*=(uint3 &a, uint s)
{
    a.x *= s; a.y *= s; a.z *= s;
}

// divide
inline __host__ __device__ uint3 operator/(uint3 a, uint3 b)
{
    return make_uint3(a.x / b.x, a.y / b.y, a.z / b.z);
}
inline __host__ __device__ uint3 operator/(uint3 a, uint s)
{
    return make_uint3(a.x / s, a.y / s, a.z / s);
}
inline __host__ __device__ uint3 operator/(uint s, uint3 a)
{
    return make_uint3(a.x / s, a.y / s, a.z / s);
}
inline __host__ __device__ void operator/=(uint3 &a, uint s)
{
    a.x /= s; a.y /= s; a.z /= s;
}

// clamp
inline __device__ __host__ uint clamp(uint f, uint a, uint b)
{
    return max(a, min(f, b));
}

inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b)
{
    return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
}

inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b)
{
    return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, 
a.z, b.z));
}



typedef unsigned int uint;
typedef unsigned short ushort;
typedef unsigned char uchar;
typedef signed char schar;

inline __device__ __host__ uint UMIN(uint a, uint b)
{
    return a < b ? a : b;
}

inline __device__ __host__ uint PowTwoDivider(uint n)
{
    if (n == 0) return 0;
    uint divider = 1;
    while ((n & divider) == 0) divider <<= 1;
    return divider;
}

inline __host__ __device__ float2 operator-(float a, float2 b)
{
    return make_float2(a - b.x, a - b.y);
}

inline __host__ __device__ float3 operator-(float a, float3 b)
{
    return make_float3(a - b.x, a - b.y, a - b.z);
}



// Cubic B-spline function
// The 3rd order Maximal Order and Minimum Support function, that it is 
maximally differentiable.
inline __host__ __device__ float bspline(float t)
{
    t = fabs(t);
    const float a = 2.0f - t;

    if (t < 1.0f) return 2.0f/3.0f - 0.5f*t*t*a;
    else if (t < 2.0f) return a*a*a / 6.0f;
    else return 0.0f;
}

// The first order derivative of the cubic B-spline
inline __host__ __device__ float bspline_1st_derivative(float t)
{
    if (-2.0f < t && t <= -1.0f) return 0.5f*t*t + 2.0f*t + 2.0f;
    else if (-1.0f < t && t <= 0.0f) return -1.5f*t*t - 2.0f*t;
    else if ( 0.0f < t && t <= 1.0f) return  1.5f*t*t - 2.0f*t;
    else if ( 1.0f < t && t <  2.0f) return -0.5f*t*t + 2.0f*t - 2.0f;
    else return 0.0f;
}

// The second order derivative of the cubic B-spline
inline __host__ __device__ float bspline_2nd_derivative(float t)
{
    t = fabs(t);

    if (t < 1.0f) return 3.0f*t - 2.0f;
    else if (t < 2.0f) return 2.0f - t;
    else return 0.0f;
}

// Inline calculation of the bspline convolution weights, without conditional 
statements
inline __device__ void bspline_weights(float fraction, float& w0, float& w1, 
float& w2, float& w3)
{
    const float one_frac = 1.0f - fraction;
    const float squared = fraction * fraction;
    const float one_sqd = one_frac * one_frac;

    w0 = 1.0f/6.0f * one_sqd * one_frac;
    w1 = 2.0f/3.0f - 0.5f * squared * (2.0f-fraction);
    w2 = 2.0f/3.0f - 0.5f * one_sqd * (2.0f-one_frac);
    w3 = 1.0f/6.0f * squared * fraction;
}

// Inline calculation of the first order derivative bspline convolution 
weights, without conditional statements
inline __device__ void bspline_weights_1st_derivative(float fraction, float& 
w0, float& w1, float& w2, float& w3)
{
    const float squared = fraction * fraction;

    w0 = -0.5f * squared + fraction - 0.5f;
    w1 =  1.5f * squared - 2.0f * fraction;
    w2 = -1.5f * squared + fraction + 0.5f;
    w3 =  0.5f * squared;
}

// Inline calculation of the second order derivative bspline convolution 
weights, without conditional statements
inline __device__ void bspline_weights_2nd_derivative(float fraction, float& 
w0, float& w1, float& w2, float& w3)
{
    w0 =  1.0f - fraction;
    w1 =  3.0f * fraction - 2.0f;
    w2 = -3.0f * fraction + 1.0f;
    w3 =  fraction;
}


__device__ float cubicTex2DSimple(texture<float, 2> tex, float x, float y)
{
    // transform the coordinate from [0,extent] to [-0.5, extent-0.5]
    const float2 coord_grid = make_float2(x - 0.5f, y - 0.5f);
    float2 index = floor(coord_grid);
    const float2 fraction = coord_grid - index;
    index.x += 0.5f;  //move from [-0.5, extent-0.5] to [0, extent]
    index.y += 0.5f;  //move from [-0.5, extent-0.5] to [0, extent]

    float result = 0.0f;
    for (float y=-1; y < 2.5f; y++)
    {
        float bsplineY = bspline(y-fraction.y);
        float v = index.y + y;
        for (float x=-1; x < 2.5f; x++)
        {
            float bsplineXY = bspline(x-fraction.x) * bsplineY;
            float u = index.x + x;
            result += bsplineXY * tex2D(tex, u, v);
        }
    }
    return result;
}


#define CUBICTEX2D cubicTex2DSimple
texture<float, 2> polbb, polbg, polgb, polgg;

__global__ void compnkcross(int N, int ngrid, int ndeg2, float Theta, float 
K_min, float K_max, float k_min, float k_max, float K, int asvec, float *y, 
float *x, int *empstat)
{
int col = blockIdx.x*blockDim.x + threadIdx.x;
if (col < N && empstat[col] == 0 && asvec == 0) {
  y[col] = CUBICTEX2D(polbb, ((K-K_min)/(K_max-K_min))*ndeg2, 
powf(((x[col]-k_min)/(k_max-k_min)),(1.0f/Theta))*ngrid);
}
if (col < N && empstat[col] == 0 && asvec == 1) {
  y[col] = CUBICTEX2D(polgb, ((K-K_min)/(K_max-K_min))*ndeg2, 
powf(((x[col]-k_min)/(k_max-k_min)),(1.0f/Theta))*ngrid);
}
if (col < N && empstat[col] == 1 && asvec == 0) {
  y[col] = CUBICTEX2D(polbg, ((K-K_min)/(K_max-K_min))*ndeg2, 
powf(((x[col]-k_min)/(k_max-k_min)),(1.0f/Theta))*ngrid);
}
if (col < N && empstat[col] == 1 && asvec == 1) {
  y[col] = CUBICTEX2D(polgg, ((K-K_min)/(K_max-K_min))*ndeg2, 
powf(((x[col]-k_min)/(k_max-k_min)),(1.0f/Theta))*ngrid);
}
}

#undef CUBICTEX2D

--
Dr. Eric Michael Scheffel
Assistant Professor in Economics
Ningbo Nottingham University
Site: http://www.ericscheffel.com




---------------------------
Dr. Eric Michael Scheffel
Assistant Professor in Economics
Ningbo Nottingham University
Site: http://www.ericscheffel.com

This message and any attachment are intended solely for the addressee and may 
contain confidential information. 
If you have received this message in error, please send it back to me, and 
immediately delete it. 
  
Please do not use, copy or disclose the information contained in this message 
or in any attachment.  

Any views or opinions expressed by the author of this email do not necessarily 
reflect the views of The University of Nottingham Ningbo China.


This message has been checked for viruses but the contents of an attachment may 
still contain software viruses which could damage your computer system:
you are advised to perform your own checks. 

Email communications with The University of Nottingham Ningbo China may be 
monitored as permitted by UK and Chinese legislation.

_______________________________________________
PyCUDA mailing list
[email protected]
http://lists.tiker.net/listinfo/pycuda

Reply via email to