[Cython] some advice on this module regarding performance?

Chris Colbert Tue, 29 Sep 2009 07:23:42 -0700

Normally, when I'm hitting a road block with numpy performance, I port
that section to cython and typically see an order of magnitude
increase in speed. (this is without disabling bounds checking) In my
current case, I'm seeing a slowdown of 2x (with boundschecking
disabled) and I'm pretty sure everything in my code is kosher. Would
anyone here mind looking at it and see if there is any glaring
ommision?


Here are the benchmarks:

For the cython version:

In [6]: superquadricfit._err_func_with_grad(guess, pts[:,0], pts[:,1],
pts[:,2], 0.5, False)
Out[6]: 200.0698654738344

In [7]: %timeit superquadricfit._err_func_with_grad(guess, pts[:,0],
pts[:,1], pts[:,2], 0.5, False)
1000 loops, best of 3: 882 µs per loop


and the numpy version:

In [11]: quad._err_func_with_grad(guess, pts[:,0], pts[:,1], pts[:,2],
0.5, False)
Out[11]: 200.0698654738344

In [12]: %timeit quad._err_func_with_grad(guess, pts[:,0], pts[:,1],
pts[:,2], 0.5, False)
1000 loops, best of 3: 438 µs per loop


I appreciate any help anyone can give

Cheers!

Chris

Here is the code:

#####   Cython Code #######

from __future__ import division
import numpy as np
cimport numpy as np

cdef extern from "math.h":
    double cos(double)
    double sin(double)
    double sqrt(double)
    double pow(double, double)

DTYPE = np.float64
ctypedef np.float64_t DTYPE_f64_t

cimport cython
@cython.boundscheck(False)
def _err_func_with_grad(params, np.ndarray[DTYPE_f64_t, ndim=1] xw,
                                np.ndarray[DTYPE_f64_t, ndim=1] yw,
                                np.ndarray[DTYPE_f64_t, ndim=1] zw,
                                double weight,
                                bool calc_grad):

    cdef double a1, a2, a3, e1, e2, phi, theta, psi, px, py, pz
    a1, a2, a3, e1, e2, phi, theta, psi, px, py, pz = params

    cdef double w1, w2, cphi, ctheta, cpsi, sphi, stheta, spsi, nx,
ny, nz, ox, oy, oz, ax, ay, az
    cdef double F, f1, f2, f3, err, insiders_err, temperr, sumerr

    cdef int N
    cdef int i

    cdef double C1, C2, C3

    # world point coordinates
    if len(xw) != len(yw) or len(xw) != len(zw):
        raise ValueError('Point vectors are not of equal length')

    N = len(xw)

    # the weights to place on the errors. w1 for all points
    # w2 for points inside the superquadric
    w2 = weight
    w1 = 1 - w2

    cphi = cos(phi)
    ctheta = cos(theta)
    cpsi = cos(psi)
    sphi = sin(phi)
    stheta = sin(theta)
    spsi = sin(psi)

    nx = cphi * ctheta * cpsi - sphi * spsi
    ny = sphi * ctheta * cpsi + cphi * spsi
    nz = -stheta * cpsi
    ox = -cphi * ctheta * spsi - sphi * cpsi
    oy = -sphi * ctheta * spsi + cphi * cpsi
    oz = stheta * spsi
    ax = cphi * stheta
    ay = sphi * stheta
    az = ctheta

    temperr = 0.0
    err = 0.0
    insiders_err = 0.0
    sumerr = 0.0

    C1 = px * nx - py * ny - pz * nz
    C2 = px * ox - py * oy - pz * oz
    C3 = px * ax - py * ay - pz * az

    for i in range(N):
        f1 = ((nx * xw[i] + ny * yw[i] + nz * zw[i] - C1) / a1)
        f2 = ((ox * xw[i] + oy * yw[i] + oz * zw[i] - C2) / a2)
        f3 = ((ax * xw[i] + ay * yw[i] + az * zw[i] - C3) / a3)


        F = ((f1**2)**(1/e2) + (f2**2)**(1/e2))**(e2/e1) + (f3**2)**(1/e1)

        temperr = (sqrt(a1 * a2 * a3) * (F**(e1) - 1))**2

        if F < 1.0:
            insiders_err += temperr

        err += temperr

    sumerr += w1 * err + w2 * insiders_err

    return sumerr



############ equivalent python code###############

def _err_func_with_grad(self, params, xw, yw, zw, weight, calc_grad):


    params = [float(arg) for arg in params]
    a1, a2, a3, e1, e2, phi, theta, psi, px, py, pz = params

    # world point coordinates
    if len(xw) != len(yw) or len(xw) != len(zw):
        raise ValueError('Point vectors are not of equal length')

    # the weights to place on the errors. w1 for all points
    # w2 for points inside the superquadric
    w2 = weight
    w1 = 1 - w2

    cphi = math.cos(phi)
    ctheta = math.cos(theta)
    cpsi = math.cos(psi)
    sphi = math.sin(phi)
    stheta = math.sin(theta)
    spsi = math.sin(psi)

    nx = cphi * ctheta * cpsi - sphi * spsi
    ny = sphi * ctheta * cpsi + cphi * spsi
    nz = -stheta * cpsi
    ox = -cphi * ctheta * spsi - sphi * cpsi
    oy = -sphi * ctheta * spsi + cphi * cpsi
    oz = stheta * spsi
    ax = cphi * stheta
    ay = sphi * stheta
    az = ctheta

    f1 = ((nx * xw + ny * yw + nz * zw - px * nx - py * ny - pz * nz) / a1)
    f2 = ((ox * xw + oy * yw + oz * zw - px * ox - py * oy - pz * oz) / a2)
    f3 = ((ax * xw + ay * yw + az * zw - px * ax - py * ay - pz * az) / a3)

    F = ((f1**2)**(1/e2) + (f2**2)**(1/e2))**(e2/e1) + (f3**2)**(1/e1)

    # get a mask for the points that lie inside the superquadric
    insiders_mask = F < 1

    # get the error associated only from the insiders
    insiders = F[insiders_mask]
    insiders_err = (math.sqrt(a1 * a2 * a3) * (insiders**(e1) - 1))**2

    # the square error of every point
    err = (math.sqrt(a1 * a2 * a3) * (F**(e1) - 1))**2

    # this is the final quantity to be minimized
    esum = err.sum()
    isum = insiders_err.sum()

    sumerr = w1*esum + w2*isum

    return sumerr
_______________________________________________
Cython-dev mailing list
[email protected]
http://codespeak.net/mailman/listinfo/cython-dev

[Cython] some advice on this module regarding performance?

Reply via email to