Normally, when I'm hitting a road block with numpy performance, I port
that section to cython and typically see an order of magnitude
increase in speed. (this is without disabling bounds checking) In my
current case, I'm seeing a slowdown of 2x (with boundschecking
disabled) and I'm pretty sure everything in my code is kosher. Would
anyone here mind looking at it and see if there is any glaring
ommision?
Here are the benchmarks:
For the cython version:
In [6]: superquadricfit._err_func_with_grad(guess, pts[:,0], pts[:,1],
pts[:,2], 0.5, False)
Out[6]: 200.0698654738344
In [7]: %timeit superquadricfit._err_func_with_grad(guess, pts[:,0],
pts[:,1], pts[:,2], 0.5, False)
1000 loops, best of 3: 882 µs per loop
and the numpy version:
In [11]: quad._err_func_with_grad(guess, pts[:,0], pts[:,1], pts[:,2],
0.5, False)
Out[11]: 200.0698654738344
In [12]: %timeit quad._err_func_with_grad(guess, pts[:,0], pts[:,1],
pts[:,2], 0.5, False)
1000 loops, best of 3: 438 µs per loop
I appreciate any help anyone can give
Cheers!
Chris
Here is the code:
##### Cython Code #######
from __future__ import division
import numpy as np
cimport numpy as np
cdef extern from "math.h":
double cos(double)
double sin(double)
double sqrt(double)
double pow(double, double)
DTYPE = np.float64
ctypedef np.float64_t DTYPE_f64_t
cimport cython
@cython.boundscheck(False)
def _err_func_with_grad(params, np.ndarray[DTYPE_f64_t, ndim=1] xw,
np.ndarray[DTYPE_f64_t, ndim=1] yw,
np.ndarray[DTYPE_f64_t, ndim=1] zw,
double weight,
bool calc_grad):
cdef double a1, a2, a3, e1, e2, phi, theta, psi, px, py, pz
a1, a2, a3, e1, e2, phi, theta, psi, px, py, pz = params
cdef double w1, w2, cphi, ctheta, cpsi, sphi, stheta, spsi, nx,
ny, nz, ox, oy, oz, ax, ay, az
cdef double F, f1, f2, f3, err, insiders_err, temperr, sumerr
cdef int N
cdef int i
cdef double C1, C2, C3
# world point coordinates
if len(xw) != len(yw) or len(xw) != len(zw):
raise ValueError('Point vectors are not of equal length')
N = len(xw)
# the weights to place on the errors. w1 for all points
# w2 for points inside the superquadric
w2 = weight
w1 = 1 - w2
cphi = cos(phi)
ctheta = cos(theta)
cpsi = cos(psi)
sphi = sin(phi)
stheta = sin(theta)
spsi = sin(psi)
nx = cphi * ctheta * cpsi - sphi * spsi
ny = sphi * ctheta * cpsi + cphi * spsi
nz = -stheta * cpsi
ox = -cphi * ctheta * spsi - sphi * cpsi
oy = -sphi * ctheta * spsi + cphi * cpsi
oz = stheta * spsi
ax = cphi * stheta
ay = sphi * stheta
az = ctheta
temperr = 0.0
err = 0.0
insiders_err = 0.0
sumerr = 0.0
C1 = px * nx - py * ny - pz * nz
C2 = px * ox - py * oy - pz * oz
C3 = px * ax - py * ay - pz * az
for i in range(N):
f1 = ((nx * xw[i] + ny * yw[i] + nz * zw[i] - C1) / a1)
f2 = ((ox * xw[i] + oy * yw[i] + oz * zw[i] - C2) / a2)
f3 = ((ax * xw[i] + ay * yw[i] + az * zw[i] - C3) / a3)
F = ((f1**2)**(1/e2) + (f2**2)**(1/e2))**(e2/e1) + (f3**2)**(1/e1)
temperr = (sqrt(a1 * a2 * a3) * (F**(e1) - 1))**2
if F < 1.0:
insiders_err += temperr
err += temperr
sumerr += w1 * err + w2 * insiders_err
return sumerr
############ equivalent python code###############
def _err_func_with_grad(self, params, xw, yw, zw, weight, calc_grad):
params = [float(arg) for arg in params]
a1, a2, a3, e1, e2, phi, theta, psi, px, py, pz = params
# world point coordinates
if len(xw) != len(yw) or len(xw) != len(zw):
raise ValueError('Point vectors are not of equal length')
# the weights to place on the errors. w1 for all points
# w2 for points inside the superquadric
w2 = weight
w1 = 1 - w2
cphi = math.cos(phi)
ctheta = math.cos(theta)
cpsi = math.cos(psi)
sphi = math.sin(phi)
stheta = math.sin(theta)
spsi = math.sin(psi)
nx = cphi * ctheta * cpsi - sphi * spsi
ny = sphi * ctheta * cpsi + cphi * spsi
nz = -stheta * cpsi
ox = -cphi * ctheta * spsi - sphi * cpsi
oy = -sphi * ctheta * spsi + cphi * cpsi
oz = stheta * spsi
ax = cphi * stheta
ay = sphi * stheta
az = ctheta
f1 = ((nx * xw + ny * yw + nz * zw - px * nx - py * ny - pz * nz) / a1)
f2 = ((ox * xw + oy * yw + oz * zw - px * ox - py * oy - pz * oz) / a2)
f3 = ((ax * xw + ay * yw + az * zw - px * ax - py * ay - pz * az) / a3)
F = ((f1**2)**(1/e2) + (f2**2)**(1/e2))**(e2/e1) + (f3**2)**(1/e1)
# get a mask for the points that lie inside the superquadric
insiders_mask = F < 1
# get the error associated only from the insiders
insiders = F[insiders_mask]
insiders_err = (math.sqrt(a1 * a2 * a3) * (insiders**(e1) - 1))**2
# the square error of every point
err = (math.sqrt(a1 * a2 * a3) * (F**(e1) - 1))**2
# this is the final quantity to be minimized
esum = err.sum()
isum = insiders_err.sum()
sumerr = w1*esum + w2*isum
return sumerr
_______________________________________________
Cython-dev mailing list
[email protected]
http://codespeak.net/mailman/listinfo/cython-dev