Chris Colbert wrote:
> Normally, when I'm hitting a road block with numpy performance, I port
> that section to cython and typically see an order of magnitude
> increase in speed. (this is without disabling bounds checking) In my
> current case, I'm seeing a slowdown of 2x (with boundschecking
> disabled) and I'm pretty sure everything in my code is kosher. Would
> anyone here mind looking at it and see if there is any glaring
> ommision?
>
Strange. I take it "cython -a" doesn't reveal anything? Could you just
attach the HTML report so that I can have a (quicker) look?
Dag Sverre
> Here are the benchmarks:
>
> For the cython version:
>
> In [6]: superquadricfit._err_func_with_grad(guess, pts[:,0], pts[:,1],
> pts[:,2], 0.5, False)
> Out[6]: 200.0698654738344
>
> In [7]: %timeit superquadricfit._err_func_with_grad(guess, pts[:,0],
> pts[:,1], pts[:,2], 0.5, False)
> 1000 loops, best of 3: 882 µs per loop
>
>
> and the numpy version:
>
> In [11]: quad._err_func_with_grad(guess, pts[:,0], pts[:,1], pts[:,2],
> 0.5, False)
> Out[11]: 200.0698654738344
>
> In [12]: %timeit quad._err_func_with_grad(guess, pts[:,0], pts[:,1],
> pts[:,2], 0.5, False)
> 1000 loops, best of 3: 438 µs per loop
>
>
> I appreciate any help anyone can give
>
> Cheers!
>
> Chris
>
> Here is the code:
>
> ##### Cython Code #######
>
> from __future__ import division
> import numpy as np
> cimport numpy as np
>
> cdef extern from "math.h":
> double cos(double)
> double sin(double)
> double sqrt(double)
> double pow(double, double)
>
> DTYPE = np.float64
> ctypedef np.float64_t DTYPE_f64_t
>
> cimport cython
> @cython.boundscheck(False)
> def _err_func_with_grad(params, np.ndarray[DTYPE_f64_t, ndim=1] xw,
> np.ndarray[DTYPE_f64_t, ndim=1] yw,
> np.ndarray[DTYPE_f64_t, ndim=1] zw,
> double weight,
> bool calc_grad):
>
> cdef double a1, a2, a3, e1, e2, phi, theta, psi, px, py, pz
> a1, a2, a3, e1, e2, phi, theta, psi, px, py, pz = params
>
> cdef double w1, w2, cphi, ctheta, cpsi, sphi, stheta, spsi, nx,
> ny, nz, ox, oy, oz, ax, ay, az
> cdef double F, f1, f2, f3, err, insiders_err, temperr, sumerr
>
> cdef int N
> cdef int i
>
> cdef double C1, C2, C3
>
> # world point coordinates
> if len(xw) != len(yw) or len(xw) != len(zw):
> raise ValueError('Point vectors are not of equal length')
>
> N = len(xw)
>
> # the weights to place on the errors. w1 for all points
> # w2 for points inside the superquadric
> w2 = weight
> w1 = 1 - w2
>
> cphi = cos(phi)
> ctheta = cos(theta)
> cpsi = cos(psi)
> sphi = sin(phi)
> stheta = sin(theta)
> spsi = sin(psi)
>
> nx = cphi * ctheta * cpsi - sphi * spsi
> ny = sphi * ctheta * cpsi + cphi * spsi
> nz = -stheta * cpsi
> ox = -cphi * ctheta * spsi - sphi * cpsi
> oy = -sphi * ctheta * spsi + cphi * cpsi
> oz = stheta * spsi
> ax = cphi * stheta
> ay = sphi * stheta
> az = ctheta
>
> temperr = 0.0
> err = 0.0
> insiders_err = 0.0
> sumerr = 0.0
>
> C1 = px * nx - py * ny - pz * nz
> C2 = px * ox - py * oy - pz * oz
> C3 = px * ax - py * ay - pz * az
>
> for i in range(N):
> f1 = ((nx * xw[i] + ny * yw[i] + nz * zw[i] - C1) / a1)
> f2 = ((ox * xw[i] + oy * yw[i] + oz * zw[i] - C2) / a2)
> f3 = ((ax * xw[i] + ay * yw[i] + az * zw[i] - C3) / a3)
>
>
> F = ((f1**2)**(1/e2) + (f2**2)**(1/e2))**(e2/e1) + (f3**2)**(1/e1)
>
> temperr = (sqrt(a1 * a2 * a3) * (F**(e1) - 1))**2
>
> if F < 1.0:
> insiders_err += temperr
>
> err += temperr
>
> sumerr += w1 * err + w2 * insiders_err
>
> return sumerr
>
>
>
> ############ equivalent python code###############
>
> def _err_func_with_grad(self, params, xw, yw, zw, weight, calc_grad):
>
>
> params = [float(arg) for arg in params]
> a1, a2, a3, e1, e2, phi, theta, psi, px, py, pz = params
>
> # world point coordinates
> if len(xw) != len(yw) or len(xw) != len(zw):
> raise ValueError('Point vectors are not of equal length')
>
> # the weights to place on the errors. w1 for all points
> # w2 for points inside the superquadric
> w2 = weight
> w1 = 1 - w2
>
> cphi = math.cos(phi)
> ctheta = math.cos(theta)
> cpsi = math.cos(psi)
> sphi = math.sin(phi)
> stheta = math.sin(theta)
> spsi = math.sin(psi)
>
> nx = cphi * ctheta * cpsi - sphi * spsi
> ny = sphi * ctheta * cpsi + cphi * spsi
> nz = -stheta * cpsi
> ox = -cphi * ctheta * spsi - sphi * cpsi
> oy = -sphi * ctheta * spsi + cphi * cpsi
> oz = stheta * spsi
> ax = cphi * stheta
> ay = sphi * stheta
> az = ctheta
>
> f1 = ((nx * xw + ny * yw + nz * zw - px * nx - py * ny - pz * nz) / a1)
> f2 = ((ox * xw + oy * yw + oz * zw - px * ox - py * oy - pz * oz) / a2)
> f3 = ((ax * xw + ay * yw + az * zw - px * ax - py * ay - pz * az) / a3)
>
> F = ((f1**2)**(1/e2) + (f2**2)**(1/e2))**(e2/e1) + (f3**2)**(1/e1)
>
> # get a mask for the points that lie inside the superquadric
> insiders_mask = F < 1
>
> # get the error associated only from the insiders
> insiders = F[insiders_mask]
> insiders_err = (math.sqrt(a1 * a2 * a3) * (insiders**(e1) - 1))**2
>
> # the square error of every point
> err = (math.sqrt(a1 * a2 * a3) * (F**(e1) - 1))**2
>
> # this is the final quantity to be minimized
> esum = err.sum()
> isum = insiders_err.sum()
>
> sumerr = w1*esum + w2*isum
>
> return sumerr
> _______________________________________________
> Cython-dev mailing list
> [email protected]
> http://codespeak.net/mailman/listinfo/cython-dev
>
_______________________________________________
Cython-dev mailing list
[email protected]
http://codespeak.net/mailman/listinfo/cython-dev