I just verified that the loop is consuming 96% of the execution time...
weird... the loop gets converted to very basic C code... I would think
it would be much faster...
perhaps its my compilation step?
my setup.py looks like this:
##### setup.py #########
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext
sourcefiles = ['superquadricfit.pyx',]
setup(
cmdclass = {'build_ext': build_ext},
ext_modules = [Extension("superquadricfit", sourcefiles)]
)
and for manual compilation I used this:
gcc -shared -pthread -fPIC -fwrapv -O2 -Wall -fno-strict-aliasing
-I/usr/local/lib/python2.6/dist-packages/numpy/core/include/numpy
-I/usr/include/python2.6 -o superquadricfit.so superquadricfit.c
they both yield the same result....
On Tue, Sep 29, 2009 at 4:46 PM, Chris Colbert <[email protected]> wrote:
> Dag,
>
> HTML was too big to attach to the list email, so I sent it to your
> personal email.
>
> Thanks for taking a look!
>
> Chris
>
> On Tue, Sep 29, 2009 at 4:42 PM, Chris Colbert <[email protected]> wrote:
>> Dag,
>>
>> Html is attached, all that garbage at the bottom in commented out via
>> a docstring (i was using it as a porting template)
>>
>> Thanks for taking a look!
>>
>> Chris
>>
>> On Tue, Sep 29, 2009 at 4:37 PM, Dag Sverre Seljebotn
>> <[email protected]> wrote:
>>> Chris Colbert wrote:
>>>> Normally, when I'm hitting a road block with numpy performance, I port
>>>> that section to cython and typically see an order of magnitude
>>>> increase in speed. (this is without disabling bounds checking) In my
>>>> current case, I'm seeing a slowdown of 2x (with boundschecking
>>>> disabled) and I'm pretty sure everything in my code is kosher. Would
>>>> anyone here mind looking at it and see if there is any glaring
>>>> ommision?
>>>>
>>> Strange. I take it "cython -a" doesn't reveal anything? Could you just
>>> attach the HTML report so that I can have a (quicker) look?
>>>
>>> Dag Sverre
>>>> Here are the benchmarks:
>>>>
>>>> For the cython version:
>>>>
>>>> In [6]: superquadricfit._err_func_with_grad(guess, pts[:,0], pts[:,1],
>>>> pts[:,2], 0.5, False)
>>>> Out[6]: 200.0698654738344
>>>>
>>>> In [7]: %timeit superquadricfit._err_func_with_grad(guess, pts[:,0],
>>>> pts[:,1], pts[:,2], 0.5, False)
>>>> 1000 loops, best of 3: 882 µs per loop
>>>>
>>>>
>>>> and the numpy version:
>>>>
>>>> In [11]: quad._err_func_with_grad(guess, pts[:,0], pts[:,1], pts[:,2],
>>>> 0.5, False)
>>>> Out[11]: 200.0698654738344
>>>>
>>>> In [12]: %timeit quad._err_func_with_grad(guess, pts[:,0], pts[:,1],
>>>> pts[:,2], 0.5, False)
>>>> 1000 loops, best of 3: 438 µs per loop
>>>>
>>>>
>>>> I appreciate any help anyone can give
>>>>
>>>> Cheers!
>>>>
>>>> Chris
>>>>
>>>> Here is the code:
>>>>
>>>> ##### Cython Code #######
>>>>
>>>> from __future__ import division
>>>> import numpy as np
>>>> cimport numpy as np
>>>>
>>>> cdef extern from "math.h":
>>>> double cos(double)
>>>> double sin(double)
>>>> double sqrt(double)
>>>> double pow(double, double)
>>>>
>>>> DTYPE = np.float64
>>>> ctypedef np.float64_t DTYPE_f64_t
>>>>
>>>> cimport cython
>>>> @cython.boundscheck(False)
>>>> def _err_func_with_grad(params, np.ndarray[DTYPE_f64_t, ndim=1] xw,
>>>> np.ndarray[DTYPE_f64_t, ndim=1] yw,
>>>> np.ndarray[DTYPE_f64_t, ndim=1] zw,
>>>> double weight,
>>>> bool calc_grad):
>>>>
>>>> cdef double a1, a2, a3, e1, e2, phi, theta, psi, px, py, pz
>>>> a1, a2, a3, e1, e2, phi, theta, psi, px, py, pz = params
>>>>
>>>> cdef double w1, w2, cphi, ctheta, cpsi, sphi, stheta, spsi, nx,
>>>> ny, nz, ox, oy, oz, ax, ay, az
>>>> cdef double F, f1, f2, f3, err, insiders_err, temperr, sumerr
>>>>
>>>> cdef int N
>>>> cdef int i
>>>>
>>>> cdef double C1, C2, C3
>>>>
>>>> # world point coordinates
>>>> if len(xw) != len(yw) or len(xw) != len(zw):
>>>> raise ValueError('Point vectors are not of equal length')
>>>>
>>>> N = len(xw)
>>>>
>>>> # the weights to place on the errors. w1 for all points
>>>> # w2 for points inside the superquadric
>>>> w2 = weight
>>>> w1 = 1 - w2
>>>>
>>>> cphi = cos(phi)
>>>> ctheta = cos(theta)
>>>> cpsi = cos(psi)
>>>> sphi = sin(phi)
>>>> stheta = sin(theta)
>>>> spsi = sin(psi)
>>>>
>>>> nx = cphi * ctheta * cpsi - sphi * spsi
>>>> ny = sphi * ctheta * cpsi + cphi * spsi
>>>> nz = -stheta * cpsi
>>>> ox = -cphi * ctheta * spsi - sphi * cpsi
>>>> oy = -sphi * ctheta * spsi + cphi * cpsi
>>>> oz = stheta * spsi
>>>> ax = cphi * stheta
>>>> ay = sphi * stheta
>>>> az = ctheta
>>>>
>>>> temperr = 0.0
>>>> err = 0.0
>>>> insiders_err = 0.0
>>>> sumerr = 0.0
>>>>
>>>> C1 = px * nx - py * ny - pz * nz
>>>> C2 = px * ox - py * oy - pz * oz
>>>> C3 = px * ax - py * ay - pz * az
>>>>
>>>> for i in range(N):
>>>> f1 = ((nx * xw[i] + ny * yw[i] + nz * zw[i] - C1) / a1)
>>>> f2 = ((ox * xw[i] + oy * yw[i] + oz * zw[i] - C2) / a2)
>>>> f3 = ((ax * xw[i] + ay * yw[i] + az * zw[i] - C3) / a3)
>>>>
>>>>
>>>> F = ((f1**2)**(1/e2) + (f2**2)**(1/e2))**(e2/e1) + (f3**2)**(1/e1)
>>>>
>>>> temperr = (sqrt(a1 * a2 * a3) * (F**(e1) - 1))**2
>>>>
>>>> if F < 1.0:
>>>> insiders_err += temperr
>>>>
>>>> err += temperr
>>>>
>>>> sumerr += w1 * err + w2 * insiders_err
>>>>
>>>> return sumerr
>>>>
>>>>
>>>>
>>>> ############ equivalent python code###############
>>>>
>>>> def _err_func_with_grad(self, params, xw, yw, zw, weight, calc_grad):
>>>>
>>>>
>>>> params = [float(arg) for arg in params]
>>>> a1, a2, a3, e1, e2, phi, theta, psi, px, py, pz = params
>>>>
>>>> # world point coordinates
>>>> if len(xw) != len(yw) or len(xw) != len(zw):
>>>> raise ValueError('Point vectors are not of equal length')
>>>>
>>>> # the weights to place on the errors. w1 for all points
>>>> # w2 for points inside the superquadric
>>>> w2 = weight
>>>> w1 = 1 - w2
>>>>
>>>> cphi = math.cos(phi)
>>>> ctheta = math.cos(theta)
>>>> cpsi = math.cos(psi)
>>>> sphi = math.sin(phi)
>>>> stheta = math.sin(theta)
>>>> spsi = math.sin(psi)
>>>>
>>>> nx = cphi * ctheta * cpsi - sphi * spsi
>>>> ny = sphi * ctheta * cpsi + cphi * spsi
>>>> nz = -stheta * cpsi
>>>> ox = -cphi * ctheta * spsi - sphi * cpsi
>>>> oy = -sphi * ctheta * spsi + cphi * cpsi
>>>> oz = stheta * spsi
>>>> ax = cphi * stheta
>>>> ay = sphi * stheta
>>>> az = ctheta
>>>>
>>>> f1 = ((nx * xw + ny * yw + nz * zw - px * nx - py * ny - pz * nz) / a1)
>>>> f2 = ((ox * xw + oy * yw + oz * zw - px * ox - py * oy - pz * oz) / a2)
>>>> f3 = ((ax * xw + ay * yw + az * zw - px * ax - py * ay - pz * az) / a3)
>>>>
>>>> F = ((f1**2)**(1/e2) + (f2**2)**(1/e2))**(e2/e1) + (f3**2)**(1/e1)
>>>>
>>>> # get a mask for the points that lie inside the superquadric
>>>> insiders_mask = F < 1
>>>>
>>>> # get the error associated only from the insiders
>>>> insiders = F[insiders_mask]
>>>> insiders_err = (math.sqrt(a1 * a2 * a3) * (insiders**(e1) - 1))**2
>>>>
>>>> # the square error of every point
>>>> err = (math.sqrt(a1 * a2 * a3) * (F**(e1) - 1))**2
>>>>
>>>> # this is the final quantity to be minimized
>>>> esum = err.sum()
>>>> isum = insiders_err.sum()
>>>>
>>>> sumerr = w1*esum + w2*isum
>>>>
>>>> return sumerr
>>>> _______________________________________________
>>>> Cython-dev mailing list
>>>> [email protected]
>>>> http://codespeak.net/mailman/listinfo/cython-dev
>>>>
>>>
>>> _______________________________________________
>>> Cython-dev mailing list
>>> [email protected]
>>> http://codespeak.net/mailman/listinfo/cython-dev
>>>
>>
>
_______________________________________________
Cython-dev mailing list
[email protected]
http://codespeak.net/mailman/listinfo/cython-dev