I just verified that the loop is consuming 96% of the execution time...

weird... the loop gets converted to very basic C code... I would think
it would be much faster...

perhaps its my compilation step?

my setup.py looks like this:


##### setup.py #########
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext

sourcefiles = ['superquadricfit.pyx',]

setup(
    cmdclass = {'build_ext': build_ext},
    ext_modules = [Extension("superquadricfit", sourcefiles)]
)


and for manual compilation I used this:

gcc -shared -pthread -fPIC -fwrapv -O2 -Wall -fno-strict-aliasing
-I/usr/local/lib/python2.6/dist-packages/numpy/core/include/numpy
-I/usr/include/python2.6 -o superquadricfit.so superquadricfit.c


they both yield the same result....




On Tue, Sep 29, 2009 at 4:46 PM, Chris Colbert <[email protected]> wrote:
> Dag,
>
> HTML was too big to attach to the list email, so I sent it to your
> personal email.
>
> Thanks for taking a look!
>
> Chris
>
> On Tue, Sep 29, 2009 at 4:42 PM, Chris Colbert <[email protected]> wrote:
>> Dag,
>>
>> Html is attached, all that garbage at the bottom in commented out via
>> a docstring (i was using it as a porting template)
>>
>> Thanks for taking a look!
>>
>> Chris
>>
>> On Tue, Sep 29, 2009 at 4:37 PM, Dag Sverre Seljebotn
>> <[email protected]> wrote:
>>> Chris Colbert wrote:
>>>> Normally, when I'm hitting a road block with numpy performance, I port
>>>> that section to cython and typically see an order of magnitude
>>>> increase in speed. (this is without disabling bounds checking) In my
>>>> current case, I'm seeing a slowdown of 2x (with boundschecking
>>>> disabled) and I'm pretty sure everything in my code is kosher. Would
>>>> anyone here mind looking at it and see if there is any glaring
>>>> ommision?
>>>>
>>> Strange. I take it "cython -a" doesn't reveal anything? Could you just
>>> attach the HTML report so that I can have a (quicker) look?
>>>
>>> Dag Sverre
>>>> Here are the benchmarks:
>>>>
>>>> For the cython version:
>>>>
>>>> In [6]: superquadricfit._err_func_with_grad(guess, pts[:,0], pts[:,1],
>>>> pts[:,2], 0.5, False)
>>>> Out[6]: 200.0698654738344
>>>>
>>>> In [7]: %timeit superquadricfit._err_func_with_grad(guess, pts[:,0],
>>>> pts[:,1], pts[:,2], 0.5, False)
>>>> 1000 loops, best of 3: 882 µs per loop
>>>>
>>>>
>>>> and the numpy version:
>>>>
>>>> In [11]: quad._err_func_with_grad(guess, pts[:,0], pts[:,1], pts[:,2],
>>>> 0.5, False)
>>>> Out[11]: 200.0698654738344
>>>>
>>>> In [12]: %timeit quad._err_func_with_grad(guess, pts[:,0], pts[:,1],
>>>> pts[:,2], 0.5, False)
>>>> 1000 loops, best of 3: 438 µs per loop
>>>>
>>>>
>>>> I appreciate any help anyone can give
>>>>
>>>> Cheers!
>>>>
>>>> Chris
>>>>
>>>> Here is the code:
>>>>
>>>> #####   Cython Code #######
>>>>
>>>> from __future__ import division
>>>> import numpy as np
>>>> cimport numpy as np
>>>>
>>>> cdef extern from "math.h":
>>>>     double cos(double)
>>>>     double sin(double)
>>>>     double sqrt(double)
>>>>     double pow(double, double)
>>>>
>>>> DTYPE = np.float64
>>>> ctypedef np.float64_t DTYPE_f64_t
>>>>
>>>> cimport cython
>>>> @cython.boundscheck(False)
>>>> def _err_func_with_grad(params, np.ndarray[DTYPE_f64_t, ndim=1] xw,
>>>>                                 np.ndarray[DTYPE_f64_t, ndim=1] yw,
>>>>                                 np.ndarray[DTYPE_f64_t, ndim=1] zw,
>>>>                                 double weight,
>>>>                                 bool calc_grad):
>>>>
>>>>     cdef double a1, a2, a3, e1, e2, phi, theta, psi, px, py, pz
>>>>     a1, a2, a3, e1, e2, phi, theta, psi, px, py, pz = params
>>>>
>>>>     cdef double w1, w2, cphi, ctheta, cpsi, sphi, stheta, spsi, nx,
>>>> ny, nz, ox, oy, oz, ax, ay, az
>>>>     cdef double F, f1, f2, f3, err, insiders_err, temperr, sumerr
>>>>
>>>>     cdef int N
>>>>     cdef int i
>>>>
>>>>     cdef double C1, C2, C3
>>>>
>>>>     # world point coordinates
>>>>     if len(xw) != len(yw) or len(xw) != len(zw):
>>>>         raise ValueError('Point vectors are not of equal length')
>>>>
>>>>     N = len(xw)
>>>>
>>>>     # the weights to place on the errors. w1 for all points
>>>>     # w2 for points inside the superquadric
>>>>     w2 = weight
>>>>     w1 = 1 - w2
>>>>
>>>>     cphi = cos(phi)
>>>>     ctheta = cos(theta)
>>>>     cpsi = cos(psi)
>>>>     sphi = sin(phi)
>>>>     stheta = sin(theta)
>>>>     spsi = sin(psi)
>>>>
>>>>     nx = cphi * ctheta * cpsi - sphi * spsi
>>>>     ny = sphi * ctheta * cpsi + cphi * spsi
>>>>     nz = -stheta * cpsi
>>>>     ox = -cphi * ctheta * spsi - sphi * cpsi
>>>>     oy = -sphi * ctheta * spsi + cphi * cpsi
>>>>     oz = stheta * spsi
>>>>     ax = cphi * stheta
>>>>     ay = sphi * stheta
>>>>     az = ctheta
>>>>
>>>>     temperr = 0.0
>>>>     err = 0.0
>>>>     insiders_err = 0.0
>>>>     sumerr = 0.0
>>>>
>>>>     C1 = px * nx - py * ny - pz * nz
>>>>     C2 = px * ox - py * oy - pz * oz
>>>>     C3 = px * ax - py * ay - pz * az
>>>>
>>>>     for i in range(N):
>>>>         f1 = ((nx * xw[i] + ny * yw[i] + nz * zw[i] - C1) / a1)
>>>>         f2 = ((ox * xw[i] + oy * yw[i] + oz * zw[i] - C2) / a2)
>>>>         f3 = ((ax * xw[i] + ay * yw[i] + az * zw[i] - C3) / a3)
>>>>
>>>>
>>>>         F = ((f1**2)**(1/e2) + (f2**2)**(1/e2))**(e2/e1) + (f3**2)**(1/e1)
>>>>
>>>>         temperr = (sqrt(a1 * a2 * a3) * (F**(e1) - 1))**2
>>>>
>>>>         if F < 1.0:
>>>>             insiders_err += temperr
>>>>
>>>>         err += temperr
>>>>
>>>>     sumerr += w1 * err + w2 * insiders_err
>>>>
>>>>     return sumerr
>>>>
>>>>
>>>>
>>>> ############ equivalent python code###############
>>>>
>>>> def _err_func_with_grad(self, params, xw, yw, zw, weight, calc_grad):
>>>>
>>>>
>>>>     params = [float(arg) for arg in params]
>>>>     a1, a2, a3, e1, e2, phi, theta, psi, px, py, pz = params
>>>>
>>>>     # world point coordinates
>>>>     if len(xw) != len(yw) or len(xw) != len(zw):
>>>>         raise ValueError('Point vectors are not of equal length')
>>>>
>>>>     # the weights to place on the errors. w1 for all points
>>>>     # w2 for points inside the superquadric
>>>>     w2 = weight
>>>>     w1 = 1 - w2
>>>>
>>>>     cphi = math.cos(phi)
>>>>     ctheta = math.cos(theta)
>>>>     cpsi = math.cos(psi)
>>>>     sphi = math.sin(phi)
>>>>     stheta = math.sin(theta)
>>>>     spsi = math.sin(psi)
>>>>
>>>>     nx = cphi * ctheta * cpsi - sphi * spsi
>>>>     ny = sphi * ctheta * cpsi + cphi * spsi
>>>>     nz = -stheta * cpsi
>>>>     ox = -cphi * ctheta * spsi - sphi * cpsi
>>>>     oy = -sphi * ctheta * spsi + cphi * cpsi
>>>>     oz = stheta * spsi
>>>>     ax = cphi * stheta
>>>>     ay = sphi * stheta
>>>>     az = ctheta
>>>>
>>>>     f1 = ((nx * xw + ny * yw + nz * zw - px * nx - py * ny - pz * nz) / a1)
>>>>     f2 = ((ox * xw + oy * yw + oz * zw - px * ox - py * oy - pz * oz) / a2)
>>>>     f3 = ((ax * xw + ay * yw + az * zw - px * ax - py * ay - pz * az) / a3)
>>>>
>>>>     F = ((f1**2)**(1/e2) + (f2**2)**(1/e2))**(e2/e1) + (f3**2)**(1/e1)
>>>>
>>>>     # get a mask for the points that lie inside the superquadric
>>>>     insiders_mask = F < 1
>>>>
>>>>     # get the error associated only from the insiders
>>>>     insiders = F[insiders_mask]
>>>>     insiders_err = (math.sqrt(a1 * a2 * a3) * (insiders**(e1) - 1))**2
>>>>
>>>>     # the square error of every point
>>>>     err = (math.sqrt(a1 * a2 * a3) * (F**(e1) - 1))**2
>>>>
>>>>     # this is the final quantity to be minimized
>>>>     esum = err.sum()
>>>>     isum = insiders_err.sum()
>>>>
>>>>     sumerr = w1*esum + w2*isum
>>>>
>>>>     return sumerr
>>>> _______________________________________________
>>>> Cython-dev mailing list
>>>> [email protected]
>>>> http://codespeak.net/mailman/listinfo/cython-dev
>>>>
>>>
>>> _______________________________________________
>>> Cython-dev mailing list
>>> [email protected]
>>> http://codespeak.net/mailman/listinfo/cython-dev
>>>
>>
>
_______________________________________________
Cython-dev mailing list
[email protected]
http://codespeak.net/mailman/listinfo/cython-dev

Reply via email to