Here is a fully developed no-delete (OR ZERO KEYS) integer keyed table in
Pure Cython using NumPy as a backing store allocator for the table space.

Included is enough to benchmark it relative to Python's.  In my timing,
it is about 3X faster when the table is "right sized", but almost 5X
faster when you need to do significant growing up of tables.

It is neither pretty nor general, nor even a very good example code for
Cython (or even C).  It needs 4 byte ints and 4 byte floats (with the size
equality mattering), and it does (almost) all indexing via under the hood
C-isms.  The ambitious Cythoner could probably convert most if not all
of that to Buffer API mojo.  Since exact integer keys seemed necessary,
it does not even print out correctly unless you do under the covers casts
{ though convert to floats for keys would be trivial but stat failing for
  keys > 2**23 or so }.  Indeed, about the only thing about the table
that IS general is which of the 3 array slots in a row holds the KEY field.

All I really wanted to do was exhibit that you only really need two 13-line
functions slot() and registerAdd() to have a usable hash table system,
though perhaps not a user friendly one.  I've seen a great many people
go to herculean efforts to import that 26 lines of logic from elsewhere
and still be unrewarded by the back-end overheads and layering.  (In this
case it would be all kinds of malloc/free cycles...).

Anyway, with right-sized tables that fit in L1 it hits about 37 cycles
per count update (including the overhead of looping over the input keys)
on my i7.  The hash part alone is probably about 30 cycles.  It is
possible that a better hash function/multiplier choice could do a
little bit better, though I expect 15 cycles or so to be a hard floor.

Chuck

# cython --embed u4Dict.pyx && gcc -g3 -O9 -combine -fipa-cp -march=native 
-mfpmath=sse,387 -fno-strict-aliasing -I/usr/include/python2.6 
-I/usr/lib64/python2.6/site-packages/numpy/core/include u4Dict.c -L/usr/lib64 
-L/usr/lib -lpython2.6 -lutil -lpthread -ldl -o u4Dict
-------------------------- u4Dict.pyx --------------------------
cimport numpy as np
import  numpy as np
cdef extern from "string.h" nogil:
  void  *memcpy(void*, void*, unsigned long)

ctypedef unsigned int u4_t

cdef class u4Dict:
    cdef np.ndarray table                           # table space
    cdef u4_t       key                             # column index of u4 key
    cdef u4_t       lgSz                            # log_2(number of rows)
    cdef u4_t       pop                             # number of members

    def __init__(u4Dict o, u4_t key, u4_t lgSz):
        o.lgSz  = max(1, lgSz)
        o.table = np.zeros(( (1 << o.lgSz), 3), 'f')
        o.pop   = 0

    cdef u4_t slot(u4Dict o, u4_t key, u4_t *missing):
        cdef u4_t h    = 0xe7cadfcc46f7c2cb * key           # primary hash
        cdef u4_t i    = ( h >> (32 - o.lgSz)) & ~1             # Hi bits(even)
        cdef u4_t incr = ((h >> 1) & ((1 << o.lgSz) - 1)) | 1   # Lo bits (odd)
        cdef u4_t MASK = (1 << o.lgSz) - 1                  # Fast mod tab sz

        while (<u4_t*>o.table.data)[3*i + o.key] != 0:      # 0 ==> UNUSED SLOT
            if (<u4_t*>o.table.data)[3*i + o.key] == key:   # FOUND; yield slot
                missing[0] = 0
                return i
            i = (i + incr) & MASK                   # incr mod table size
        missing[0] = 1                              # NOT FOUND
        return i                                    # yield insert slot

    cdef void registerAdd(u4Dict o):                # update pop; maybe resize
        cdef u4_t    i, j, dum
        cdef u4Dict  n
        o.pop += 1
        if o.pop * 8  >  7 << o.lgSz:
            n = u4Dict(o.key, o.lgSz + 1)
            for 0 <= i < (1 << o.lgSz):
                if (<u4_t*>o.table.data)[3*i + o.key] != 0:
                    j = n.slot((<u4_t*>o.table.data)[3 * i + o.key], &dum)
                    memcpy(&(<u4_t*>n.table.data)[3 * j],
                           &(<u4_t*>o.table.data)[3 * i], 3 * 4)
            o.table = n.table
            o.lgSz += 1

from time import time
from sys  import stdin, argv

def build(np.ndarray keys):
    cdef u4_t        i, j, k, nKey = len(keys), missing, KEY=0, CNT=1, PROB=2
    cdef u4Dict      idict = u4Dict(KEY, 1)
    cdef double      t0 = time()
    cdef np.ndarray  row

    for 0 <= j < nKey:
        k = (<u4_t*>keys.data)[j]
        i = idict.slot(k, &missing)
        if missing:                         # insert
            (<u4_t *>idict.table.data)[3*i + KEY] = k
            (<float*>idict.table.data)[3*i + CNT] = 1.0
            idict.registerAdd()
        else:                               # update
            (<float*>idict.table.data)[3*i + CNT] += 1.0
    cdef double t1 = time()
    if len(argv) > 1:
        for row in idict.table[ idict.table[ : , KEY] != 0 ]:
            print row[CNT], (<u4_t*>row.data)[KEY]
    print (t1 - t0)*1e6 / len(keys), 'microseconds/key'

def Build(np.ndarray keys):     # Python dicts are 2.5..4.5X slower than u4Dict
    cdef u4_t        i, j, k, nKey = len(keys), missing, KEY=0, CNT=1, PROB=2
    cdef dict        idict = {}
    cdef double      t0 = time()
    cdef np.ndarray  row

    for 0 <= j < nKey:
        k = (<u4_t*>keys.data)[j]
        try:
            idict[k][CNT] += 1.0
        except:
            idict[k] = [1, 0]
    cdef double t1 = time()
    if len(argv) > 1:
        for row in idict.table[ idict.table[ : , KEY] != 0 ]:
            print row[CNT], (<u4_t*>row.data)[KEY]
    print (t1 - t0)*1e6 / len(keys), 'microseconds/key'

stuff = np.array([ int(line) for line in stdin ], 'I')
build(stuff)
Build(stuff)
_______________________________________________
Cython-dev mailing list
[email protected]
http://codespeak.net/mailman/listinfo/cython-dev

Reply via email to