Stefan Behnel skrev:
Could you provide complete code examples for the two functions you use?
Sure... I was fiddling with an experimental re-write (or clean-up) of
SciPy's ckdtree. Here it is.
Sturla
# Rewrite by Sturla Molden, Oct 2009
# Changes are public domain.
# Copyright Anne M. Archibald 2008
# Released under the scipy license
import numpy as np
cimport numpy as np
cimport cython
import kdtree
# this is a Cython trick to get a const type
# compiled into the C
cdef double infinity = np.inf
# annoying hack to get the address of the
# first element in a row, &a[i,0] doesn't work
cdef inline double *rowd(np.ndarray a, Py_ssize_t i):
return <double*> a.data + a.strides[0]*i
cdef inline Py_ssize_t *rowi(np.ndarray a, Py_ssize_t i):
return <Py_ssize_t*> a.data + a.strides[0]*i
# Maximum number of dimensions allowed
DEF MAXDIM = 32
# Node class
cdef class Node:
cdef Py_ssize_t split_dim
cdef Py_ssize_t n_points
cdef Py_ssize_t start_idx
cdef Py_ssize_t end_idx
cdef Node less
cdef Node greater
pass
cdef class NodeInfo:
cdef Node node
cdef double side_distances[MAXDIM]
pass
# priority queue
cdef class HeapItem:
cdef double priority
def __init__(HeapItem self, double priority):
self.priority = priority
cdef class IndexItem(HeapItem):
cdef Py_ssize_t index
def __init__(HeapItem self, double priority, Py_ssize_t index):
self.priority = priority
self.index = index
cdef class NodeInfoItem(HeapItem):
cdef NodeInfo info
def __init__(HeapItem self, double priority, NodeInfo info):
self.priority = priority
self.info = info
cdef class Heap:
cdef Py_ssize_t n, space
cdef list heap
def __cinit__(Heap self, int init_space=12):
self.heap = [0]*init_space
self.space = init_space
self.n = 0
cdef object push(Heap self, HeapItem item):
cdef Py_ssize_t i
cdef HeapItem t
i = self.n
self.n += 1
# We need to do this because the Python C API
# has no fast version of list.pop, only list.append,
# otherwise we could just have appended
if self.space >= i:
self.heap[i] = item
else:
self.heap.append(item)
self.space += 1
while i>0 and self.heap[i].priority < self.heap[(i-1)//2].priority:
t = self.heap[(i-1)//2]
self.heap[(i-1)//2] = self.heap[i]
self.heap[i] = t
i = (i-1)//2
return None # for propagating exceptions
cdef HeapItem peek(Heap self):
return self.heap[0]
cdef HeapItem pop(Heap self):
cdef HeapItem it, t
cdef Py_ssize_t i, j, k, l
it = self.heap[0]
self.heap[0] = self.heap[self.n-1]
self.n -= 1
# Since Python C API has no way of popping off a list,
# we must fake this with slicing.
if self.n < self.space//4 and self.space>40: #FIXME: magic number
self.heap = self.heap[ :self.space//2+1 ]
self.space = self.space//2+1
i=0
j=1
k=2
while ((j<self.n and
self.heap[i].priority > self.heap[j].priority or
k<self.n and
self.heap[i].priority > self.heap[k].priority)):
if k<self.n and self.heap[j].priority>self.heap[k].priority:
l = k
else:
l = j
t = self.heap[l]
self.heap[l] = self.heap[i]
self.heap[i] = t
i = l
j = 2*i+1
k = 2*i+2
return it
# utility functions
cdef inline double dmax(double x, double y):
if x > y:
return x
else:
return y
cdef inline double dabs(double x):
if x > 0:
return x
else:
return -x
cdef inline double _distance_p(double*x, double*y, double p, int k, double
upperbound):
"""Compute the distance between x and y
Computes the Minkowski p-distance to the power p between two points.
If the distance**p is larger than upperbound, then any number larger
than upperbound may be returned (the calculation is truncated).
"""
cdef int i
cdef double r
r = 0
if p==infinity:
for i in range(k):
r = dmax(r,dabs(x[i]-y[i]))
if r>upperbound:
return r
elif p==1:
for i in range(k):
r += dabs(x[i]-y[i])
if r>upperbound:
return r
else:
for i in range(k):
r += dabs(x[i]-y[i])**p
if r>upperbound:
return r
return r
cdef class cKDTree:
"""kd-tree for quick nearest-neighbor lookup
This class provides an index into a set of k-dimensional points
which can be used to rapidly look up the nearest neighbors of any
point.
The algorithm used is described in Maneewongvatana and Mount 1999.
The general idea is that the kd-tree is a binary trie, each of whose
nodes represents an axis-aligned hyperrectangle. Each node specifies
an axis and splits the set of points based on whether their coordinate
along that axis is greater than or less than a particular value.
During construction, the axis and splitting point are chosen by the
"sliding midpoint" rule, which ensures that the cells do not all
become long and thin.
The tree can be queried for the r closest neighbors of any given point
(optionally returning only those within some maximum distance of the
point). It can also be queried, with a substantial gain in efficiency,
for the r approximate closest neighbors.
For large dimensions (20 is already large) do not expect this to run
significantly faster than brute force. High-dimensional nearest-neighbor
queries are a substantial open problem in computer science.
"""
cdef Node tree
cdef object data
cdef Py_ssize_t n, m
cdef Py_ssize_t leafsize
cdef double maxes[MAXDIM]
cdef double mins[MAXDIM]
cdef object indices
@cython.boundscheck(False)
@cython.wraparound(False)
def __init__(cKDTree self, object _data, Py_ssize_t leafsize=10):
"""Construct a kd-tree.
Parameters:
===========
data : array-like, shape (n,m)
The n data points of dimension mto be indexed. This array is
not copied unless this is necessary to produce a contiguous
array of doubles, and so modifying this data will result in
bogus results.
leafsize : positive integer
The number of points at which the algorithm switches over to
brute-force.
"""
cdef np.ndarray[double, ndim=2, mode="c", readonly=True, restrict=True]
data
cdef double tmp
cdef Py_ssize_t i,j
self.data = np.ascontiguousarray(_data,dtype=np.float)
self.n, self.m = np.shape(self.data)
if self.m > MAXDIM:
raise ValueError, ('Data exceeds maximum number dimensions (%d).' %
int(MAXDIM))
self.leafsize = leafsize
if self.leafsize < 1:
raise ValueError("leafsize must be at least 1")
# find maximum and minimum along dimensions
data = self.data
for i in range(self.m):
tmp = data[0,i]
self.maxes[i] = tmp
self.mins[i] = tmp
for j in range(1,self.n):
for i in range(self.m):
tmp = data[j,i]
if tmp > self.maxes[i]:
self.maxes[i] = tmp
if tmp < self.mins[i]:
self.mins[i] = tmp
self.indices = np.ascontiguousarray(np.arange(self.n,dtype=int))
self.tree = self.__build(0, self.n, self.maxes, self.mins)
@cython.boundscheck(False)
@cython.wraparound(False)
cdef Node __build(cKDTree self, Py_ssize_t start_idx, Py_ssize_t end_idx,
double *maxes, double *mins):
cdef np.ndarray[double, ndim=2, mode="c", readonly=True, restrict=True]
data
cdef np.ndarray[Py_ssize_t, ndim=1, mode="c", restrict=True] indices
cdef Node node, inode
cdef double mids[MAXDIM]
cdef Py_ssize_t i, j, t, p, q, d, m
cdef double size, split, minval, maxval
m = self.m
n = self.n
# unbox ndarrays into local scope
data = self.data
indices = self.indices
if end_idx-start_idx <= self.leafsize:
node = Node()
node.split_dim = -1
node.start_idx = start_idx
node.end_idx = end_idx
return node
else:
d = 0
size = 0
for i in range(self.m):
if maxes[i] - mins[i] > size:
d = i
size = maxes[i] - mins[i]
maxval = maxes[d]
minval = mins[d]
if maxval == minval:
# all points are identical; warn user?
node = Node()
node.split_dim = -1
node.start_idx = start_idx
node.end_idx = end_idx
return node
split = (maxval+minval)/2
p = start_idx
q = end_idx-1
while p <= q:
if data[indices[p],d] < split:
p += 1
elif data[indices[q],d] >= split:
q -= 1
else:
t = indices[p]
indices[p] = indices[q]
indices[q] = t
p += 1
q -= 1
# slide midpoint if necessary
if p == start_idx:
# no points less than split
j = start_idx
split = data[indices[j],d]
for i in range(start_idx+1, end_idx):
if data[indices[i],d] < split:
j = i
split = data[indices[j],d]
t = indices[start_idx]
indices[start_idx] = indices[j]
indices[j] = t
p = start_idx + 1
q = start_idx
elif p == end_idx:
# no points greater than split
j = end_idx-1
split = data[indices[j],d]
for i in range(start_idx, end_idx-1):
if data[indices[i],d]>split:
j = i
split = data[indices[j],d]
t = indices[end_idx-1]
indices[end_idx-1] = indices[j]
indices[j] = t
p = end_idx - 1
q = end_idx - 2
# construct new node representation
inode = Node()
for i in range(m):
mids[i] = maxes[i]
mids[d] = split
inode.less = self.__build(start_idx, p, <double*> mids, mins)
for i in range(self.m):
mids[i] = mins[i]
mids[d] = split
inode.greater = self.__build(p, end_idx, maxes, mids)
inode.split_dim = d
inode.split = split
return inode
@cython.boundscheck(False)
@cython.wraparound(False)
cdef __query(cKDTree self,
double *result_distances,
Py_ssize_t *result_indices,
double *x,
int k,
double eps,
double p,
double distance_upper_bound):
cdef np.ndarray[double, ndim=2, mode="c", readonly=True, restrict=True]
data
cdef np.ndarray[Py_ssize_t, ndim=1, mode="c", readonly=True,
restrict=True] indices
cdef Heap q
cdef Heap neighbors
cdef Py_ssize_t i, j
cdef double t
cdef NodeInfo info1
cdef NodeInfo info2
cdef double d
cdef double epsfac
cdef double min_distance
cdef double far_min_distance
cdef NodeInfoItem it, it2
cdef IndexItem neighbor
cdef Node node
cdef Node inode
cdef Node near
cdef Node far
cdef double* side_distances
# unbox ndarrays into local scope
data = self.data
indices = self.indices
# priority queue for chasing nodes
# entries are:
# minimum distance between the cell and the target
# distances between the nearest side of the cell and the target
# the head node of the cell
q = Heap()
# priority queue for the nearest neighbors
# furthest known neighbor first
# entries are (-distance**p, i)
neighbors = Heap(k)
# set up first nodeinfo
info1 = NodeInfo()
info1.node = self.tree
for i in range(self.m):
info1.side_distances[i] = 0
t = x[i] - self.maxes[i]
if t > info1.side_distances[i]:
info1.side_distances[i] = t
else:
t = self.mins[i] - x[i]
if t > info1.side_distances[i]:
info1.side_distances[i] = t
if (p != 1) and (p != infinity):
info1.side_distances[i] = info1.side_distances[i]**p
# compute first distance
min_distance = 0.
for i in range(self.m):
if p == infinity:
min_distance = dmax(min_distance, info1.side_distances[i])
else:
min_distance += info1.side_distances[i]
# fiddle approximation factor
if eps == 0:
epsfac = 1
elif p == infinity:
epsfac = 1/(1+eps)
else:
epsfac = 1/(1+eps)**p
# internally we represent all distances as distance**p
if (p != infinity) and (distance_upper_bound != infinity):
distance_upper_bound = distance_upper_bound**p
while True:
if info1.node.split_dim == -1:
node = info1.node
# brute-force
for i in range(node.start_idx, node.end_idx):
d = _distance_p(&data[indices[i],0], x, p, self.m,
distance_upper_bound) ### does not fail here
if d < distance_upper_bound:
# replace furthest neighbor
if neighbors.n == k: neighbors.pop()
neighbors.push( IndexItem(-d, indices[i]) )
# adjust upper bound for efficiency
if neighbors.n==k:
distance_upper_bound = - neighbors.peek().priority
# done with this node, get another
if q.n == 0:
# no more nodes to visit
break
else:
it1 = q.pop()
info1 = it1.info
min_distance = it1.priority
else:
inode = info1.node
# we don't push cells that are too far onto the queue at all,
# but since the distance_upper_bound decreases, we might get
# here even if the cell's too far
if min_distance > distance_upper_bound * epsfac:
# since this is the nearest cell, we're done, bail out
break
# set up children for searching
if x[inode.split_dim] < inode.split:
near = inode.less
far = inode.greater
else:
near = inode.greater
far = inode.less
# near child is at the same distance as the current node
# we're going here next, so no point pushing it on the queue
# no need to recompute the distance or the side_distances
info1.node = near
# far child is further by an amount depending only
# on the split value; compute its distance and side_distances
# and push it on the queue if it's near enough
info2 = Node()
info2.node = far
it2 = NodeInfoItem(-1, info2)
# most side distances unchanged
for i in range(self.m):
info2.side_distances[i] = info1.side_distances[i]
# one side distance changes
# we can adjust the minimum distance without recomputing
if p == infinity:
# we never use side_distances in the l_infinity case
# inf2.side_distances[inode.split_dim] =
dabs(inode.split-x[inode.split_dim])
far_min_distance = dmax(min_distance,
dabs(inode.split-x[inode.split_dim]))
elif p == 1:
info2.side_distances[inode.split_dim] =
dabs(inode.split-x[inode.split_dim])
far_min_distance = min_distance -
info1.side_distances[inode.split_dim] + info2.side_distances[inode.split_dim]
else:
info2.side_distances[inode.split_dim] =
dabs(inode.split-x[inode.split_dim])**p
far_min_distance = min_distance -
info1.side_distances[inode.split_dim] + info2.side_distances[inode.split_dim]
it2.priority = far_min_distance
# far child might be too far, if so, don't bother pushing it
if far_min_distance <= distance_upper_bound*epsfac:
q.push(it2)
# fill output arrays with sorted neighbors
for i in range(neighbors.n-1,-1,-1):
neighbor = neighbors.pop()
result_indices[i] = neighbor.index
if p==1 or p==infinity:
result_distances[i] = -neighbor.priority
else:
result_distances[i] = (-neighbor.priority)**(1./p)
@cython.boundscheck(False)
@cython.wraparound(False)
cdef object query(cKDTree self, object x, int k=1, double eps=0, double
p=2,
double distance_upper_bound=infinity):
"""query the kd-tree for nearest neighbors
Parameters:
===========
x : array-like, last dimension self.m
An array of points to query.
k : integer
The number of nearest neighbors to return.
eps : nonnegative float
Return approximate nearest neighbors; the kth returned value
is guaranteed to be no further than (1+eps) times the
distance to the real kth nearest neighbor.
p : float, 1<=p<=infinity
Which Minkowski p-norm to use.
1 is the sum-of-absolute-values "Manhattan" distance
2 is the usual Euclidean distance
infinity is the maximum-coordinate-difference distance
distance_upper_bound : nonnegative float
Return only neighbors within this distance. This is used to prune
tree searches, so if you are doing a series of nearest-neighbor
queries, it may help to supply the distance to the nearest neighbor
of the most recent point.
Returns:
========
d : array of floats
The distances to the nearest neighbors.
If x has shape tuple+(self.m,), then d has shape tuple+(k,).
Missing neighbors are indicated with infinite distances.
i : array of integers
The locations of the neighbors in self.data.
If x has shape tuple+(self.m,), then i has shape tuple+(k,).
Missing neighbors are indicated with self.n.
"""
cdef np.ndarray[double, ndim=2, mode="c", readonly=True, restrict=True]
xx
cdef np.ndarray[Py_ssize_t, ndim=2, mode="c", readonly=False] ii
cdef np.ndarray[double, ndim=2, mode="c", readonly=False] dd
cdef Py_ssize_t c, n
x = np.asarray(x).astype(np.float)
if np.shape(x)[-1] != self.m:
raise ValueError("x must consist of vectors of length %d but has
shape %s" % (self.m, np.shape(x)))
if p<1:
raise ValueError("Only p-norms with 1<=p<=infinity permitted")
if len(x.shape)==1:
single = True
x = x[np.newaxis,:]
else:
single = False
retshape = np.shape(x)[:-1]
n = np.prod(retshape)
xx = np.reshape(x,(n,self.m))
dd = np.empty((n,k),dtype=np.float)
dd.fill(infinity)
ii = np.empty((n,k),dtype='i')
ii.fill(self.n)
if single:
self.__query( rowd(dd,0), rowi(ii,0), rowd(xx,0), k, eps, p,
distance_upper_bound )
#self.__query( &(dd[0,0]), &(ii[0,0]), &(xx[0,0]), k, eps, p,
distance_upper_bound ) ### fails here, change to cdef and it works
if k==1:
return dd[0,0], ii[0,0]
else:
return dd[0], ii[0]
else:
for i in range(n):
self.__query( rowd(dd,i), rowi(ii,i), rowd(xx,i), k, eps, p,
distance_upper_bound )
# self.__query( &(dd[i,0]), &(ii[i,0]), &(xx[i,0]), k, eps, p,
distance_upper_bound ) ### fails here, change to cdef and it works
# periodically give up the GIL to keep
# the interpreter responsive
if not (n % 100):
with nogil: pass
if k==1:
return np.reshape(dd[...,0],retshape),
np.reshape(ii[...,0],retshape)
else:
return np.reshape(dd,retshape+(k,)),
np.reshape(ii,retshape+(k,))
_______________________________________________
Cython-dev mailing list
[email protected]
http://codespeak.net/mailman/listinfo/cython-dev