Hi, I spent some time a while ago on an histogram function for numpy. It uses digitize and bincount instead of sorting the data. If I remember right, it was significantly faster than numpy's histogram, but I don't know how it will behave with very large data sets.
I attached the file if you want to take a look, or if you me the benchmark, I'll add it to it and report the results. Cheers, David 2006/12/14, eric jones <[EMAIL PROTECTED]>:
Rick White wrote: > Just so we don't get too smug about the speed, if I do this in IDL on > the same machine it is 10 times faster (0.28 seconds instead of 4 > seconds). I'm sure the IDL version uses the much faster approach of > just sweeping through the array once, incrementing counts in the > appropriate bins. It only handles equal-sized bins, so it is not as > general as the numpy version -- but equal-sized bins is a very common > case. I'd still like to see a C version of histogram (which I guess > would need to be a ufunc) go into the core numpy. > Yes, this gets rid of the search, and indices can just be caluclated from offsets. I've attached a modified weaved histogram that takes this approach. Running the snippet below on my machine takes .118 sec for the evenly binned weave algorithm and 0.385 sec for Rick's algorithm on 5 million elements. That is close to 4x faster (but not 10x...), so there is indeed some speed to be gained for the common special case. I don't know if the code I wrote has a 2x gain left in it, but I've spent zero time optimizing it. I'd bet it can be improved substantially. eric ### test_weave_even_histogram.py from numpy import arange, product, sum, zeros, uint8 from numpy.random import randint import weave_even_histogram import time shape = 1000,1000,5 size = product(shape) data = randint(0,256,size).astype(uint8) bins = arange(256+1) print 'type:', data.dtype print 'millions of elements:', size/1e6 bin_start = 0 bin_size = 1 bin_count = 256 t1 = time.clock() res = weave_even_histogram.histogram(data, bin_start, bin_size, bin_count) t2 = time.clock() print 'sec (evenly spaced):', t2-t1, sum(res) print res > Rick > _______________________________________________ > Numpy-discussion mailing list > Numpy-discussion@scipy.org > http://projects.scipy.org/mailman/listinfo/numpy-discussion > > _______________________________________________ Numpy-discussion mailing list Numpy-discussion@scipy.org http://projects.scipy.org/mailman/listinfo/numpy-discussion
# License: Scipy compatible # Author: David Huard, 2006 from numpy import * def histogram(a, bins=10, range=None, normed=False, weights=None, axis=None): """histogram(a, bins=10, range=None, normed=False, weights=None, axis=None) -> H, dict Return the distribution of sample. Parameters ---------- a: Array sample. bins: Number of bins, or an array of bin edges, in which case the range is not used. range: Lower and upper bin edges, default: [min, max]. normed: Boolean, if False, return the number of samples in each bin, if True, return the density. weights: Sample weights. The weights are normed only if normed is True. Should weights.sum() not equal len(a), the total bin count will not be equal to the number of samples. axis: Specifies the dimension along which the histogram is computed. Defaults to None, which aggregates the entire sample array. Output ------ H: The number of samples in each bin. If normed is True, H is a frequency distribution. dict{ 'edges': The bin edges, including the rightmost edge. 'upper': Upper outliers. 'lower': Lower outliers. 'bincenters': Center of bins. } Examples -------- x = random.rand(100,10) H, Dict = histogram(x, bins=10, range=[0,1], normed=True) H2, Dict = histogram(x, bins=10, range=[0,1], normed=True, axis=0) See also: histogramnd """ a = asarray(a) if axis is None: a = atleast_1d(a.ravel()) axis = 0 # Bin edges. if not iterable(bins): if range is None: range = (a.min(), a.max()) mn, mx = [mi+0.0 for mi in range] if mn == mx: mn -= 0.5 mx += 0.5 edges = linspace(mn, mx, bins+1, endpoint=True) else: edges = asarray(bins, float) dedges = diff(edges) decimal = int(-log10(dedges.min())+6) bincenters = edges[:-1] + dedges/2. # apply_along_axis accepts only one array input, but we need to pass the # weights along with the sample. The strategy here is to concatenate the # weights array along axis, so the passed array contains [sample, weights]. # The array is then split back in __hist1d. if weights is not None: aw = concatenate((a, weights), axis) weighted = True else: aw = a weighted = False count = apply_along_axis(__hist1d, axis, aw, edges, decimal, weighted, normed) # Outlier count upper = count.take(array([-1]), axis) lower = count.take(array([0]), axis) # Non-outlier count core = a.ndim*[slice(None)] core[axis] = slice(1, -1) hist = count[core] if normed: normalize = lambda x: atleast_1d(x/(x*dedges).sum()) hist = apply_along_axis(normalize, axis, hist) return hist, {'edges':edges, 'lower':lower, 'upper':upper, \ 'bincenters':bincenters} def __hist1d(aw, edges, decimal, weighted, normed): """Internal routine to compute the 1d histogram. aw: sample, [weights] edges: bin edges decimal: approximation to put values lying on the rightmost edge in the last bin. weighted: Means that the weights are appended to array a. Return the bin count or frequency if normed. """ nbin = edges.shape[0]+1 if weighted: count = zeros(nbin, dtype=float) a,w = hsplit(aw,2) if normed: w = w/w.mean() else: a = aw count = zeros(nbin, dtype=int) w = None binindex = digitize(a, edges) # Values that fall on an edge are put in the right bin. # For the rightmost bin, we want values equal to the right # edge to be counted in the last bin, and not as an outlier. on_edge = where(around(a,decimal) == around(edges[-1], decimal))[0] binindex[on_edge] -= 1 # Count the number of identical indices. flatcount = bincount(binindex, w) # Place the count in the histogram array. i = arange(len(flatcount)) count[i] = flatcount return count from numpy.testing import * class test_histogram(NumpyTestCase): def check_simple(self): n=100 v=rand(n) (a,b)=histogram(v) #check if the sum of the bins equals the number of samples assert(sum(a,axis=0)==n) #check that the bin counts are evenly spaced when the data is from a linear function (a,b)=histogram(linspace(0,10,100)) assert(all(a==10)) #Check the construction of the bin array a, b = histogram(v, bins=4, range=[.2,.8]) assert(all(b['edges']==linspace(.2, .8, 5))) #Check the number of outliers assert((v<.2).sum() == b['lower']) assert((v>.8).sum() == b['upper']) #Check the normalization bins = [0,.5,.75,1] a,b = histogram(v, bins, normed=True) assert_almost_equal((a*diff(bins)).sum(), 1) def check_axis(self): n,m = 100,20 v = rand(n,m) a,b = histogram(v, bins=5) # Check dimension is reduced (axis=None). assert(a.ndim == 1) #Check total number of count is equal to the number of samples. assert(a.sum() == n*m) a,b = histogram(v, bins = 7, axis=0) # Check shape of new array is ok. assert(a.ndim == 2) assert_array_equal(a.shape,[7, m]) # Check normalization is consistent a,b = histogram(v, bins = 7, axis=0, normed=True) assert_array_almost_equal((a.T*diff(b['edges'])).sum(1), ones((m))) a,b = histogram(v, bins = 7, axis=1, normed=True) assert_array_equal(a.shape, [n,7]) assert_array_almost_equal((a*diff(b['edges'])).sum(1), ones((n))) # Check results are consistent with 1d estimate a1, b1 = histogram(v[0,:], bins=b['edges'], normed=True) assert_array_equal(a1, a[0,:]) def check_weights(self): # Check weights = constant gives the same answer as no weights. v = rand(100) w = ones(100)*5 a,b = histogram(v) na,nb = histogram(v, normed=True) wa,wb = histogram(v, weights=w) nwa,nwb = histogram(v, weights=w, normed=True) assert_array_equal(a*5, wa) assert_array_equal(na, nwa) # Check weights are properly applied. v = linspace(0,10,10) w = concatenate((zeros(5), ones(5))) wa,wb = histogram(v, bins=arange(11),weights=w) assert_array_almost_equal(wa, w) if __name__ == "__main__": NumpyTest().run()
_______________________________________________ Numpy-discussion mailing list Numpy-discussion@scipy.org http://projects.scipy.org/mailman/listinfo/numpy-discussion