Here it is. Have fun.
Suggestions are welcome !
David
"""Class constructor to compute weighted 1-D histogram.
Suggestions are welcome !
David
Usage: Histogram(data, bins=10, range=None, weights=None, normed=False)
Input parameters
----------------------
data: Input array
bins: Number of bins or the bin array(overides the range).
range: A tuple of two values defining the lower and upper ends of the bin span.
If no argument is given, defaults to (data.min(), data.max()).
weights: Array of weights stating the importance of each data. This array must have the same shape as data.
Methods
-----------
add_data(values, weights): Add values array to existing data and update bin count. This does not modify the histogram bining.
optimize_binning(method): Chooses an optimal number of bin. Available methods are : Freedman, Scott.
score(percentile): Returns interpolated value at given percentile.
cdf(x, method): Return interpolated cdf at x.
Attributes
-------------
freq: The resulting bin count. If normed is true, this is a frequency. If normed is False, then freq is simply the number of data falling into each bin.
cum_freq: The cumulative bin count.
data: Array of data.
weights: Array of weights.
s_data: Array of sorted data.
range : Bin limits : (min, max)
Nbins : Number of bins
bin : The bin array (Nbin + 1)
normed: Normalization factor. Setting it to True will normed the density to 1.
weighted : Boolean indicating whether or not the data is weighted.
"""
Input parameters
----------------------
data: Input array
bins: Number of bins or the bin array(overides the range).
range: A tuple of two values defining the lower and upper ends of the bin span.
If no argument is given, defaults to (data.min(), data.max()).
weights: Array of weights stating the importance of each data. This array must have the same shape as data.
Methods
-----------
add_data(values, weights): Add values array to existing data and update bin count. This does not modify the histogram bining.
optimize_binning(method): Chooses an optimal number of bin. Available methods are : Freedman, Scott.
score(percentile): Returns interpolated value at given percentile.
cdf(x, method): Return interpolated cdf at x.
Attributes
-------------
freq: The resulting bin count. If normed is true, this is a frequency. If normed is False, then freq is simply the number of data falling into each bin.
cum_freq: The cumulative bin count.
data: Array of data.
weights: Array of weights.
s_data: Array of sorted data.
range : Bin limits : (min, max)
Nbins : Number of bins
bin : The bin array (Nbin + 1)
normed: Normalization factor. Setting it to True will normed the density to 1.
weighted : Boolean indicating whether or not the data is weighted.
"""
# License: MIT # Author: David Huard <david.huard at gmail.com> # Changes log # 2006, June 2: Completed object interface # May : Small corrections to docstrings # April : Modified to use objects # March : Creation
# To do # Clean up the normed object in order to avoid calling __compute_cum_freq when it is changed. # Add a simple pdf method # Add 2D support from numpy import atleast_1d, linspace, ones, sort, Float, concatenate, diff, cross, array, size from scipy.interpolate import interp1d class Histogram(object): """Class constructor to compute weighted 1-D histogram. Usage: Histogram(data, bins=10, range=None, weights=None, normed=False) Input parameters --------------------------- data: Input array bins: Number of bins or the bin array(overides the range). range: A tuple of two values defining the lower and upper ends of the bin span. If no argument is given, defaults to (data.min(), data.max()). weights: Array of weights stating the importance of each data. This array must have the same shape as data. Methods ------------- add_data(values, weights): Add values array to existing data and update bin count. This does not modify the histogram bining. optimize_binning(method): Chooses an optimal number of bin. Available methods are : Freedman, Scott. score(percentile): Returns interpolated value at given percentile. cdf(x, method): Return interpolated cdf at x. Attributes ---------------- freq: The resulting bin count. If normed is true, this is a frequency. If normed is False, then freq is simply the number of data falling into each bin. cum_freq: The cumulative bin count. data: Array of data. weights: Array of weights. s_data: Array of sorted data. range : Bin limits : (min, max) Nbins : Number of bins bin : The bin array (Nbin + 1) normed: Normalization factor. Setting it to True will normed the density to 1. weighted : Boolean indicating whether or not the data is weighted. """ def __init__(self, data, bins=10, range=None, weights=None, normed=False): self.__Init = True self.weights = weights self.data = data self.range = range self.__normed = normed if size(bins) == 1: self.Nbins = bins else: self.bins = bins if range is not None: print 'The bin array assignment superseded the range assignment.' self.__compute() self.__empirical_cdf() self.__Init = False def __delete_value(self): """Delete method.""" raise TypeError, 'Cannot delete attribute.' def __set_value(self): raise TypeError, 'Cannot set attribute.' def range(): #---- range object ---- doc = """A tuple of two values defining the lower and upper ends of the bin span. If no argument is given, defaults to (data.min(), data.max()).""" def fget(self): return self.__range def fset(self, value): if value is None: self.__range = array((self.__data.min(), self.__data.max()*(1+1E-6))) else: value = atleast_1d(value) if len(value) != 2: raise TypeError, 'Range must have two elements (min, max).' self.__range = value if self.__Init is False: self.__compute() return locals() range = property(**range()) def bins(): #---- bins object ---- doc = "Bin array. A (n) array for n-1 bins. Calling this method updates the count." def fset(self, bin_array): """Set method for bin attribute.""" if size(bin_array) < 2: raise 'Bin array must contain at least two positions.' self.__bins=atleast_1d(bin_array) self.range = (self.__bins[0], self__bins[-1]) if self.__Init is False: self.__compute() def fget(self): """Get method for bin attribute.""" return self.__bins return locals() bins = property(**bins()) def Nbins(): #---- Nbins object ---- doc = "Number of bins. Setting this value updates the bin count." def fset(self, value): """Set method for Nbins.""" if (size(value) != 1): raise TypeError, 'Nbin must be a scalar.' else: self.__Nbins = value self.__bins = linspace(self.__range[0], self.__range[1], self.__Nbins+1) if self.__Init is False: self.__compute() def fget(self): """Get method for Nbins.""" return self.__Nbins return locals() Nbins = property(**Nbins()) def weighted(): #---- weighted object ---- doc = "Boolean specifying whether or not the data is weighted." def fget(self): return self.__weighted def fset(self, value): if (value == True) or (value == False): self.__weighted = value if self.__Init is False: self.__compute() else: raise TypeError, 'weighted must be a boolean (True/False).' return locals() weighted = property(**weighted()) def normed(): #---- normed object ---- doc = """ Value specifying whether or not the frequency and cumulative frequency should be normalized. If normed is True: norm=1 If normed if False or 0: freq = number of values in each bin. If normed is a scalar: the bin count is scaled so that the area under the curve is equal to normed. """ def fget(self): return self.__normed def fset(self, value): if size(value) != 1: raise TypeError, 'normed must be a boolean (True/False) or a scalar.' if value: self.__normed = value*1. else: self.__normed = value self.__compute() return locals() normed = property(**normed()) def weights(): #---- weights object ---- doc = """Array of weights defining the relative importance of each datum.""" def fset(self, value): """Assigns weights to the data already present in the class attribute.""" if value is None: self.weighted = False self.__weights = value else : value = atleast_1d(value) if size(value) == size(self.__data): self.weighted = True if not self.normed: self.normed = True else: raise TypeError, 'The size of the weight array (%d) must be identical to the data size (%d).' % (size(weights), size(self.__data)) if self.__Init is False: self.__compute() def fget(self): if self.__weights is None: print 'No weights are yet assigned.' else: return self.__weights def fdel(self): self.__weighted = False self.__weights = None return locals() weights =property(**weights()) def data(): #---- data object ---- doc = "Array of values to bin." def fget(self): return self.__data def fset(self, data): self.__data = atleast_1d(data) if self.__Init is False: self.__compute() return locals() data = property(**data()) def freq(): #-----freq object----- doc = "Bin count (bin frequency if normalized)." def fget(self): return self.__freq return locals() freq = property(**freq()) def cum_freq(): #-----freq object----- doc = "Cumulative bin count (cumulative bin frequency if normalized)." def fget(self): return self.__cum_freq return locals() cum_freq = property(**cum_freq()) def add_data(self, data, weights=None): """Adds data to the data and weight attributes and updates the bin count.""" self.__data = concatenate(self.__data, data) if weights is not None: self.weighted = True if self.__weights is None: self.__weights = ones(len(data), dtype = Float) self.__weights = concatenate(self.__weights, weights) self.__compute() def __compute_cum_freq(self): """Sort the data and store it internally. Puts cum_freq in the class attributes. """ if not self.weighted: sd = sort(self.data) n = sd.searchsorted(self.bins) cf = n-n[0] self.__cum_freq = cf[1:] else: sd, sw = sort(zip(self.data,self.weights), axis=0).transpose() n = sd.searchsorted(self.bins) cw = concatenate((atleast_1d(0),sw.cumsum())) self.__cum_freq = cw[n[1:]] self.__sorted_weights = sw self.__sorted_data = sd def __compute(self): """Updates the bin count.""" self.__compute_cum_freq() self.__freq = concatenate((atleast_1d(self.__cum_freq[0]), diff(self.__cum_freq))) if self.__normed: self.__normalize() def __normalize(self): """Normalizes the frequency count so that the area under the curve is norm. Normalizes the cumulative frequency so that the largest value is norm.""" N = sum(self.__freq*diff(self.__bins)) self.__freq = self.__freq/N*self.__normed self.__cum_freq = self.__cum_freq*1./self.__cum_freq[-1]*self.__normed def optimize_binning(self, method='Freedman'): """Find the optimal number of bins and update the bin count accordingly. Available methods : Freedman, Scott """ N = len(self.data) if method=='Freedman': IQR = self.score(75) - self.score(25) # Interquantile range (75% -25%) width = 2* IQR*N**(-1./3) elif method=='Scott': width = 3.49 * std(self.__data) * N**(-1./3) self.setNbin(self.range.ptp()/width) def __empirical_cdf(self, method='Weibull'): """Compute the cdf and define an interpolation instance. The cdf data and the interpolation instance are stored in a dictionnary. """ N = len(self.__data) i = linspace(1,N, N) self.__cdf = {} self.__interp_cdf={} self.__interp_score={} if method == 'Weibull': self.__cdf[method] = i/(N+1) elif method == 'Hazen': self.__cdf[method] = (i-0.5)/N elif method == 'California': self.__cdf[method] = (i-1)/N elif method == 'Chegodayev': self.__cdf[method] = (i-.3)/(N+.4) elif method == 'Cunnane': self.__cdf[method] = (i-.4)/(N+.2) elif method == 'Gringorten': self.__cdf[method] = (i-.44)/(N+.12) else: raise 'Unknown method. Choose among Weibull, Hazen, Chegodayev, Cunnane, Gringorten and California.' self.__interp_cdf[method] = interp1d(self.__sorted_data, self.__cdf[method]) self.__interp_score[method] = interp1d(self.__cdf[method], self.__sorted_data) def cdf(self, x, method='Weibull'): """Returns the cdf interpolated from an empirical cdf. Available methods to compute cdf are Weibull, Hazen, Chegodayev, Cunnane, Gringorten and California.""" if method not in self.__interp_cdf.keys(): self.__empirical_cdf(method) return self.__interp_cdf[method](x) def score(self, percent, method='Weibull'): """Returns the score at given percentile. Available methods to compute cdf are Weibull, Hazen, Chegodayev, Cunnane, Gringorten and California. The score is simply the inverted cdf.""" # Do some checking percent = atleast_1d(percent) if percent.any() < 0: raise 'Percent must be positive.' elif percent.any() > 100: raise 'Percent must be smaller than 100.' if method not in self.__interp_score.keys(): self.__empirical_cdf(method) return self.__interp_score[method](percent)
_______________________________________________ Matplotlib-users mailing list Matplotlib-users@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/matplotlib-users