Here it is. Have fun.
Suggestions are welcome !
David
"""Class constructor to compute weighted 1-D histogram.
Suggestions are welcome !
David
Usage: Histogram(data, bins=10, range=None, weights=None, normed=False)
Input parameters
----------------------
data: Input array
bins: Number of bins or the bin array(overides the range).
range: A tuple of two values defining the lower and upper ends of the bin span.
If no argument is given, defaults to (data.min(), data.max()).
weights: Array of weights stating the importance of each data. This array must have the same shape as data.
Methods
-----------
add_data(values, weights): Add values array to existing data and update bin count. This does not modify the histogram bining.
optimize_binning(method): Chooses an optimal number of bin. Available methods are : Freedman, Scott.
score(percentile): Returns interpolated value at given percentile.
cdf(x, method): Return interpolated cdf at x.
Attributes
-------------
freq: The resulting bin count. If normed is true, this is a frequency. If normed is False, then freq is simply the number of data falling into each bin.
cum_freq: The cumulative bin count.
data: Array of data.
weights: Array of weights.
s_data: Array of sorted data.
range : Bin limits : (min, max)
Nbins : Number of bins
bin : The bin array (Nbin + 1)
normed: Normalization factor. Setting it to True will normed the density to 1.
weighted : Boolean indicating whether or not the data is weighted.
"""
Input parameters
----------------------
data: Input array
bins: Number of bins or the bin array(overides the range).
range: A tuple of two values defining the lower and upper ends of the bin span.
If no argument is given, defaults to (data.min(), data.max()).
weights: Array of weights stating the importance of each data. This array must have the same shape as data.
Methods
-----------
add_data(values, weights): Add values array to existing data and update bin count. This does not modify the histogram bining.
optimize_binning(method): Chooses an optimal number of bin. Available methods are : Freedman, Scott.
score(percentile): Returns interpolated value at given percentile.
cdf(x, method): Return interpolated cdf at x.
Attributes
-------------
freq: The resulting bin count. If normed is true, this is a frequency. If normed is False, then freq is simply the number of data falling into each bin.
cum_freq: The cumulative bin count.
data: Array of data.
weights: Array of weights.
s_data: Array of sorted data.
range : Bin limits : (min, max)
Nbins : Number of bins
bin : The bin array (Nbin + 1)
normed: Normalization factor. Setting it to True will normed the density to 1.
weighted : Boolean indicating whether or not the data is weighted.
"""
# License: MIT # Author: David Huard <david.huard at gmail.com> # Changes log # 2006, June 2: Completed object interface # May : Small corrections to docstrings # April : Modified to use objects # March : Creation
# To do
# Clean up the normed object in order to avoid calling __compute_cum_freq when it is changed.
# Add a simple pdf method
# Add 2D support
from numpy import atleast_1d, linspace, ones, sort, Float, concatenate, diff, cross, array, size
from scipy.interpolate import interp1d
class Histogram(object):
"""Class constructor to compute weighted 1-D histogram.
Usage: Histogram(data, bins=10, range=None, weights=None, normed=False)
Input parameters
---------------------------
data: Input array
bins: Number of bins or the bin array(overides the range).
range: A tuple of two values defining the lower and upper ends of the bin span.
If no argument is given, defaults to (data.min(), data.max()).
weights: Array of weights stating the importance of each data. This array must have the same shape as data.
Methods
-------------
add_data(values, weights): Add values array to existing data and update bin count. This does not modify the histogram bining.
optimize_binning(method): Chooses an optimal number of bin. Available methods are : Freedman, Scott.
score(percentile): Returns interpolated value at given percentile.
cdf(x, method): Return interpolated cdf at x.
Attributes
----------------
freq: The resulting bin count. If normed is true, this is a frequency. If normed is False, then freq is simply the number of data falling into each bin.
cum_freq: The cumulative bin count.
data: Array of data.
weights: Array of weights.
s_data: Array of sorted data.
range : Bin limits : (min, max)
Nbins : Number of bins
bin : The bin array (Nbin + 1)
normed: Normalization factor. Setting it to True will normed the density to 1.
weighted : Boolean indicating whether or not the data is weighted.
"""
def __init__(self, data, bins=10, range=None, weights=None, normed=False):
self.__Init = True
self.weights = weights
self.data = data
self.range = range
self.__normed = normed
if size(bins) == 1:
self.Nbins = bins
else:
self.bins = bins
if range is not None:
print 'The bin array assignment superseded the range assignment.'
self.__compute()
self.__empirical_cdf()
self.__Init = False
def __delete_value(self):
"""Delete method."""
raise TypeError, 'Cannot delete attribute.'
def __set_value(self):
raise TypeError, 'Cannot set attribute.'
def range(): #---- range object ----
doc = """A tuple of two values defining the lower and upper ends of the bin span.
If no argument is given, defaults to (data.min(), data.max())."""
def fget(self):
return self.__range
def fset(self, value):
if value is None:
self.__range = array((self.__data.min(), self.__data.max()*(1+1E-6)))
else:
value = atleast_1d(value)
if len(value) != 2:
raise TypeError, 'Range must have two elements (min, max).'
self.__range = value
if self.__Init is False:
self.__compute()
return locals()
range = property(**range())
def bins(): #---- bins object ----
doc = "Bin array. A (n) array for n-1 bins. Calling this method updates the count."
def fset(self, bin_array):
"""Set method for bin attribute."""
if size(bin_array) < 2:
raise 'Bin array must contain at least two positions.'
self.__bins=atleast_1d(bin_array)
self.range = (self.__bins[0], self__bins[-1])
if self.__Init is False:
self.__compute()
def fget(self):
"""Get method for bin attribute."""
return self.__bins
return locals()
bins = property(**bins())
def Nbins(): #---- Nbins object ----
doc = "Number of bins. Setting this value updates the bin count."
def fset(self, value):
"""Set method for Nbins."""
if (size(value) != 1):
raise TypeError, 'Nbin must be a scalar.'
else:
self.__Nbins = value
self.__bins = linspace(self.__range[0], self.__range[1], self.__Nbins+1)
if self.__Init is False:
self.__compute()
def fget(self):
"""Get method for Nbins."""
return self.__Nbins
return locals()
Nbins = property(**Nbins())
def weighted(): #---- weighted object ----
doc = "Boolean specifying whether or not the data is weighted."
def fget(self):
return self.__weighted
def fset(self, value):
if (value == True) or (value == False):
self.__weighted = value
if self.__Init is False:
self.__compute()
else:
raise TypeError, 'weighted must be a boolean (True/False).'
return locals()
weighted = property(**weighted())
def normed(): #---- normed object ----
doc = """
Value specifying whether or not the frequency and cumulative frequency should be normalized.
If normed is True: norm=1
If normed if False or 0: freq = number of values in each bin.
If normed is a scalar: the bin count is scaled so that the area under the curve is equal to normed.
"""
def fget(self):
return self.__normed
def fset(self, value):
if size(value) != 1:
raise TypeError, 'normed must be a boolean (True/False) or a scalar.'
if value:
self.__normed = value*1.
else:
self.__normed = value
self.__compute()
return locals()
normed = property(**normed())
def weights(): #---- weights object ----
doc = """Array of weights defining the relative importance of each datum."""
def fset(self, value):
"""Assigns weights to the data already present in the class attribute."""
if value is None:
self.weighted = False
self.__weights = value
else :
value = atleast_1d(value)
if size(value) == size(self.__data):
self.weighted = True
if not self.normed:
self.normed = True
else:
raise TypeError, 'The size of the weight array (%d) must be identical to the data size (%d).' % (size(weights), size(self.__data))
if self.__Init is False:
self.__compute()
def fget(self):
if self.__weights is None:
print 'No weights are yet assigned.'
else:
return self.__weights
def fdel(self):
self.__weighted = False
self.__weights = None
return locals()
weights =property(**weights())
def data(): #---- data object ----
doc = "Array of values to bin."
def fget(self):
return self.__data
def fset(self, data):
self.__data = atleast_1d(data)
if self.__Init is False:
self.__compute()
return locals()
data = property(**data())
def freq(): #-----freq object-----
doc = "Bin count (bin frequency if normalized)."
def fget(self):
return self.__freq
return locals()
freq = property(**freq())
def cum_freq(): #-----freq object-----
doc = "Cumulative bin count (cumulative bin frequency if normalized)."
def fget(self):
return self.__cum_freq
return locals()
cum_freq = property(**cum_freq())
def add_data(self, data, weights=None):
"""Adds data to the data and weight attributes and updates the bin count."""
self.__data = concatenate(self.__data, data)
if weights is not None:
self.weighted = True
if self.__weights is None:
self.__weights = ones(len(data), dtype = Float)
self.__weights = concatenate(self.__weights, weights)
self.__compute()
def __compute_cum_freq(self):
"""Sort the data and store it internally. Puts cum_freq in the class attributes. """
if not self.weighted:
sd = sort(self.data)
n = sd.searchsorted(self.bins)
cf = n-n[0]
self.__cum_freq = cf[1:]
else:
sd, sw = sort(zip(self.data,self.weights), axis=0).transpose()
n = sd.searchsorted(self.bins)
cw = concatenate((atleast_1d(0),sw.cumsum()))
self.__cum_freq = cw[n[1:]]
self.__sorted_weights = sw
self.__sorted_data = sd
def __compute(self):
"""Updates the bin count."""
self.__compute_cum_freq()
self.__freq = concatenate((atleast_1d(self.__cum_freq[0]), diff(self.__cum_freq)))
if self.__normed:
self.__normalize()
def __normalize(self):
"""Normalizes the frequency count so that the area under the curve is norm.
Normalizes the cumulative frequency so that the largest value is norm."""
N = sum(self.__freq*diff(self.__bins))
self.__freq = self.__freq/N*self.__normed
self.__cum_freq = self.__cum_freq*1./self.__cum_freq[-1]*self.__normed
def optimize_binning(self, method='Freedman'):
"""Find the optimal number of bins and update the bin count accordingly.
Available methods : Freedman, Scott
"""
N = len(self.data)
if method=='Freedman':
IQR = self.score(75) - self.score(25) # Interquantile range (75% -25%)
width = 2* IQR*N**(-1./3)
elif method=='Scott':
width = 3.49 * std(self.__data) * N**(-1./3)
self.setNbin(self.range.ptp()/width)
def __empirical_cdf(self, method='Weibull'):
"""Compute the cdf and define an interpolation instance.
The cdf data and the interpolation instance are stored in a dictionnary.
"""
N = len(self.__data)
i = linspace(1,N, N)
self.__cdf = {}
self.__interp_cdf={}
self.__interp_score={}
if method == 'Weibull':
self.__cdf[method] = i/(N+1)
elif method == 'Hazen':
self.__cdf[method] = (i-0.5)/N
elif method == 'California':
self.__cdf[method] = (i-1)/N
elif method == 'Chegodayev':
self.__cdf[method] = (i-.3)/(N+.4)
elif method == 'Cunnane':
self.__cdf[method] = (i-.4)/(N+.2)
elif method == 'Gringorten':
self.__cdf[method] = (i-.44)/(N+.12)
else:
raise 'Unknown method. Choose among Weibull, Hazen, Chegodayev, Cunnane, Gringorten and California.'
self.__interp_cdf[method] = interp1d(self.__sorted_data, self.__cdf[method])
self.__interp_score[method] = interp1d(self.__cdf[method], self.__sorted_data)
def cdf(self, x, method='Weibull'):
"""Returns the cdf interpolated from an empirical cdf.
Available methods to compute cdf are Weibull, Hazen, Chegodayev, Cunnane, Gringorten and California."""
if method not in self.__interp_cdf.keys():
self.__empirical_cdf(method)
return self.__interp_cdf[method](x)
def score(self, percent, method='Weibull'):
"""Returns the score at given percentile.
Available methods to compute cdf are Weibull, Hazen, Chegodayev, Cunnane, Gringorten and California.
The score is simply the inverted cdf."""
# Do some checking
percent = atleast_1d(percent)
if percent.any() < 0:
raise 'Percent must be positive.'
elif percent.any() > 100:
raise 'Percent must be smaller than 100.'
if method not in self.__interp_score.keys():
self.__empirical_cdf(method)
return self.__interp_score[method](percent)
_______________________________________________ Matplotlib-users mailing list [email protected] https://lists.sourceforge.net/lists/listinfo/matplotlib-users
