Re: [Matplotlib-users] histogram

David Huard Fri, 02 Jun 2006 07:52:57 -0700

Here it is. Have fun.
Suggestions are welcome !

David

"""Class constructor to compute weighted 1-D histogram.

    Usage: Histogram(data, bins=10, range=None, weights=None, normed=False)

     Input parameters
     ----------------------
        data: Input array
        bins: Number of bins or the bin array(overides the range).
        range: A tuple of two values defining the lower and upper ends of the bin span.
               If no argument is given, defaults to (data.min(), data.max()).
        weights: Array of weights stating the importance of each data. This array must have the same shape as data.

    Methods
    -----------
        add_data(values, weights): Add values array to existing data and update bin count. This does not modify the histogram bining.
        optimize_binning(method): Chooses an optimal number of bin. Available methods are : Freedman, Scott.
        score(percentile): Returns interpolated value at given percentile.
        cdf(x, method): Return interpolated cdf at x.

    Attributes
   -------------
        freq: The resulting bin count. If normed is true, this is a frequency. If normed is False, then freq is simply the number of data falling into each bin.
        cum_freq: The cumulative bin count.
        data: Array of data.
        weights: Array of weights.
        s_data: Array of sorted data.

        range     : Bin limits : (min, max)
        Nbins     : Number of bins
        bin       : The bin array (Nbin + 1)
        normed: Normalization factor. Setting it to True will normed the density to 1.
        weighted : Boolean indicating whether or not the data is weighted.
    """

# License: MIT
# Author: David Huard <david.huard at gmail.com>

# Changes log
# 2006, June 2: Completed object interface
# May  : Small corrections to docstrings
# April : Modified to use objects 
# March : Creation


# To do
# Clean up the normed object in order to avoid calling __compute_cum_freq when it is changed.
# Add a simple pdf method
# Add 2D support

from numpy import atleast_1d, linspace, ones, sort, Float, concatenate, diff, cross, array, size
from scipy.interpolate import interp1d
class Histogram(object):
    """Class constructor to compute weighted 1-D histogram.
    Usage: Histogram(data, bins=10, range=None, weights=None, normed=False)
     
     Input parameters
     ---------------------------
        data:  Input array 
        bins:  Number of bins or the bin array(overides the range).
        range: A tuple of two values defining the lower and upper ends of the bin span. 
               If no argument is given, defaults to (data.min(), data.max()).
        weights: Array of weights stating the importance of each data. This array must have the same shape as data. 
    
    Methods
    -------------  
        add_data(values, weights): Add values array to existing data and update bin count. This does not modify the histogram bining.
        optimize_binning(method): Chooses an optimal number of bin. Available methods are : Freedman, Scott. 
        score(percentile): Returns interpolated value at given percentile.
        cdf(x, method): Return interpolated cdf at x.
    
    Attributes
   ---------------- 
        freq: The resulting bin count. If normed is true, this is a frequency.  If normed is False, then freq is simply the number of data falling into each bin.
        cum_freq: The cumulative bin count.
        data: Array of data. 
        weights: Array of weights.
        s_data: Array of sorted data.
        
        range     : Bin limits : (min, max)
        Nbins     : Number of bins
        bin       : The bin array (Nbin + 1)
        normed: Normalization factor. Setting it to True will normed the density to 1.
        weighted  : Boolean indicating whether or not the data is weighted.
    """

    def __init__(self, data, bins=10, range=None, weights=None, normed=False):
        self.__Init = True
        self.weights = weights
        self.data = data
        self.range = range
        self.__normed = normed
        if size(bins) == 1:
            self.Nbins = bins 
        else:
            self.bins = bins
            if range is not None:
                print 'The bin array assignment superseded the range assignment.'
        
        self.__compute()
        self.__empirical_cdf()
        self.__Init = False
        
    def __delete_value(self):
        """Delete method."""
        raise TypeError, 'Cannot delete attribute.'
    
    def __set_value(self):
        raise TypeError, 'Cannot set attribute.'
    
    def range(): #---- range object ----
        doc = """A tuple of two values defining the lower and upper ends of the bin span. 
               If no argument is given, defaults to (data.min(), data.max())."""
        def fget(self):
            return self.__range
        def fset(self, value):
            if value is None:
                self.__range = array((self.__data.min(), self.__data.max()*(1+1E-6)))
            else:
                value = atleast_1d(value)        
                if len(value) != 2:
                    raise TypeError, 'Range must have two elements (min, max).'
                self.__range = value 
                if self.__Init is False:
                    self.__compute()
        return locals()
    range = property(**range())
    
    
    def bins(): #---- bins object ----
        doc = "Bin array. A (n) array for n-1 bins. Calling this method updates the count."
        
        def fset(self, bin_array):
            """Set method for bin attribute.""" 
            if size(bin_array) < 2:
                raise 'Bin array must contain at least two positions.'
            self.__bins=atleast_1d(bin_array)
            self.range = (self.__bins[0], self__bins[-1])
            if self.__Init is False:
                self.__compute()
        
        def fget(self):
            """Get method for bin attribute."""
            return self.__bins
        
        return locals()
    bins = property(**bins())
       

        
    def Nbins(): #---- Nbins object ----
        doc = "Number of bins. Setting this value updates the bin count."
        def fset(self, value):
            """Set method for Nbins."""
            if (size(value) != 1):
                raise TypeError, 'Nbin must be a scalar.'
            else:
                self.__Nbins = value
                self.__bins = linspace(self.__range[0], self.__range[1], self.__Nbins+1) 
                if self.__Init is False:
                    self.__compute()
                
        def fget(self):
            """Get method for Nbins."""
            return self.__Nbins

        return locals()
    Nbins = property(**Nbins())
   

    
    def weighted(): #---- weighted object ----
        doc = "Boolean specifying whether or not the data is weighted." 
        def fget(self):
            return self.__weighted
        
        def fset(self, value):
            if (value == True) or (value == False):
                self.__weighted = value
                if self.__Init is False:
                    self.__compute()
            else:
                raise TypeError, 'weighted must be a boolean (True/False).'
        return locals()
    weighted = property(**weighted())
        
    
    def normed(): #---- normed object ----
        doc = """
        Value specifying whether or not the frequency and cumulative frequency should be normalized. 
        If normed is True: norm=1
        If normed if False or 0: freq = number of values in each bin.
        If normed is a scalar: the bin count is scaled so that the area under the curve is equal to normed. 
        """
        def fget(self):
            return self.__normed
    
        def fset(self, value):             
            if size(value) != 1:
                raise TypeError, 'normed must be a boolean (True/False) or a scalar.'
            if value:
                self.__normed = value*1.
            else:
                self.__normed = value
            self.__compute()
                                         
        return locals()
    normed = property(**normed())
    
    
    
    def weights(): #---- weights object ----
        doc = """Array of weights defining the relative importance of each datum."""
        def fset(self, value):
            """Assigns weights to the data already present in the class attribute."""
            if value is None:
                self.weighted = False
                self.__weights = value                
            else :
                value = atleast_1d(value)
                if size(value) == size(self.__data):
                    self.weighted = True
                    if not self.normed:
                        self.normed = True
                else:
                    raise TypeError, 'The size of the weight array (%d) must be identical to the data size (%d).' % (size(weights), size(self.__data))
                if self.__Init is False:
                    self.__compute()
            
        def fget(self):
            if self.__weights is None:
                print 'No weights are yet assigned.'
            else:
                return self.__weights
    
        def fdel(self):
            self.__weighted = False
            self.__weights = None
        return locals()
    weights =property(**weights())
     
        
  
    def data(): #---- data object ----
        doc = "Array of values to bin."        
        def fget(self):
            return self.__data
    
        def fset(self, data):
            self.__data = atleast_1d(data)
            if self.__Init is False:
                self.__compute()
        return locals()
    data = property(**data())
    
    def freq(): #-----freq object-----
        doc = "Bin count (bin frequency if normalized)."
        def fget(self):
            return self.__freq
        return locals()
    freq = property(**freq())
        
    def cum_freq(): #-----freq object-----
        doc = "Cumulative bin count (cumulative bin frequency if normalized)."
        def fget(self):
            return self.__cum_freq
        return locals()
    cum_freq = property(**cum_freq())
    
    def add_data(self, data, weights=None):
        """Adds data to the data and weight attributes and updates the bin count."""
        
        self.__data = concatenate(self.__data, data)
        if weights is not None:
            self.weighted = True
            if self.__weights is None:
                self.__weights = ones(len(data), dtype = Float)
            self.__weights = concatenate(self.__weights, weights)
        self.__compute()
        
    def __compute_cum_freq(self):
        """Sort the data and store it internally. Puts cum_freq in the class attributes. """
        if not self.weighted:
            sd = sort(self.data)
            n = sd.searchsorted(self.bins)
            cf = n-n[0]
            self.__cum_freq = cf[1:]
        else:
            sd, sw = sort(zip(self.data,self.weights), axis=0).transpose()            
            n = sd.searchsorted(self.bins)
            cw = concatenate((atleast_1d(0),sw.cumsum()))
            self.__cum_freq = cw[n[1:]]
            self.__sorted_weights = sw
        self.__sorted_data = sd
        
    
    
    def __compute(self):
        """Updates the bin count."""
        
        self.__compute_cum_freq()
        self.__freq = concatenate((atleast_1d(self.__cum_freq[0]), diff(self.__cum_freq)))
        if self.__normed:
            self.__normalize()
            
    def __normalize(self):
        """Normalizes the frequency count so that the area under the curve is norm.
        Normalizes the cumulative frequency so that the largest value is norm."""
        N = sum(self.__freq*diff(self.__bins))
        self.__freq = self.__freq/N*self.__normed
        self.__cum_freq = self.__cum_freq*1./self.__cum_freq[-1]*self.__normed

    
    def optimize_binning(self, method='Freedman'):
        """Find the optimal number of bins and update the bin count accordingly.
        Available methods : Freedman, Scott
        """
        
        N = len(self.data)
        if method=='Freedman':
            IQR = self.score(75) - self.score(25) # Interquantile range (75% -25%)
            width = 2* IQR*N**(-1./3)
            
        elif method=='Scott':
            width = 3.49 * std(self.__data) * N**(-1./3)
        
        self.setNbin(self.range.ptp()/width)
 
        
    def __empirical_cdf(self, method='Weibull'):
        """Compute the cdf and define an interpolation instance. 
        The cdf data and the interpolation instance are stored in a dictionnary.
        """
        
        N = len(self.__data)
        i = linspace(1,N, N)
        self.__cdf = {}
        self.__interp_cdf={}
        self.__interp_score={}
        
        if method == 'Weibull':
            self.__cdf[method] = i/(N+1)
        elif method == 'Hazen':
            self.__cdf[method] = (i-0.5)/N
        elif method == 'California':
            self.__cdf[method] = (i-1)/N
        elif method == 'Chegodayev':
            self.__cdf[method] = (i-.3)/(N+.4)
        elif method == 'Cunnane':
            self.__cdf[method] = (i-.4)/(N+.2)
        elif method == 'Gringorten':
            self.__cdf[method] = (i-.44)/(N+.12)
        else:
            raise 'Unknown method. Choose among Weibull, Hazen, Chegodayev, Cunnane, Gringorten and California.'
        
        self.__interp_cdf[method] = interp1d(self.__sorted_data, self.__cdf[method])
        self.__interp_score[method] = interp1d(self.__cdf[method], self.__sorted_data)
    
    def cdf(self, x, method='Weibull'):
        """Returns the cdf interpolated from an empirical cdf. 
        Available methods to compute cdf are Weibull, Hazen, Chegodayev, Cunnane, Gringorten and California."""
        
        if method not in self.__interp_cdf.keys():
            self.__empirical_cdf(method)
        
        return self.__interp_cdf[method](x)
        
    def score(self, percent, method='Weibull'):
        """Returns the score at given percentile. 
        Available methods to compute cdf are Weibull, Hazen, Chegodayev, Cunnane, Gringorten and California. 
        The score is simply the inverted cdf."""

        # Do some checking
        percent = atleast_1d(percent)
        if percent.any() < 0:
            raise 'Percent must be positive.'
        elif percent.any() > 100:
            raise 'Percent must be smaller than 100.'
        
        if method not in self.__interp_score.keys():
            self.__empirical_cdf(method)
            
        return self.__interp_score[method](percent)

_______________________________________________
Matplotlib-users mailing list
Matplotlib-users@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/matplotlib-users

Re: [Matplotlib-users] histogram

Reply via email to