[Scikit-learn-general] DPGMM applied to 1-dimensional vector and variance problem

Johan Mazel Thu, 05 May 2016 00:50:50 -0700

Hi
I am trying to use the DPGMM technique (
http://scikit-learn.org/stable/modules/generated/sklearn.mixture.DPGMM.html)
to find classes of data in 1-dimensional vectors.


The extracted variance/standard devaition seems a bit off: either
underestimated or overestimated compared to dpcluster (
https://github.com/teodor-moldovan/dpcluster).
I wrote a small script that give two examples and attached it to this mail.
Please tell me if I am doing anything wrong.

Thank you very much for your time.

Regards,
Johan

import dpcluster as dp
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

import itertools

from scipy import linalg
from sklearn import mixture


def NIWparam2Nparam(vdp, minClusterIPRatio=0.05):
    """
    Convert Gaussian Normal-Inverse-Wishart parameters to the usual Gaussian
    parameters (i.e. mean, standard deviation)
    :vdp: Variational Dirichlet Process obtained from dpgmm
    :minClusterIPRatio: Ignore distributions standing for a ratio of IPs lower
    than minClusterIPRatio
    """

    nbIPs = float(np.sum(vdp.cluster_sizes()))
    mus, Sgs, k, nu = vdp.distr.prior.nat2usual(vdp.cluster_parameters()[
        vdp.cluster_sizes() > (minClusterIPRatio * nbIPs), :])[0]
        
    Sgs = Sgs / (k + 1 + 1)[:, np.newaxis, np.newaxis]
    
    res = np.zeros( (2, len(mus)) )
    for i, (mu, Sg) in enumerate(zip(mus, Sgs)):
        w, V = np.linalg.eig(Sg)
        V = np.array(np.matrix(V) * np.matrix(np.diag(np.sqrt(w))))
        V = V[0]
        res[:, i] = (mu[0], V[0])
        
    return res



def generate_data(n, m1, s1, m2, s2):
    np.random.seed(0)

    data_1 = s1 * np.random.randn(n, 1) + np.array([m1])
    data_2 = s2 * np.random.randn(2*n, 1) + np.array([m2])

    data = np.r_[ data_1, data_2 ]

    data_rs = np.array(data).reshape(-1, 1)

    return data_rs


def apply_dpcluster(data):
    vdp = dp.algorithms.VDP(dp.distributions.GaussianNIW(1), k=10, w = 1, tol=1e-6, max_iters=10000)

    stats = vdp.distr.sufficient_stats(data)
    vdp.batch_learn(stats)

   
    tau = vdp.cluster_parameters()
    (mean_l, std_l, k_l, nu_l), wtf_1, wtf_2 = vdp.distr.prior.nat2usual(tau)
    prop_l = [ k / float(len(data)) for k in k_l ]
    
    mean_l, std_l = NIWparam2Nparam(vdp, minClusterIPRatio=0.00)

    return (mean_l, std_l, prop_l)


def apply_scikit(data):
    dpgmm = mixture.DPGMM(n_components=10, covariance_type='full', alpha=1, tol=1e-6, n_iter=10000)
    dpgmm.fit(data)

    mean_l = dpgmm.means_
    std_l = [ np.sqrt(linalg.eigh(covar)[0][0]) for covar in dpgmm._get_covars() ]
    
    c = dpgmm.predict(data)

    nb_point_l = [ len([ class_indice for class_indice in c if class_indice == i ]) for i, mean in enumerate(mean_l) ]
    prop_l = [ nb_point / float(len(data)) for nb_point in nb_point_l ]

    return (mean_l, std_l, prop_l)
    


data_1 = generate_data(1000, 1, 2, 10, 2)
data_2 = generate_data(1000, 1, 1, 10, 1)


    
color_iter = itertools.cycle(['r', 'b', 'c', 'm'])
linestyle_iter = itertools.cycle(['-', '--'])

num_bins = 50




fig = plt.figure()  



plt.subplot(2, 1, 1)

plt.title('Histogram and CDF with c1: mu=1, sigma=2 and c2: mu=10, sigma=2')
            
n, bins, patches = plt.hist(data_1, num_bins, facecolor='green', alpha=0.1, normed=True)


mean_l, std_l, prop_l = apply_dpcluster(data_1)

for i, (mean, std, prop, linestyle) in enumerate(zip(mean_l, std_l, prop_l, linestyle_iter)):
    if prop > 0.05:
        y_normpdf = mpl.mlab.normpdf(bins, mean, std)
        y_normpdf_mod = [ pdf * prop for pdf in y_normpdf ]
        plt.plot(bins, y_normpdf_mod, 'b', linestyle=linestyle)

linestyle_iter.next()

mean_l, std_l, prop_l = apply_scikit(data_2)

for i, (mean, std, prop, linestyle) in enumerate(zip(mean_l, std_l, prop_l, linestyle_iter)):
    if prop > 0.05:
        y_normpdf = mpl.mlab.normpdf(bins, mean, std)
        y_normpdf_mod = [ pdf * prop for pdf in y_normpdf ]
        plt.plot(bins, y_normpdf_mod, 'r', linestyle=linestyle)

plt.legend([ "dpcluster c2", "dpcluster c1", "scikit c2", "scikit c1" ], loc=0, ncol=2)

linestyle_iter.next()



plt.subplot(2, 1, 2)


plt.title('Histogram and CDF with c1: mu=1, sigma=1 and c2: mu=10, sigma=1')

n, bins, patches = plt.hist(data_2, num_bins, facecolor='green', alpha=0.1, normed=True)


mean_l, std_l, prop_l = apply_dpcluster(data_2)

for i, (mean, std, prop, linestyle) in enumerate(zip(mean_l, std_l, prop_l, linestyle_iter)):
    if prop > 0.05:
        y_normpdf = mpl.mlab.normpdf(bins, mean, std)
        y_normpdf_mod = [ pdf * prop for pdf in y_normpdf ]
        plt.plot(bins, y_normpdf_mod, 'b', linestyle=linestyle)

linestyle_iter.next()

mean_l, std_l, prop_l = apply_scikit(data_2)

for i, (mean, std, prop, linestyle) in enumerate(zip(mean_l, std_l, prop_l, linestyle_iter)):
    if prop > 0.05:
        y_normpdf = mpl.mlab.normpdf(bins, mean, std)
        y_normpdf_mod = [ pdf * prop for pdf in y_normpdf ]
        plt.plot(bins, y_normpdf_mod, 'r', linestyle=linestyle)

plt.legend([ "dpcluster c2", "dpcluster c1", "scikit c2", "scikit c1" ], loc=0, ncol=2)

linestyle_iter.next()





        
figure_name = "hist_with_dpcluster_scikit.eps"
plt.savefig(figure_name, format='eps')

plt.close('all')

------------------------------------------------------------------------------
Find and fix application performance issues faster with Applications Manager
Applications Manager provides deep performance insights into multiple tiers of
your business applications. It resolves application problems quickly and
reduces your MTTR. Get your free trial!
https://ad.doubleclick.net/ddm/clk/302982198;130105516;z

_______________________________________________
Scikit-learn-general mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/scikit-learn-general

[Scikit-learn-general] DPGMM applied to 1-dimensional vector and variance problem

Reply via email to