Hi
I am trying to use the DPGMM technique (
http://scikit-learn.org/stable/modules/generated/sklearn.mixture.DPGMM.html)
to find classes of data in 1-dimensional vectors.
The extracted variance/standard devaition seems a bit off: either
underestimated or overestimated compared to dpcluster (
https://github.com/teodor-moldovan/dpcluster).
I wrote a small script that give two examples and attached it to this mail.
Please tell me if I am doing anything wrong.
Thank you very much for your time.
Regards,
Johan
import dpcluster as dp
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import itertools
from scipy import linalg
from sklearn import mixture
def NIWparam2Nparam(vdp, minClusterIPRatio=0.05):
"""
Convert Gaussian Normal-Inverse-Wishart parameters to the usual Gaussian
parameters (i.e. mean, standard deviation)
:vdp: Variational Dirichlet Process obtained from dpgmm
:minClusterIPRatio: Ignore distributions standing for a ratio of IPs lower
than minClusterIPRatio
"""
nbIPs = float(np.sum(vdp.cluster_sizes()))
mus, Sgs, k, nu = vdp.distr.prior.nat2usual(vdp.cluster_parameters()[
vdp.cluster_sizes() > (minClusterIPRatio * nbIPs), :])[0]
Sgs = Sgs / (k + 1 + 1)[:, np.newaxis, np.newaxis]
res = np.zeros( (2, len(mus)) )
for i, (mu, Sg) in enumerate(zip(mus, Sgs)):
w, V = np.linalg.eig(Sg)
V = np.array(np.matrix(V) * np.matrix(np.diag(np.sqrt(w))))
V = V[0]
res[:, i] = (mu[0], V[0])
return res
def generate_data(n, m1, s1, m2, s2):
np.random.seed(0)
data_1 = s1 * np.random.randn(n, 1) + np.array([m1])
data_2 = s2 * np.random.randn(2*n, 1) + np.array([m2])
data = np.r_[ data_1, data_2 ]
data_rs = np.array(data).reshape(-1, 1)
return data_rs
def apply_dpcluster(data):
vdp = dp.algorithms.VDP(dp.distributions.GaussianNIW(1), k=10, w = 1, tol=1e-6, max_iters=10000)
stats = vdp.distr.sufficient_stats(data)
vdp.batch_learn(stats)
tau = vdp.cluster_parameters()
(mean_l, std_l, k_l, nu_l), wtf_1, wtf_2 = vdp.distr.prior.nat2usual(tau)
prop_l = [ k / float(len(data)) for k in k_l ]
mean_l, std_l = NIWparam2Nparam(vdp, minClusterIPRatio=0.00)
return (mean_l, std_l, prop_l)
def apply_scikit(data):
dpgmm = mixture.DPGMM(n_components=10, covariance_type='full', alpha=1, tol=1e-6, n_iter=10000)
dpgmm.fit(data)
mean_l = dpgmm.means_
std_l = [ np.sqrt(linalg.eigh(covar)[0][0]) for covar in dpgmm._get_covars() ]
c = dpgmm.predict(data)
nb_point_l = [ len([ class_indice for class_indice in c if class_indice == i ]) for i, mean in enumerate(mean_l) ]
prop_l = [ nb_point / float(len(data)) for nb_point in nb_point_l ]
return (mean_l, std_l, prop_l)
data_1 = generate_data(1000, 1, 2, 10, 2)
data_2 = generate_data(1000, 1, 1, 10, 1)
color_iter = itertools.cycle(['r', 'b', 'c', 'm'])
linestyle_iter = itertools.cycle(['-', '--'])
num_bins = 50
fig = plt.figure()
plt.subplot(2, 1, 1)
plt.title('Histogram and CDF with c1: mu=1, sigma=2 and c2: mu=10, sigma=2')
n, bins, patches = plt.hist(data_1, num_bins, facecolor='green', alpha=0.1, normed=True)
mean_l, std_l, prop_l = apply_dpcluster(data_1)
for i, (mean, std, prop, linestyle) in enumerate(zip(mean_l, std_l, prop_l, linestyle_iter)):
if prop > 0.05:
y_normpdf = mpl.mlab.normpdf(bins, mean, std)
y_normpdf_mod = [ pdf * prop for pdf in y_normpdf ]
plt.plot(bins, y_normpdf_mod, 'b', linestyle=linestyle)
linestyle_iter.next()
mean_l, std_l, prop_l = apply_scikit(data_2)
for i, (mean, std, prop, linestyle) in enumerate(zip(mean_l, std_l, prop_l, linestyle_iter)):
if prop > 0.05:
y_normpdf = mpl.mlab.normpdf(bins, mean, std)
y_normpdf_mod = [ pdf * prop for pdf in y_normpdf ]
plt.plot(bins, y_normpdf_mod, 'r', linestyle=linestyle)
plt.legend([ "dpcluster c2", "dpcluster c1", "scikit c2", "scikit c1" ], loc=0, ncol=2)
linestyle_iter.next()
plt.subplot(2, 1, 2)
plt.title('Histogram and CDF with c1: mu=1, sigma=1 and c2: mu=10, sigma=1')
n, bins, patches = plt.hist(data_2, num_bins, facecolor='green', alpha=0.1, normed=True)
mean_l, std_l, prop_l = apply_dpcluster(data_2)
for i, (mean, std, prop, linestyle) in enumerate(zip(mean_l, std_l, prop_l, linestyle_iter)):
if prop > 0.05:
y_normpdf = mpl.mlab.normpdf(bins, mean, std)
y_normpdf_mod = [ pdf * prop for pdf in y_normpdf ]
plt.plot(bins, y_normpdf_mod, 'b', linestyle=linestyle)
linestyle_iter.next()
mean_l, std_l, prop_l = apply_scikit(data_2)
for i, (mean, std, prop, linestyle) in enumerate(zip(mean_l, std_l, prop_l, linestyle_iter)):
if prop > 0.05:
y_normpdf = mpl.mlab.normpdf(bins, mean, std)
y_normpdf_mod = [ pdf * prop for pdf in y_normpdf ]
plt.plot(bins, y_normpdf_mod, 'r', linestyle=linestyle)
plt.legend([ "dpcluster c2", "dpcluster c1", "scikit c2", "scikit c1" ], loc=0, ncol=2)
linestyle_iter.next()
figure_name = "hist_with_dpcluster_scikit.eps"
plt.savefig(figure_name, format='eps')
plt.close('all')
------------------------------------------------------------------------------
Find and fix application performance issues faster with Applications Manager
Applications Manager provides deep performance insights into multiple tiers of
your business applications. It resolves application problems quickly and
reduces your MTTR. Get your free trial!
https://ad.doubleclick.net/ddm/clk/302982198;130105516;z
_______________________________________________
Scikit-learn-general mailing list
Scikit-learn-general@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/scikit-learn-general