Hi Jeff,
I tried to run the example for dirichlet process clustering on the synthetic
control data. I tried to set k = 5 for the number of clusters. However, after
the first state, I always end up with only a single cluster. Essentially, for
any number of iterations greater than 2, there is always one model with all
samples assigned to it. Any idea?
Additionally, what's the best way to figure out cluster assignments on a test
set given the model?
Thanks.
- jerry
Command:
hadoop jar examples/target/mahout-examples-0.3-SNAPSHOT.job
org.apache.mahout.clustering.syntheticcontrol.dirichlet.Job --maxIter 3 -k 5
Output:
sample[0]=
sample[1]= m0(527)nm{n=527 m=[29.91, 30.99, 32.08, 32.48, 32.04, 31.36, 30.27,
29.05, 28.16, 28.16, 27.92, 28.85, 29.86, 30.73, 31.07, 31.45, 31.30, 30.83,
30.39, 29.85, 29.71, 29.03, 29.17, 29.65, 29.69, 30.03, 30.61, 30.53, 30.66,
30.61, 30.00, 30.29, 30.22, 30.33, 29.89, 30.14, 29.73, 29.69, 29.29, 29.63,
29.74, 29.90, 30.34, 30.33, 30.17, 30.68, 30.48, 30.14, 30.17, 29.84, 29.60,
29.42, 29.52, 29.58, 29.93, 30.23, 30.32, 30.08, 30.07, 30.19, ] sd=10.29},
m1(24)nm{n=24 m=[30.11, 29.08, 29.74, 30.78, 31.00, 29.19, 30.32, 29.58, 32.11,
28.30, 30.84, 31.58, 29.55, 31.57, 29.76, 28.74, 30.62, 30.77, 28.26, 30.34,
30.37, 29.99, 30.40, 29.25, 30.34, 29.44, 29.60, 30.14, 30.74, 28.89, 28.77,
30.09, 31.48, 29.67, 30.41, 29.71, 28.84, 30.94, 29.15, 30.52, 30.90, 30.81,
30.62, 29.03, 30.15, 28.98, 29.31, 30.89, 30.24, 30.93, 29.62, 29.40, 30.75,
29.96, 29.20, 29.42, 30.54, 28.96, 29.14, 30.91, ] sd=3.31}, m2(1)nm{n=1
m=[34.56, 35.50, 35.70, 24.69, 26.72, 29.56, 35.98, 33.22, 30.46, 34.71, 31.30,
27.79, 30.69, 29.01, 30.45, 26.65, 25.13, 24.33, 24.76, 29.19, 29.41, 34.65,
24.55, 34.29, 35.65, 27.11, 26.88, 24.27, 25.91, 33.84, 30.68, 35.34, 27.10,
30.66, 28.98, 32.05, 31.93, 25.44, 34.23, 31.35, 25.31, 34.49, 30.31, 25.37,
24.90, 28.54, 27.66, 28.28, 26.39, 32.40, 30.71, 24.88, 26.75, 26.43, 34.04,
25.96, 28.24, 26.45, 24.84, 32.17, ] sd=0.00}, m3(39)nm{n=39 m=[31.07, 31.11,
31.16, 30.63, 29.44, 31.00, 29.92, 29.87, 30.47, 30.17, 29.28, 29.11, 30.16,
29.14, 29.82, 29.62, 28.82, 29.92, 30.81, 30.39, 28.98, 30.23, 29.66, 30.29,
30.41, 30.36, 29.13, 29.93, 28.88, 29.86, 31.30, 29.80, 29.24, 29.69, 30.33,
29.70, 30.01, 30.54, 29.73, 28.76, 29.12, 31.08, 29.37, 29.95, 29.57, 30.29,
30.17, 29.68, 30.21, 30.50, 31.79, 31.14, 30.38, 29.46, 30.67, 30.98, 29.30,
29.95, 30.11, 29.89, ] sd=3.42}, m4(9)nm{n=9 m=[31.50, 29.89, 29.70, 29.40,
27.83, 27.77, 30.04, 30.11, 27.86, 29.77, 29.19, 30.88, 29.93, 28.00, 31.38,
29.89, 30.30, 28.39, 31.32, 30.12, 29.93, 29.82, 31.64, 30.74, 28.24, 27.82,
30.39, 30.53, 30.52, 31.72, 31.60, 28.06, 28.78, 28.15, 30.23, 31.21, 30.85,
26.93, 31.35, 30.22, 29.07, 28.78, 29.45, 30.95, 30.44, 29.21, 33.02, 28.94,
28.88, 26.63, 29.67, 33.16, 29.15, 28.66, 30.33, 30.65, 30.13, 31.19, 29.08,
27.55, ] sd=3.06},
sample[2]= m0(1127)nm{n=600 m=[30.02, 30.91, 31.90, 32.23, 31.76, 31.19, 30.26,
29.15, 28.46, 28.33, 28.15, 29.00, 29.87, 30.61, 30.94, 31.19, 31.09, 30.72,
30.33, 29.91, 29.69, 29.17, 29.28, 29.70, 29.75, 29.99, 30.46, 30.47, 30.54,
30.51, 30.06, 30.23, 30.18, 30.23, 29.94, 30.11, 29.73, 29.75, 29.35, 29.62,
29.73, 30.01, 30.28, 30.25, 30.13, 30.56, 30.45, 30.12, 30.15, 29.88, 29.75,
29.58, 29.61, 29.56, 29.96, 30.25, 30.26, 30.04, 30.01, 30.16, ] sd=9.74},