Author: gsingers
Date: Thu Nov 3 04:01:49 2011
New Revision: 1196934
URL: http://svn.apache.org/viewvc?rev=1196934&view=rev
Log:
MAHOUT-867: hook in ability to run basic evaluations into cluster dumper
Modified:
mahout/trunk/examples/bin/build-reuters.sh
mahout/trunk/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
Modified: mahout/trunk/examples/bin/build-reuters.sh
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-reuters.sh?rev=1196934&r1=1196933&r2=1196934&view=diff
==============================================================================
--- mahout/trunk/examples/bin/build-reuters.sh (original)
+++ mahout/trunk/examples/bin/build-reuters.sh Thu Nov 3 04:01:49 2011
@@ -100,12 +100,13 @@ if [ "x$clustertype" == "xkmeans" ]; the
-c ${WORK_DIR}/reuters-kmeans-clusters \
-o ${WORK_DIR}/reuters-kmeans \
-dm org.apache.mahout.common.distance.CosineDistanceMeasure \
- -x 10 -k 20 -ow \
+ -x 10 -k 20 -ow --clustering \
&& \
$MAHOUT clusterdump \
-s ${WORK_DIR}/reuters-kmeans/clusters-*-final \
-d ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/dictionary.file-0 \
- -dt sequencefile -b 100 -n 20
+ -dt sequencefile -b 100 -n 20 --evaluate -dm
org.apache.mahout.common.distance.CosineDistanceMeasure \
+ --pointsDir ${WORK_DIR}/reuters-kmeans/clusteredPoints
elif [ "x$clustertype" == "xfuzzykmeans" ]; then
$MAHOUT seq2sparse \
-i ${WORK_DIR}/reuters-out-seqdir/ \
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java?rev=1196934&r1=1196933&r2=1196934&view=diff
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java
(original)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java
Thu Nov 3 04:01:49 2011
@@ -71,6 +71,7 @@ public final class RepresentativePointsD
public int run(String[] args) throws ClassNotFoundException, IOException,
InterruptedException {
addInputOption();
addOutputOption();
+ addOption("clusteredPoints", "cp", "The path to the clustered points",
true);
addOption(DefaultOptionCreator.distanceMeasureOption().create());
addOption(DefaultOptionCreator.maxIterationsOption().create());
addOption(DefaultOptionCreator.methodOption().create());
@@ -85,8 +86,8 @@ public final class RepresentativePointsD
boolean runSequential =
getOption(DefaultOptionCreator.METHOD_OPTION).equalsIgnoreCase(
DefaultOptionCreator.SEQUENTIAL_METHOD);
DistanceMeasure measure = ClassUtils.instantiateAs(distanceMeasureClass,
DistanceMeasure.class);
-
- run(getConf(), input, null, output, measure, maxIterations, runSequential);
+ Path clusteredPoints = new Path(getOption("clusteredPoints"));
+ run(getConf(), input, clusteredPoints, output, measure, maxIterations,
runSequential);
return 0;
}
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java?rev=1196934&r1=1196933&r2=1196934&view=diff
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
(original)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
Thu Nov 3 04:01:49 2011
@@ -27,8 +27,16 @@ import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.WeightedVectorWritable;
+import org.apache.mahout.clustering.cdbw.CDbwEvaluator;
+import org.apache.mahout.clustering.evaluation.ClusterEvaluator;
+import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver;
+import org.apache.mahout.clustering.evaluation.RepresentativePointsMapper;
import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
@@ -51,6 +59,8 @@ import java.util.TreeMap;
public final class ClusterDumper extends AbstractJob {
+ protected DistanceMeasure measure;
+
public enum OUTPUT_FORMAT {
TEXT,
CSV,
@@ -64,6 +74,7 @@ public final class ClusterDumper extends
public static final String NUM_WORDS_OPTION = "numWords";
public static final String SUBSTRING_OPTION = "substring";
public static final String SEQ_FILE_DIR_OPTION = "seqFileDir";
+ public static final String EVALUATE_CLUSTERS = "evaluate";
public static final String OUTPUT_FORMAT_OPT = "outputFormat";
@@ -77,6 +88,7 @@ public final class ClusterDumper extends
private int numTopFeatures = 10;
private Map<Integer, List<WeightedVectorWritable>> clusterIdToPoints;
private OUTPUT_FORMAT outputFormat = OUTPUT_FORMAT.TEXT;
+ private boolean runEvaluation;
public ClusterDumper(Path seqFileDir, Path pointsDir) {
this.seqFileDir = seqFileDir;
@@ -104,6 +116,8 @@ public final class ClusterDumper extends
+ "If specified, then the program will output the points
associated with a cluster");
addOption(DICTIONARY_OPTION, "d", "The dictionary file");
addOption(DICTIONARY_TYPE_OPTION, "dt", "The dictionary file type
(text|sequencefile)", "text");
+ addOption(buildOption(EVALUATE_CLUSTERS, "e", "Run ClusterEvaluator and
CDbwEvaluator over the input. The output will be appended to the rest of the
output at the end.", false, false, null));
+ addOption(DefaultOptionCreator.distanceMeasureOption().create());
if (parseArguments(args) == null) {
return -1;
}
@@ -127,12 +141,15 @@ public final class ClusterDumper extends
if (hasOption(OUTPUT_FORMAT_OPT)) {
outputFormat = OUTPUT_FORMAT.valueOf(getOption(OUTPUT_FORMAT_OPT));
}
+ runEvaluation = hasOption(EVALUATE_CLUSTERS);
+ String distanceMeasureClass =
getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
+ measure = ClassUtils.instantiateAs(distanceMeasureClass,
DistanceMeasure.class);
init();
printClusters(null);
return 0;
}
- public void printClusters(String[] dictionary) throws IOException {
+ public void printClusters(String[] dictionary) throws Exception {
Configuration conf = new Configuration();
if (this.termDictionary != null) {
@@ -165,6 +182,28 @@ public final class ClusterDumper extends
long numWritten = clusterWriter.write(new
SequenceFileDirValueIterable<Cluster>(new Path(seqFileDir, "part-*"),
PathType.GLOB, conf));
writer.flush();
+ if (runEvaluation){
+ HadoopUtil.delete(conf, new Path("tmp/representative"));
+ int numIters = 5;
+ RepresentativePointsDriver.main(new String[]{
+ "--input", seqFileDir.toString(),
+ "--output", "tmp/representative",
+ "--clusteredPoints", pointsDir.toString(),
+ "--distanceMeasure", measure.getClass().getName(),
+ "--maxIter", String.valueOf(numIters)//
+ });
+ conf.set(RepresentativePointsDriver.DISTANCE_MEASURE_KEY,
measure.getClass().getName());
+ conf.set(RepresentativePointsDriver.STATE_IN_KEY,
"tmp/representative/representativePoints-" + numIters);
+ ClusterEvaluator ce = new ClusterEvaluator(conf, seqFileDir);
+ writer.append("\n");
+ writer.append("Inter-Cluster Density:
").append(String.valueOf(ce.interClusterDensity())).append("\n");
+ writer.append("Intra-Cluster Density:
").append(String.valueOf(ce.intraClusterDensity())).append("\n");
+ CDbwEvaluator cdbw = new CDbwEvaluator(conf, seqFileDir);
+ writer.append("CDbw Inter-Cluster Density:
").append(String.valueOf(cdbw.interClusterDensity())).append("\n");
+ writer.append("CDbw Intra-Cluster Density:
").append(String.valueOf(cdbw.intraClusterDensity())).append("\n");
+ writer.append("CDbw Separation:
").append(String.valueOf(cdbw.separation())).append("\n");
+ writer.flush();
+ }
log.info("Wrote {} clusters", numWritten);
} finally {
if (shouldClose) {