Author: sslavic
Date: Sun Nov 3 17:48:57 2013
New Revision: 1538406
URL: http://svn.apache.org/r1538406
Log:
MAHOUT-1343: JSON output format support in cluster dumper
Added:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java
(with props)
Modified:
mahout/trunk/CHANGELOG
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
Modified: mahout/trunk/CHANGELOG
URL:
http://svn.apache.org/viewvc/mahout/trunk/CHANGELOG?rev=1538406&r1=1538405&r2=1538406&view=diff
==============================================================================
--- mahout/trunk/CHANGELOG (original)
+++ mahout/trunk/CHANGELOG Sun Nov 3 17:48:57 2013
@@ -2,6 +2,8 @@ Mahout Change Log
Release 0.9 - unreleased
+ MAHOUT-1343: JSON output format support in cluster dumper (Telvis Calhoun
via sslavic)
+
MAHOUT-1333: Fixed examples bin directory permissions in distribution
archives (Mike Percy via sslavic)
MAHOUT-1313: Fixed unwanted integral division bug in RowSimilarityJob
downsampling code where precision should have been retained (sslavic)
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java?rev=1538406&r1=1538405&r2=1538406&view=diff
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
(original)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
Sun Nov 3 17:48:57 2013
@@ -63,6 +63,7 @@ public final class ClusterDumper extends
TEXT,
CSV,
GRAPH_ML,
+ JSON,
}
public static final String DICTIONARY_TYPE_OPTION = "dictionaryType";
@@ -104,7 +105,7 @@ public final class ClusterDumper extends
public int run(String[] args) throws Exception {
addInputOption();
addOutputOption();
- addOption(OUTPUT_FORMAT_OPT, "of", "The optional output format for the
results. Options: TEXT, CSV or GRAPH_ML",
+ addOption(OUTPUT_FORMAT_OPT, "of", "The optional output format for the
results. Options: TEXT, CSV, JSON or GRAPH_ML",
"TEXT");
addOption(SUBSTRING_OPTION, "b", "The number of chars of the
asFormatString() to print");
addOption(NUM_WORDS_OPTION, "n", "The number of top terms to print");
@@ -239,12 +240,22 @@ public final class ClusterDumper extends
case GRAPH_ML:
result = new GraphMLClusterWriter(writer, clusterIdToPoints, measure,
numTopFeatures, dictionary, subString);
break;
+ case JSON:
+ result = new JsonClusterWriter(writer, clusterIdToPoints, measure,
numTopFeatures, dictionary);
+ break;
default:
throw new IllegalStateException("Unknown outputformat: " +
outputFormat);
}
return result;
}
+ /**
+ * Convenience function to set the output format during testing.
+ */
+ public void setOutputFormat(OUTPUT_FORMAT of) {
+ outputFormat = of;
+ }
+
private void init() {
if (this.pointsDir != null) {
Configuration conf = new Configuration();
Added:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java?rev=1538406&view=auto
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java
(added)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java
Sun Nov 3 17:48:57 2013
@@ -0,0 +1,177 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.utils.clustering;
+
+import java.io.IOException;
+import java.io.Writer;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+import org.apache.mahout.clustering.AbstractCluster;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.classify.WeightedVectorWritable;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.math.NamedVector;
+import org.apache.mahout.math.Vector;
+import org.codehaus.jackson.map.ObjectMapper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.collect.Lists;
+
+/**
+ * Dump cluster info to JSON formatted lines. Heavily inspired by
+ * ClusterDumperWriter.java and CSVClusterWriter.java
+ *
+ */
+public class JsonClusterWriter extends AbstractClusterWriter {
+ private final String[] dictionary;
+ private final int numTopFeatures;
+ private final ObjectMapper jxn;
+
+ private static final Logger log = LoggerFactory
+ .getLogger(JsonClusterWriter.class);
+ private static final Pattern VEC_PATTERN =
Pattern.compile("\\{|\\:|\\,|\\}");
+
+ public JsonClusterWriter(Writer writer,
+ Map<Integer, List<WeightedVectorWritable>> clusterIdToPoints,
+ DistanceMeasure measure, int numTopFeatures, String[] dictionary) {
+ super(writer, clusterIdToPoints, measure);
+ this.numTopFeatures = numTopFeatures;
+ this.dictionary = dictionary;
+ jxn = new ObjectMapper();
+ }
+
+ /**
+ * Generate HashMap with cluster info and write as a single JSON formatted
+ * line
+ */
+ @Override
+ public void write(ClusterWritable clusterWritable) throws IOException {
+ HashMap<String, Object> res = new HashMap<String, Object>();
+
+ // get top terms
+ List<Object> topTerms = getTopFeaturesList(clusterWritable.getValue()
+ .getCenter(), dictionary, numTopFeatures);
+ res.put("top_terms", topTerms);
+
+ // get human-readable cluster representation
+ Cluster cluster = clusterWritable.getValue();
+ String fmtStr = cluster.asFormatString(dictionary);
+ res.put("cluster_id", cluster.getId());
+ res.put("cluster", fmtStr);
+
+ // get points
+ List<Object> points = getPoints(cluster, dictionary);
+ res.put("points", points);
+
+ // write JSON
+ Writer writer = getWriter();
+ writer.write(jxn.writeValueAsString(res) + "\n");
+ }
+
+ /**
+ * Create a List of HashMaps containing top terms information
+ *
+ * @return List<Object>
+ */
+ public List<Object> getTopFeaturesList(Vector vector, String[] dictionary,
+ int numTerms) {
+
+ List<TermIndexWeight> vectorTerms = Lists.newArrayList();
+
+ for (Vector.Element elt : vector.nonZeroes()) {
+ vectorTerms.add(new TermIndexWeight(elt.index(), elt.get()));
+ }
+
+ // Sort results in reverse order (i.e. weight in descending order)
+ Collections.sort(vectorTerms, new Comparator<TermIndexWeight>() {
+ @Override
+ public int compare(TermIndexWeight one, TermIndexWeight two) {
+ return Double.compare(two.weight, one.weight);
+ }
+ });
+
+ List<Object> topTerms = Lists.newLinkedList();
+
+ for (int i = 0; i < vectorTerms.size() && i < numTerms; i++) {
+ int index = vectorTerms.get(i).index;
+ String dictTerm = dictionary[index];
+ if (dictTerm == null) {
+ log.error("Dictionary entry missing for {}", index);
+ continue;
+ }
+ HashMap<String, Object> term_entry = new HashMap<String, Object>();
+ term_entry.put("term", dictTerm);
+ term_entry.put("weight", vectorTerms.get(i).weight);
+ topTerms.add(term_entry);
+ }
+
+ return topTerms;
+ }
+
+ /**
+ * Create a List of HashMaps containing Vector point information
+ *
+ * @return List<Object>
+ */
+ public List<Object> getPoints(Cluster cluster, String[] dictionary) {
+ List<Object> vectorObjs = Lists.newLinkedList();
+ List<WeightedVectorWritable> points = getClusterIdToPoints().get(
+ cluster.getId());
+
+ if (points != null) {
+ for (WeightedVectorWritable point : points) {
+ HashMap<String, Object> entry = new HashMap<String, Object>();
+ Vector theVec = point.getVector();
+ if (theVec instanceof NamedVector) {
+ entry.put("vector_name", ((NamedVector) theVec).getName());
+ } else {
+ String vecStr = theVec.asFormatString();
+ // do some basic manipulations for display
+ vecStr = VEC_PATTERN.matcher(vecStr).replaceAll("_");
+ entry.put("vector_name", vecStr);
+ }
+ entry.put("weight", String.valueOf(point.getWeight()));
+ entry.put("point",
+ AbstractCluster.formatVector(point.getVector(), dictionary));
+ vectorObjs.add(entry);
+ }
+ }
+ return vectorObjs;
+ }
+
+ /**
+ * Convenience class for sorting terms
+ *
+ */
+ private static class TermIndexWeight {
+ private final int index;
+ private final double weight;
+
+ TermIndexWeight(int index, double weight) {
+ this.index = index;
+ this.weight = weight;
+ }
+ }
+
+}
Propchange:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Modified:
mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java?rev=1538406&r1=1538405&r2=1538406&view=diff
==============================================================================
---
mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
(original)
+++
mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
Sun Nov 3 17:48:57 2013
@@ -201,6 +201,25 @@ public final class TestClusterDumper ext
output, 10), new Path(kmeansOutput, "clusteredPoints"));
clusterDumper.printClusters(termDictionary);
}
+
+ @Test
+ public void testJsonClusterDumper() throws Exception {
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
+ // now run the Canopy job to prime kMeans canopies
+ Path output = getTestTempDirPath("output");
+ Configuration conf = getConfiguration();
+ CanopyDriver.run(conf, getTestTempDirPath("testdata"), output, measure, 8,
+ 4, false, 0.0, true);
+ // now run the KMeans job
+ Path kmeansOutput = new Path(output, "kmeans");
+ KMeansDriver.run(conf, getTestTempDirPath("testdata"), new Path(output,
+ "clusters-0-final"), kmeansOutput, measure, 0.001, 10, true, 0.0,
false);
+ // run ClusterDumper
+ ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
+ output, 10), new Path(kmeansOutput, "clusteredPoints"));
+ clusterDumper.setOutputFormat(ClusterDumper.OUTPUT_FORMAT.JSON);
+ clusterDumper.printClusters(termDictionary);
+ }
@Test
public void testFuzzyKmeans() throws Exception {