svn commit: r1538406 - in /mahout/trunk: ./ integration/src/main/java/org/apache/mahout/utils/clustering/ integration/src/test/java/org/apache/mahout/clustering/

sslavic Sun, 03 Nov 2013 09:50:00 -0800

Author: sslavic
Date: Sun Nov  3 17:48:57 2013
New Revision: 1538406

URL: http://svn.apache.org/r1538406
Log:
MAHOUT-1343: JSON output format support in cluster dumper


Added:
    
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java
   (with props)
Modified:
    mahout/trunk/CHANGELOG
    
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
    
mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java

Modified: mahout/trunk/CHANGELOG
URL: 
http://svn.apache.org/viewvc/mahout/trunk/CHANGELOG?rev=1538406&r1=1538405&r2=1538406&view=diff
==============================================================================
--- mahout/trunk/CHANGELOG (original)
+++ mahout/trunk/CHANGELOG Sun Nov  3 17:48:57 2013
@@ -2,6 +2,8 @@ Mahout Change Log
 
 Release 0.9 - unreleased
 
+  MAHOUT-1343: JSON output format support in cluster dumper (Telvis Calhoun 
via sslavic)
+
   MAHOUT-1333: Fixed examples bin directory permissions in distribution 
archives (Mike Percy via sslavic)
 
   MAHOUT-1313: Fixed unwanted integral division bug in RowSimilarityJob 
downsampling code where precision should have been retained (sslavic) 

Modified: 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java?rev=1538406&r1=1538405&r2=1538406&view=diff
==============================================================================
--- 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
 (original)
+++ 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
 Sun Nov  3 17:48:57 2013
@@ -63,6 +63,7 @@ public final class ClusterDumper extends
     TEXT,
     CSV,
     GRAPH_ML,
+    JSON,
   }
 
   public static final String DICTIONARY_TYPE_OPTION = "dictionaryType";
@@ -104,7 +105,7 @@ public final class ClusterDumper extends
   public int run(String[] args) throws Exception {
     addInputOption();
     addOutputOption();
-    addOption(OUTPUT_FORMAT_OPT, "of", "The optional output format for the 
results.  Options: TEXT, CSV or GRAPH_ML",
+    addOption(OUTPUT_FORMAT_OPT, "of", "The optional output format for the 
results.  Options: TEXT, CSV, JSON or GRAPH_ML",
         "TEXT");
     addOption(SUBSTRING_OPTION, "b", "The number of chars of the 
asFormatString() to print");
     addOption(NUM_WORDS_OPTION, "n", "The number of top terms to print");
@@ -239,12 +240,22 @@ public final class ClusterDumper extends
       case GRAPH_ML:
         result = new GraphMLClusterWriter(writer, clusterIdToPoints, measure, 
numTopFeatures, dictionary, subString);
         break;
+      case JSON:
+        result = new JsonClusterWriter(writer, clusterIdToPoints, measure, 
numTopFeatures, dictionary);
+        break;
       default:
         throw new IllegalStateException("Unknown outputformat: " + 
outputFormat);
     }
     return result;
   }
 
+  /**
+   * Convenience function to set the output format during testing.
+   */
+  public void setOutputFormat(OUTPUT_FORMAT of) {
+    outputFormat = of;
+  }
+
   private void init() {
     if (this.pointsDir != null) {
       Configuration conf = new Configuration();

Added: 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java?rev=1538406&view=auto
==============================================================================
--- 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java
 (added)
+++ 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java
 Sun Nov  3 17:48:57 2013
@@ -0,0 +1,177 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.utils.clustering;
+
+import java.io.IOException;
+import java.io.Writer;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+import org.apache.mahout.clustering.AbstractCluster;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.classify.WeightedVectorWritable;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.math.NamedVector;
+import org.apache.mahout.math.Vector;
+import org.codehaus.jackson.map.ObjectMapper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.collect.Lists;
+
+/**
+ * Dump cluster info to JSON formatted lines. Heavily inspired by
+ * ClusterDumperWriter.java and CSVClusterWriter.java
+ *
+ */
+public class JsonClusterWriter extends AbstractClusterWriter {
+  private final String[] dictionary;
+  private final int numTopFeatures;
+  private final ObjectMapper jxn;
+
+  private static final Logger log = LoggerFactory
+      .getLogger(JsonClusterWriter.class);
+  private static final Pattern VEC_PATTERN = 
Pattern.compile("\\{|\\:|\\,|\\}");
+
+  public JsonClusterWriter(Writer writer,
+      Map<Integer, List<WeightedVectorWritable>> clusterIdToPoints,
+      DistanceMeasure measure, int numTopFeatures, String[] dictionary) {
+    super(writer, clusterIdToPoints, measure);
+    this.numTopFeatures = numTopFeatures;
+    this.dictionary = dictionary;
+    jxn = new ObjectMapper();
+  }
+
+  /**
+   * Generate HashMap with cluster info and write as a single JSON formatted
+   * line
+   */
+  @Override
+  public void write(ClusterWritable clusterWritable) throws IOException {
+    HashMap<String, Object> res = new HashMap<String, Object>();
+
+    // get top terms
+    List<Object> topTerms = getTopFeaturesList(clusterWritable.getValue()
+        .getCenter(), dictionary, numTopFeatures);
+    res.put("top_terms", topTerms);
+
+    // get human-readable cluster representation
+    Cluster cluster = clusterWritable.getValue();
+    String fmtStr = cluster.asFormatString(dictionary);
+    res.put("cluster_id", cluster.getId());
+    res.put("cluster", fmtStr);
+
+    // get points
+    List<Object> points = getPoints(cluster, dictionary);
+    res.put("points", points);
+
+    // write JSON
+    Writer writer = getWriter();
+    writer.write(jxn.writeValueAsString(res) + "\n");
+  }
+
+  /**
+   * Create a List of HashMaps containing top terms information
+   *
+   * @return List<Object>
+   */
+  public List<Object> getTopFeaturesList(Vector vector, String[] dictionary,
+      int numTerms) {
+
+    List<TermIndexWeight> vectorTerms = Lists.newArrayList();
+
+    for (Vector.Element elt : vector.nonZeroes()) {
+      vectorTerms.add(new TermIndexWeight(elt.index(), elt.get()));
+    }
+
+    // Sort results in reverse order (i.e. weight in descending order)
+    Collections.sort(vectorTerms, new Comparator<TermIndexWeight>() {
+      @Override
+      public int compare(TermIndexWeight one, TermIndexWeight two) {
+        return Double.compare(two.weight, one.weight);
+      }
+    });
+
+    List<Object> topTerms = Lists.newLinkedList();
+
+    for (int i = 0; i < vectorTerms.size() && i < numTerms; i++) {
+      int index = vectorTerms.get(i).index;
+      String dictTerm = dictionary[index];
+      if (dictTerm == null) {
+        log.error("Dictionary entry missing for {}", index);
+        continue;
+      }
+      HashMap<String, Object> term_entry = new HashMap<String, Object>();
+      term_entry.put("term", dictTerm);
+      term_entry.put("weight", vectorTerms.get(i).weight);
+      topTerms.add(term_entry);
+    }
+
+    return topTerms;
+  }
+
+  /**
+   * Create a List of HashMaps containing Vector point information
+   *
+   * @return List<Object>
+   */
+  public List<Object> getPoints(Cluster cluster, String[] dictionary) {
+    List<Object> vectorObjs = Lists.newLinkedList();
+    List<WeightedVectorWritable> points = getClusterIdToPoints().get(
+        cluster.getId());
+
+    if (points != null) {
+      for (WeightedVectorWritable point : points) {
+        HashMap<String, Object> entry = new HashMap<String, Object>();
+        Vector theVec = point.getVector();
+        if (theVec instanceof NamedVector) {
+          entry.put("vector_name", ((NamedVector) theVec).getName());
+        } else {
+          String vecStr = theVec.asFormatString();
+          // do some basic manipulations for display
+          vecStr = VEC_PATTERN.matcher(vecStr).replaceAll("_");
+          entry.put("vector_name", vecStr);
+        }
+        entry.put("weight", String.valueOf(point.getWeight()));
+        entry.put("point",
+            AbstractCluster.formatVector(point.getVector(), dictionary));
+        vectorObjs.add(entry);
+      }
+    }
+    return vectorObjs;
+  }
+
+  /**
+   * Convenience class for sorting terms
+   *
+   */
+  private static class TermIndexWeight {
+    private final int index;
+    private final double weight;
+
+    TermIndexWeight(int index, double weight) {
+      this.index = index;
+      this.weight = weight;
+    }
+  }
+
+}

Propchange: 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: 
mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java?rev=1538406&r1=1538405&r2=1538406&view=diff
==============================================================================
--- 
mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
 (original)
+++ 
mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
 Sun Nov  3 17:48:57 2013
@@ -201,6 +201,25 @@ public final class TestClusterDumper ext
         output, 10), new Path(kmeansOutput, "clusteredPoints"));
     clusterDumper.printClusters(termDictionary);
   }
+
+  @Test
+  public void testJsonClusterDumper() throws Exception {
+    DistanceMeasure measure = new EuclideanDistanceMeasure();
+    // now run the Canopy job to prime kMeans canopies
+    Path output = getTestTempDirPath("output");
+    Configuration conf = getConfiguration();
+    CanopyDriver.run(conf, getTestTempDirPath("testdata"), output, measure, 8,
+        4, false, 0.0, true);
+    // now run the KMeans job
+    Path kmeansOutput = new Path(output, "kmeans");
+    KMeansDriver.run(conf, getTestTempDirPath("testdata"), new Path(output,
+        "clusters-0-final"), kmeansOutput, measure, 0.001, 10, true, 0.0, 
false);
+    // run ClusterDumper
+    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
+        output, 10), new Path(kmeansOutput, "clusteredPoints"));
+    clusterDumper.setOutputFormat(ClusterDumper.OUTPUT_FORMAT.JSON);
+    clusterDumper.printClusters(termDictionary);
+  }
   
   @Test
   public void testFuzzyKmeans() throws Exception {

svn commit: r1538406 - in /mahout/trunk: ./ integration/src/main/java/org/apache/mahout/utils/clustering/ integration/src/test/java/org/apache/mahout/clustering/

Reply via email to