Author: edwardyoon
Date: Tue Nov 26 08:46:30 2013
New Revision: 1545568
URL: http://svn.apache.org/r1545568
Log:
HAMA-821: Fix bugs in KMeans example and make output more readable
Modified:
hama/trunk/CHANGES.txt
hama/trunk/examples/src/main/java/org/apache/hama/examples/Kmeans.java
hama/trunk/ml/src/main/java/org/apache/hama/ml/kmeans/KMeansBSP.java
hama/trunk/ml/src/test/java/org/apache/hama/ml/kmeans/TestKMeansBSP.java
Modified: hama/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/hama/trunk/CHANGES.txt?rev=1545568&r1=1545567&r2=1545568&view=diff
==============================================================================
--- hama/trunk/CHANGES.txt (original)
+++ hama/trunk/CHANGES.txt Tue Nov 26 08:46:30 2013
@@ -8,6 +8,7 @@ Release 0.7.0 (unreleased changes)
BUG FIXES
+ HAMA-821: Fix bugs in KMeans example and make output more readable
(edwardyoon)
IMPROVEMENTS
Modified: hama/trunk/examples/src/main/java/org/apache/hama/examples/Kmeans.java
URL:
http://svn.apache.org/viewvc/hama/trunk/examples/src/main/java/org/apache/hama/examples/Kmeans.java?rev=1545568&r1=1545567&r2=1545568&view=diff
==============================================================================
--- hama/trunk/examples/src/main/java/org/apache/hama/examples/Kmeans.java
(original)
+++ hama/trunk/examples/src/main/java/org/apache/hama/examples/Kmeans.java Tue
Nov 26 08:46:30 2013
@@ -17,6 +17,8 @@
*/
package org.apache.hama.examples;
+import java.util.List;
+
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
@@ -54,7 +56,7 @@ import org.apache.hama.ml.kmeans.KMeansB
public class Kmeans {
public static void main(String[] args) throws Exception {
- if (args.length < 4 || args.length != 7) {
+ if (args.length < 4 || (args.length > 4 && args.length != 7)) {
System.out
.println("USAGE: <INPUT_PATH> <OUTPUT_PATH> <MAXITERATIONS> <K (how
many centers)> -g [<COUNT> <DIMENSION OF VECTORS>]");
return;
@@ -85,12 +87,18 @@ public class Kmeans {
KMeansBSP.prepareInput(count, k, dimension, conf, in, center, out, fs);
} else {
KMeansBSP.prepareInputText(k, conf, in, center, out, fs);
- in = new Path(args[0], "textinput/in.seq");
+ in = new Path(in.getParent(), "textinput/in.seq");
}
BSPJob job = KMeansBSP.createJob(conf, in, out, true);
// just submit the job
job.waitForCompletion(true);
+
+ List<String> results = KMeansBSP.readOutput(conf, out, fs, 5);
+ for (String line : results) {
+ System.out.println(line);
+ }
+ System.out.println("...");
}
}
Modified: hama/trunk/ml/src/main/java/org/apache/hama/ml/kmeans/KMeansBSP.java
URL:
http://svn.apache.org/viewvc/hama/trunk/ml/src/main/java/org/apache/hama/ml/kmeans/KMeansBSP.java?rev=1545568&r1=1545567&r2=1545568&view=diff
==============================================================================
--- hama/trunk/ml/src/main/java/org/apache/hama/ml/kmeans/KMeansBSP.java
(original)
+++ hama/trunk/ml/src/main/java/org/apache/hama/ml/kmeans/KMeansBSP.java Tue
Nov 26 08:46:30 2013
@@ -28,6 +28,7 @@ import java.util.Random;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
@@ -106,17 +107,16 @@ public final class KMeansBSP
"Centers file must contain at least a single center!");
this.centers = centers.toArray(new DoubleVector[centers.size()]);
-
String distanceClass = peer.getConfiguration().get(DISTANCE_MEASURE_CLASS);
if (distanceClass != null) {
try {
distanceMeasurer = ReflectionUtils.newInstance(distanceClass);
} catch (ClassNotFoundException e) {
- throw new RuntimeException(new StringBuilder("Wrong DistanceMeasurer
implementation ").
- append(distanceClass).append(" provided").toString());
+ throw new RuntimeException(new StringBuilder(
+ "Wrong DistanceMeasurer implementation ").append(distanceClass)
+ .append(" provided").toString());
}
- }
- else {
+ } else {
distanceMeasurer = new EuclidianDistance();
}
@@ -177,7 +177,8 @@ public final class KMeansBSP
for (int i = 0; i < msgCenters.length; i++) {
final DoubleVector oldCenter = centers[i];
if (msgCenters[i] != null) {
- double calculateError =
oldCenter.subtractUnsafe(msgCenters[i]).abs().sum();
+ double calculateError = oldCenter.subtractUnsafe(msgCenters[i]).abs()
+ .sum();
if (calculateError > 0.0d) {
centers[i] = msgCenters[i];
convergedCounter++;
@@ -366,12 +367,13 @@ public final class KMeansBSP
}
/**
- * Reads the centers outputted from the clustering job.
+ * Reads the cluster centers.
*
* @return an index on the key dimension, and a cluster center on the value.
*/
- public static HashMap<Integer, DoubleVector> readOutput(Configuration conf,
- Path out, Path centerPath, FileSystem fs) throws IOException {
+ public static HashMap<Integer, DoubleVector> readClusterCenters(
+ Configuration conf, Path out, Path centerPath, FileSystem fs)
+ throws IOException {
HashMap<Integer, DoubleVector> centerMap = new HashMap<Integer,
DoubleVector>();
SequenceFile.Reader centerReader = new SequenceFile.Reader(fs, centerPath,
conf);
@@ -385,6 +387,37 @@ public final class KMeansBSP
}
/**
+ * Reads output. The list of output records can be restricted to maxlines.
+ *
+ * @param conf
+ * @param outPath
+ * @param fs
+ * @param maxlines
+ * @return the list of output records
+ * @throws IOException
+ */
+ public static List<String> readOutput(Configuration conf, Path outPath,
+ FileSystem fs, int maxlines) throws IOException {
+ List<String> output = new ArrayList<String>();
+
+ FileStatus[] globStatus = fs.globStatus(new Path(outPath + "/part-*"));
+ for (FileStatus fts : globStatus) {
+ BufferedReader reader = new BufferedReader(new InputStreamReader(
+ fs.open(fts.getPath())));
+ String line = null;
+ while ((line = reader.readLine()) != null) {
+ String[] split = line.split("\t");
+ output.add(split[1] + " belongs to cluster " + split[0]);
+
+ if (output.size() >= maxlines)
+ return output;
+ }
+ }
+
+ return output;
+ }
+
+ /**
* Reads input text files and writes it to a sequencefile.
*/
public static Path prepareInputText(int k, Configuration conf, Path txtIn,
@@ -396,7 +429,7 @@ public final class KMeansBSP
} else {
in = new Path(txtIn, "textinput/in.seq");
}
-
+
if (fs.exists(out))
fs.delete(out, true);
@@ -428,8 +461,8 @@ public final class KMeansBSP
VectorWritable vector = new VectorWritable(vec);
dataWriter.append(vector, value);
if (k > i) {
- assert centerWriter != null;
- centerWriter.append(vector, value);
+ assert centerWriter != null;
+ centerWriter.append(vector, value);
} else {
if (centerWriter != null) {
centerWriter.close();
Modified:
hama/trunk/ml/src/test/java/org/apache/hama/ml/kmeans/TestKMeansBSP.java
URL:
http://svn.apache.org/viewvc/hama/trunk/ml/src/test/java/org/apache/hama/ml/kmeans/TestKMeansBSP.java?rev=1545568&r1=1545567&r2=1545568&view=diff
==============================================================================
--- hama/trunk/ml/src/test/java/org/apache/hama/ml/kmeans/TestKMeansBSP.java
(original)
+++ hama/trunk/ml/src/test/java/org/apache/hama/ml/kmeans/TestKMeansBSP.java
Tue Nov 26 08:46:30 2013
@@ -72,8 +72,8 @@ public class TestKMeansBSP extends TestC
assertEquals(true, result);
- HashMap<Integer, DoubleVector> centerMap = KMeansBSP.readOutput(conf,
- out, centerOut, fs);
+ HashMap<Integer, DoubleVector> centerMap = KMeansBSP.readClusterCenters(
+ conf, out, centerOut, fs);
System.out.println(centerMap);
assertEquals(1, centerMap.size());
DoubleVector doubleVector = centerMap.get(0);