Author: srowen
Date: Wed Apr 6 22:02:48 2011
New Revision: 1089646
URL: http://svn.apache.org/viewvc?rev=1089646&view=rev
Log:
MAHOUT-510 part 2 take out more JSON and remove defunct output selection option
in a few jobs
Added:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/DelimitedTermInfoWriter.java
- copied, changed from r1089224,
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterTermInfoWriter.java
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/TextualVectorWriter.java
- copied, changed from r1089224,
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java
Removed:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterTermInfoWriter.java
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/ModelSerializer.java
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/TestClusterInterface.java
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/ModelSerializer.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/ModelSerializer.java?rev=1089646&r1=1089645&r2=1089646&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/ModelSerializer.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/ModelSerializer.java
Wed Apr 6 22:02:48 2011
@@ -26,7 +26,7 @@ import java.io.IOException;
import java.io.InputStream;
/**
- * Provides the ability to store SGD model-related objects as JSON.
+ * Provides the ability to store SGD model-related objects as binary files.
*/
public final class ModelSerializer {
Modified:
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/TestClusterInterface.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/TestClusterInterface.java?rev=1089646&r1=1089645&r2=1089646&view=diff
==============================================================================
---
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/TestClusterInterface.java
(original)
+++
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/TestClusterInterface.java
Wed Apr 6 22:02:48 2011
@@ -84,14 +84,6 @@ public final class TestClusterInterface
}
@Test
- public void testDirichletNormalModelClusterAsJsonString() {
- double[] d = { 1.1, 2.2, 3.3 };
- Vector m = new DenseVector(d);
- NormalModel model = new NormalModel(5, m, 0.75);
- Cluster cluster = new DirichletCluster(model, 35.0);
- }
-
- @Test
public void testDirichletAsymmetricSampledNormalModelClusterAsFormatString()
{
double[] d = { 1.1, 2.2, 3.3 };
Vector m = new DenseVector(d);
@@ -102,14 +94,6 @@ public final class TestClusterInterface
}
@Test
- public void testDirichletAsymmetricSampledNormalModelClusterAsJsonString() {
- double[] d = { 1.1, 2.2, 3.3 };
- Vector m = new DenseVector(d);
- AsymmetricSampledNormalModel model = new AsymmetricSampledNormalModel(5,
m, m);
- Cluster cluster = new DirichletCluster(model, 35.0);
- }
-
- @Test
public void testDirichletL1ModelClusterAsFormatString() {
double[] d = { 1.1, 2.2, 3.3 };
Vector m = new DenseVector(d);
@@ -120,14 +104,6 @@ public final class TestClusterInterface
}
@Test
- public void testDirichletL1ModelClusterAsJsonString() {
- double[] d = { 1.1, 2.2, 3.3 };
- Vector m = new DenseVector(d);
- L1Model model = new L1Model(5, m);
- Cluster cluster = new DirichletCluster(model, 35.0);
- }
-
- @Test
public void testCanopyAsFormatString() {
double[] d = { 1.1, 2.2, 3.3 };
Vector m = new DenseVector(d);
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java?rev=1089646&r1=1089645&r2=1089646&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
Wed Apr 6 22:02:48 2011
@@ -54,7 +54,7 @@ public final class VectorHelper {
boolean namesAsComments,
Appendable bldr) throws IOException {
if (namesAsComments && vector instanceof NamedVector){
- bldr.append("#").append(((NamedVector)vector).getName()).append('\n');
+ bldr.append('#').append(((NamedVector) vector).getName()).append('\n');
}
Iterator<Vector.Element> iter = vector.iterator();
boolean first = true;
@@ -62,46 +62,13 @@ public final class VectorHelper {
if (first) {
first = false;
} else {
- bldr.append(",");
+ bldr.append(',');
}
Vector.Element elt = iter.next();
bldr.append(String.valueOf(elt.get()));
}
bldr.append('\n');
}
-
-
- /**
- * @return a String from a vector that fills in the values with the
appropriate value from a dictionary where
- * each the ith entry is the term for the ith vector cell.
- */
- public static String vectorToJSONString(Vector vector, String[] dictionary) {
- StringBuilder bldr = new StringBuilder(2048);
-
- if (vector instanceof NamedVector) {
- bldr.append("name: ").append(((NamedVector)
vector).getName()).append('\t');
- }
-
- bldr.append("elts: {");
- Iterator<Vector.Element> iter = vector.iterateNonZero();
- boolean first = true;
- while (iter.hasNext()) {
- if (first) {
- first = false;
- } else {
- bldr.append(", ");
- }
- Vector.Element elt = iter.next();
- if (dictionary != null) {
- bldr.append(dictionary[elt.index()]);
- } else {
- bldr.append(String.valueOf(elt.index()));
- }
- bldr.append(':').append(String.valueOf(elt.get()));
- }
- return bldr.append('}').toString();
- }
-
/**
* Read in a dictionary file. Format is:
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java?rev=1089646&r1=1089645&r2=1089646&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java
Wed Apr 6 22:02:48 2011
@@ -43,7 +43,6 @@ import org.apache.hadoop.io.SequenceFile
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.utils.vectors.io.JWriterVectorWriter;
import org.apache.mahout.utils.vectors.io.SequenceFileVectorWriter;
import org.apache.mahout.utils.vectors.io.VectorWriter;
import org.slf4j.Logger;
@@ -85,15 +84,10 @@ public final class Driver {
abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create()).withDescription(
"The delimiter for outputing the
dictionary").withShortName("l").create();
- Option outWriterOpt =
obuilder.withLongName("outputWriter").withRequired(false).withArgument(
-
abuilder.withName("outputWriter").withMinimum(1).withMaximum(1).create()).withDescription(
- "The VectorWriter to use, either seq (SequenceFileVectorWriter -
default) or"
- + "file (Writes to a File)").withShortName("e").create();
-
Option helpOpt = obuilder.withLongName("help").withDescription("Print out
help").withShortName("h")
.create();
Group group =
gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(maxOpt)
-
.withOption(helpOpt).withOption(dictOutOpt).withOption(outWriterOpt).withOption(delimiterOpt)
+ .withOption(helpOpt).withOption(dictOutOpt).withOption(delimiterOpt)
.create();
try {
Parser parser = new Parser();
@@ -116,10 +110,7 @@ public final class Driver {
}
String outDir = cmdLine.getValue(outputOpt).toString();
log.info("Output Dir: {}", outDir);
- String outWriter = null;
- if (cmdLine.hasOption(outWriterOpt)) {
- outWriter = cmdLine.getValue(outWriterOpt).toString();
- }
+
String delimiter = cmdLine.hasOption(delimiterOpt) ?
cmdLine.getValue(delimiterOpt).toString() : "\t";
File dictOut = new File(cmdLine.getValue(dictOutOpt).toString());
ARFFModel model = new MapBackedARFFModel();
@@ -132,10 +123,10 @@ public final class Driver {
});
for (File file : files) {
- writeFile(outWriter, outDir, file, maxDocs, model);
+ writeFile(outDir, file, maxDocs, model);
}
} else {
- writeFile(outWriter, outDir, input, maxDocs, model);
+ writeFile(outDir, input, maxDocs, model);
}
log.info("Dictionary Output file: {}", dictOut);
Map<String,Integer> labels = model.getLabelBindings();
@@ -159,25 +150,14 @@ public final class Driver {
}
}
- private static void writeFile(String outWriter, String outDir, File file,
- long maxDocs, ARFFModel arffModel) throws
IOException {
+ private static void writeFile(String outDir, File file, long maxDocs,
ARFFModel arffModel) throws IOException {
log.info("Converting File: {}", file);
ARFFModel model = new MapBackedARFFModel(arffModel.getWords(),
arffModel.getWordCount() + 1, arffModel
.getNominalMap());
Iterable<Vector> iteratable = new ARFFVectorIterable(file, model);
String outFile = outDir + '/' + file.getName() + ".mvc";
- VectorWriter vectorWriter;
- if (outWriter == null) {
- vectorWriter = getSeqFileWriter(outFile);
- } else {
- if ("file".equals(outWriter)) {
- vectorWriter = new JWriterVectorWriter(
- new OutputStreamWriter(new FileOutputStream(new File(outFile)),
Charset.forName("UTF-8")));
- } else {
- vectorWriter = getSeqFileWriter(outFile);
- }
- }
+ VectorWriter vectorWriter = getSeqFileWriter(outFile);
try {
long numDocs = vectorWriter.write(iteratable, maxDocs);
log.info("Wrote: {} vectors", numDocs);
Copied:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/DelimitedTermInfoWriter.java
(from r1089224,
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterTermInfoWriter.java)
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/DelimitedTermInfoWriter.java?p2=mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/DelimitedTermInfoWriter.java&p1=mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterTermInfoWriter.java&r1=1089224&r2=1089646&rev=1089646&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterTermInfoWriter.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/DelimitedTermInfoWriter.java
Wed Apr 6 22:02:48 2011
@@ -24,15 +24,15 @@ import org.apache.mahout.utils.vectors.T
import org.apache.mahout.utils.vectors.TermInfo;
/**
- * Write ther TermInfo out to a {@link java.io.Writer}
+ * Write {@link TermInfo} to a {@link Writer} in a textual, delimited format
with header.
*/
-public class JWriterTermInfoWriter implements TermInfoWriter {
+public class DelimitedTermInfoWriter implements TermInfoWriter {
private final Writer writer;
private final String delimiter;
private final String field;
- public JWriterTermInfoWriter(Writer writer, String delimiter, String field) {
+ public DelimitedTermInfoWriter(Writer writer, String delimiter, String
field) {
this.writer = writer;
this.delimiter = delimiter;
this.field = field;
@@ -44,9 +44,9 @@ public class JWriterTermInfoWriter imple
Iterator<TermEntry> entIter = ti.getAllEntries();
writer.write(String.valueOf(ti.totalTerms(field)));
- writer.write("\n");
+ writer.write('\n');
writer.write("#term" + delimiter + "doc freq" + delimiter + "idx");
- writer.write("\n");
+ writer.write('\n');
while (entIter.hasNext()) {
TermEntry entry = entIter.next();
writer.write(entry.getTerm());
@@ -54,7 +54,7 @@ public class JWriterTermInfoWriter imple
writer.write(String.valueOf(entry.getDocFreq()));
writer.write(delimiter);
writer.write(String.valueOf(entry.getTermIdx()));
- writer.write("\n");
+ writer.write('\n');
}
writer.flush();
writer.close();
Copied:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/TextualVectorWriter.java
(from r1089224,
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java)
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/TextualVectorWriter.java?p2=mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/TextualVectorWriter.java&p1=mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java&r1=1089224&r2=1089646&rev=1089646&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/TextualVectorWriter.java
Wed Apr 6 22:02:48 2011
@@ -23,14 +23,20 @@ import java.io.Writer;
import org.apache.mahout.math.Vector;
/**
- * Write out the vectors to any {@link java.io.Writer} using {@link
org.apache.mahout.math.Vector#asFormatString()}.
+ * Write out the vectors to any {@link Writer} using {@link
Vector#asFormatString()},
+ * one per line by default.
*/
-public class JWriterVectorWriter implements VectorWriter {
- protected final Writer writer;
+public class TextualVectorWriter implements VectorWriter {
+
+ private final Writer writer;
- public JWriterVectorWriter(Writer writer) {
+ public TextualVectorWriter(Writer writer) {
this.writer = writer;
}
+
+ protected Writer getWriter() {
+ return writer;
+ }
@Override
public long write(Iterable<Vector> iterable) throws IOException {
@@ -40,25 +46,20 @@ public class JWriterVectorWriter impleme
@Override
public long write(Iterable<Vector> iterable, long maxDocs) throws
IOException {
long result = 0;
-
for (Vector vector : iterable) {
if (result >= maxDocs) {
break;
}
- formatVector(vector);
+ write(vector);
result++;
}
return result;
}
- protected void formatVector(Vector vector) throws IOException {
- writer.write(vector.asFormatString());
- writer.write('\n');
- }
-
@Override
public void write(Vector vector) throws IOException {
- formatVector(vector);
+ writer.write(vector.asFormatString());
+ writer.write('\n');
}
@Override
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java?rev=1089646&r1=1089645&r2=1089646&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
Wed Apr 6 22:02:48 2011
@@ -43,8 +43,7 @@ import org.apache.lucene.store.FSDirecto
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.utils.vectors.TermInfo;
-import org.apache.mahout.utils.vectors.io.JWriterTermInfoWriter;
-import org.apache.mahout.utils.vectors.io.JWriterVectorWriter;
+import org.apache.mahout.utils.vectors.io.DelimitedTermInfoWriter;
import org.apache.mahout.utils.vectors.io.SequenceFileVectorWriter;
import org.apache.mahout.utils.vectors.io.VectorWriter;
import org.apache.mahout.vectorizer.TF;
@@ -103,12 +102,6 @@ public final class Driver {
"The maximum number of vectors to output. If not specified, then it
will loop over all docs")
.withShortName("m").create();
- Option outWriterOpt =
obuilder.withLongName("outputWriter").withRequired(false).withArgument(
-
abuilder.withName("outputWriter").withMinimum(1).withMaximum(1).create()).withDescription(
- "The VectorWriter to use, either seq "
- + "(SequenceFileVectorWriter - default) or file (Writes to a File)")
- .withShortName("e").create();
-
Option minDFOpt =
obuilder.withLongName("minDF").withRequired(false).withArgument(
abuilder.withName("minDF").withMinimum(1).withMaximum(1).create()).withDescription(
"The minimum document frequency. Default is
1").withShortName("md").create();
@@ -123,7 +116,7 @@ public final class Driver {
Group group =
gbuilder.withName("Options").withOption(inputOpt).withOption(idFieldOpt).withOption(
outputOpt).withOption(delimiterOpt).withOption(helpOpt).withOption(fieldOpt).withOption(maxOpt)
-
.withOption(dictOutOpt).withOption(powerOpt).withOption(outWriterOpt).withOption(maxDFPercentOpt)
+
.withOption(dictOutOpt).withOption(powerOpt).withOption(maxDFPercentOpt)
.withOption(weightOpt).withOption(minDFOpt).create();
try {
@@ -209,18 +202,7 @@ public final class Driver {
String outFile = cmdLine.getValue(outputOpt).toString();
log.info("Output File: {}", outFile);
- VectorWriter vectorWriter;
- if (cmdLine.hasOption(outWriterOpt)) {
- String outWriter = cmdLine.getValue(outWriterOpt).toString();
- if ("file".equals(outWriter)) {
- Writer writer = new OutputStreamWriter(new FileOutputStream(new
File(outFile)), Charset.forName("UTF8"));
- vectorWriter = new JWriterVectorWriter(writer);
- } else {
- vectorWriter = getSeqFileWriter(outFile);
- }
- } else {
- vectorWriter = getSeqFileWriter(outFile);
- }
+ VectorWriter vectorWriter = getSeqFileWriter(outFile);
long numDocs = vectorWriter.write(iterable, maxDocs);
vectorWriter.close();
@@ -231,7 +213,7 @@ public final class Driver {
File dictOutFile = new File(cmdLine.getValue(dictOutOpt).toString());
log.info("Dictionary Output file: {}", dictOutFile);
Writer writer = new OutputStreamWriter(new
FileOutputStream(dictOutFile), Charset.forName("UTF8"));
- JWriterTermInfoWriter tiWriter = new JWriterTermInfoWriter(writer,
delimiter, field);
+ DelimitedTermInfoWriter tiWriter = new DelimitedTermInfoWriter(writer,
delimiter, field);
tiWriter.write(termInfo);
tiWriter.close();
writer.close();
Modified:
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java?rev=1089646&r1=1089645&r2=1089646&view=diff
==============================================================================
---
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java
(original)
+++
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java
Wed Apr 6 22:02:48 2011
@@ -20,7 +20,7 @@ import org.apache.mahout.math.Vector;
import org.apache.mahout.utils.MahoutTestCase;
import org.apache.mahout.utils.vectors.RandomVectorIterable;
import org.apache.mahout.utils.vectors.VectorHelper;
-import org.apache.mahout.utils.vectors.io.JWriterVectorWriter;
+import org.apache.mahout.utils.vectors.io.TextualVectorWriter;
import org.junit.Test;
import java.io.IOException;
@@ -30,15 +30,14 @@ import java.io.StringWriter;
public class CSVVectorIterableTest extends MahoutTestCase {
@Test
- public void test() throws Exception {
+ public void testCount() throws Exception {
StringWriter sWriter = new StringWriter();
- JWriterVectorWriter jwvw = new JWriterVectorWriter(sWriter) {
-
+ TextualVectorWriter jwvw = new TextualVectorWriter(sWriter) {
@Override
- protected void formatVector(Vector vector) throws IOException {
+ public void write(Vector vector) throws IOException {
String vecStr = VectorHelper.vectorToCSVString(vector, false);
- writer.write(vecStr);
+ getWriter().write(vecStr);
}
};
Iterable<Vector> iter = new RandomVectorIterable(50);
Modified:
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java?rev=1089646&r1=1089645&r2=1089646&view=diff
==============================================================================
---
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java
(original)
+++
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java
Wed Apr 6 22:02:48 2011
@@ -52,9 +52,9 @@ public final class VectorWriterTest exte
}
@Test
- public void test() throws Exception {
+ public void testTextOutputSize() throws Exception {
StringWriter strWriter = new StringWriter();
- VectorWriter writer = new JWriterVectorWriter(strWriter);
+ VectorWriter writer = new TextualVectorWriter(strWriter);
Collection<Vector> vectors = new ArrayList<Vector>();
vectors.add(new DenseVector(new double[]{0.3, 1.5, 4.5}));
vectors.add(new DenseVector(new double[]{1.3, 1.5, 3.5}));