Author: smarthi
Date: Sun May 12 05:25:54 2013
New Revision: 1481471
URL: http://svn.apache.org/r1481471
Log:
Mahout-1199: Improve javadoc comments of mahout-integration
Modified:
mahout/trunk/CHANGELOG
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWrapper.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWriter.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/IOWriterWrapper.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/WrappedWriter.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
Modified: mahout/trunk/CHANGELOG
URL:
http://svn.apache.org/viewvc/mahout/trunk/CHANGELOG?rev=1481471&r1=1481470&r2=1481471&view=diff
==============================================================================
--- mahout/trunk/CHANGELOG (original)
+++ mahout/trunk/CHANGELOG Sun May 12 05:25:54 2013
@@ -2,6 +2,8 @@ Mahout Change Log
Release 0.8 - unreleased
+ MAHOUT-1199: Improve javadoc comments of mahout-integration (Angel Martinez
Gonzalez via smarthi)
+
MAHOUT-1162: Adding BallKMeans and StreamingKMeans clustering algorithms
(dfilimon)
MAHOUT-1205: ParallelALSFactorizationJob should leverage the distributed
cache (ssc)
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java?rev=1481471&r1=1481470&r2=1481471&view=diff
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java
(original)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java
Sun May 12 05:25:54 2013
@@ -22,6 +22,10 @@ import java.nio.charset.Charset;
import java.util.Map;
import java.util.regex.Pattern;
+/**
+ * Configuration options to be used by {@link MailProcessor}. Includes options
controlling the exact output format
+ * and which mail fields are included (body, to, from, subject, etc.)
+ */
public class MailOptions {
public static final String FROM = "FROM";
@@ -58,6 +62,9 @@ public class MailOptions {
return outputDir;
}
+ /**
+ * Sets the output directory where sequence files will be written.
+ */
public void setOutputDir(String outputDir) {
this.outputDir = outputDir;
}
@@ -66,6 +73,10 @@ public class MailOptions {
return prefix;
}
+ /**
+ * Sets the prefix that is combined with the archive name and with message
ids to create {@code SequenceFile} keys.
+ * @param prefix The name of the directory containing the mail archive is
commonly used.
+ */
public void setPrefix(String prefix) {
this.prefix = prefix;
}
@@ -74,6 +85,9 @@ public class MailOptions {
return chunkSize;
}
+ /**
+ * Sets the size of each generated sequence file, in Megabytes.
+ */
public void setChunkSize(int chunkSize) {
this.chunkSize = chunkSize;
}
@@ -82,6 +96,9 @@ public class MailOptions {
return charset;
}
+ /**
+ * Sets the encoding of the input
+ */
public void setCharset(Charset charset) {
this.charset = charset;
}
@@ -90,6 +107,9 @@ public class MailOptions {
return separator;
}
+ /**
+ * Sets the separator to use in the output between metadata items (to, from,
etc.).
+ */
public void setSeparator(String separator) {
this.separator = separator;
}
@@ -98,6 +118,9 @@ public class MailOptions {
return bodySeparator;
}
+ /**
+ * Sets the separator to use in the output between lines in the body, the
default is "\n".
+ */
public void setBodySeparator(String bodySeparator) {
this.bodySeparator = bodySeparator;
}
@@ -106,6 +129,9 @@ public class MailOptions {
return includeBody;
}
+ /**
+ * Sets whether mail bodies are included in the output
+ */
public void setIncludeBody(boolean includeBody) {
this.includeBody = includeBody;
}
@@ -114,6 +140,10 @@ public class MailOptions {
return patternsToMatch;
}
+ /**
+ * Sets the list of patterns to be applied in the given order to extract
metadata fields (to, from, subject, etc.)
+ * from the input
+ */
public void setPatternsToMatch(Pattern[] patternsToMatch) {
this.patternsToMatch = patternsToMatch;
}
@@ -136,7 +166,7 @@ public class MailOptions {
/**
*
- * @param stripQuotedText if true, then strip off quoted text, such as lines
starting with | or >
+ * Sets whether quoted text such as lines starting with | or > is striped
off.
*/
public void setStripQuotedText(boolean stripQuotedText) {
this.stripQuotedText = stripQuotedText;
@@ -147,10 +177,8 @@ public class MailOptions {
}
/**
+ * Sets the {@link java.util.regex.Pattern} to use to identify lines that
are quoted text. Default is | and >
* @see #setStripQuotedText(boolean)
- *
- * @param quotedTextPattern The {@link java.util.regex.Pattern} to use to
identify lines that are quoted text.
- * Default is | and >
*/
public void setQuotedTextPattern(Pattern quotedTextPattern) {
this.quotedTextPattern = quotedTextPattern;
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java?rev=1481471&r1=1481470&r2=1481471&view=diff
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java
(original)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java
Sun May 12 05:25:54 2013
@@ -32,6 +32,11 @@ import java.io.Writer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+/**
+ * Converts an mbox mail archive into a group of Hadoop Sequence Files with
equal size. The archive may optionally be gzipped or zipped.
+ * @see org.apache.mahout.text.SequenceFilesFromMailArchives
+ *
+ */
public class MailProcessor {
private static final Pattern MESSAGE_START = Pattern.compile("^From
\\S+@\\S.*\\d{4}$", Pattern.CASE_INSENSITIVE);
@@ -49,18 +54,31 @@ public class MailProcessor {
private static final Logger log =
LoggerFactory.getLogger(MailProcessor.class);
+ /**
+ * Creates a {@code MailProcessor} that does not write to sequence files,
but to a single text file.
+ * This constructor is for debugging and testing purposes.
+ */
public MailProcessor(MailOptions options, String prefix, Writer writer) {
this.writer = new IOWriterWrapper(writer);
this.options = options;
this.prefix = prefix;
}
+ /**
+ * This is the main constructor of {@code MailProcessor}.
+ */
public MailProcessor(MailOptions options, String prefix, ChunkedWriter
writer) {
this.writer = new ChunkedWrapper(writer);
this.options = options;
this.prefix = prefix;
}
+ /**
+ * Parses one complete mail archive, writing output to the {@code writer}
constructor parameter.
+ * @param mboxFile mail archive to parse
+ * @return number of parsed mails
+ * @throws IOException
+ */
public long parseMboxLineByLine(File mboxFile) throws IOException {
long messageCount = 0;
try {
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWrapper.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWrapper.java?rev=1481471&r1=1481470&r2=1481471&view=diff
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWrapper.java
(original)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWrapper.java
Sun May 12 05:25:54 2013
@@ -19,6 +19,9 @@ package org.apache.mahout.utils.io;
import java.io.IOException;
+/**
+ * {@link ChunkedWriter} based implementation of the {@link WrappedWriter}
interface.
+ */
public class ChunkedWrapper implements WrappedWriter {
private final ChunkedWriter writer;
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWriter.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWriter.java?rev=1481471&r1=1481470&r2=1481471&view=diff
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWriter.java
(original)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWriter.java
Sun May 12 05:25:54 2013
@@ -26,6 +26,11 @@ import org.apache.hadoop.io.Text;
import java.io.Closeable;
import java.io.IOException;
+/**
+ * Writes data splitted in multiple Hadoop sequence files of approximate equal
size. The data must consist
+ * of key-value pairs, both of them of String type. All sequence files are
created in the same
+ * directory and named "chunk-0", "chunk-1", etc.
+ */
public final class ChunkedWriter implements Closeable {
private final int maxChunkSizeInBytes;
@@ -36,6 +41,12 @@ public final class ChunkedWriter impleme
private final FileSystem fs;
private final Configuration conf;
+ /**
+ * @param conf needed by Hadoop to know what filesystem implementation to
use.
+ * @param chunkSizeInMB approximate size of each file, in Megabytes.
+ * @param output directory where the sequence files will be created.
+ * @throws IOException
+ */
public ChunkedWriter(Configuration conf, int chunkSizeInMB, Path output)
throws IOException {
this.output = output;
this.conf = conf;
@@ -52,6 +63,7 @@ public final class ChunkedWriter impleme
return new Path(output, "chunk-" + chunkID);
}
+ /** Writes a new key-value pair, creating a new sequence file if necessary.*/
public void write(String key, String value) throws IOException {
if (currentChunkSize > maxChunkSizeInBytes) {
Closeables.closeQuietly(writer);
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/IOWriterWrapper.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/IOWriterWrapper.java?rev=1481471&r1=1481470&r2=1481471&view=diff
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/IOWriterWrapper.java
(original)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/IOWriterWrapper.java
Sun May 12 05:25:54 2013
@@ -19,7 +19,9 @@ package org.apache.mahout.utils.io;
import java.io.IOException;
import java.io.Writer;
-
+/**
+ * Implementation of the {@link WrappedWriter} interface based on {@link
java.io.Writer}.
+ */
public class IOWriterWrapper implements WrappedWriter {
private final Writer writer;
@@ -28,6 +30,9 @@ public class IOWriterWrapper implements
this.writer = writer;
}
+ /** Writes a new key and value, separating them with one space. The value
must end with a
+ * new line or some other delimiter, as it is not automatically added by
this method
+ */
@Override
public void write(String key, String value) throws IOException {
writer.write(key + ' ' + value);
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/WrappedWriter.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/WrappedWriter.java?rev=1481471&r1=1481470&r2=1481471&view=diff
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/WrappedWriter.java
(original)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/WrappedWriter.java
Sun May 12 05:25:54 2013
@@ -25,6 +25,7 @@ import java.io.IOException;
*/
public interface WrappedWriter extends Closeable {
+ /** Writes a new key-value pair.*/
void write(String key, String value) throws IOException;
}
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java?rev=1481471&r1=1481470&r2=1481471&view=diff
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java
(original)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java
Sun May 12 05:25:54 2013
@@ -37,6 +37,18 @@ import org.slf4j.LoggerFactory;
import java.util.List;
import java.util.Map;
+/**
+ * Converts a vector representation of documents into a {@code document x
term} matrix.
+ * The input data is in {@code SequenceFile<Text,VectorWritable>} format (as
generated by
+ * {@link org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles
SparseVectorsFromSequenceFiles}
+ * or by {@link org.apache.mahout.vectorizer.EncodedVectorsFromSequenceFiles
EncodedVectorsFromSequenceFiles})
+ * and generates the following two files as output:
+ * <ul><li>A file called "matrix" in {@code
SequenceFile<IntWritable,VectorWritable>} format.</li>
+ * <li>A file called "docIndex" in {@code SequenceFile<IntWritable,Text>}
format.</li></ul>
+ * The input file is the join of the two output files on the generated int
key.
+ * In other words, {@code RowIdJob} replaces the document text ids by integers.
+ * The original document text ids can still be retrieved from the "docIndex".
+ */
public class RowIdJob extends AbstractJob {
private static final Logger log = LoggerFactory.getLogger(RowIdJob.class);
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java?rev=1481471&r1=1481470&r2=1481471&view=diff
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java
(original)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java
Sun May 12 05:25:54 2013
@@ -17,6 +17,9 @@
package org.apache.mahout.utils.vectors;
+/**
+ * Each entry in a {@link TermInfo} dictionary. Contains information about a
term.
+ */
public class TermEntry {
private final String term;
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java?rev=1481471&r1=1481470&r2=1481471&view=diff
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java
(original)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java
Sun May 12 05:25:54 2013
@@ -19,6 +19,10 @@ package org.apache.mahout.utils.vectors;
import java.util.Iterator;
+/**
+ * Contains the term dictionary information associated with a vectorized
collection of text documents
+ *
+ */
public interface TermInfo {
int totalTerms(String field);
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java?rev=1481471&r1=1481470&r2=1481471&view=diff
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
(original)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
Sun May 12 05:25:54 2013
@@ -44,6 +44,7 @@ import java.util.List;
import java.util.Comparator;
import java.util.regex.Pattern;
+/** Static utility methods related to vectors. */
public final class VectorHelper {
private static final Pattern TAB_PATTERN = Pattern.compile("\t");