Author: smarthi
Date: Sun May 12 05:25:54 2013
New Revision: 1481471

URL: http://svn.apache.org/r1481471
Log:
Mahout-1199: Improve javadoc comments of mahout-integration

Modified:
    mahout/trunk/CHANGELOG
    
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java
    
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java
    
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWrapper.java
    
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWriter.java
    
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/IOWriterWrapper.java
    
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/WrappedWriter.java
    
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java
    
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java
    
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java
    
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java

Modified: mahout/trunk/CHANGELOG
URL: 
http://svn.apache.org/viewvc/mahout/trunk/CHANGELOG?rev=1481471&r1=1481470&r2=1481471&view=diff
==============================================================================
--- mahout/trunk/CHANGELOG (original)
+++ mahout/trunk/CHANGELOG Sun May 12 05:25:54 2013
@@ -2,6 +2,8 @@ Mahout Change Log
 
 Release 0.8 - unreleased
 
+  MAHOUT-1199: Improve javadoc comments of mahout-integration (Angel Martinez 
Gonzalez via smarthi)  
+
   MAHOUT-1162: Adding BallKMeans and StreamingKMeans clustering algorithms 
(dfilimon)
 
   MAHOUT-1205: ParallelALSFactorizationJob should leverage the distributed 
cache (ssc)

Modified: 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java?rev=1481471&r1=1481470&r2=1481471&view=diff
==============================================================================
--- 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java
 (original)
+++ 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java
 Sun May 12 05:25:54 2013
@@ -22,6 +22,10 @@ import java.nio.charset.Charset;
 import java.util.Map;
 import java.util.regex.Pattern;
 
+/**
+ * Configuration options to be used by {@link MailProcessor}. Includes options 
controlling the exact output format 
+ * and which mail fields are included (body, to, from, subject, etc.)
+ */
 public class MailOptions {
 
   public static final String FROM = "FROM";
@@ -58,6 +62,9 @@ public class MailOptions {
     return outputDir;
   }
 
+  /**
+   * Sets the output directory where sequence files will be written.
+   */
   public void setOutputDir(String outputDir) {
     this.outputDir = outputDir;
   }
@@ -66,6 +73,10 @@ public class MailOptions {
     return prefix;
   }
 
+  /**
+   * Sets the prefix that is combined with the archive name and with message 
ids to create {@code SequenceFile} keys. 
+   * @param prefix The name of the directory containing the mail archive is 
commonly used.
+   */
   public void setPrefix(String prefix) {
     this.prefix = prefix;
   }
@@ -74,6 +85,9 @@ public class MailOptions {
     return chunkSize;
   }
 
+  /**
+   * Sets the size of each generated sequence file, in Megabytes.
+   */
   public void setChunkSize(int chunkSize) {
     this.chunkSize = chunkSize;
   }
@@ -82,6 +96,9 @@ public class MailOptions {
     return charset;
   }
 
+  /**
+   * Sets the encoding of the input
+   */
   public void setCharset(Charset charset) {
     this.charset = charset;
   }
@@ -90,6 +107,9 @@ public class MailOptions {
     return separator;
   }
 
+  /**
+   * Sets the separator to use in the output between metadata items (to, from, 
etc.).
+   */
   public void setSeparator(String separator) {
     this.separator = separator;
   }
@@ -98,6 +118,9 @@ public class MailOptions {
     return bodySeparator;
   }
 
+  /**
+   * Sets the separator to use in the output between lines in the body, the 
default is "\n".
+   */
   public void setBodySeparator(String bodySeparator) {
     this.bodySeparator = bodySeparator;
   }
@@ -106,6 +129,9 @@ public class MailOptions {
     return includeBody;
   }
 
+  /**
+   * Sets whether mail bodies are included in the output
+   */
   public void setIncludeBody(boolean includeBody) {
     this.includeBody = includeBody;
   }
@@ -114,6 +140,10 @@ public class MailOptions {
     return patternsToMatch;
   }
 
+  /**
+   * Sets the list of patterns to be applied in the given order to extract 
metadata fields (to, from, subject, etc.)
+   *  from the input 
+   */
   public void setPatternsToMatch(Pattern[] patternsToMatch) {
     this.patternsToMatch = patternsToMatch;
   }
@@ -136,7 +166,7 @@ public class MailOptions {
 
   /**
    *
-   * @param stripQuotedText if true, then strip off quoted text, such as lines 
starting with | or >
+   * Sets whether quoted text such as lines starting with | or > is striped 
off.
    */
   public void setStripQuotedText(boolean stripQuotedText) {
     this.stripQuotedText = stripQuotedText;
@@ -147,10 +177,8 @@ public class MailOptions {
   }
 
   /**
+   * Sets the {@link java.util.regex.Pattern} to use to identify lines that 
are quoted text. Default is | and >
    * @see #setStripQuotedText(boolean)
-   *
-   * @param quotedTextPattern The {@link java.util.regex.Pattern} to use to 
identify lines that are quoted text.
-   *                          Default is | and >
    */
   public void setQuotedTextPattern(Pattern quotedTextPattern) {
     this.quotedTextPattern = quotedTextPattern;

Modified: 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java?rev=1481471&r1=1481470&r2=1481471&view=diff
==============================================================================
--- 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java
 (original)
+++ 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java
 Sun May 12 05:25:54 2013
@@ -32,6 +32,11 @@ import java.io.Writer;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
+/**
+ * Converts an mbox mail archive into a group of Hadoop Sequence Files with 
equal size. The archive may optionally be gzipped or zipped.
+ * @see org.apache.mahout.text.SequenceFilesFromMailArchives
+ *
+ */
 public class MailProcessor {
 
   private static final Pattern MESSAGE_START = Pattern.compile("^From 
\\S+@\\S.*\\d{4}$", Pattern.CASE_INSENSITIVE);
@@ -49,18 +54,31 @@ public class MailProcessor {
 
   private static final Logger log = 
LoggerFactory.getLogger(MailProcessor.class);
 
+  /**
+   * Creates a {@code MailProcessor} that does not write to sequence files, 
but to a single text file.
+   * This constructor is for debugging and testing purposes.
+   */
   public MailProcessor(MailOptions options, String prefix, Writer writer) {
     this.writer = new IOWriterWrapper(writer);
     this.options = options;
     this.prefix = prefix;
   }
 
+  /**
+   * This is the main constructor of {@code MailProcessor}.
+   */
   public MailProcessor(MailOptions options, String prefix, ChunkedWriter 
writer) {
     this.writer = new ChunkedWrapper(writer);
     this.options = options;
     this.prefix = prefix;
   }
 
+  /**
+   * Parses one complete mail archive, writing output to the {@code writer} 
constructor parameter.
+   * @param mboxFile  mail archive to parse
+   * @return number of parsed mails
+   * @throws IOException
+   */
   public long parseMboxLineByLine(File mboxFile) throws IOException {
     long messageCount = 0;
     try {

Modified: 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWrapper.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWrapper.java?rev=1481471&r1=1481470&r2=1481471&view=diff
==============================================================================
--- 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWrapper.java
 (original)
+++ 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWrapper.java
 Sun May 12 05:25:54 2013
@@ -19,6 +19,9 @@ package org.apache.mahout.utils.io;
 
 import java.io.IOException;
 
+/**
+ * {@link ChunkedWriter} based implementation of the {@link WrappedWriter} 
interface.
+ */
 public class ChunkedWrapper implements WrappedWriter {
 
   private final ChunkedWriter writer;

Modified: 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWriter.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWriter.java?rev=1481471&r1=1481470&r2=1481471&view=diff
==============================================================================
--- 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWriter.java
 (original)
+++ 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWriter.java
 Sun May 12 05:25:54 2013
@@ -26,6 +26,11 @@ import org.apache.hadoop.io.Text;
 import java.io.Closeable;
 import java.io.IOException;
 
+/**
+ * Writes data splitted in multiple Hadoop sequence files of approximate equal 
size. The data must consist
+ * of key-value pairs, both of them of String type. All sequence files are 
created in the same
+ * directory and named "chunk-0", "chunk-1", etc. 
+ */
 public final class ChunkedWriter implements Closeable {
 
   private final int maxChunkSizeInBytes;
@@ -36,6 +41,12 @@ public final class ChunkedWriter impleme
   private final FileSystem fs;
   private final Configuration conf;
 
+  /** 
+   * @param conf    needed by Hadoop to know what filesystem implementation to 
use.
+   * @param chunkSizeInMB approximate size of each file, in Megabytes.
+   * @param output        directory where the sequence files will be created.
+   * @throws IOException
+   */
   public ChunkedWriter(Configuration conf, int chunkSizeInMB, Path output) 
throws IOException {
     this.output = output;
     this.conf = conf;
@@ -52,6 +63,7 @@ public final class ChunkedWriter impleme
     return new Path(output, "chunk-" + chunkID);
   }
 
+  /** Writes a new key-value pair, creating a new sequence file if necessary.*/
   public void write(String key, String value) throws IOException {
     if (currentChunkSize > maxChunkSizeInBytes) {
       Closeables.closeQuietly(writer);

Modified: 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/IOWriterWrapper.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/IOWriterWrapper.java?rev=1481471&r1=1481470&r2=1481471&view=diff
==============================================================================
--- 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/IOWriterWrapper.java
 (original)
+++ 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/IOWriterWrapper.java
 Sun May 12 05:25:54 2013
@@ -19,7 +19,9 @@ package org.apache.mahout.utils.io;
 
 import java.io.IOException;
 import java.io.Writer;
-
+/**
+ * Implementation of the {@link WrappedWriter} interface based on {@link 
java.io.Writer}.
+ */
 public class IOWriterWrapper implements WrappedWriter {
 
   private final Writer writer;
@@ -28,6 +30,9 @@ public class IOWriterWrapper implements 
     this.writer = writer;
   }
 
+  /** Writes a new key and value, separating them with one space. The value 
must end with a
+   * new line or some other delimiter, as it is not automatically added by 
this method 
+   */
   @Override
   public void write(String key, String value) throws IOException {
     writer.write(key + ' ' + value);

Modified: 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/WrappedWriter.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/WrappedWriter.java?rev=1481471&r1=1481470&r2=1481471&view=diff
==============================================================================
--- 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/WrappedWriter.java
 (original)
+++ 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/WrappedWriter.java
 Sun May 12 05:25:54 2013
@@ -25,6 +25,7 @@ import java.io.IOException;
  */
 public interface WrappedWriter extends Closeable {
 
+  /** Writes a new key-value pair.*/
   void write(String key, String value) throws IOException;
 
 }

Modified: 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java?rev=1481471&r1=1481470&r2=1481471&view=diff
==============================================================================
--- 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java
 (original)
+++ 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java
 Sun May 12 05:25:54 2013
@@ -37,6 +37,18 @@ import org.slf4j.LoggerFactory;
 import java.util.List;
 import java.util.Map;
 
+/**
+ * Converts a vector representation of documents into a {@code document x 
term} matrix.
+ * The input data is in {@code SequenceFile<Text,VectorWritable>} format (as 
generated by 
+ * {@link org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles 
SparseVectorsFromSequenceFiles}
+ * or by {@link org.apache.mahout.vectorizer.EncodedVectorsFromSequenceFiles 
EncodedVectorsFromSequenceFiles})
+ * and generates the following two files as output:
+ * <ul><li>A file called "matrix" in {@code 
SequenceFile<IntWritable,VectorWritable>} format.</li>
+ * <li>A file called "docIndex" in {@code SequenceFile<IntWritable,Text>} 
format.</li></ul>
+ * The input file is the join of the two output files on the generated int 
key. 
+ * In other words, {@code RowIdJob} replaces the document text ids by integers.
+ * The original document text ids can still be retrieved from the "docIndex".
+ */
 public class RowIdJob extends AbstractJob {
   private static final Logger log = LoggerFactory.getLogger(RowIdJob.class);
 

Modified: 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java?rev=1481471&r1=1481470&r2=1481471&view=diff
==============================================================================
--- 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java
 (original)
+++ 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java
 Sun May 12 05:25:54 2013
@@ -17,6 +17,9 @@
 
 package org.apache.mahout.utils.vectors;
 
+/**
+ * Each entry in a {@link TermInfo} dictionary. Contains information about a 
term.
+ */
 public class TermEntry {
 
   private final String term;

Modified: 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java?rev=1481471&r1=1481470&r2=1481471&view=diff
==============================================================================
--- 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java
 (original)
+++ 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java
 Sun May 12 05:25:54 2013
@@ -19,6 +19,10 @@ package org.apache.mahout.utils.vectors;
 
 import java.util.Iterator;
 
+/**
+ * Contains the term dictionary information associated with a vectorized 
collection of text documents
+ *
+ */
 public interface TermInfo {
   
   int totalTerms(String field);

Modified: 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java?rev=1481471&r1=1481470&r2=1481471&view=diff
==============================================================================
--- 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
 (original)
+++ 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
 Sun May 12 05:25:54 2013
@@ -44,6 +44,7 @@ import java.util.List;
 import java.util.Comparator;
 import java.util.regex.Pattern;
 
+/** Static utility methods related to vectors. */
 public final class VectorHelper {
 
   private static final Pattern TAB_PATTERN = Pattern.compile("\t");


Reply via email to