Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java?rev=1460571&r1=1460570&r2=1460571&view=diff ============================================================================== --- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java (original) +++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java Mon Mar 25 09:50:22 2013 @@ -26,7 +26,6 @@ import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.Utils.OutputFileUtils.OutputFilesFilter; import org.apache.hadoop.util.ToolRunner; @@ -48,7 +47,7 @@ import java.util.Iterator; import java.util.Set; /** - * Can read in a {@link SequenceFile} of {@link Vector}s and dump + * Can read in a {@link org.apache.hadoop.io.SequenceFile} of {@link Vector}s and dump * out the results using {@link Vector#asFormatString()} to either the console or to a * file. */ @@ -76,10 +75,13 @@ public final class VectorDumper extends addOption("printKey", "p", "Print out the key as well, delimited by tab (or the value if useKey is true", false); addOption("dictionary", "d", "The dictionary file.", false); addOption("dictionaryType", "dt", "The dictionary file type (text|seqfile)", false); - addOption("csv", "c", "Output the Vector as CSV. Otherwise it substitutes in the terms for vector cell entries", false); - addOption("namesAsComments", "n", "If using CSV output, optionally add a comment line for each NamedVector (if the vector is one) printing out the name", false); + addOption("csv", "c", "Output the Vector as CSV. Otherwise it substitutes in the terms for vector cell entries", + false); + addOption("namesAsComments", "n", "If using CSV output, optionally add a comment line for each NamedVector " + + "(if the vector is one) printing out the name", false); addOption("nameOnly", "N", "Use the name as the value for each NamedVector (skip other vectors)", false); - addOption("sortVectors", "sort", "Sort output key/value pairs of the vector entries in abs magnitude descending order", false); + addOption("sortVectors", "sort", "Sort output key/value pairs of the vector entries in abs magnitude " + + "descending order", false); addOption("quiet", "q", "Print only file contents", false); addOption("sizeOnly", "sz", "Dump only the size of the vector", false); addOption("numItems", "ni", "Output at most <n> vecors", false);
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java?rev=1460571&r1=1460570&r2=1460571&view=diff ============================================================================== --- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java (original) +++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java Mon Mar 25 09:50:22 2013 @@ -23,7 +23,6 @@ import com.google.common.collect.Lists; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.lucene.util.PriorityQueue; import org.apache.mahout.common.Pair; @@ -170,7 +169,7 @@ public final class VectorHelper { } /** - * Read a dictionary in {@link SequenceFile} generated by + * Read a dictionary in {@link org.apache.hadoop.io.SequenceFile} generated by * {@link org.apache.mahout.vectorizer.DictionaryVectorizer} * * @param filePattern <PATH TO DICTIONARY>/dictionary.file-* @@ -217,7 +216,7 @@ public final class VectorHelper { return result; } - private static class TDoublePQ<T> extends PriorityQueue<Pair<T, Double>> { + private static final class TDoublePQ<T> extends PriorityQueue<Pair<T, Double>> { private final T sentinel; private TDoublePQ(T sentinel, int size) { @@ -226,8 +225,7 @@ public final class VectorHelper { } @Override - protected boolean lessThan(Pair<T, Double> a, - Pair<T, Double> b) { + protected boolean lessThan(Pair<T, Double> a, Pair<T, Double> b) { return a.getSecond().compareTo(b.getSecond()) < 0; } Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java?rev=1460571&r1=1460570&r2=1460571&view=diff ============================================================================== --- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java (original) +++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java Mon Mar 25 09:50:22 2013 @@ -29,7 +29,8 @@ import org.apache.mahout.math.Vector; /** * Iterates a CSV file and produces {@link org.apache.mahout.math.Vector}. * <br/> - * The Iterator returned throws {@link UnsupportedOperationException} for the {@link java.util.Iterator#remove()} method. + * The Iterator returned throws {@link UnsupportedOperationException} for the {@link java.util.Iterator#remove()} + * method. * <p/> * Assumes DenseVector for now, but in the future may have the option of mapping columns to sparse format * <p/> Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java?rev=1460571&r1=1460570&r2=1460571&view=diff ============================================================================== --- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java (original) +++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java Mon Mar 25 09:50:22 2013 @@ -50,15 +50,16 @@ public abstract class AbstractLuceneIter protected long nextLogRecord = bump.increment(); protected int skippedErrorMessages; - public AbstractLuceneIterator(TermInfo terminfo, double normPower, IndexReader indexReader, Weight weight, double maxPercentErrorDocs, String field) { - this.terminfo = terminfo; - this.normPower = normPower; - this.indexReader = indexReader; - - this.weight = weight; - this.nextDocId = 0; - this.maxErrorDocs = (int) (maxPercentErrorDocs * indexReader.numDocs()); - this.field = field; + public AbstractLuceneIterator(TermInfo terminfo, double normPower, IndexReader indexReader, Weight weight, + double maxPercentErrorDocs, String field) { + this.terminfo = terminfo; + this.normPower = normPower; + this.indexReader = indexReader; + + this.weight = weight; + this.nextDocId = 0; + this.maxErrorDocs = (int) (maxPercentErrorDocs * indexReader.numDocs()); + this.field = field; } /** @@ -93,7 +94,8 @@ public abstract class AbstractLuceneIter numErrorDocs++; if (numErrorDocs >= maxErrorDocs) { log.error("There are too many documents that do not have a term vector for {}", field); - throw new IllegalStateException("There are too many documents that do not have a term vector for " + field); + throw new IllegalStateException("There are too many documents that do not have a term vector for " + + field); } if (numErrorDocs >= nextLogRecord) { if (skippedErrorMessages == 0) { Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java?rev=1460571&r1=1460570&r2=1460571&view=diff ============================================================================== --- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java (original) +++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java Mon Mar 25 09:50:22 2013 @@ -198,7 +198,8 @@ public class ClusterLabels { DocsEnum docsEnum = MultiFields.getTermDocsEnum(reader, null, contentField, term); int docID; while ((docID = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { - if (liveDocs != null && !liveDocs.get(docID)) { //check to see if we don't have an deletions (null) or if document is live + //check to see if we don't have an deletions (null) or if document is live + if (liveDocs != null && !liveDocs.get(docID)) { // document is deleted... termBitset.set(docsEnum.docID()); } @@ -243,9 +244,9 @@ public class ClusterLabels { OpenBitSet bitset = new OpenBitSet(numDocs); - Set<String> idFieldSelector= null; - if(idField !=null){ - idFieldSelector= new TreeSet<String>(); + Set<String> idFieldSelector = null; + if (idField != null) { + idFieldSelector = new TreeSet<String>(); idFieldSelector.add(idField); } Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java?rev=1460571&r1=1460570&r2=1460571&view=diff ============================================================================== --- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java (original) +++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java Mon Mar 25 09:50:22 2013 @@ -98,7 +98,8 @@ public final class Driver { LuceneIterable iterable; if (norm == LuceneIterable.NO_NORMALIZING) { - iterable = new LuceneIterable(reader, idField, field, termInfo,weight, LuceneIterable.NO_NORMALIZING, maxPercentErrorDocs); + iterable = new LuceneIterable(reader, idField, field, termInfo,weight, LuceneIterable.NO_NORMALIZING, + maxPercentErrorDocs); } else { iterable = new LuceneIterable(reader, idField, field, termInfo,weight, norm, maxPercentErrorDocs); } @@ -181,8 +182,8 @@ public final class Driver { Option maxPercentErrorDocsOpt = obuilder.withLongName("maxPercentErrorDocs").withRequired(false).withArgument( abuilder.withName("maxPercentErrorDocs").withMinimum(1).withMaximum(1).create()).withDescription( "The max percentage of docs that can have a null term vector. These are noise document and can occur if the " - + "analyzer used strips out all terms in the target field. This percentage is expressed as a value between 0 and 1. " + - "The default is 0.").withShortName("err").create(); + + "analyzer used strips out all terms in the target field. This percentage is expressed as a value " + + "between 0 and 1. The default is 0.").withShortName("err").create(); Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h") .create(); Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java?rev=1460571&r1=1460570&r2=1460571&view=diff ============================================================================== --- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java (original) +++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java Mon Mar 25 09:50:22 2013 @@ -43,7 +43,8 @@ public final class LuceneIterable implem this(reader, idField, field, terminfo, weight, NO_NORMALIZING); } - public LuceneIterable(IndexReader indexReader, String idField, String field, TermInfo terminfo, Weight weight, double normPower) { + public LuceneIterable(IndexReader indexReader, String idField, String field, TermInfo terminfo, Weight weight, + double normPower) { this(indexReader, idField, field, terminfo, weight, normPower, 0); } Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java?rev=1460571&r1=1460570&r2=1460571&view=diff ============================================================================== --- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java (original) +++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java Mon Mar 25 09:50:22 2013 @@ -19,18 +19,17 @@ package org.apache.mahout.utils.vectors. import com.google.common.base.Preconditions; import org.apache.lucene.index.IndexReader; -import org.apache.mahout.math.Vector; import org.apache.mahout.utils.vectors.TermInfo; import org.apache.mahout.vectorizer.Weight; import java.io.IOException; -import java.util.Iterator; import java.util.Set; import java.util.TreeSet; /** - * An {@link Iterator} over {@link Vector}s that uses a Lucene index as the source for creating the - * {@link Vector}s. The field used to create the vectors currently must have term vectors stored for it. + * An {@link java.util.Iterator} over {@link org.apache.mahout.math.Vector}s that uses a Lucene index as the source + * for creating the {@link org.apache.mahout.math.Vector}s. The field used to create the vectors currently must have + * term vectors stored for it. */ public class LuceneIterator extends AbstractLuceneIterator { protected final Set<String> idFieldSelector; @@ -63,7 +62,8 @@ public class LuceneIterator extends Abst * @param weight weight * @param normPower the normalization value. Must be nonnegative, or {@link LuceneIterable#NO_NORMALIZING} * @param maxPercentErrorDocs most documents that will be tolerated without a term freq vector. In [0,1]. - * @see #LuceneIterator(org.apache.lucene.index.IndexReader, String, String, org.apache.mahout.utils.vectors.TermInfo, org.apache.mahout.vectorizer.Weight, double) + * @see #LuceneIterator(org.apache.lucene.index.IndexReader, String, String, org.apache.mahout.utils.vectors.TermInfo, + * org.apache.mahout.vectorizer.Weight, double) */ public LuceneIterator(IndexReader indexReader, String idField,
