Modified: mahout/trunk/examples/src/test/java/org/apache/mahout/classifier/bayes/SplitBayesInputTest.java URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/test/java/org/apache/mahout/classifier/bayes/SplitBayesInputTest.java?rev=1090865&r1=1090864&r2=1090865&view=diff ============================================================================== --- mahout/trunk/examples/src/test/java/org/apache/mahout/classifier/bayes/SplitBayesInputTest.java (original) +++ mahout/trunk/examples/src/test/java/org/apache/mahout/classifier/bayes/SplitBayesInputTest.java Sun Apr 10 18:30:05 2011 @@ -23,6 +23,7 @@ import java.io.OutputStreamWriter; import java.io.Writer; import java.nio.charset.Charset; +import com.google.common.base.Charsets; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -54,7 +55,7 @@ public final class SplitBayesInputTest e countMap = new OpenObjectIntHashMap<String>(); - charset = Charset.forName("UTF-8"); + charset = Charsets.UTF_8; tempInputFile = getTestTempFilePath("bayesinputfile"); tempTrainingDirectory = getTestTempDirPath("bayestrain"); tempTestDirectory = getTestTempDirPath("bayestest"); @@ -79,7 +80,7 @@ public final class SplitBayesInputTest e writer = new BufferedWriter( new OutputStreamWriter( - fs.create(new Path(tempInputDirectory, currentLabel)), Charset.forName("UTF-8"))); + fs.create(new Path(tempInputDirectory, currentLabel)), Charsets.UTF_8)); } countMap.adjustOrPutValue(currentLabel, 1, 1); writer.write(currentLabel + '\t' + entry[1] + '\n'); @@ -89,7 +90,7 @@ public final class SplitBayesInputTest e private void writeSingleInputFile() throws IOException { BufferedWriter writer = new BufferedWriter( - new OutputStreamWriter(fs.create(tempInputFile), Charset.forName("UTF-8"))); + new OutputStreamWriter(fs.create(tempInputFile), Charsets.UTF_8)); for (String[] entry : ClassifierData.DATA) { writer.write(entry[0] + '\t' + entry[1] + '\n'); }
Modified: mahout/trunk/math/src/main/java/org/apache/mahout/math/DenseVector.java URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/DenseVector.java?rev=1090865&r1=1090864&r2=1090865&view=diff ============================================================================== --- mahout/trunk/math/src/main/java/org/apache/mahout/math/DenseVector.java (original) +++ mahout/trunk/math/src/main/java/org/apache/mahout/math/DenseVector.java Sun Apr 10 18:30:05 2011 @@ -19,8 +19,8 @@ package org.apache.mahout.math; import java.util.Arrays; import java.util.Iterator; -import java.util.NoSuchElementException; +import com.google.common.collect.AbstractIterator; import org.apache.mahout.math.function.DoubleDoubleFunction; import org.apache.mahout.math.function.PlusMult; @@ -262,45 +262,28 @@ public class DenseVector extends Abstrac } - private final class NonDefaultIterator implements Iterator<Element> { + private final class NonDefaultIterator extends AbstractIterator<Element> { private final DenseElement element = new DenseElement(); private int index = 0; - private NonDefaultIterator() { - goToNext(); - } - - private void goToNext() { + @Override + protected Element computeNext() { while (index < size() && values[index] == 0.0) { index++; } - } - - @Override - public boolean hasNext() { - return index < size(); - } - - @Override - public Element next() { - if (index >= size()) { - throw new NoSuchElementException(); - } else { + if (index < size()) { element.index = index; index++; - goToNext(); return element; + } else { + return endOfData(); } } - @Override - public void remove() { - throw new UnsupportedOperationException(); - } } - private final class AllIterator implements Iterator<Element> { + private final class AllIterator extends AbstractIterator<Element> { private final DenseElement element = new DenseElement(); @@ -309,24 +292,15 @@ public class DenseVector extends Abstrac } @Override - public boolean hasNext() { - return element.index + 1 < size(); - } - - @Override - public Element next() { - if (element.index + 1 >= size()) { - throw new NoSuchElementException(); - } else { + protected Element computeNext() { + if (element.index + 1 < size()) { element.index++; return element; + } else { + return endOfData(); } } - @Override - public void remove() { - throw new UnsupportedOperationException(); - } } private final class DenseElement implements Element { Modified: mahout/trunk/math/src/main/java/org/apache/mahout/math/RandomAccessSparseVector.java URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/RandomAccessSparseVector.java?rev=1090865&r1=1090864&r2=1090865&view=diff ============================================================================== --- mahout/trunk/math/src/main/java/org/apache/mahout/math/RandomAccessSparseVector.java (original) +++ mahout/trunk/math/src/main/java/org/apache/mahout/math/RandomAccessSparseVector.java Sun Apr 10 18:30:05 2011 @@ -18,8 +18,8 @@ package org.apache.mahout.math; import java.util.Iterator; -import java.util.NoSuchElementException; +import com.google.common.collect.AbstractIterator; import org.apache.mahout.math.function.IntDoubleProcedure; import org.apache.mahout.math.list.IntArrayList; import org.apache.mahout.math.map.OpenIntDoubleHashMap; @@ -214,7 +214,7 @@ public class RandomAccessSparseVector ex } } - private final class NonDefaultIterator implements Iterator<Element> { + private final class NonDefaultIterator extends AbstractIterator<Element> { private final RandomAccessElement element = new RandomAccessElement(); private final IntArrayList indices = new IntArrayList(); @@ -225,28 +225,18 @@ public class RandomAccessSparseVector ex } @Override - public boolean hasNext() { - return offset < indices.size(); - } - - @Override - public Element next() { + protected Element computeNext() { if (offset >= indices.size()) { - throw new NoSuchElementException(); - } else { - element.index = indices.get(offset); - offset++; - return element; + return endOfData(); } + element.index = indices.get(offset); + offset++; + return element; } - @Override - public void remove() { - throw new UnsupportedOperationException(); - } } - private final class AllIterator implements Iterator<Element> { + private final class AllIterator extends AbstractIterator<Element> { private final RandomAccessElement element = new RandomAccessElement(); @@ -255,24 +245,15 @@ public class RandomAccessSparseVector ex } @Override - public boolean hasNext() { - return element.index + 1 < size(); - } - - @Override - public Element next() { - if (element.index + 1 >= size()) { - throw new NoSuchElementException(); - } else { + protected Element computeNext() { + if (element.index + 1 < size()) { element.index++; return element; + } else { + return endOfData(); } } - @Override - public void remove() { - throw new UnsupportedOperationException(); - } } private final class RandomAccessElement implements Element { Modified: mahout/trunk/math/src/main/java/org/apache/mahout/math/SequentialAccessSparseVector.java URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/SequentialAccessSparseVector.java?rev=1090865&r1=1090864&r2=1090865&view=diff ============================================================================== --- mahout/trunk/math/src/main/java/org/apache/mahout/math/SequentialAccessSparseVector.java (original) +++ mahout/trunk/math/src/main/java/org/apache/mahout/math/SequentialAccessSparseVector.java Sun Apr 10 18:30:05 2011 @@ -17,10 +17,10 @@ package org.apache.mahout.math; +import com.google.common.collect.AbstractIterator; import org.apache.mahout.math.function.Functions; import java.util.Iterator; -import java.util.NoSuchElementException; /** * <p> @@ -249,54 +249,36 @@ public class SequentialAccessSparseVecto } - private final class NonDefaultIterator implements Iterator<Element> { + private final class NonDefaultIterator extends AbstractIterator<Element> { private final NonDefaultElement element = new NonDefaultElement(); @Override - public boolean hasNext() { + protected Element computeNext() { int numMappings = values.getNumMappings(); - return numMappings > 0 && element.getNextOffset() < numMappings; - } - - @Override - public Element next() { - if (!hasNext()) { - throw new NoSuchElementException(); + if (numMappings <= 0 || element.getNextOffset() >= numMappings) { + return endOfData(); } element.advanceOffset(); return element; } - @Override - public void remove() { - throw new UnsupportedOperationException(); - } } - private final class AllIterator implements Iterator<Element> { + private final class AllIterator extends AbstractIterator<Element> { private final AllElement element = new AllElement(); @Override - public boolean hasNext() { + protected Element computeNext() { int numMappings = values.getNumMappings(); - return numMappings > 0 && element.getNextIndex() <= values.getIndices()[numMappings - 1]; - } - - @Override - public Element next() { - if (!hasNext()) { - throw new NoSuchElementException(); + if (numMappings <= 0 || element.getNextIndex() > values.getIndices()[numMappings - 1]) { + return endOfData(); } element.advanceIndex(); return element; } - @Override - public void remove() { - throw new UnsupportedOperationException(); - } } private final class NonDefaultElement implements Element { Modified: mahout/trunk/math/src/main/java/org/apache/mahout/math/VectorView.java URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/VectorView.java?rev=1090865&r1=1090864&r2=1090865&view=diff ============================================================================== --- mahout/trunk/math/src/main/java/org/apache/mahout/math/VectorView.java (original) +++ mahout/trunk/math/src/main/java/org/apache/mahout/math/VectorView.java Sun Apr 10 18:30:05 2011 @@ -18,7 +18,8 @@ package org.apache.mahout.math; import java.util.Iterator; -import java.util.NoSuchElementException; + +import com.google.common.collect.AbstractIterator; /** Implements subset view of a Vector */ public class VectorView extends AbstractVector { @@ -108,127 +109,74 @@ public class VectorView extends Abstract return new AllIterator(); } - public final class NonZeroIterator implements Iterator<Element> { + public final class NonZeroIterator extends AbstractIterator<Element> { private final Iterator<Element> it; - private Element el; - private NonZeroIterator() { it = vector.iterateNonZero(); - buffer(); } - private void buffer() { + @Override + protected Element computeNext() { while (it.hasNext()) { - el = it.next(); + Element el = it.next(); if (isInView(el.index()) && el.get() != 0) { - final Element decorated = vector.getElement(el.index()); - el = new Element() { - @Override - public double get() { - return decorated.get(); - } - - @Override - public int index() { - return decorated.index() - offset; - } - - @Override - public void set(double value) { - decorated.set(value); - } - }; - return; + Element decorated = vector.getElement(el.index()); + return new DecoratorElement(decorated); } } - el = null; // No element was found + return endOfData(); } - @Override - public Element next() { - if (!hasNext()) { - throw new NoSuchElementException(); - } - Element buffer = el; - buffer(); - return buffer; - } - - @Override - public boolean hasNext() { - return el != null; - } - - /** @throws UnsupportedOperationException all the time. method not implemented. */ - @Override - public void remove() { - throw new UnsupportedOperationException(); - } } - public final class AllIterator implements Iterator<Element> { + public final class AllIterator extends AbstractIterator<Element> { private final Iterator<Element> it; - private Element el; - private AllIterator() { it = vector.iterator(); - buffer(); } - private void buffer() { + @Override + protected Element computeNext() { while (it.hasNext()) { - el = it.next(); + Element el = it.next(); if (isInView(el.index())) { - final Element decorated = vector.getElement(el.index()); - el = new Element() { - @Override - public double get() { - return decorated.get(); - } - - @Override - public int index() { - return decorated.index() - offset; - } - - @Override - public void set(double value) { - decorated.set(value); - } - }; - return; + Element decorated = vector.getElement(el.index()); + return new DecoratorElement(decorated); } } - el = null; // No element was found + return endOfData(); // No element was found + } + + } + + private class DecoratorElement implements Element { + + private final Element decorated; + + private DecoratorElement(Element decorated) { + this.decorated = decorated; } @Override - public Element next() { - if (!hasNext()) { - throw new NoSuchElementException(); - } - Element buffer = el; - buffer(); - return buffer; + public double get() { + return decorated.get(); } @Override - public boolean hasNext() { - return el != null; + public int index() { + return decorated.index() - offset; } - /** @throws UnsupportedOperationException all the time. method not implemented. */ @Override - public void remove() { - throw new UnsupportedOperationException(); + public void set(double value) { + decorated.set(value); } } - @Override public double dot(Vector x) { if (size() != x.size()) { Modified: mahout/trunk/math/src/test/java/org/apache/mahout/math/jet/random/NegativeBinomialTest.java URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/test/java/org/apache/mahout/math/jet/random/NegativeBinomialTest.java?rev=1090865&r1=1090864&r2=1090865&view=diff ============================================================================== --- mahout/trunk/math/src/test/java/org/apache/mahout/math/jet/random/NegativeBinomialTest.java (original) +++ mahout/trunk/math/src/test/java/org/apache/mahout/math/jet/random/NegativeBinomialTest.java Sun Apr 10 18:30:05 2011 @@ -17,6 +17,7 @@ package org.apache.mahout.math.jet.random; +import com.google.common.base.Charsets; import com.google.common.base.Splitter; import com.google.common.collect.Iterables; import com.google.common.io.CharStreams; @@ -27,7 +28,6 @@ import org.apache.mahout.math.MahoutTest import org.junit.Test; import java.io.InputStreamReader; -import java.nio.charset.Charset; public final class NegativeBinomialTest extends MahoutTestCase { @@ -37,7 +37,7 @@ public final class NegativeBinomialTest @Test public void testDistributionFunctions() throws Exception { InputSupplier<InputStreamReader> input = - Resources.newReaderSupplier(Resources.getResource("negative-binomial-test-data.csv"), Charset.forName("UTF-8")); + Resources.newReaderSupplier(Resources.getResource("negative-binomial-test-data.csv"), Charsets.UTF_8); boolean header = true; for (String line : CharStreams.readLines(input)) { if (header) { Modified: mahout/trunk/math/src/test/java/org/apache/mahout/math/jet/stat/GammaTest.java URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/test/java/org/apache/mahout/math/jet/stat/GammaTest.java?rev=1090865&r1=1090864&r2=1090865&view=diff ============================================================================== --- mahout/trunk/math/src/test/java/org/apache/mahout/math/jet/stat/GammaTest.java (original) +++ mahout/trunk/math/src/test/java/org/apache/mahout/math/jet/stat/GammaTest.java Sun Apr 10 18:30:05 2011 @@ -17,6 +17,7 @@ package org.apache.mahout.math.jet.stat; +import com.google.common.base.Charsets; import com.google.common.base.Splitter; import com.google.common.collect.Iterables; import com.google.common.io.CharStreams; @@ -28,7 +29,6 @@ import org.junit.Test; import java.io.IOException; import java.io.InputStreamReader; -import java.nio.charset.Charset; import java.util.Random; public final class GammaTest extends MahoutTestCase { @@ -116,7 +116,7 @@ public final class GammaTest extends Mah Splitter onComma = Splitter.on(",").trimResults(); InputSupplier<InputStreamReader> input = - Resources.newReaderSupplier(Resources.getResource("beta-test-data.csv"), Charset.forName("UTF-8")); + Resources.newReaderSupplier(Resources.getResource("beta-test-data.csv"), Charsets.UTF_8); boolean header = true; for (String line : CharStreams.readLines(input)) { if (header) { Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/benchmark/VectorBenchmarks.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/benchmark/VectorBenchmarks.java?rev=1090865&r1=1090864&r2=1090865&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/benchmark/VectorBenchmarks.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/benchmark/VectorBenchmarks.java Sun Apr 10 18:30:05 2011 @@ -46,7 +46,6 @@ import org.apache.hadoop.io.SequenceFile import org.apache.hadoop.io.Writable; import org.apache.mahout.common.CommandLineUtil; import org.apache.mahout.common.RandomUtils; -import org.apache.mahout.common.Summarizable; import org.apache.mahout.common.TimingStatistics; import org.apache.mahout.common.commandline.DefaultOptionCreator; import org.apache.mahout.common.distance.CosineDistanceMeasure; @@ -65,7 +64,7 @@ import org.apache.mahout.math.VectorWrit import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class VectorBenchmarks implements Summarizable { +public class VectorBenchmarks { private static final Logger log = LoggerFactory.getLogger(VectorBenchmarks.class); private static final Pattern TAB_PATTERN = Pattern.compile("\t"); @@ -784,7 +783,7 @@ public class VectorBenchmarks implements mark.closestCentroidBenchmark(new ManhattanDistanceMeasure()); mark.closestCentroidBenchmark(new TanimotoDistanceMeasure()); - log.info("\n{}", mark.summarize()); + log.info("\n{}", mark); } catch (OptionException e) { CommandLineUtil.printHelp(group); } @@ -792,7 +791,7 @@ public class VectorBenchmarks implements } @Override - public String summarize() { + public String toString() { int pad = 24; StringBuilder sb = new StringBuilder(1000); sb.append(StringUtils.rightPad("BenchMarks", pad)); Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java?rev=1090865&r1=1090864&r2=1090865&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java Sun Apr 10 18:30:05 2011 @@ -22,7 +22,6 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; -import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; @@ -31,6 +30,7 @@ import java.util.List; import java.util.PriorityQueue; import java.util.Queue; +import com.google.common.base.Charsets; import org.apache.commons.cli2.CommandLine; import org.apache.commons.cli2.Group; import org.apache.commons.cli2.Option; @@ -229,7 +229,7 @@ public final class LDAPrintTopics { for (int i = 0; i < topWords.size(); ++i) { List<String> topK = topWords.get(i); Writer writer = new OutputStreamWriter( - new FileOutputStream(new File(output, "topic-" + i)), Charset.forName("UTF-8")); + new FileOutputStream(new File(output, "topic-" + i)), Charsets.UTF_8); try { writer.write("Topic " + i + '\n'); writer.write("===========\n"); Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java?rev=1090865&r1=1090864&r2=1090865&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java Sun Apr 10 18:30:05 2011 @@ -21,8 +21,8 @@ import java.io.File; import java.io.FileOutputStream; import java.io.OutputStreamWriter; import java.io.Writer; -import java.nio.charset.Charset; +import com.google.common.base.Charsets; import org.apache.commons.cli2.CommandLine; import org.apache.commons.cli2.Group; import org.apache.commons.cli2.Option; @@ -86,7 +86,7 @@ public final class SequenceFileDumper { Writer writer; if (cmdLine.hasOption(outputOpt)) { writer = new OutputStreamWriter( - new FileOutputStream(new File(cmdLine.getValue(outputOpt).toString())), Charset.forName("UTF-8")); + new FileOutputStream(new File(cmdLine.getValue(outputOpt).toString())), Charsets.UTF_8); } else { writer = new OutputStreamWriter(System.out); } Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java?rev=1090865&r1=1090864&r2=1090865&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java Sun Apr 10 18:30:05 2011 @@ -22,7 +22,6 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; -import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; @@ -33,6 +32,7 @@ import java.util.List; import java.util.Map; import java.util.TreeMap; +import com.google.common.base.Charsets; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; @@ -139,7 +139,7 @@ public final class ClusterDumper extends if (this.outputFile == null) { writer = new OutputStreamWriter(System.out); } else { - writer = new OutputStreamWriter(new FileOutputStream(new File(this.outputFile)), Charset.forName("UTF-8")); + writer = new OutputStreamWriter(new FileOutputStream(new File(this.outputFile)), Charsets.UTF_8); } try { for (Cluster value : Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java?rev=1090865&r1=1090864&r2=1090865&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java Sun Apr 10 18:30:05 2011 @@ -17,6 +17,7 @@ package org.apache.mahout.utils.vectors; +import com.google.common.base.Charsets; import org.apache.commons.cli2.CommandLine; import org.apache.commons.cli2.Group; import org.apache.commons.cli2.Option; @@ -42,7 +43,6 @@ import java.io.File; import java.io.FileOutputStream; import java.io.OutputStreamWriter; import java.io.Writer; -import java.nio.charset.Charset; /** * Can read in a {@link SequenceFile} of {@link Vector}s and dump @@ -132,7 +132,7 @@ public final class VectorDumper { Writer writer; if (cmdLine.hasOption(outputOpt)) { writer = new OutputStreamWriter( - new FileOutputStream(new File(cmdLine.getValue(outputOpt).toString())), Charset.forName("UTF-8")); + new FileOutputStream(new File(cmdLine.getValue(outputOpt).toString())), Charsets.UTF_8); } else { writer = new OutputStreamWriter(System.out); } Added: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java?rev=1090865&view=auto ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java (added) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java Sun Apr 10 18:30:05 2011 @@ -0,0 +1,81 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.utils.vectors.arff; + +import java.io.BufferedReader; +import java.io.IOException; +import java.util.regex.Pattern; + +import com.google.common.collect.AbstractIterator; +import org.apache.mahout.common.IOUtils; +import org.apache.mahout.math.DenseVector; +import org.apache.mahout.math.RandomAccessSparseVector; +import org.apache.mahout.math.Vector; + +final class ARFFIterator extends AbstractIterator<Vector> { + + private static final Pattern COMMA_PATTERN = Pattern.compile(","); + private static final Pattern SPACE_PATTERN = Pattern.compile(" "); + + private final BufferedReader reader; + private final ARFFModel model; + + ARFFIterator(BufferedReader reader, ARFFModel model) { + this.reader = reader; + this.model = model; + } + + @Override + protected Vector computeNext() { + String line; + try { + while ((line = reader.readLine()) != null) { + line = line.trim(); + if (line.length() > 0 && !line.startsWith(ARFFModel.ARFF_COMMENT)) { + break; + } + } + } catch (IOException ioe) { + throw new IllegalStateException(ioe); + } + if (line == null) { + IOUtils.quietClose(reader); + return endOfData(); + } + Vector result; + if (line.startsWith(ARFFModel.ARFF_SPARSE)) { + line = line.substring(1, line.length() - 1); + String[] splits = COMMA_PATTERN.split(line); + result = new RandomAccessSparseVector(model.getLabelSize()); + for (String split : splits) { + String[] data = SPACE_PATTERN.split(split); // first is index, second is + int idx = Integer.parseInt(data[0]); + result.setQuick(idx, model.getValue(data[1], idx)); + } + } else { + result = new DenseVector(model.getLabelSize()); + String[] splits = COMMA_PATTERN.split(line); + for (int i = 0; i < splits.length; i++) { + result.setQuick(i, model.getValue(splits[i], i)); + } + } + //result.setLabelBindings(labelBindings); + return result; + } + +} Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java?rev=1090865&r1=1090864&r2=1090865&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java Sun Apr 10 18:30:05 2011 @@ -29,12 +29,9 @@ import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.Iterator; import java.util.Locale; -import java.util.NoSuchElementException; import java.util.regex.Pattern; -import org.apache.mahout.common.IOUtils; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.RandomAccessSparseVector; +import com.google.common.base.Charsets; import org.apache.mahout.math.Vector; /** @@ -55,12 +52,12 @@ public class ARFFVectorIterable implemen private static final Pattern COMMA_PATTERN = Pattern.compile(","); private static final Pattern SPACE_PATTERN = Pattern.compile(" "); - + private final BufferedReader buff; private final ARFFModel model; public ARFFVectorIterable(File file, ARFFModel model) throws IOException { - this(file, Charset.forName("UTF-8"), model); + this(file, Charsets.UTF_8, model); } public ARFFVectorIterable(File file, Charset encoding, ARFFModel model) throws IOException { @@ -82,7 +79,6 @@ public class ARFFVectorIterable implemen int labelNumber = 0; String line; - //boolean inData = false; // TODO not used? while ((line = buff.readLine()) != null) { line = line.trim(); String lower = line.toLowerCase(Locale.ENGLISH); @@ -145,72 +141,9 @@ public class ARFFVectorIterable implemen @Override public Iterator<Vector> iterator() { - return new ARFFIterator(); + return new ARFFIterator(buff, model); } - - private final class ARFFIterator implements Iterator<Vector> { - - private String line; - - private ARFFIterator() { - goToNext(); - } - - private void goToNext() { - line = null; - try { - while ((line = buff.readLine()) != null) { - line = line.trim(); - if (line.length() > 0 && !line.startsWith(ARFFModel.ARFF_COMMENT)) { - break; - } - } - } catch (IOException e) { - line = null; - } - if (line == null) { - IOUtils.quietClose(buff); - } - } - - @Override - public boolean hasNext() { - return line != null; - } - - @Override - public Vector next() { - if (line == null) { - throw new NoSuchElementException(); - } - Vector result; - if (line.startsWith(ARFFModel.ARFF_SPARSE)) { - line = line.substring(1, line.length() - 1); - String[] splits = ARFFVectorIterable.COMMA_PATTERN.split(line); - result = new RandomAccessSparseVector(model.getLabelSize()); - for (String split : splits) { - String[] data = ARFFVectorIterable.SPACE_PATTERN.split(split); // first is index, second is - int idx = Integer.parseInt(data[0]); - result.setQuick(idx, model.getValue(data[1], idx)); - } - } else { - result = new DenseVector(model.getLabelSize()); - String[] splits = ARFFVectorIterable.COMMA_PATTERN.split(line); - for (int i = 0; i < splits.length; i++) { - result.setQuick(i, model.getValue(splits[i], i)); - } - } - //result.setLabelBindings(labelBindings); - goToNext(); - return result; - } - - @Override - public void remove() { - throw new UnsupportedOperationException("remove not supported"); - } - } - + /** * Returns info about the ARFF content that was parsed. * @return the model Added: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java?rev=1090865&view=auto ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java (added) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java Sun Apr 10 18:30:05 2011 @@ -0,0 +1,68 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.utils.vectors.csv; + +import java.io.IOException; +import java.io.Reader; + +import com.google.common.collect.AbstractIterator; +import org.apache.commons.csv.CSVParser; +import org.apache.commons.csv.CSVStrategy; +import org.apache.mahout.math.DenseVector; +import org.apache.mahout.math.Vector; + +/** + * Iterates a CSV file and produces {@link org.apache.mahout.math.Vector}. + * <br/> + * The Iterator returned throws {@link UnsupportedOperationException} for the {@link java.util.Iterator#remove()} method. + * <p/> + * Assumes DenseVector for now, but in the future may have the option of mapping columns to sparse format + * <p/> + * The Iterator is not thread-safe. + */ +public class CSVVectorIterator extends AbstractIterator<Vector> { + + private final CSVParser parser; + + public CSVVectorIterator(Reader reader) { + parser = new CSVParser(reader); + } + + public CSVVectorIterator(Reader reader, CSVStrategy strategy) { + parser = new CSVParser(reader, strategy); + } + + @Override + protected Vector computeNext() { + String[] line; + try { + line = parser.getLine(); + } catch (IOException e) { + throw new IllegalStateException(e); + } + if (line == null) { + return endOfData(); + } + Vector result = new DenseVector(line.length); + for (int i = 0; i < line.length; i++) { + result.setQuick(i, Double.parseDouble(line[i])); + } + return result; + } + +} Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java?rev=1090865&r1=1090864&r2=1090865&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java Sun Apr 10 18:30:05 2011 @@ -22,7 +22,6 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; -import java.nio.charset.Charset; import java.util.Collection; import java.util.Collections; import java.util.HashSet; @@ -31,6 +30,7 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; +import com.google.common.base.Charsets; import org.apache.commons.cli2.CommandLine; import org.apache.commons.cli2.Group; import org.apache.commons.cli2.Option; @@ -105,7 +105,7 @@ public class ClusterLabels { if (this.output == null) { writer = new OutputStreamWriter(System.out); } else { - writer = new OutputStreamWriter(new FileOutputStream(new File(this.output)), Charset.forName("UTF-8")); + writer = new OutputStreamWriter(new FileOutputStream(new File(this.output)), Charsets.UTF_8); } try { for (Map.Entry<Integer, List<WeightedVectorWritable>> integerListEntry : clusterIdToPoints.entrySet()) { Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java?rev=1090865&r1=1090864&r2=1090865&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java Sun Apr 10 18:30:05 2011 @@ -20,9 +20,9 @@ package org.apache.mahout.utils.vectors. import java.io.IOException; import java.util.Collections; import java.util.Iterator; -import java.util.NoSuchElementException; import com.google.common.base.Preconditions; +import com.google.common.collect.AbstractIterator; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.document.SetBasedFieldSelector; import org.apache.lucene.index.IndexReader; @@ -35,7 +35,7 @@ import org.apache.mahout.math.Vector; * An {@link Iterator} over {@link Vector}s that uses a Lucene index as the source for creating the * {@link Vector}s. The field used to create the vectors currently must have term vectors stored for it. */ -public final class LuceneIterator implements Iterator<Vector> { +public final class LuceneIterator extends AbstractIterator<Vector> { private final IndexReader indexReader; private final String field; @@ -44,13 +44,11 @@ public final class LuceneIterator implem private final VectorMapper mapper; private final double normPower; private final TermDocs termDocs; - private Vector current; - private boolean available; /** * Produce a LuceneIterable that can create the Vector plus normalize it. * - * @param indexReader {@link org.apache.lucene.index.IndexReader} to read the documents from. + * @param indexReader {@link IndexReader} to read the documents from. * @param idField field containing the id. May be null. * @param field field to use for the Vector * @param mapper {@link VectorMapper} for creating {@link Vector}s from Lucene's TermVectors. @@ -72,71 +70,44 @@ public final class LuceneIterator implem this.normPower = normPower; // term docs(null) is a better way of iterating all the docs in Lucene this.termDocs = indexReader.termDocs(null); - current = null; - available = false; - } - - private void readVector() throws IOException { - available = termDocs.next(); - if (!available) { - current = null; - return; - } - int doc = termDocs.doc(); - TermFreqVector termFreqVector = indexReader.getTermFreqVector(doc, field); - if (termFreqVector == null) { - throw new IllegalStateException("Field '" + field + "' does not have term vectors"); - } - - indexReader.getTermFreqVector(doc, field, mapper); - mapper.setDocumentNumber(doc); - Vector result = mapper.getVector(); - if (result == null) { - // TODO is this right? last version would produce null in the iteration in this case, though it - // seems like that may not be desirable - current = null; - return; - } - String name; - if (idField != null) { - name = indexReader.document(doc, idFieldSelector).get(idField); - } else { - name = String.valueOf(doc); - } - if (normPower == LuceneIterable.NO_NORMALIZING) { - result = new NamedVector(result, name); - } else { - result = new NamedVector(result.normalize(normPower), name); - } - current = result; } @Override - public boolean hasNext() { - if (!available) { - try { - readVector(); - } catch (IOException e) { - throw new IllegalStateException(e); + protected Vector computeNext() { + try { + if (!termDocs.next()) { + return endOfData(); } - } - return available; - } - @Override - public Vector next() { - if (!hasNext()) { - throw new NoSuchElementException(); - } - Vector next = current; - current = null; - available = false; - return next; - } + int doc = termDocs.doc(); + TermFreqVector termFreqVector = indexReader.getTermFreqVector(doc, field); + if (termFreqVector == null) { + throw new IllegalStateException("Field '" + field + "' does not have term vectors"); + } - @Override - public void remove() { - throw new UnsupportedOperationException(); + indexReader.getTermFreqVector(doc, field, mapper); + mapper.setDocumentNumber(doc); + Vector result = mapper.getVector(); + if (result == null) { + // TODO is this right? last version would produce null in the iteration in this case, though it + // seems like that may not be desirable + return null; + } + String name; + if (idField != null) { + name = indexReader.document(doc, idFieldSelector).get(idField); + } else { + name = String.valueOf(doc); + } + if (normPower == LuceneIterable.NO_NORMALIZING) { + result = new NamedVector(result, name); + } else { + result = new NamedVector(result.normalize(normPower), name); + } + return result; + } catch (IOException ioe) { + throw new IllegalStateException(ioe); + } } } Modified: mahout/trunk/utils/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java?rev=1090865&r1=1090864&r2=1090865&view=diff ============================================================================== --- mahout/trunk/utils/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java (original) +++ mahout/trunk/utils/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java Sun Apr 10 18:30:05 2011 @@ -19,11 +19,10 @@ package org.apache.mahout.text; import java.io.IOException; import java.io.OutputStreamWriter; -import java.nio.charset.Charset; import java.util.HashMap; -import java.util.Locale; import java.util.Map; +import com.google.common.base.Charsets; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; @@ -37,7 +36,6 @@ import org.junit.Test; public final class TestSequenceFilesFromDirectory extends MahoutTestCase { - private static final Charset UTF8 = Charset.forName("UTF-8"); private static final String[][] DATA1 = { {"test1", "This is the first text."}, {"test2", "This is the second text."}, @@ -71,7 +69,7 @@ public final class TestSequenceFilesFrom SequenceFilesFromDirectory.main(new String[] {"--input", inputDir.toString(), "--output", outputDir.toString(), "--chunkSize", "64", "--charset", - UTF8.displayName(Locale.ENGLISH), "--keyPrefix", prefix}); + Charsets.UTF_8.name(), "--keyPrefix", prefix}); // check output chunk files checkChunkFiles(conf, outputDir, DATA1, prefix, ParserType.TEXT); @@ -100,7 +98,7 @@ public final class TestSequenceFilesFrom int keyColumn = 0; int valueColumn = 1; SequenceFilesFromCsvFilter.main(new String[] {"--input", inputDir.toString(), - "--output", outputDir.toString(), "--charset", UTF8.name(), + "--output", outputDir.toString(), "--charset", Charsets.UTF_8.name(), "--chunkSize", Integer.toString(chunkSizeInMB), "--keyPrefix", prefix, "--keyColumn", Integer.toString(keyColumn), "--valueColumn", Integer.toString(valueColumn)}); @@ -112,7 +110,7 @@ public final class TestSequenceFilesFrom private static void createFilesFromArrays(Configuration conf, Path inputDir, String[][] data) throws IOException { FileSystem fs = FileSystem.get(conf); for (String[] aData : data) { - OutputStreamWriter osw = new OutputStreamWriter(fs.create(new Path(inputDir, aData[0])), UTF8); + OutputStreamWriter osw = new OutputStreamWriter(fs.create(new Path(inputDir, aData[0])), Charsets.UTF_8); osw.write(aData[1]); osw.close(); } Modified: mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java?rev=1090865&r1=1090864&r2=1090865&view=diff ============================================================================== --- mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java (original) +++ mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java Sun Apr 10 18:30:05 2011 @@ -21,7 +21,10 @@ import java.util.Iterator; import java.util.NoSuchElementException; import java.util.Random; +import com.google.common.base.Function; +import com.google.common.collect.Iterators; import org.apache.mahout.common.RandomUtils; +import org.apache.mahout.common.iterator.CountingIterator; import org.apache.mahout.math.DenseVector; import org.apache.mahout.math.RandomAccessSparseVector; import org.apache.mahout.math.Vector; @@ -49,36 +52,23 @@ public final class RandomVectorIterable @Override public Iterator<Vector> iterator() { - return new VectIterator(); - } - - private class VectIterator implements Iterator<Vector>{ - private int count; - private final Random random = RandomUtils.getRandom(); - @Override - public boolean hasNext() { - return count < numItems; - } - - @Override - public Vector next() { - if (!hasNext()) { - throw new NoSuchElementException(); - } - Vector result = type == VectorType.SPARSE ? new RandomAccessSparseVector(numItems) : new DenseVector(numItems); - result.assign(new DoubleFunction(){ - @Override - public double apply(double arg1) { - return random.nextDouble(); - } - }); - count++; - return result; - } - - @Override - public void remove() { - throw new UnsupportedOperationException(); - } + return Iterators.transform( + new CountingIterator(numItems), + new Function<Integer, Vector>() { + private final Random random = RandomUtils.getRandom(); + @Override + public Vector apply(Integer dummy) { + Vector result = + type == VectorType.SPARSE ? new RandomAccessSparseVector(numItems) : new DenseVector(numItems); + result.assign(new DoubleFunction(){ + @Override + public double apply(double ignored) { + return random.nextDouble(); + } + }); + return result; + } + }); } + } Copied: mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIteratorTest.java (from r1090549, mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java) URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIteratorTest.java?p2=mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIteratorTest.java&p1=mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java&r1=1090549&r2=1090865&rev=1090865&view=diff ============================================================================== --- mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java (original) +++ mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIteratorTest.java Sun Apr 10 18:30:05 2011 @@ -1,4 +1,3 @@ -package org.apache.mahout.utils.vectors.csv; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -16,6 +15,8 @@ package org.apache.mahout.utils.vectors. * limitations under the License. */ +package org.apache.mahout.utils.vectors.csv; + import org.apache.mahout.math.Vector; import org.apache.mahout.utils.MahoutTestCase; import org.apache.mahout.utils.vectors.RandomVectorIterable; @@ -26,8 +27,9 @@ import org.junit.Test; import java.io.IOException; import java.io.StringReader; import java.io.StringWriter; +import java.util.Iterator; -public class CSVVectorIterableTest extends MahoutTestCase { +public class CSVVectorIteratorTest extends MahoutTestCase { @Test public void testCount() throws Exception { @@ -43,10 +45,10 @@ public class CSVVectorIterableTest exten Iterable<Vector> iter = new RandomVectorIterable(50); jwvw.write(iter); jwvw.close(); - Iterable<Vector> csvIter = new CSVVectorIterable(new StringReader(sWriter.getBuffer().toString())); + Iterator<Vector> csvIter = new CSVVectorIterator(new StringReader(sWriter.getBuffer().toString())); int count = 0; - for (Vector vector : csvIter) { - //System.out.println("Vec: " + vector); + while (csvIter.hasNext()) { + csvIter.next(); count++; } assertEquals(50, count);
