No prob. was too late to comment :) i will tackle it in one of 622 patches :)
On Sat, Mar 26, 2011 at 4:03 AM, Grant Ingersoll <[email protected]> wrote: > Ah, OK. Good to know. Hadn't followed that one. Feel free to change as > appropriate or I can. > > On Mar 25, 2011, at 9:12 PM, Dmitriy Lyubimov wrote: > >> That would be a typicall change i am trying to fix with 622: >> http://svn.apache.org/viewvc/mahout/trunk/utils/pom.xml?view=diff&r1=1085396&r2=1085397&pathrev=1085397 >> >> >> >> On Fri, Mar 25, 2011 at 7:28 AM, <[email protected]> wrote: >>> Author: gsingers >>> Date: Fri Mar 25 14:28:12 2011 >>> New Revision: 1085397 >>> >>> URL: http://svn.apache.org/viewvc?rev=1085397&view=rev >>> Log: >>> MAHOUT-548: add in some CSV support for creating vectors, as well as a few >>> other fixes for working with vectors >>> >>> Added: >>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/ >>> >>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java >>> >>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java >>> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/ >>> >>> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java >>> >>> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java >>> Modified: >>> mahout/trunk/utils/pom.xml >>> >>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java >>> >>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java >>> >>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java >>> >>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java >>> >>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java >>> >>> Modified: mahout/trunk/utils/pom.xml >>> URL: >>> http://svn.apache.org/viewvc/mahout/trunk/utils/pom.xml?rev=1085397&r1=1085396&r2=1085397&view=diff >>> ============================================================================== >>> --- mahout/trunk/utils/pom.xml (original) >>> +++ mahout/trunk/utils/pom.xml Fri Mar 25 14:28:12 2011 >>> @@ -142,6 +142,11 @@ >>> <type>test-jar</type> >>> <scope>test</scope> >>> </dependency> >>> + <dependency> >>> + <groupId>org.apache.solr</groupId> >>> + <artifactId>solr-commons-csv</artifactId> >>> + <version>1.4.1</version> >>> + </dependency> >>> >>> <dependency> >>> <groupId>junit</groupId> >>> >>> Modified: >>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java >>> URL: >>> http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java?rev=1085397&r1=1085396&r2=1085397&view=diff >>> ============================================================================== >>> --- >>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java >>> (original) >>> +++ >>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java >>> Fri Mar 25 14:28:12 2011 >>> @@ -77,16 +77,22 @@ public final class VectorDumper { >>> Option dictTypeOpt = >>> obuilder.withLongName("dictionaryType").withRequired(false).withArgument( >>> >>> abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1).create()).withDescription( >>> "The dictionary file type >>> (text|sequencefile)").withShortName("dt").create(); >>> - Option centroidJSonOpt = >>> obuilder.withLongName("json").withRequired(false).withDescription( >>> - "Output the centroid as JSON. Otherwise it substitutes in the >>> terms for vector cell entries") >>> + Option jsonOpt = >>> obuilder.withLongName("json").withRequired(false).withDescription( >>> + "Output the Vector as JSON. Otherwise it substitutes in the >>> terms for vector cell entries") >>> .withShortName("j").create(); >>> + Option csvOpt = >>> obuilder.withLongName("csv").withRequired(false).withDescription( >>> + "Output the Vector as CSV. Otherwise it substitutes in the >>> terms for vector cell entries") >>> + .withShortName("c").create(); >>> + Option namesAsCommentsOpt = >>> obuilder.withLongName("namesAsComments").withRequired(false).withDescription( >>> + "If using CSV output, optionally add a comment line for each >>> NamedVector (if the vector is one) printing out the name") >>> + .withShortName("n").create(); >>> Option sizeOpt = obuilder.withLongName("sizeOnly").withRequired(false). >>> withDescription("Dump only the size of the >>> vector").withShortName("sz").create(); >>> Option helpOpt = obuilder.withLongName("help").withDescription("Print >>> out help").withShortName("h") >>> .create(); >>> >>> Group group = >>> gbuilder.withName("Options").withOption(seqOpt).withOption(outputOpt).withOption( >>> - >>> dictTypeOpt).withOption(dictOpt).withOption(centroidJSonOpt).withOption(vectorAsKeyOpt).withOption( >>> + >>> dictTypeOpt).withOption(dictOpt).withOption(csvOpt).withOption(vectorAsKeyOpt).withOption( >>> printKeyOpt).withOption(sizeOpt).create(); >>> >>> try { >>> @@ -122,10 +128,12 @@ public final class VectorDumper { >>> throw new OptionException(dictTypeOpt); >>> } >>> } >>> - boolean useJSON = cmdLine.hasOption(centroidJSonOpt); >>> + boolean useJSON = cmdLine.hasOption(jsonOpt); >>> + boolean useCSV = cmdLine.hasOption(csvOpt); >>> + >>> boolean sizeOnly = cmdLine.hasOption(sizeOpt); >>> SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, >>> conf); >>> - >>> + boolean namesAsComments = cmdLine.hasOption(namesAsCommentsOpt); >>> Writable keyWritable = >>> reader.getKeyClass().asSubclass(Writable.class).newInstance(); >>> Writable valueWritable = >>> reader.getValueClass().asSubclass(Writable.class).newInstance(); >>> boolean transposeKeyValue = cmdLine.hasOption(vectorAsKeyOpt); >>> @@ -140,6 +148,16 @@ public final class VectorDumper { >>> try { >>> boolean printKey = cmdLine.hasOption(printKeyOpt); >>> long i = 0; >>> + if (useCSV && dictionary != null){ >>> + writer.write("#"); >>> + for (int j = 0; j < dictionary.length; j++) { >>> + writer.write(dictionary[j]); >>> + if (j < dictionary.length - 1){ >>> + writer.write(','); >>> + } >>> + } >>> + writer.write('\n'); >>> + } >>> while (reader.next(keyWritable, valueWritable)) { >>> if (printKey) { >>> Writable notTheVectorWritable = transposeKeyValue ? >>> valueWritable : keyWritable; >>> @@ -159,7 +177,14 @@ public final class VectorDumper { >>> writer.write(String.valueOf(vector.size())); >>> writer.write('\n'); >>> } else { >>> - String fmtStr = useJSON ? vector.asFormatString() : >>> VectorHelper.vectorToString(vector, dictionary); >>> + String fmtStr; >>> + if (useJSON){ >>> + fmtStr = VectorHelper.vectorToJSONString(vector, >>> dictionary); >>> + } else if (useCSV){ >>> + fmtStr = VectorHelper.vectorToCSVString(vector, >>> namesAsComments); >>> + } else { >>> + fmtStr = vector.asFormatString(); >>> + } >>> writer.write(fmtStr); >>> writer.write('\n'); >>> } >>> >>> Modified: >>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java >>> URL: >>> http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java?rev=1085397&r1=1085396&r2=1085397&view=diff >>> ============================================================================== >>> --- >>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java >>> (original) >>> +++ >>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java >>> Fri Mar 25 14:28:12 2011 >>> @@ -40,14 +40,45 @@ import org.apache.mahout.math.map.OpenOb >>> public final class VectorHelper { >>> >>> private static final Pattern TAB_PATTERN = Pattern.compile("\t"); >>> + >>> >>> private VectorHelper() { } >>> - >>> + >>> + public static String vectorToCSVString(Vector vector, boolean >>> namesAsComments){ >>> + StringBuilder bldr = new StringBuilder(2048); >>> + try { >>> + vectorToCSVString(vector, namesAsComments, bldr); >>> + } catch (IOException e) { >>> + throw new RuntimeException(e); >>> + } >>> + return bldr.toString(); >>> + } >>> + >>> + public static void vectorToCSVString(Vector vector, boolean >>> namesAsComments, >>> + Appendable bldr) throws IOException >>> { >>> + if (namesAsComments && vector instanceof NamedVector){ >>> + >>> bldr.append("#").append(((NamedVector)vector).getName()).append('\n'); >>> + } >>> + Iterator<Vector.Element> iter = vector.iterator(); >>> + boolean first = true; >>> + while (iter.hasNext()) { >>> + if (first) { >>> + first = false; >>> + } else { >>> + bldr.append(","); >>> + } >>> + Vector.Element elt = iter.next(); >>> + bldr.append(String.valueOf(elt.get())); >>> + } >>> + bldr.append('\n'); >>> + } >>> + >>> + >>> /** >>> * @return a String from a vector that fills in the values with the >>> appropriate value from a dictionary where >>> * each the ith entry is the term for the ith vector cell. >>> */ >>> - public static String vectorToString(Vector vector, String[] dictionary) { >>> + public static String vectorToJSONString(Vector vector, String[] >>> dictionary) { >>> StringBuilder bldr = new StringBuilder(2048); >>> >>> if (vector instanceof NamedVector) { >>> @@ -67,12 +98,13 @@ public final class VectorHelper { >>> if (dictionary != null) { >>> bldr.append(dictionary[elt.index()]); >>> } else { >>> - bldr.append(elt.index()); >>> + bldr.append(String.valueOf(elt.index())); >>> } >>> - bldr.append(':').append(elt.get()); >>> + bldr.append(':').append(String.valueOf(elt.get())); >>> } >>> return bldr.append('}').toString(); >>> } >>> + >>> >>> /** >>> * Read in a dictionary file. Format is: >>> >>> Added: >>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java >>> URL: >>> http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java?rev=1085397&view=auto >>> ============================================================================== >>> --- >>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java >>> (added) >>> +++ >>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java >>> Fri Mar 25 14:28:12 2011 >>> @@ -0,0 +1,94 @@ >>> +package org.apache.mahout.utils.vectors.csv; >>> +/** >>> + * Licensed to the Apache Software Foundation (ASF) under one or more >>> + * contributor license agreements. See the NOTICE file distributed with >>> + * this work for additional information regarding copyright ownership. >>> + * The ASF licenses this file to You under the Apache License, Version 2.0 >>> + * (the "License"); you may not use this file except in compliance with >>> + * the License. You may obtain a copy of the License at >>> + * >>> + * http://www.apache.org/licenses/LICENSE-2.0 >>> + * >>> + * Unless required by applicable law or agreed to in writing, software >>> + * distributed under the License is distributed on an "AS IS" BASIS, >>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. >>> + * See the License for the specific language governing permissions and >>> + * limitations under the License. >>> + */ >>> + >>> +import org.apache.commons.csv.CSVParser; >>> +import org.apache.commons.csv.CSVStrategy; >>> +import org.apache.mahout.math.DenseVector; >>> +import org.apache.mahout.math.Vector; >>> + >>> +import java.io.BufferedReader; >>> +import java.io.IOException; >>> +import java.io.Reader; >>> +import java.util.Iterator; >>> + >>> + >>> +/** >>> + * Iterates a CSV file and produces {@link org.apache.mahout.math.Vector}. >>> + * <br/> >>> + * The Iterator returned throws {@link UnsupportedOperationException} for >>> the {@link java.util.Iterator#remove()} method. >>> + * <p/> >>> + * Assumes DenseVector for now, but in the future may have the option of >>> mapping columns to sparse format >>> + * <p/> >>> + * The Iterator is not thread-safe. >>> + * >>> + * >>> + **/ >>> +public class CSVVectorIterable implements Iterable<Vector> { >>> + protected CSVParser parser; >>> + protected String [] line; >>> + >>> + public CSVVectorIterable(Reader reader) throws IOException { >>> + parser = new CSVParser(reader); >>> + line = parser.getLine(); >>> + } >>> + >>> + public CSVVectorIterable(Reader reader, CSVStrategy strategy) throws >>> IOException { >>> + parser = new CSVParser(reader, strategy); >>> + line = parser.getLine(); >>> + } >>> + >>> + >>> + @Override >>> + public Iterator<Vector> iterator() { >>> + return new CSVIterator(); >>> + } >>> + >>> + private class CSVIterator implements Iterator<Vector>{ >>> + >>> + >>> + public CSVIterator() { >>> + } >>> + >>> + @Override >>> + public boolean hasNext() { >>> + return line != null; >>> + } >>> + >>> + @Override >>> + public Vector next() { >>> + >>> + Vector result = null; >>> + result = new DenseVector(line.length); >>> + for (int i = 0; i < line.length; i++) { >>> + result.setQuick(i, Double.parseDouble(line[i])); >>> + } >>> + //move the line forward >>> + try { >>> + line = parser.getLine(); >>> + } catch (IOException e) { >>> + throw new RuntimeException(e); >>> + } >>> + return result; >>> + } >>> + >>> + @Override >>> + public void remove() { >>> + throw new UnsupportedOperationException(); >>> + } >>> + } >>> +} >>> >>> Modified: >>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java >>> URL: >>> http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java?rev=1085397&r1=1085396&r2=1085397&view=diff >>> ============================================================================== >>> --- >>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java >>> (original) >>> +++ >>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java >>> Fri Mar 25 14:28:12 2011 >>> @@ -25,8 +25,8 @@ import org.apache.mahout.math.Vector; >>> /** >>> * Write out the vectors to any {@link java.io.Writer} using {@link >>> org.apache.mahout.math.Vector#asFormatString()}. >>> */ >>> -public class JWriterVectorWriter implements VectorWriter { >>> - private final Writer writer; >>> +public class JWriterVectorWriter extends VectorWriter { >>> + protected final Writer writer; >>> >>> public JWriterVectorWriter(Writer writer) { >>> this.writer = writer; >>> @@ -45,14 +45,22 @@ public class JWriterVectorWriter impleme >>> if (result >= maxDocs) { >>> break; >>> } >>> - writer.write(vector.asFormatString()); >>> - writer.write('\n'); >>> - >>> + formatVector(vector); >>> result++; >>> } >>> return result; >>> } >>> - >>> + >>> + protected void formatVector(Vector vector) throws IOException { >>> + writer.write(vector.asFormatString()); >>> + writer.write('\n'); >>> + } >>> + >>> + @Override >>> + public void write(Vector vector) throws IOException { >>> + formatVector(vector); >>> + } >>> + >>> @Override >>> public void close() throws IOException { >>> writer.flush(); >>> >>> Added: >>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java >>> URL: >>> http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java?rev=1085397&view=auto >>> ============================================================================== >>> --- >>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java >>> (added) >>> +++ >>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java >>> Fri Mar 25 14:28:12 2011 >>> @@ -0,0 +1,105 @@ >>> +package org.apache.mahout.utils.vectors.io; >>> +/** >>> + * Licensed to the Apache Software Foundation (ASF) under one or more >>> + * contributor license agreements. See the NOTICE file distributed with >>> + * this work for additional information regarding copyright ownership. >>> + * The ASF licenses this file to You under the Apache License, Version 2.0 >>> + * (the "License"); you may not use this file except in compliance with >>> + * the License. You may obtain a copy of the License at >>> + * >>> + * http://www.apache.org/licenses/LICENSE-2.0 >>> + * >>> + * Unless required by applicable law or agreed to in writing, software >>> + * distributed under the License is distributed on an "AS IS" BASIS, >>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. >>> + * See the License for the specific language governing permissions and >>> + * limitations under the License. >>> + */ >>> + >>> +import org.apache.hadoop.conf.Configuration; >>> +import org.apache.hadoop.fs.ContentSummary; >>> +import org.apache.hadoop.fs.FileSystem; >>> +import org.apache.hadoop.fs.Path; >>> +import org.apache.hadoop.io.SequenceFile; >>> +import org.apache.hadoop.io.Writable; >>> +import org.apache.mahout.math.Vector; >>> +import org.apache.mahout.math.VectorWritable; >>> + >>> +import java.io.IOException; >>> +import java.util.Iterator; >>> + >>> + >>> +/** >>> + * Given a Sequence File containing vectors (actually, {@link >>> org.apache.mahout.math.VectorWritable}, iterate over it. >>> + * >>> + **/ >>> +public class SequenceFileVectorIterable implements Iterable<Vector>{ >>> + protected SequenceFile.Reader reader; >>> + protected long fileLen; >>> + protected Writable keyWritable; >>> + protected Writable valueWritable; >>> + protected boolean useKey; >>> + >>> + /** >>> + * Construct the Iterable >>> + * @param fs The {@link org.apache.hadoop.fs.FileSystem} containing the >>> {@link org.apache.hadoop.io.SequenceFile} >>> + * @param file The {@link org.apache.hadoop.fs.Path} containing the file >>> + * @param conf The {@link org.apache.hadoop.conf.Configuration} to use >>> + * @param useKey If true, use the key as the {@link >>> org.apache.mahout.math.VectorWritable}, otherwise use the value >>> + * @throws IllegalAccessException >>> + * @throws InstantiationException >>> + * @throws IOException >>> + */ >>> + public SequenceFileVectorIterable(FileSystem fs, Path file, >>> Configuration conf, boolean useKey) throws IllegalAccessException, >>> InstantiationException, IOException { >>> + this.reader = new SequenceFile.Reader(fs, file, conf); >>> + ContentSummary summary = fs.getContentSummary(file); >>> + fileLen = summary.getLength(); >>> + this.useKey = useKey; >>> + keyWritable = >>> reader.getKeyClass().asSubclass(Writable.class).newInstance(); >>> + valueWritable = >>> reader.getValueClass().asSubclass(Writable.class).newInstance(); >>> + } >>> + >>> + /** >>> + * The Iterator returned does not support remove() >>> + * @return The {@link java.util.Iterator} >>> + */ >>> + public Iterator<Vector> iterator() { >>> + return new SFIterator(); >>> + >>> + } >>> + >>> + private final class SFIterator implements Iterator<Vector>{ >>> + @Override >>> + public boolean hasNext() { >>> + //TODO: is this legitimate? We can't call next here since it breaks >>> the iterator contract >>> + try { >>> + return reader.getPosition() < fileLen; >>> + } catch (IOException e) { >>> + return false; >>> + } >>> + } >>> + >>> + @Override >>> + public Vector next() { >>> + Vector result = null; >>> + boolean valid = false; >>> + try { >>> + valid = reader.next(keyWritable, valueWritable); >>> + if (valid){ >>> + result = ((VectorWritable) (useKey ? keyWritable : >>> valueWritable)).get(); >>> + } >>> + } catch (IOException e) { >>> + throw new RuntimeException(e); >>> + } >>> + >>> + return result; >>> + } >>> + >>> + /** >>> + * Not supported >>> + */ >>> + public void remove() { >>> + throw new UnsupportedOperationException(); >>> + } >>> + } >>> +} >>> >>> Modified: >>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java >>> URL: >>> http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java?rev=1085397&r1=1085396&r2=1085397&view=diff >>> ============================================================================== >>> --- >>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java >>> (original) >>> +++ >>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java >>> Fri Mar 25 14:28:12 2011 >>> @@ -30,16 +30,16 @@ import org.apache.mahout.math.VectorWrit >>> * >>> * Closes the writer when done >>> */ >>> -public class SequenceFileVectorWriter implements VectorWriter { >>> +public class SequenceFileVectorWriter extends VectorWriter { >>> private final SequenceFile.Writer writer; >>> - >>> + long recNum = 0; >>> public SequenceFileVectorWriter(SequenceFile.Writer writer) { >>> this.writer = writer; >>> } >>> >>> @Override >>> public long write(Iterable<Vector> iterable, long maxDocs) throws >>> IOException { >>> - long recNum = 0; >>> + >>> for (Vector point : iterable) { >>> if (recNum >= maxDocs) { >>> break; >>> @@ -51,7 +51,13 @@ public class SequenceFileVectorWriter im >>> } >>> return recNum; >>> } >>> - >>> + >>> + @Override >>> + public void write(Vector vector) throws IOException { >>> + writer.append(new LongWritable(recNum++), new VectorWritable(vector)); >>> + >>> + } >>> + >>> @Override >>> public long write(Iterable<Vector> iterable) throws IOException { >>> return write(iterable, Long.MAX_VALUE); >>> >>> Modified: >>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java >>> URL: >>> http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java?rev=1085397&r1=1085396&r2=1085397&view=diff >>> ============================================================================== >>> --- >>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java >>> (original) >>> +++ >>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java >>> Fri Mar 25 14:28:12 2011 >>> @@ -21,7 +21,7 @@ import java.io.IOException; >>> >>> import org.apache.mahout.math.Vector; >>> >>> -public interface VectorWriter { >>> +public abstract class VectorWriter { >>> /** >>> * Write all values in the Iterable to the output >>> * @param iterable The {@link Iterable} to loop over >>> @@ -29,7 +29,15 @@ public interface VectorWriter { >>> * @throws IOException if there was a problem writing >>> * >>> */ >>> - long write(Iterable<Vector> iterable) throws IOException; >>> + public abstract long write(Iterable<Vector> iterable) throws IOException; >>> + >>> + /** >>> + * Write out a vector >>> + * >>> + * @param vector The {@link org.apache.mahout.math.Vector} to write >>> + * @throws IOException >>> + */ >>> + public abstract void write(Vector vector) throws IOException; >>> >>> /** >>> * Write the first <code>maxDocs</code> to the output. >>> @@ -38,12 +46,12 @@ public interface VectorWriter { >>> * @return The number of docs written >>> * @throws IOException if there was a problem writing >>> */ >>> - long write(Iterable<Vector> iterable, long maxDocs) throws IOException; >>> + public abstract long write(Iterable<Vector> iterable, long maxDocs) >>> throws IOException; >>> >>> /** >>> * Close any internally held resources. If external Writers are passed >>> in, the implementation should indicate >>> * whether it also closes them >>> * @throws IOException if there was an issue closing the item >>> */ >>> - void close() throws IOException; >>> + public abstract void close() throws IOException; >>> } >>> >>> Added: >>> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java >>> URL: >>> http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java?rev=1085397&view=auto >>> ============================================================================== >>> --- >>> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java >>> (added) >>> +++ >>> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java >>> Fri Mar 25 14:28:12 2011 >>> @@ -0,0 +1,60 @@ >>> +package org.apache.mahout.utils.vectors.csv; >>> +/** >>> + * Licensed to the Apache Software Foundation (ASF) under one or more >>> + * contributor license agreements. See the NOTICE file distributed with >>> + * this work for additional information regarding copyright ownership. >>> + * The ASF licenses this file to You under the Apache License, Version 2.0 >>> + * (the "License"); you may not use this file except in compliance with >>> + * the License. You may obtain a copy of the License at >>> + * >>> + * http://www.apache.org/licenses/LICENSE-2.0 >>> + * >>> + * Unless required by applicable law or agreed to in writing, software >>> + * distributed under the License is distributed on an "AS IS" BASIS, >>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. >>> + * See the License for the specific language governing permissions and >>> + * limitations under the License. >>> + */ >>> + >>> +import org.apache.mahout.math.Vector; >>> +import org.apache.mahout.utils.MahoutTestCase; >>> +import org.apache.mahout.utils.vectors.RandomVectorIterable; >>> +import org.apache.mahout.utils.vectors.VectorHelper; >>> +import org.apache.mahout.utils.vectors.io.JWriterVectorWriter; >>> +import org.junit.Test; >>> + >>> +import java.io.IOException; >>> +import java.io.StringReader; >>> +import java.io.StringWriter; >>> + >>> + >>> +/** >>> + * >>> + * >>> + **/ >>> +public class CSVVectorIterableTest extends MahoutTestCase { >>> + >>> + >>> + @Test >>> + public void test() throws Exception { >>> + >>> + StringWriter sWriter = new StringWriter(); >>> + JWriterVectorWriter jwvw = new JWriterVectorWriter(sWriter) { >>> + >>> + protected void formatVector(Vector vector) throws IOException { >>> + String vecStr = VectorHelper.vectorToCSVString(vector, false); >>> + writer.write(vecStr); >>> + } >>> + }; >>> + Iterable<Vector> iter = new RandomVectorIterable(50); >>> + jwvw.write(iter); >>> + jwvw.close(); >>> + CSVVectorIterable csvIter = new CSVVectorIterable(new >>> StringReader(sWriter.getBuffer().toString())); >>> + int count = 0; >>> + for (Vector vector : csvIter) { >>> + //System.out.println("Vec: " + vector); >>> + count++; >>> + } >>> + assertEquals(50, count); >>> + } >>> +} >>> >>> Added: >>> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java >>> URL: >>> http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java?rev=1085397&view=auto >>> ============================================================================== >>> --- >>> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java >>> (added) >>> +++ >>> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java >>> Fri Mar 25 14:28:12 2011 >>> @@ -0,0 +1,39 @@ >>> +package org.apache.mahout.utils.vectors.io; >>> + >>> +import org.apache.hadoop.conf.Configuration; >>> +import org.apache.hadoop.fs.FileSystem; >>> +import org.apache.hadoop.fs.Path; >>> +import org.apache.hadoop.io.LongWritable; >>> +import org.apache.hadoop.io.SequenceFile; >>> +import org.apache.mahout.math.Vector; >>> +import org.apache.mahout.math.VectorWritable; >>> +import org.apache.mahout.utils.MahoutTestCase; >>> +import org.apache.mahout.utils.vectors.RandomVectorIterable; >>> +import org.junit.Test; >>> + >>> + >>> +/** >>> + * >>> + * >>> + **/ >>> +public class SequenceFileVectorIterableTest extends MahoutTestCase { >>> + >>> + >>> + @Test >>> + public void testSFVI() throws Exception { >>> + Path path = getTestTempFilePath("sfvw"); >>> + Configuration conf = new Configuration(); >>> + FileSystem fs = FileSystem.get(conf); >>> + SequenceFile.Writer seqWriter = new SequenceFile.Writer(fs, conf, >>> path, LongWritable.class, VectorWritable.class); >>> + SequenceFileVectorWriter writer = new >>> SequenceFileVectorWriter(seqWriter); >>> + Iterable<Vector> iter = new RandomVectorIterable(50); >>> + writer.write(iter); >>> + writer.close(); >>> + SequenceFileVectorIterable sfVIter = new >>> SequenceFileVectorIterable(fs, path, conf, false); >>> + int count = 0; >>> + for (Vector vector : sfVIter) { >>> + count++; >>> + } >>> + assertEquals(50, count); >>> + } >>> +} >>> >>> >>> > > -------------------------- > Grant Ingersoll > http://www.lucidimagination.com/ > > Search the Lucene ecosystem docs using Solr/Lucene: > http://www.lucidimagination.com/search > >
