No prob. was too late to comment :) i will tackle it in one of 622 patches :)

On Sat, Mar 26, 2011 at 4:03 AM, Grant Ingersoll <[email protected]> wrote:
> Ah, OK.  Good to know.  Hadn't followed that one.   Feel free to change as 
> appropriate or I can.
>
> On Mar 25, 2011, at 9:12 PM, Dmitriy Lyubimov wrote:
>
>> That would be a typicall change i am trying to fix with 622:
>> http://svn.apache.org/viewvc/mahout/trunk/utils/pom.xml?view=diff&r1=1085396&r2=1085397&pathrev=1085397
>>
>>
>>
>> On Fri, Mar 25, 2011 at 7:28 AM,  <[email protected]> wrote:
>>> Author: gsingers
>>> Date: Fri Mar 25 14:28:12 2011
>>> New Revision: 1085397
>>>
>>> URL: http://svn.apache.org/viewvc?rev=1085397&view=rev
>>> Log:
>>> MAHOUT-548: add in some CSV support for creating vectors, as well as a few 
>>> other fixes for working with vectors
>>>
>>> Added:
>>>    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/
>>>    
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java
>>>    
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java
>>>    mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/
>>>    
>>> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java
>>>    
>>> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java
>>> Modified:
>>>    mahout/trunk/utils/pom.xml
>>>    
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
>>>    
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
>>>    
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java
>>>    
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
>>>    
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
>>>
>>> Modified: mahout/trunk/utils/pom.xml
>>> URL: 
>>> http://svn.apache.org/viewvc/mahout/trunk/utils/pom.xml?rev=1085397&r1=1085396&r2=1085397&view=diff
>>> ==============================================================================
>>> --- mahout/trunk/utils/pom.xml (original)
>>> +++ mahout/trunk/utils/pom.xml Fri Mar 25 14:28:12 2011
>>> @@ -142,6 +142,11 @@
>>>       <type>test-jar</type>
>>>       <scope>test</scope>
>>>     </dependency>
>>> +    <dependency>
>>> +      <groupId>org.apache.solr</groupId>
>>> +      <artifactId>solr-commons-csv</artifactId>
>>> +      <version>1.4.1</version>
>>> +    </dependency>
>>>
>>>     <dependency>
>>>       <groupId>junit</groupId>
>>>
>>> Modified: 
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
>>> URL: 
>>> http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java?rev=1085397&r1=1085396&r2=1085397&view=diff
>>> ==============================================================================
>>> --- 
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
>>>  (original)
>>> +++ 
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
>>>  Fri Mar 25 14:28:12 2011
>>> @@ -77,16 +77,22 @@ public final class VectorDumper {
>>>     Option dictTypeOpt = 
>>> obuilder.withLongName("dictionaryType").withRequired(false).withArgument(
>>>             
>>> abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1).create()).withDescription(
>>>             "The dictionary file type 
>>> (text|sequencefile)").withShortName("dt").create();
>>> -    Option centroidJSonOpt = 
>>> obuilder.withLongName("json").withRequired(false).withDescription(
>>> -            "Output the centroid as JSON.  Otherwise it substitutes in the 
>>> terms for vector cell entries")
>>> +    Option jsonOpt = 
>>> obuilder.withLongName("json").withRequired(false).withDescription(
>>> +            "Output the Vector as JSON.  Otherwise it substitutes in the 
>>> terms for vector cell entries")
>>>             .withShortName("j").create();
>>> +    Option csvOpt = 
>>> obuilder.withLongName("csv").withRequired(false).withDescription(
>>> +            "Output the Vector as CSV.  Otherwise it substitutes in the 
>>> terms for vector cell entries")
>>> +            .withShortName("c").create();
>>> +    Option namesAsCommentsOpt = 
>>> obuilder.withLongName("namesAsComments").withRequired(false).withDescription(
>>> +            "If using CSV output, optionally add a comment line for each 
>>> NamedVector (if the vector is one) printing out the name")
>>> +            .withShortName("n").create();
>>>     Option sizeOpt = obuilder.withLongName("sizeOnly").withRequired(false).
>>>             withDescription("Dump only the size of the 
>>> vector").withShortName("sz").create();
>>>     Option helpOpt = obuilder.withLongName("help").withDescription("Print 
>>> out help").withShortName("h")
>>>             .create();
>>>
>>>     Group group = 
>>> gbuilder.withName("Options").withOption(seqOpt).withOption(outputOpt).withOption(
>>> -            
>>> dictTypeOpt).withOption(dictOpt).withOption(centroidJSonOpt).withOption(vectorAsKeyOpt).withOption(
>>> +            
>>> dictTypeOpt).withOption(dictOpt).withOption(csvOpt).withOption(vectorAsKeyOpt).withOption(
>>>             printKeyOpt).withOption(sizeOpt).create();
>>>
>>>     try {
>>> @@ -122,10 +128,12 @@ public final class VectorDumper {
>>>             throw new OptionException(dictTypeOpt);
>>>           }
>>>         }
>>> -        boolean useJSON = cmdLine.hasOption(centroidJSonOpt);
>>> +        boolean useJSON = cmdLine.hasOption(jsonOpt);
>>> +        boolean useCSV = cmdLine.hasOption(csvOpt);
>>> +
>>>         boolean sizeOnly = cmdLine.hasOption(sizeOpt);
>>>         SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, 
>>> conf);
>>> -
>>> +        boolean namesAsComments = cmdLine.hasOption(namesAsCommentsOpt);
>>>         Writable keyWritable = 
>>> reader.getKeyClass().asSubclass(Writable.class).newInstance();
>>>         Writable valueWritable = 
>>> reader.getValueClass().asSubclass(Writable.class).newInstance();
>>>         boolean transposeKeyValue = cmdLine.hasOption(vectorAsKeyOpt);
>>> @@ -140,6 +148,16 @@ public final class VectorDumper {
>>>           try {
>>>             boolean printKey = cmdLine.hasOption(printKeyOpt);
>>>             long i = 0;
>>> +            if (useCSV && dictionary != null){
>>> +              writer.write("#");
>>> +              for (int j = 0; j < dictionary.length; j++) {
>>> +                writer.write(dictionary[j]);
>>> +                if (j < dictionary.length - 1){
>>> +                  writer.write(',');
>>> +                }
>>> +              }
>>> +              writer.write('\n');
>>> +            }
>>>             while (reader.next(keyWritable, valueWritable)) {
>>>               if (printKey) {
>>>                 Writable notTheVectorWritable = transposeKeyValue ? 
>>> valueWritable : keyWritable;
>>> @@ -159,7 +177,14 @@ public final class VectorDumper {
>>>                 writer.write(String.valueOf(vector.size()));
>>>                 writer.write('\n');
>>>               } else {
>>> -                String fmtStr = useJSON ? vector.asFormatString() : 
>>> VectorHelper.vectorToString(vector, dictionary);
>>> +                String fmtStr;
>>> +                if (useJSON){
>>> +                  fmtStr = VectorHelper.vectorToJSONString(vector, 
>>> dictionary);
>>> +                } else if (useCSV){
>>> +                  fmtStr = VectorHelper.vectorToCSVString(vector, 
>>> namesAsComments);
>>> +                } else {
>>> +                  fmtStr = vector.asFormatString();
>>> +                }
>>>                 writer.write(fmtStr);
>>>                 writer.write('\n');
>>>               }
>>>
>>> Modified: 
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
>>> URL: 
>>> http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java?rev=1085397&r1=1085396&r2=1085397&view=diff
>>> ==============================================================================
>>> --- 
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
>>>  (original)
>>> +++ 
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
>>>  Fri Mar 25 14:28:12 2011
>>> @@ -40,14 +40,45 @@ import org.apache.mahout.math.map.OpenOb
>>>  public final class VectorHelper {
>>>
>>>   private static final Pattern TAB_PATTERN = Pattern.compile("\t");
>>> +
>>>
>>>   private VectorHelper() { }
>>> -
>>> +
>>> +  public static String vectorToCSVString(Vector vector, boolean 
>>> namesAsComments){
>>> +    StringBuilder bldr = new StringBuilder(2048);
>>> +    try {
>>> +      vectorToCSVString(vector, namesAsComments, bldr);
>>> +    } catch (IOException e) {
>>> +      throw new RuntimeException(e);
>>> +    }
>>> +    return bldr.toString();
>>> +  }
>>> +
>>> +  public static void vectorToCSVString(Vector vector, boolean 
>>> namesAsComments,
>>> +                                       Appendable bldr) throws IOException 
>>> {
>>> +    if (namesAsComments && vector instanceof NamedVector){
>>> +      
>>> bldr.append("#").append(((NamedVector)vector).getName()).append('\n');
>>> +    }
>>> +    Iterator<Vector.Element> iter = vector.iterator();
>>> +    boolean first = true;
>>> +    while (iter.hasNext()) {
>>> +      if (first) {
>>> +        first = false;
>>> +      } else {
>>> +        bldr.append(",");
>>> +      }
>>> +      Vector.Element elt = iter.next();
>>> +      bldr.append(String.valueOf(elt.get()));
>>> +    }
>>> +    bldr.append('\n');
>>> +  }
>>> +
>>> +
>>>   /**
>>>    * @return a String from a vector that fills in the values with the 
>>> appropriate value from a dictionary where
>>>    * each the ith entry is the term for the ith vector cell.
>>>    */
>>> -  public static String vectorToString(Vector vector, String[] dictionary) {
>>> +  public static String vectorToJSONString(Vector vector, String[] 
>>> dictionary) {
>>>     StringBuilder bldr = new StringBuilder(2048);
>>>
>>>     if (vector instanceof NamedVector) {
>>> @@ -67,12 +98,13 @@ public final class VectorHelper {
>>>       if (dictionary != null) {
>>>         bldr.append(dictionary[elt.index()]);
>>>       } else {
>>> -        bldr.append(elt.index());
>>> +        bldr.append(String.valueOf(elt.index()));
>>>       }
>>> -      bldr.append(':').append(elt.get());
>>> +      bldr.append(':').append(String.valueOf(elt.get()));
>>>     }
>>>     return bldr.append('}').toString();
>>>   }
>>> +
>>>
>>>   /**
>>>    * Read in a dictionary file. Format is:
>>>
>>> Added: 
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java
>>> URL: 
>>> http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java?rev=1085397&view=auto
>>> ==============================================================================
>>> --- 
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java
>>>  (added)
>>> +++ 
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java
>>>  Fri Mar 25 14:28:12 2011
>>> @@ -0,0 +1,94 @@
>>> +package org.apache.mahout.utils.vectors.csv;
>>> +/**
>>> + * Licensed to the Apache Software Foundation (ASF) under one or more
>>> + * contributor license agreements.  See the NOTICE file distributed with
>>> + * this work for additional information regarding copyright ownership.
>>> + * The ASF licenses this file to You under the Apache License, Version 2.0
>>> + * (the "License"); you may not use this file except in compliance with
>>> + * the License.  You may obtain a copy of the License at
>>> + *
>>> + *     http://www.apache.org/licenses/LICENSE-2.0
>>> + *
>>> + * Unless required by applicable law or agreed to in writing, software
>>> + * distributed under the License is distributed on an "AS IS" BASIS,
>>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>>> + * See the License for the specific language governing permissions and
>>> + * limitations under the License.
>>> + */
>>> +
>>> +import org.apache.commons.csv.CSVParser;
>>> +import org.apache.commons.csv.CSVStrategy;
>>> +import org.apache.mahout.math.DenseVector;
>>> +import org.apache.mahout.math.Vector;
>>> +
>>> +import java.io.BufferedReader;
>>> +import java.io.IOException;
>>> +import java.io.Reader;
>>> +import java.util.Iterator;
>>> +
>>> +
>>> +/**
>>> + * Iterates a CSV file and produces {@link org.apache.mahout.math.Vector}.
>>> + * <br/>
>>> + * The Iterator returned throws {@link UnsupportedOperationException} for 
>>> the {@link java.util.Iterator#remove()} method.
>>> + * <p/>
>>> + * Assumes DenseVector for now, but in the future may have the option of 
>>> mapping columns to sparse format
>>> + * <p/>
>>> + * The Iterator is not thread-safe.
>>> + *
>>> + *
>>> + **/
>>> +public class CSVVectorIterable implements Iterable<Vector> {
>>> +  protected CSVParser parser;
>>> +  protected String [] line;
>>> +
>>> +  public CSVVectorIterable(Reader reader) throws IOException {
>>> +    parser = new CSVParser(reader);
>>> +    line = parser.getLine();
>>> +  }
>>> +
>>> +  public CSVVectorIterable(Reader reader, CSVStrategy strategy) throws 
>>> IOException {
>>> +    parser = new CSVParser(reader, strategy);
>>> +    line = parser.getLine();
>>> +  }
>>> +
>>> +
>>> +  @Override
>>> +  public Iterator<Vector> iterator() {
>>> +    return new CSVIterator();
>>> +  }
>>> +
>>> +  private class CSVIterator implements Iterator<Vector>{
>>> +
>>> +
>>> +    public CSVIterator() {
>>> +    }
>>> +
>>> +    @Override
>>> +    public boolean hasNext() {
>>> +      return line != null;
>>> +    }
>>> +
>>> +    @Override
>>> +    public Vector next() {
>>> +
>>> +      Vector result = null;
>>> +      result = new DenseVector(line.length);
>>> +      for (int i = 0; i < line.length; i++) {
>>> +        result.setQuick(i, Double.parseDouble(line[i]));
>>> +      }
>>> +      //move the line forward
>>> +      try {
>>> +        line = parser.getLine();
>>> +      } catch (IOException e) {
>>> +        throw new RuntimeException(e);
>>> +      }
>>> +      return result;
>>> +    }
>>> +
>>> +    @Override
>>> +    public void remove() {
>>> +      throw new UnsupportedOperationException();
>>> +    }
>>> +  }
>>> +}
>>>
>>> Modified: 
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java
>>> URL: 
>>> http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java?rev=1085397&r1=1085396&r2=1085397&view=diff
>>> ==============================================================================
>>> --- 
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java
>>>  (original)
>>> +++ 
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java
>>>  Fri Mar 25 14:28:12 2011
>>> @@ -25,8 +25,8 @@ import org.apache.mahout.math.Vector;
>>>  /**
>>>  * Write out the vectors to any {@link java.io.Writer} using {@link 
>>> org.apache.mahout.math.Vector#asFormatString()}.
>>>  */
>>> -public class JWriterVectorWriter implements VectorWriter {
>>> -  private final Writer writer;
>>> +public class JWriterVectorWriter extends VectorWriter {
>>> +  protected final Writer writer;
>>>
>>>   public JWriterVectorWriter(Writer writer) {
>>>     this.writer = writer;
>>> @@ -45,14 +45,22 @@ public class JWriterVectorWriter impleme
>>>       if (result >= maxDocs) {
>>>         break;
>>>       }
>>> -      writer.write(vector.asFormatString());
>>> -      writer.write('\n');
>>> -
>>> +      formatVector(vector);
>>>       result++;
>>>     }
>>>     return result;
>>>   }
>>> -
>>> +
>>> +  protected void formatVector(Vector vector) throws IOException {
>>> +    writer.write(vector.asFormatString());
>>> +    writer.write('\n');
>>> +  }
>>> +
>>> +  @Override
>>> +  public void write(Vector vector) throws IOException {
>>> +    formatVector(vector);
>>> +  }
>>> +
>>>   @Override
>>>   public void close() throws IOException {
>>>     writer.flush();
>>>
>>> Added: 
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java
>>> URL: 
>>> http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java?rev=1085397&view=auto
>>> ==============================================================================
>>> --- 
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java
>>>  (added)
>>> +++ 
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java
>>>  Fri Mar 25 14:28:12 2011
>>> @@ -0,0 +1,105 @@
>>> +package org.apache.mahout.utils.vectors.io;
>>> +/**
>>> + * Licensed to the Apache Software Foundation (ASF) under one or more
>>> + * contributor license agreements.  See the NOTICE file distributed with
>>> + * this work for additional information regarding copyright ownership.
>>> + * The ASF licenses this file to You under the Apache License, Version 2.0
>>> + * (the "License"); you may not use this file except in compliance with
>>> + * the License.  You may obtain a copy of the License at
>>> + *
>>> + *     http://www.apache.org/licenses/LICENSE-2.0
>>> + *
>>> + * Unless required by applicable law or agreed to in writing, software
>>> + * distributed under the License is distributed on an "AS IS" BASIS,
>>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>>> + * See the License for the specific language governing permissions and
>>> + * limitations under the License.
>>> + */
>>> +
>>> +import org.apache.hadoop.conf.Configuration;
>>> +import org.apache.hadoop.fs.ContentSummary;
>>> +import org.apache.hadoop.fs.FileSystem;
>>> +import org.apache.hadoop.fs.Path;
>>> +import org.apache.hadoop.io.SequenceFile;
>>> +import org.apache.hadoop.io.Writable;
>>> +import org.apache.mahout.math.Vector;
>>> +import org.apache.mahout.math.VectorWritable;
>>> +
>>> +import java.io.IOException;
>>> +import java.util.Iterator;
>>> +
>>> +
>>> +/**
>>> + * Given a Sequence File containing vectors (actually, {@link 
>>> org.apache.mahout.math.VectorWritable}, iterate over it.
>>> + *
>>> + **/
>>> +public class SequenceFileVectorIterable implements Iterable<Vector>{
>>> +  protected SequenceFile.Reader reader;
>>> +  protected long fileLen;
>>> +  protected Writable keyWritable;
>>> +  protected Writable valueWritable;
>>> +  protected boolean useKey;
>>> +
>>> +  /**
>>> +   * Construct the Iterable
>>> +   * @param fs The {@link org.apache.hadoop.fs.FileSystem} containing the 
>>> {@link org.apache.hadoop.io.SequenceFile}
>>> +   * @param file The {@link org.apache.hadoop.fs.Path} containing the file
>>> +   * @param conf The {@link org.apache.hadoop.conf.Configuration} to use
>>> +   * @param useKey If true, use the key as the {@link 
>>> org.apache.mahout.math.VectorWritable}, otherwise use the value
>>> +   * @throws IllegalAccessException
>>> +   * @throws InstantiationException
>>> +   * @throws IOException
>>> +   */
>>> +  public SequenceFileVectorIterable(FileSystem fs, Path file, 
>>> Configuration conf, boolean useKey) throws IllegalAccessException, 
>>> InstantiationException, IOException {
>>> +    this.reader = new SequenceFile.Reader(fs, file, conf);
>>> +    ContentSummary summary = fs.getContentSummary(file);
>>> +    fileLen = summary.getLength();
>>> +    this.useKey = useKey;
>>> +    keyWritable = 
>>> reader.getKeyClass().asSubclass(Writable.class).newInstance();
>>> +    valueWritable = 
>>> reader.getValueClass().asSubclass(Writable.class).newInstance();
>>> +  }
>>> +
>>> +  /**
>>> +   * The Iterator returned does not support remove()
>>> +   * @return The {@link java.util.Iterator}
>>> +   */
>>> +  public Iterator<Vector> iterator() {
>>> +    return new SFIterator();
>>> +
>>> +  }
>>> +
>>> +  private final class SFIterator implements Iterator<Vector>{
>>> +    @Override
>>> +    public boolean hasNext() {
>>> +      //TODO: is this legitimate?  We can't call next here since it breaks 
>>> the iterator contract
>>> +      try {
>>> +        return reader.getPosition() < fileLen;
>>> +      } catch (IOException e) {
>>> +        return false;
>>> +      }
>>> +    }
>>> +
>>> +    @Override
>>> +    public Vector next() {
>>> +      Vector result = null;
>>> +      boolean valid = false;
>>> +      try {
>>> +        valid = reader.next(keyWritable, valueWritable);
>>> +        if (valid){
>>> +          result = ((VectorWritable) (useKey ? keyWritable : 
>>> valueWritable)).get();
>>> +        }
>>> +      } catch (IOException e) {
>>> +        throw new RuntimeException(e);
>>> +      }
>>> +
>>> +      return result;
>>> +    }
>>> +
>>> +    /**
>>> +     * Not supported
>>> +     */
>>> +    public void remove() {
>>> +      throw new UnsupportedOperationException();
>>> +    }
>>> +  }
>>> +}
>>>
>>> Modified: 
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
>>> URL: 
>>> http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java?rev=1085397&r1=1085396&r2=1085397&view=diff
>>> ==============================================================================
>>> --- 
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
>>>  (original)
>>> +++ 
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
>>>  Fri Mar 25 14:28:12 2011
>>> @@ -30,16 +30,16 @@ import org.apache.mahout.math.VectorWrit
>>>  *
>>>  * Closes the writer when done
>>>  */
>>> -public class SequenceFileVectorWriter implements VectorWriter {
>>> +public class SequenceFileVectorWriter extends VectorWriter {
>>>   private final SequenceFile.Writer writer;
>>> -
>>> +  long recNum = 0;
>>>   public SequenceFileVectorWriter(SequenceFile.Writer writer) {
>>>     this.writer = writer;
>>>   }
>>>
>>>   @Override
>>>   public long write(Iterable<Vector> iterable, long maxDocs) throws 
>>> IOException {
>>> -    long recNum = 0;
>>> +
>>>     for (Vector point : iterable) {
>>>       if (recNum >= maxDocs) {
>>>         break;
>>> @@ -51,7 +51,13 @@ public class SequenceFileVectorWriter im
>>>     }
>>>     return recNum;
>>>   }
>>> -
>>> +
>>> +  @Override
>>> +  public void write(Vector vector) throws IOException {
>>> +    writer.append(new LongWritable(recNum++), new VectorWritable(vector));
>>> +
>>> +  }
>>> +
>>>   @Override
>>>   public long write(Iterable<Vector> iterable) throws IOException {
>>>     return write(iterable, Long.MAX_VALUE);
>>>
>>> Modified: 
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
>>> URL: 
>>> http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java?rev=1085397&r1=1085396&r2=1085397&view=diff
>>> ==============================================================================
>>> --- 
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
>>>  (original)
>>> +++ 
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
>>>  Fri Mar 25 14:28:12 2011
>>> @@ -21,7 +21,7 @@ import java.io.IOException;
>>>
>>>  import org.apache.mahout.math.Vector;
>>>
>>> -public interface VectorWriter {
>>> +public abstract class VectorWriter {
>>>   /**
>>>    * Write all values in the Iterable to the output
>>>    * @param iterable The {@link Iterable} to loop over
>>> @@ -29,7 +29,15 @@ public interface VectorWriter {
>>>    * @throws IOException if there was a problem writing
>>>    *
>>>    */
>>> -  long write(Iterable<Vector> iterable) throws IOException;
>>> +  public abstract long write(Iterable<Vector> iterable) throws IOException;
>>> +
>>> +  /**
>>> +   * Write out a vector
>>> +   *
>>> +   * @param vector The {@link org.apache.mahout.math.Vector} to write
>>> +   * @throws IOException
>>> +   */
>>> +  public abstract void write(Vector vector) throws IOException;
>>>
>>>   /**
>>>    * Write the first <code>maxDocs</code> to the output.
>>> @@ -38,12 +46,12 @@ public interface VectorWriter {
>>>    * @return The number of docs written
>>>    * @throws IOException if there was a problem writing
>>>    */
>>> -  long write(Iterable<Vector> iterable, long maxDocs) throws IOException;
>>> +  public abstract long write(Iterable<Vector> iterable, long maxDocs) 
>>> throws IOException;
>>>
>>>   /**
>>>    * Close any internally held resources.  If external Writers are passed 
>>> in, the implementation should indicate
>>>    * whether it also closes them
>>>    * @throws IOException if there was an issue closing the item
>>>    */
>>> -  void close() throws IOException;
>>> +  public abstract void close() throws IOException;
>>>  }
>>>
>>> Added: 
>>> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java
>>> URL: 
>>> http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java?rev=1085397&view=auto
>>> ==============================================================================
>>> --- 
>>> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java
>>>  (added)
>>> +++ 
>>> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java
>>>  Fri Mar 25 14:28:12 2011
>>> @@ -0,0 +1,60 @@
>>> +package org.apache.mahout.utils.vectors.csv;
>>> +/**
>>> + * Licensed to the Apache Software Foundation (ASF) under one or more
>>> + * contributor license agreements.  See the NOTICE file distributed with
>>> + * this work for additional information regarding copyright ownership.
>>> + * The ASF licenses this file to You under the Apache License, Version 2.0
>>> + * (the "License"); you may not use this file except in compliance with
>>> + * the License.  You may obtain a copy of the License at
>>> + *
>>> + *     http://www.apache.org/licenses/LICENSE-2.0
>>> + *
>>> + * Unless required by applicable law or agreed to in writing, software
>>> + * distributed under the License is distributed on an "AS IS" BASIS,
>>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>>> + * See the License for the specific language governing permissions and
>>> + * limitations under the License.
>>> + */
>>> +
>>> +import org.apache.mahout.math.Vector;
>>> +import org.apache.mahout.utils.MahoutTestCase;
>>> +import org.apache.mahout.utils.vectors.RandomVectorIterable;
>>> +import org.apache.mahout.utils.vectors.VectorHelper;
>>> +import org.apache.mahout.utils.vectors.io.JWriterVectorWriter;
>>> +import org.junit.Test;
>>> +
>>> +import java.io.IOException;
>>> +import java.io.StringReader;
>>> +import java.io.StringWriter;
>>> +
>>> +
>>> +/**
>>> + *
>>> + *
>>> + **/
>>> +public class CSVVectorIterableTest extends MahoutTestCase {
>>> +
>>> +
>>> +  @Test
>>> +  public void test() throws Exception {
>>> +
>>> +    StringWriter sWriter = new StringWriter();
>>> +    JWriterVectorWriter jwvw = new JWriterVectorWriter(sWriter) {
>>> +
>>> +      protected void formatVector(Vector vector) throws IOException {
>>> +        String vecStr = VectorHelper.vectorToCSVString(vector, false);
>>> +        writer.write(vecStr);
>>> +      }
>>> +    };
>>> +    Iterable<Vector> iter = new RandomVectorIterable(50);
>>> +    jwvw.write(iter);
>>> +    jwvw.close();
>>> +    CSVVectorIterable csvIter = new CSVVectorIterable(new 
>>> StringReader(sWriter.getBuffer().toString()));
>>> +    int count = 0;
>>> +    for (Vector vector : csvIter) {
>>> +      //System.out.println("Vec: " + vector);
>>> +      count++;
>>> +    }
>>> +    assertEquals(50, count);
>>> +  }
>>> +}
>>>
>>> Added: 
>>> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java
>>> URL: 
>>> http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java?rev=1085397&view=auto
>>> ==============================================================================
>>> --- 
>>> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java
>>>  (added)
>>> +++ 
>>> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java
>>>  Fri Mar 25 14:28:12 2011
>>> @@ -0,0 +1,39 @@
>>> +package org.apache.mahout.utils.vectors.io;
>>> +
>>> +import org.apache.hadoop.conf.Configuration;
>>> +import org.apache.hadoop.fs.FileSystem;
>>> +import org.apache.hadoop.fs.Path;
>>> +import org.apache.hadoop.io.LongWritable;
>>> +import org.apache.hadoop.io.SequenceFile;
>>> +import org.apache.mahout.math.Vector;
>>> +import org.apache.mahout.math.VectorWritable;
>>> +import org.apache.mahout.utils.MahoutTestCase;
>>> +import org.apache.mahout.utils.vectors.RandomVectorIterable;
>>> +import org.junit.Test;
>>> +
>>> +
>>> +/**
>>> + *
>>> + *
>>> + **/
>>> +public class SequenceFileVectorIterableTest extends MahoutTestCase {
>>> +
>>> +
>>> +  @Test
>>> +  public void testSFVI() throws Exception {
>>> +    Path path = getTestTempFilePath("sfvw");
>>> +    Configuration conf = new Configuration();
>>> +    FileSystem fs = FileSystem.get(conf);
>>> +    SequenceFile.Writer seqWriter = new SequenceFile.Writer(fs, conf, 
>>> path, LongWritable.class, VectorWritable.class);
>>> +    SequenceFileVectorWriter writer = new 
>>> SequenceFileVectorWriter(seqWriter);
>>> +    Iterable<Vector> iter = new RandomVectorIterable(50);
>>> +    writer.write(iter);
>>> +    writer.close();
>>> +    SequenceFileVectorIterable sfVIter = new 
>>> SequenceFileVectorIterable(fs, path, conf, false);
>>> +    int count = 0;
>>> +    for (Vector vector : sfVIter) {
>>> +      count++;
>>> +    }
>>> +    assertEquals(50, count);
>>> +  }
>>> +}
>>>
>>>
>>>
>
> --------------------------
> Grant Ingersoll
> http://www.lucidimagination.com/
>
> Search the Lucene ecosystem docs using Solr/Lucene:
> http://www.lucidimagination.com/search
>
>

Reply via email to