svn commit: r900034 - in /lucene/mahout/trunk: examples/src/main/java/org/apache/mahout/text/ utils/src/main/java/org/apache/mahout/utils/vectors/text/ utils/src/test/java/org/apache/mahout/utils/vectors/text/

robinanil Sat, 16 Jan 2010 15:10:55 -0800

Author: robinanil
Date: Sat Jan 16 23:10:31 2010
New Revision: 900034

URL: http://svn.apache.org/viewvc?rev=900034&view=rev
Log:
MAHOUT-237 Dictionary Vectorizer: running version which was tested over 
Wikipedia article dumps


Added:
    
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentTokenizerMapper.java
Modified:
    
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
    
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/WikipediaMapper.java
    
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java
    
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
    
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/PartialVectorGenerator.java
    
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/TermCountMapper.java
    
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java

Modified: 
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
URL: 
http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java?rev=900034&r1=900033&r2=900034&view=diff
==============================================================================
--- 
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
 (original)
+++ 
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
 Sat Jan 16 23:10:31 2010
@@ -25,6 +25,7 @@
 import org.apache.commons.cli2.builder.GroupBuilder;
 import org.apache.commons.cli2.commandline.Parser;
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.mahout.utils.vectors.text.DictionaryVectorizer;
 
 /**
@@ -58,7 +59,7 @@
             .create();
     
     Option analyzerNameOpt =
-        obuilder.withLongName("analyzerName").withRequired(true).withArgument(
+        obuilder.withLongName("analyzerName").withArgument(
             abuilder.withName("analyzerName").withMinimum(1).withMaximum(1)
                 .create()).withDescription("The class name of the analyzer")
             .withShortName("a").create();
@@ -93,9 +94,15 @@
       minSupport = Integer.parseInt(minSupportString);
     }
     String analyzerName = (String) cmdLine.getValue(analyzerNameOpt);
-    Analyzer analyzer = (Analyzer) Class.forName(analyzerName).newInstance();
-    
+    Class<? extends Analyzer> analyzerClass = StandardAnalyzer.class;
+    if (cmdLine.hasOption(analyzerNameOpt)) {
+      String className = cmdLine.getValue(analyzerNameOpt).toString();
+      analyzerClass = (Class<? extends Analyzer>) Class.forName(className);
+      // try instantiating it, b/c there isn't any point in setting it if
+      // you can't instantiate it
+      analyzerClass.newInstance();
+    }
     DictionaryVectorizer.createTermFrequencyVectors(inputDir, outputDir,
-        analyzer, minSupport, chunkSize);
+        analyzerClass, minSupport, chunkSize);
   }
 }

Modified: 
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/WikipediaMapper.java
URL: 
http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/WikipediaMapper.java?rev=900034&r1=900033&r2=900034&view=diff
==============================================================================
--- 
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/WikipediaMapper.java
 (original)
+++ 
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/WikipediaMapper.java
 Sat Jan 16 23:10:31 2010
@@ -48,16 +48,15 @@
   private static final Logger log =
       LoggerFactory.getLogger(WikipediaMapper.class);
   private static final Pattern SPACE_NON_ALPHA_PATTERN =
-      Pattern.compile("[\\s\\W]");
-  
-  private static final Pattern DOCUMENT =
-      Pattern.compile("<text xml:space=\"preserve\">(.*)</text>");
-  private static final Pattern TITLE = Pattern.compile("<title>(.*)</title>");
+      Pattern.compile("[\\s]");
+  private static final String START_DOC = "<text xml:space=\"preserve\">";
+  private static final String END_DOC = "</text>";
+  private static final Pattern TITLE = 
Pattern.compile("<title>(.*)<\\/title>");
   
+  private static final String REDIRECT = "<redirect />";
   private Set<String> inputCategories;
   private boolean exactMatchOnly;
   private boolean all;
-  private Analyzer analyzer;
   
   @Override
   public void map(LongWritable key,
@@ -66,8 +65,16 @@
                   Reporter reporter) throws IOException {
     
     String content = value.toString();
-    String document = getDocument(content);
-    String title = getTitle(content);
+    if (content.indexOf(REDIRECT) != -1) return;
+    String document = "";
+    String title = "";
+    try {
+      document = getDocument(content);
+      title = getTitle(content);
+    } catch (Exception e) {
+      reporter.getCounter("Wikipedia", "Parse errors").increment(1);
+      return;
+    }
     
     if (!all) {
       String catMatch = findMatchingCategory(document);
@@ -81,13 +88,9 @@
   }
   
   private String getDocument(String xml) {
-    Matcher m = DOCUMENT.matcher(xml);
-    String ret = "";
-    if (m.find()) {
-      ret = m.group(1);
-    }
-    return ret;
-    
+    int start = xml.indexOf(START_DOC) + START_DOC.length();
+    int end = xml.indexOf(END_DOC, start);
+    return xml.substring(start, end);
   }
   
   private String getTitle(String xml) {
@@ -150,8 +153,6 @@
         + " All: "
         + all
         + " Exact Match: "
-        + exactMatchOnly
-        + " Analyzer: "
-        + analyzer.getClass().getName());
+        + exactMatchOnly);
   }
 }

Modified: 
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java
URL: 
http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java?rev=900034&r1=900033&r2=900034&view=diff
==============================================================================
--- 
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java
 (original)
+++ 
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java
 Sat Jan 16 23:10:31 2010
@@ -104,7 +104,7 @@
     
     Option allOpt =
         obuilder.withLongName("all").withDescription(
-            "If set, Select all files. Default is true").withShortName("all")
+            "If set, Select all files. Default is false").withShortName("all")
             .create();
     
     Option helpOpt =
@@ -133,9 +133,9 @@
         catFile = (String) cmdLine.getValue(categoriesOpt);
       }
       
-      boolean all = true;
+      boolean all = false;
       if (cmdLine.hasOption(allOpt)) {
-        cmdLine.getValue(allOpt);
+        all = true;
       }
       runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt),
           all);
@@ -177,7 +177,7 @@
           + " All Files: "
           + all);
     }
-    conf.set("xmlinput.start", "<page");
+    conf.set("xmlinput.start", "<page>");
     conf.set("xmlinput.end", "</page>");
     conf.setOutputKeyClass(Text.class);
     conf.setOutputValueClass(Text.class);
@@ -193,7 +193,13 @@
     conf.set("io.serializations",
         "org.apache.hadoop.io.serializer.JavaSerialization,"
             + "org.apache.hadoop.io.serializer.WritableSerialization");
-    
+
+    /*conf.set("mapred.compress.map.output", "true");
+    conf.set("mapred.map.output.compression.type", "BLOCK");
+    conf.set("mapred.output.compress", "true");
+    conf.set("mapred.output.compression.type", "BLOCK");
+    conf.set("mapred.output.compression.codec", 
"org.apache.hadoop.io.compress.GzipCodec");
+    */
     FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
     if (dfs.exists(outPath)) {
       dfs.delete(outPath, true);

Modified: 
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
URL: 
http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java?rev=900034&r1=900033&r2=900034&view=diff
==============================================================================
--- 
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
 (original)
+++ 
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
 Sat Jan 16 23:10:31 2010
@@ -42,7 +42,6 @@
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.hadoop.mapred.lib.IdentityMapper;
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.mahout.math.VectorWritable;
 
 /**
@@ -73,7 +72,7 @@
   
   private static final String OUTPUT_FILES_PATTERN = "/part-*";
   
-  private static final int SEQUENCEFILE_BYTE_OVERHEAD = 4;
+  private static final int SEQUENCEFILE_BYTE_OVERHEAD = 45;
   
   private static final String VECTOR_OUTPUT_FOLDER = "/partial-vectors-";
   
@@ -117,7 +116,7 @@
    */
   public static void createTermFrequencyVectors(String input,
                                                 String output,
-                                                Analyzer analyzer,
+                                                Class<? extends Analyzer> 
analyzerClass,
                                                 int minSupport,
                                                 int chunkSizeInMegabytes) 
throws IOException,
                                                                          
InterruptedException,
@@ -132,7 +131,7 @@
     Path inputPath = new Path(input);
     Path wordCountPath = new Path(output + WORDCOUNT_OUTPUT_FOLDER);
     
-    startWordCounting(inputPath, wordCountPath, analyzer);
+    startWordCounting(inputPath, analyzerClass, wordCountPath);
     List<Path> dictionaryChunks =
         createDictionaryChunks(minSupport, wordCountPath, output,
             chunkSizeInMegabytes);
@@ -143,8 +142,8 @@
       Path partialVectorOutputPath =
           getPath(output + VECTOR_OUTPUT_FOLDER, partialVectorIndex++);
       partialVectorPaths.add(partialVectorOutputPath);
-      makePartialVectors(input, dictionaryChunk, partialVectorOutputPath,
-          analyzer);
+      makePartialVectors(input, dictionaryChunk, analyzerClass,
+          partialVectorOutputPath);
     }
     
     createVectorFromPartialVectors(partialVectorPaths, output
@@ -221,9 +220,8 @@
         
         int fieldSize =
             SEQUENCEFILE_BYTE_OVERHEAD
-                + key.toString().getBytes(CHARSET).length
-                + Long.SIZE
-                / 8;
+                + (key.toString().length() * 2)
+                + (Long.SIZE / 8);
         currentChunkSize += fieldSize;
         writer.append(key, new LongWritable(i++));
         freqWriter.append(key, value);
@@ -261,8 +259,6 @@
         "org.apache.hadoop.io.serializer.JavaSerialization,"
             + "org.apache.hadoop.io.serializer.WritableSerialization");
     // this conf parameter needs to be set enable serialisation of conf values
-    
-    conf.set(ANALYZER_CLASS, StandardAnalyzer.class.getName());
     conf
         .setJobName("DictionaryVectorizer Vector generator to group Partial 
Vectors");
     
@@ -324,11 +320,11 @@
    */
   private static void makePartialVectors(String input,
                                          Path dictionaryFilePath,
-                                         Path output,
-                                         Analyzer analyzer) throws IOException,
-                                                           
InterruptedException,
-                                                           
ClassNotFoundException,
-                                                           URISyntaxException {
+                                         Class<? extends Analyzer> 
analyzerClass,
+                                         Path output) throws IOException,
+                                                     InterruptedException,
+                                                     ClassNotFoundException,
+                                                     URISyntaxException {
     
     Configurable client = new JobClient();
     JobConf conf = new JobConf(DictionaryVectorizer.class);
@@ -337,7 +333,7 @@
             + "org.apache.hadoop.io.serializer.WritableSerialization");
     // this conf parameter needs to be set enable serialisation of conf values
     
-    conf.set(ANALYZER_CLASS, StandardAnalyzer.class.getName());
+    conf.set(ANALYZER_CLASS, analyzerClass.getName());
     conf.setJobName("DictionaryVectorizer Partial Vector running over input: "
         + input
         + " using dictionary file"
@@ -353,11 +349,10 @@
     
     FileOutputFormat.setOutputPath(conf, output);
     
-    conf.setMapperClass(IdentityMapper.class);
+    conf.setMapperClass(DocumentTokenizerMapper.class);
     conf.setInputFormat(SequenceFileInputFormat.class);
     conf.setReducerClass(PartialVectorGenerator.class);
     conf.setOutputFormat(SequenceFileOutputFormat.class);
-    
     FileSystem dfs = FileSystem.get(output.toUri(), conf);
     if (dfs.exists(output)) {
       dfs.delete(output, true);
@@ -379,10 +374,10 @@
    * @throws ClassNotFoundException
    */
   private static void startWordCounting(Path input,
-                                        Path output,
-                                        Analyzer analyzer) throws IOException,
-                                                          InterruptedException,
-                                                          
ClassNotFoundException {
+                                        Class<? extends Analyzer> 
analyzerClass,
+                                        Path output) throws IOException,
+                                                    InterruptedException,
+                                                    ClassNotFoundException {
     
     Configurable client = new JobClient();
     JobConf conf = new JobConf(DictionaryVectorizer.class);
@@ -391,7 +386,7 @@
             + "org.apache.hadoop.io.serializer.WritableSerialization");
     // this conf parameter needs to be set enable serialisation of conf values
     
-    conf.set(ANALYZER_CLASS, StandardAnalyzer.class.getName());
+    conf.set(ANALYZER_CLASS, analyzerClass.getName());
     conf.setJobName("DictionaryVectorizer Word Count running over input: "
         + input.toString());
     conf.setOutputKeyClass(Text.class);

Added: 
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentTokenizerMapper.java
URL: 
http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentTokenizerMapper.java?rev=900034&view=auto
==============================================================================
--- 
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentTokenizerMapper.java
 (added)
+++ 
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentTokenizerMapper.java
 Sat Jan 16 23:10:31 2010
@@ -0,0 +1,91 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.text;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.commons.lang.mutable.MutableLong;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.mahout.common.parameters.ClassParameter;
+
+/**
+ * TextVectorizer Term Count Mapper. Tokenizes a text document and outputs
+ * useful tokens space separated
+ */
+public class DocumentTokenizerMapper extends MapReduceBase implements
+    Mapper<Text,Text,Text,Text> {
+  
+  private Analyzer analyzer;
+  private StringBuilder document = new StringBuilder();
+  @Override
+  public void map(Text key,
+                  Text value,
+                  OutputCollector<Text,Text> output,
+                  Reporter reporter) throws IOException {
+    TokenStream stream =
+        analyzer
+            .tokenStream(key.toString(), new StringReader(value.toString()));
+    TermAttribute termAtt =
+        (TermAttribute) stream.addAttribute(TermAttribute.class);
+    document.setLength(0);
+    String sep = "";
+    while (stream.incrementToken()) {
+      String word = new String(termAtt.termBuffer(), 0, termAtt.termLength());
+      if (word != "") {
+        document.append(sep).append(word);
+        sep = " ";
+      }
+    }
+    output.collect(key, new Text(document.toString()) );
+    
+  }
+  
+  @Override
+  public void configure(JobConf job) {
+    super.configure(job);
+    try {
+      ClassLoader ccl = Thread.currentThread().getContextClassLoader();
+      Class<?> cl =
+          ccl.loadClass(job.get(DictionaryVectorizer.ANALYZER_CLASS,
+              StandardAnalyzer.class.getName()));
+      analyzer = (Analyzer) cl.newInstance();
+    } catch (ClassNotFoundException e) {
+      throw new IllegalStateException(e);
+    } catch (InstantiationException e) {
+      throw new IllegalStateException(e);
+    } catch (IllegalAccessException e) {
+      throw new IllegalStateException(e);
+    }
+  }
+  
+}

Modified: 
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/PartialVectorGenerator.java
URL: 
http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/PartialVectorGenerator.java?rev=900034&r1=900033&r2=900034&view=diff
==============================================================================
--- 
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/PartialVectorGenerator.java
 (original)
+++ 
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/PartialVectorGenerator.java
 Sat Jan 16 23:10:31 2010
@@ -18,14 +18,12 @@
 package org.apache.mahout.utils.vectors.text;
 
 import java.io.IOException;
-import java.io.StringReader;
 import java.net.URI;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
-import java.util.Map.Entry;
+import java.util.StringTokenizer;
 
-import org.apache.commons.lang.mutable.MutableInt;
 import org.apache.hadoop.filecache.DistributedCache;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -38,9 +36,6 @@
 import org.apache.hadoop.mapred.Reducer;
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.mahout.math.RandomAccessSparseVector;
 import org.apache.mahout.math.VectorWritable;
 
@@ -48,64 +43,44 @@
  * Converts a document in to a sparse vector
  */
 public class PartialVectorGenerator extends MapReduceBase implements
-    Reducer<Text,Text,Text, VectorWritable> {
+    Reducer<Text,Text,Text,VectorWritable> {
   private Analyzer analyzer;
-  private Map<String,Integer> dictionary = new HashMap<String,Integer>();
+  private Map<String,int[]> dictionary = new HashMap<String,int[]>();
   private FileSystem fs; // local filesystem
   private URI[] localFiles; // local filenames from the distributed cache
-
+  
   private VectorWritable vectorWritable = new VectorWritable();
-
-  @Override
+  
   public void reduce(Text key,
                      Iterator<Text> values,
                      OutputCollector<Text,VectorWritable> output,
                      Reporter reporter) throws IOException {
+    if (values.hasNext() == false) return;
+    Text value = values.next();
+    String valueString = value.toString();
+    StringTokenizer stream = new StringTokenizer(valueString, " ");
     
-    if (values.hasNext()) {
-      Text value = values.next();
-      TokenStream ts =
-          analyzer.tokenStream(key.toString(), new StringReader(value
-              .toString()));
-      
-      Map<String,MutableInt> termFrequency = new HashMap<String,MutableInt>();
-      
-      Token token = new Token();
-      int count = 0;
-      while ((token = ts.next(token)) != null) {
-        String tk = new String(token.termBuffer(), 0, token.termLength());
-        if(dictionary.containsKey(tk) == false) continue;
-        if (termFrequency.containsKey(tk) == false) {
-          count += tk.length() + 1;
-          termFrequency.put(tk, new MutableInt(0));
-        }
-        termFrequency.get(tk).increment();
-      }
-      
-      RandomAccessSparseVector vector =
-          new RandomAccessSparseVector(key.toString(), Integer.MAX_VALUE, 
termFrequency
-              .size());
+    RandomAccessSparseVector vector =
+        new RandomAccessSparseVector(key.toString(), Integer.MAX_VALUE,
+            valueString.length() / 5); // guess at initial size
+    
+    while (stream.hasMoreTokens()) {
+      String tk = stream.nextToken();
+      if (dictionary.containsKey(tk) == false) continue;
+      int tokenKey = dictionary.get(tk)[0];
+      vector.setQuick(tokenKey, vector.getQuick(tokenKey) + 1);
       
-      for (Entry<String,MutableInt> pair : termFrequency.entrySet()) {
-        String tk = pair.getKey();
-        if (dictionary.containsKey(tk) == false) continue;
-        vector.setQuick(dictionary.get(tk).intValue(), pair.getValue()
-            .doubleValue());
-      }
-      vectorWritable.set(vector);
-      output.collect(key, vectorWritable);
     }
+    
+    vectorWritable.set(vector);
+    output.collect(key, vectorWritable);
+    
   }
   
   @Override
   public void configure(JobConf job) {
     super.configure(job);
     try {
-      ClassLoader ccl = Thread.currentThread().getContextClassLoader();
-      Class<?> cl =
-          ccl.loadClass(job.get(DictionaryVectorizer.ANALYZER_CLASS,
-              StandardAnalyzer.class.getName()));
-      analyzer = (Analyzer) cl.newInstance();
       
       localFiles = DistributedCache.getCacheFiles(job);
       if (localFiles == null || localFiles.length < 1) {
@@ -121,15 +96,9 @@
       
       // key is word value is id
       while (reader.next(key, value)) {
-        dictionary.put(key.toString(), Long.valueOf(value.get()).intValue());
-        // System.out.println(key.toString() + "=>" + value.get());
+        dictionary.put(key.toString(), new int[] {Long.valueOf(value.get())
+            .intValue()});
       }
-    } catch (ClassNotFoundException e) {
-      throw new IllegalStateException(e);
-    } catch (InstantiationException e) {
-      throw new IllegalStateException(e);
-    } catch (IllegalAccessException e) {
-      throw new IllegalStateException(e);
     } catch (IOException e) {
       throw new IllegalStateException(e);
     }

Modified: 
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/TermCountMapper.java
URL: 
http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/TermCountMapper.java?rev=900034&r1=900033&r2=900034&view=diff
==============================================================================
--- 
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/TermCountMapper.java
 (original)
+++ 
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/TermCountMapper.java
 Sat Jan 16 23:10:31 2010
@@ -35,6 +35,8 @@
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.mahout.common.parameters.ClassParameter;
 
 /**
  * TextVectorizer Term Count Mapper. Tokenizes a text document and outputs the
@@ -51,20 +53,20 @@
                   Text value,
                   OutputCollector<Text,LongWritable> output,
                   Reporter reporter) throws IOException {
-    TokenStream ts =
+    TokenStream stream =
         analyzer
             .tokenStream(key.toString(), new StringReader(value.toString()));
-    
-    Token token = new Token();
-    Map<String,MutableLong> wordCount = new HashMap<String,MutableLong>();
-    while ((token = ts.next(token)) != null) {
-      char[] termBuffer = token.termBuffer();
-      String word = new String(termBuffer, 0, token.termLength());
+    Map<String,MutableLong> wordCount = new HashMap<String,MutableLong>();    
+    TermAttribute termAtt =
+        (TermAttribute) stream.addAttribute(TermAttribute.class);
+    while (stream.incrementToken()) {
+      String word = new String(termAtt.termBuffer(), 0, termAtt.termLength());
       if (wordCount.containsKey(word) == false) {
         wordCount.put(word, new MutableLong(0));
       }
       wordCount.get(word).increment();
     }
+    
     for (Entry<String,MutableLong> entry : wordCount.entrySet()) {
       output.collect(new Text(entry.getKey()), new LongWritable(entry
           .getValue().longValue()));

Modified: 
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
URL: 
http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java?rev=900034&r1=900033&r2=900034&view=diff
==============================================================================
--- 
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
 (original)
+++ 
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
 Sat Jan 16 23:10:31 2010
@@ -29,7 +29,9 @@
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.util.Version;
 
 /**
  * Test the dictionary Vector
@@ -49,8 +51,8 @@
   
   public static final String DELIM = " .,?;:!\t\n\r";
   
-  public static final String ERRORSET = "`1234567890"
-                                        + "-...@#$%^&*()_+[]{}'\"/<>|\\";
+  public static final String ERRORSET =
+      "`1234567890" + "-...@#$%^&*()_+[]{}'\"/<>|\\";
   
   private static Random random = new Random();
   
@@ -60,20 +62,20 @@
     return DELIM.charAt(random.nextInt(DELIM.length()));
   }
   
-  private static String getRandomDocument() {
-    int length = (AVG_DOCUMENT_LENGTH >> 1)
-                 + random.nextInt(AVG_DOCUMENT_LENGTH);
-    StringBuilder sb = new StringBuilder(length * AVG_SENTENCE_LENGTH
-                                         * AVG_WORD_LENGTH);
+  public static String getRandomDocument() {
+    int length =
+        (AVG_DOCUMENT_LENGTH >> 1) + random.nextInt(AVG_DOCUMENT_LENGTH);
+    StringBuilder sb =
+        new StringBuilder(length * AVG_SENTENCE_LENGTH * AVG_WORD_LENGTH);
     for (int i = 0; i < length; i++) {
       sb.append(getRandomSentence());
     }
     return sb.toString();
   }
   
-  private static String getRandomSentence() {
-    int length = (AVG_SENTENCE_LENGTH >> 1)
-                 + random.nextInt(AVG_SENTENCE_LENGTH);
+  public static String getRandomSentence() {
+    int length =
+        (AVG_SENTENCE_LENGTH >> 1) + random.nextInt(AVG_SENTENCE_LENGTH);
     StringBuilder sb = new StringBuilder(length * AVG_WORD_LENGTH);
     for (int i = 0; i < length; i++) {
       sb.append(getRandomString()).append(' ');
@@ -82,7 +84,7 @@
     return sb.toString();
   }
   
-  private static String getRandomString() {
+  public static String getRandomString() {
     int length = (AVG_WORD_LENGTH >> 1) + random.nextInt(AVG_WORD_LENGTH);
     StringBuilder sb = new StringBuilder(length);
     for (int i = 0; i < length; i++) {
@@ -120,17 +122,18 @@
     Configuration conf = new Configuration();
     String pathString = "testdata/documents/docs.file";
     Path path = new Path(pathString);
-    SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path,
-        Text.class, Text.class);
+    SequenceFile.Writer writer =
+        new SequenceFile.Writer(fs, conf, path, Text.class, Text.class);
     
     for (int i = 0; i < NUM_DOCS; i++) {
       writer.append(new Text("Document::ID::" + i), new Text(
           getRandomDocument()));
     }
     writer.close();
+    Class<? extends Analyzer> analyzer =
+        new StandardAnalyzer(Version.LUCENE_CURRENT).getClass();
     DictionaryVectorizer.createTermFrequencyVectors(pathString,
-      "output/wordcount", new StandardAnalyzer(), 2, 100);
-    
+        "output/wordcount", analyzer, 2, 100);
     
   }
 }

svn commit: r900034 - in /lucene/mahout/trunk: examples/src/main/java/org/apache/mahout/text/ utils/src/main/java/org/apache/mahout/utils/vectors/text/ utils/src/test/java/org/apache/mahout/utils/vectors/text/

Reply via email to