Author: srowen
Date: Sat Jun 4 17:43:31 2011
New Revision: 1131447
URL: http://svn.apache.org/viewvc?rev=1131447&view=rev
Log:
MAHOUT-706 use reusable TokenStream in Lucene for performance
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/BayesFileFormatter.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DefaultAnalyzer.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/document/SequenceFileTokenizerMapper.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/LuceneTextValueEncoder.java
mahout/trunk/examples/src/main/java/org/apache/mahout/analysis/WikipediaAnalyzer.java
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java
mahout/trunk/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/BayesFileFormatter.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/BayesFileFormatter.java?rev=1131447&r1=1131446&r2=1131447&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/BayesFileFormatter.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/BayesFileFormatter.java
Sat Jun 4 17:43:31 2011
@@ -203,11 +203,12 @@ public final class BayesFileFormatter {
Charset charset, Writer writer) throws
IOException {
Reader reader = Files.newReader(inFile, charset);
try {
- TokenStream ts = analyzer.tokenStream(label, reader);
+ TokenStream ts = analyzer.reusableTokenStream(label, reader);
writer.write(label);
writer.write('\t'); // edit: Inorder to match Hadoop standard
// TextInputFormat
TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
+ ts.reset();
while (ts.incrementToken()) {
char[] termBuffer = termAtt.termBuffer();
int termLen = termAtt.termLength();
@@ -229,10 +230,11 @@ public final class BayesFileFormatter {
* @return An array of unique tokens
*/
public static String[] readerToDocument(Analyzer analyzer, Reader reader)
throws IOException {
- TokenStream ts = analyzer.tokenStream("", reader);
+ TokenStream ts = analyzer.reusableTokenStream("", reader);
List<String> coll = new ArrayList<String>();
TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
+ ts.reset();
while (ts.incrementToken()) {
char[] termBuffer = termAtt.termBuffer();
int termLen = termAtt.termLength();
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DefaultAnalyzer.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DefaultAnalyzer.java?rev=1131447&r1=1131446&r2=1131447&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DefaultAnalyzer.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DefaultAnalyzer.java
Sat Jun 4 17:43:31 2011
@@ -21,6 +21,7 @@ import org.apache.lucene.analysis.TokenS
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.util.Version;
+import java.io.IOException;
import java.io.Reader;
/**
@@ -36,4 +37,9 @@ public final class DefaultAnalyzer exten
public TokenStream tokenStream(String fieldName, Reader reader) {
return stdAnalyzer.tokenStream(fieldName, reader);
}
+
+ @Override
+ public TokenStream reusableTokenStream(String fieldName, Reader reader)
throws IOException {
+ return stdAnalyzer.reusableTokenStream(fieldName, reader);
+ }
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/document/SequenceFileTokenizerMapper.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/document/SequenceFileTokenizerMapper.java?rev=1131447&r1=1131446&r2=1131447&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/document/SequenceFileTokenizerMapper.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/document/SequenceFileTokenizerMapper.java
Sat Jun 4 17:43:31 2011
@@ -39,9 +39,10 @@ public class SequenceFileTokenizerMapper
@Override
protected void map(Text key, Text value, Context context) throws
IOException, InterruptedException {
- TokenStream stream = analyzer.tokenStream(key.toString(), new
StringReader(value.toString()));
+ TokenStream stream = analyzer.reusableTokenStream(key.toString(), new
StringReader(value.toString()));
CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
StringTuple document = new StringTuple();
+ stream.reset();
while (stream.incrementToken()) {
if (termAtt.length() > 0) {
document.add(new String(termAtt.buffer(), 0, termAtt.length()));
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/LuceneTextValueEncoder.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/LuceneTextValueEncoder.java?rev=1131447&r1=1131446&r2=1131447&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/LuceneTextValueEncoder.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/LuceneTextValueEncoder.java
Sat Jun 4 17:43:31 2011
@@ -49,9 +49,13 @@ public class LuceneTextValueEncoder exte
*/
@Override
protected Iterable<String> tokenize(CharSequence originalForm) {
- TokenStream ts = analyzer.tokenStream(getName(), new
CharSequenceReader(originalForm));
- ts.addAttribute(CharTermAttribute.class);
- return new LuceneTokenIterable(ts);
+ try {
+ TokenStream ts = analyzer.reusableTokenStream(getName(), new
CharSequenceReader(originalForm));
+ ts.addAttribute(CharTermAttribute.class);
+ return new LuceneTokenIterable(ts);
+ } catch (IOException ex) {
+ throw new IllegalStateException(ex);
+ }
}
private static final class CharSequenceReader extends Reader {
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/analysis/WikipediaAnalyzer.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/analysis/WikipediaAnalyzer.java?rev=1131447&r1=1131446&r2=1131447&view=diff
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/analysis/WikipediaAnalyzer.java
(original)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/analysis/WikipediaAnalyzer.java
Sat Jun 4 17:43:31 2011
@@ -19,36 +19,34 @@ package org.apache.mahout.analysis;
import java.io.Reader;
-import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
import org.apache.lucene.util.Version;
-public class WikipediaAnalyzer extends Analyzer {
-
- private final CharArraySet stopSet;
+public class WikipediaAnalyzer extends StopwordAnalyzerBase {
public WikipediaAnalyzer() {
- stopSet = (CharArraySet) StopFilter.makeStopSet(Version.LUCENE_31,
- StopAnalyzer.ENGLISH_STOP_WORDS_SET.toArray(new
String[StopAnalyzer.ENGLISH_STOP_WORDS_SET.size()]));
+ super(Version.LUCENE_31, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
}
public WikipediaAnalyzer(CharArraySet stopSet) {
- this.stopSet = stopSet;
+ super(Version.LUCENE_31, stopSet);
}
@Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream result = new WikipediaTokenizer(reader);
- result = new StandardFilter(Version.LUCENE_31, result);
+ protected TokenStreamComponents createComponents(String fieldName, Reader
reader) {
+ Tokenizer tokenizer = new WikipediaTokenizer(reader);
+ TokenStream result = new StandardFilter(Version.LUCENE_31, tokenizer);
result = new LowerCaseFilter(Version.LUCENE_31, result);
- result = new StopFilter(Version.LUCENE_31, result, stopSet);
- return result;
+ result = new StopFilter(Version.LUCENE_31, result, stopwords);
+ return new TokenStreamComponents(tokenizer, result);
}
}
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java?rev=1131447&r1=1131446&r2=1131447&view=diff
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java
(original)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java
Sat Jun 4 17:43:31 2011
@@ -67,8 +67,9 @@ public class WikipediaDatasetCreatorMapp
StringBuilder contents = new StringBuilder(1000);
document =
StringEscapeUtils.unescapeHtml(WikipediaDatasetCreatorMapper.CLOSE_TEXT_TAG_PATTERN.matcher(
WikipediaDatasetCreatorMapper.OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst("")).replaceAll(""));
- TokenStream stream = analyzer.tokenStream(catMatch, new
StringReader(document));
+ TokenStream stream = analyzer.reusableTokenStream(catMatch, new
StringReader(document));
CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
+ stream.reset();
while (stream.incrementToken()) {
contents.append(termAtt.buffer(), 0, termAtt.length()).append(' ');
}
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java?rev=1131447&r1=1131446&r2=1131447&view=diff
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java
(original)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java
Sat Jun 4 17:43:31 2011
@@ -320,8 +320,9 @@ public final class TrainNewsGroups {
}
private static void countWords(Analyzer analyzer, Collection<String> words,
Reader in) throws IOException {
- TokenStream ts = analyzer.tokenStream("text", in);
+ TokenStream ts = analyzer.reusableTokenStream("text", in);
ts.addAttribute(CharTermAttribute.class);
+ ts.reset();
while (ts.incrementToken()) {
String s = ts.getAttribute(CharTermAttribute.class).toString();
words.add(s);
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java?rev=1131447&r1=1131446&r2=1131447&view=diff
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
(original)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
Sat Jun 4 17:43:31 2011
@@ -17,17 +17,19 @@
package org.apache.mahout.text;
import java.io.IOException;
+import java.io.Reader;
import java.util.Arrays;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.ASCIIFoldingFilter;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.PorterStemFilter;
import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
@@ -39,12 +41,12 @@ import org.apache.lucene.util.Version;
* for clustering the ASF Mail Archives using an extended set of
* stop words, excluding non-alpha-numeric tokens, and porter stemming.
*/
-public final class MailArchivesClusteringAnalyzer extends Analyzer {
+public final class MailArchivesClusteringAnalyzer extends StopwordAnalyzerBase
{
// extended set of stop words composed of common mail terms like "hi",
// HTML tags, and Java keywords asmany of the messages in the archives
// are subversion check-in notifications
- private static final String[] STOP_WORDS = {
+ private static final CharArraySet STOP_WORDS = new
CharArraySet(Version.LUCENE_31, Arrays.asList(
"3d","7bit","a0","about","above","abstract","across","additional","after",
"afterwards","again","against","align","all","almost","alone","along",
"already","also","although","always","am","among","amongst","amoungst",
@@ -97,36 +99,29 @@ public final class MailArchivesClusterin
"whole","whom","whose","why","width","will","with","within","without",
"wont","would","wrote","www","yes","yet","you","your","yours","yourself",
"yourselves"
- };
+ ), false);
// Regex used to exclude non-alpha-numeric tokens
private static final Pattern alphaNumeric =
Pattern.compile("^[a-z][a-z0-9_]+$");
- private final CharArraySet stopSet;
public MailArchivesClusteringAnalyzer() {
- stopSet = (CharArraySet)StopFilter.makeStopSet(Version.LUCENE_31,
Arrays.asList(STOP_WORDS));
- /*
- Collection<String> tmp = new java.util.TreeSet<String>();
- for (Object entry : stopSet) {
- tmp.add(entry.toString());
- }
- */
+ super(Version.LUCENE_31, STOP_WORDS);
}
public MailArchivesClusteringAnalyzer(CharArraySet stopSet) {
- this.stopSet = stopSet;
+ super(Version.LUCENE_31, stopSet);
}
-
+
@Override
- public TokenStream tokenStream(String fieldName, java.io.Reader reader) {
-
- TokenStream result = new StandardTokenizer(Version.LUCENE_31, reader);
- result = new StandardFilter(Version.LUCENE_31, result);
+ protected TokenStreamComponents createComponents(String fieldName, Reader
reader) {
+ Tokenizer tokenizer = new StandardTokenizer(Version.LUCENE_31, reader);
+ TokenStream result = new StandardFilter(Version.LUCENE_31, tokenizer);
result = new LowerCaseFilter(Version.LUCENE_31, result);
result = new ASCIIFoldingFilter(result);
result = new AlphaNumericMaxLengthFilter(result);
- result = new StopFilter(Version.LUCENE_31, result, stopSet);
- return new PorterStemFilter(result);
+ result = new StopFilter(Version.LUCENE_31, result, stopwords);
+ result = new PorterStemFilter(result);
+ return new TokenStreamComponents(tokenizer, result);
}
/**