Author: srowen
Date: Sat Jun 4 15:47:07 2011
New Revision: 1131417
URL: http://svn.apache.org/viewvc?rev=1131417&view=rev
Log:
Consistently select Lucene 3.1 behavior. Make BloomTokenFilter final since
Lucene requires it
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/BayesFileFormatter.java
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/Classify.java
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/collocations/llr/CollocMapperTest.java
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/PrepareTwentyNewsgroups.java
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java
mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/dirichlet/TestL1ModelClustering.java
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/BayesFileFormatter.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/BayesFileFormatter.java?rev=1131417&r1=1131416&r2=1131417&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/BayesFileFormatter.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/BayesFileFormatter.java
Sat Jun 4 15:47:07 2011
@@ -17,7 +17,11 @@
package org.apache.mahout.classifier;
-import java.io.*;
+import java.io.File;
+import java.io.FileFilter;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.Writer;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
@@ -38,7 +42,6 @@ import org.apache.lucene.analysis.TokenS
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;
-import org.apache.mahout.common.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -302,7 +305,7 @@ public final class BayesFileFormatter {
if (cmdLine.hasOption(analyzerOpt)) {
analyzer = Class.forName((String)
cmdLine.getValue(analyzerOpt)).asSubclass(Analyzer.class).newInstance();
} else {
- analyzer = new StandardAnalyzer(Version.LUCENE_30);
+ analyzer = new StandardAnalyzer(Version.LUCENE_31);
}
Charset charset = Charsets.UTF_8;
if (cmdLine.hasOption(charsetOpt)) {
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/Classify.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/Classify.java?rev=1131417&r1=1131416&r2=1131417&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/Classify.java
(original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/Classify.java
Sat Jun 4 15:47:07 2011
@@ -151,7 +151,7 @@ public final class Classify {
analyzer =
Class.forName(className).asSubclass(Analyzer.class).newInstance();
}
if (analyzer == null) {
- analyzer = new StandardAnalyzer(Version.LUCENE_30);
+ analyzer = new StandardAnalyzer(Version.LUCENE_31);
}
log.info("Converting input document to proper format");
Modified:
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/collocations/llr/CollocMapperTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/collocations/llr/CollocMapperTest.java?rev=1131417&r1=1131416&r2=1131417&view=diff
==============================================================================
---
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/collocations/llr/CollocMapperTest.java
(original)
+++
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/collocations/llr/CollocMapperTest.java
Sat Jun 4 15:47:07 2011
@@ -86,7 +86,7 @@ public final class CollocMapperTest exte
for (String[] v : values) {
Type p = v[0].startsWith("h") ? Gram.Type.HEAD : Gram.Type.TAIL;
int frequency = 1;
- if (v[1].equals("of times")) {
+ if ("of times".equals(v[1])) {
frequency = 2;
}
@@ -150,8 +150,8 @@ public final class CollocMapperTest exte
Type p = v[0].startsWith("h") ? Gram.Type.HEAD : Gram.Type.TAIL;
p = v[0].startsWith("u") ? Gram.Type.UNIGRAM : p;
int frequency = 1;
- if (v[1].equals("of times") || v[1].equals("of") || v[1].equals("times")
- || v[1].equals("the")) {
+ if ("of times".equals(v[1]) || "of".equals(v[1]) || "times".equals(v[1])
+ || "the".equals(v[1])) {
frequency = 2;
}
@@ -190,7 +190,7 @@ public final class CollocMapperTest exte
private final Analyzer a;
public TestAnalyzer() {
- a = new StandardAnalyzer(Version.LUCENE_29, Collections.emptySet());
+ a = new StandardAnalyzer(Version.LUCENE_31, Collections.emptySet());
}
@Override
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/PrepareTwentyNewsgroups.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/PrepareTwentyNewsgroups.java?rev=1131417&r1=1131416&r2=1131417&view=diff
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/PrepareTwentyNewsgroups.java
(original)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/PrepareTwentyNewsgroups.java
Sat Jun 4 15:47:07 2011
@@ -90,7 +90,7 @@ public final class PrepareTwentyNewsgrou
try {
analyzer =
Class.forName(analyzerName).asSubclass(Analyzer.class).newInstance();
} catch (InstantiationException e) {
- analyzer = (Analyzer)
Class.forName(analyzerName).getConstructor(Version.class).newInstance(Version.LUCENE_30);
+ analyzer = (Analyzer)
Class.forName(analyzerName).getConstructor(Version.class).newInstance(Version.LUCENE_31);
}
// parent dir contains dir by category
if (!parentDir.exists()) {
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java?rev=1131417&r1=1131416&r2=1131417&view=diff
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java
(original)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java
Sat Jun 4 15:47:07 2011
@@ -122,7 +122,7 @@ public final class TrainNewsGroups {
new SimpleDateFormat("dd-MMM-yyyy HH:mm:ss", Locale.ENGLISH)
};
- private static final Analyzer analyzer = new
StandardAnalyzer(Version.LUCENE_30);
+ private static final Analyzer analyzer = new
StandardAnalyzer(Version.LUCENE_31);
private static final FeatureVectorEncoder encoder = new
StaticWordValueEncoder("body");
private static final FeatureVectorEncoder bias = new
ConstantValueEncoder("Intercept");
private static Multiset<String> overallCounts;
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java?rev=1131417&r1=1131416&r2=1131417&view=diff
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java
(original)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java
Sat Jun 4 15:47:07 2011
@@ -35,7 +35,7 @@ import org.apache.lucene.analysis.tokena
/**
* Emits tokens based on bloom filter membership.
*/
-public class BloomTokenFilter extends TokenFilter {
+public final class BloomTokenFilter extends TokenFilter {
private final Filter filter;
private final TermAttribute termAtt;
Modified:
mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java?rev=1131417&r1=1131416&r2=1131417&view=diff
==============================================================================
---
mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
(original)
+++
mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
Sat Jun 4 15:47:07 2011
@@ -110,7 +110,7 @@ public final class TestClusterDumper ext
sampleData = new ArrayList<VectorWritable>();
RAMDirectory directory = new RAMDirectory();
IndexWriter writer = new IndexWriter(directory,
- new
StandardAnalyzer(Version.LUCENE_30),
+ new
StandardAnalyzer(Version.LUCENE_31),
true,
IndexWriter.MaxFieldLength.UNLIMITED);
try {
Modified:
mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/dirichlet/TestL1ModelClustering.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/dirichlet/TestL1ModelClustering.java?rev=1131417&r1=1131416&r2=1131417&view=diff
==============================================================================
---
mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/dirichlet/TestL1ModelClustering.java
(original)
+++
mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/dirichlet/TestL1ModelClustering.java
Sat Jun 4 15:47:07 2011
@@ -102,7 +102,7 @@ public final class TestL1ModelClustering
sampleData = new ArrayList<VectorWritable>();
RAMDirectory directory = new RAMDirectory();
IndexWriter writer = new IndexWriter(directory,
- new
StandardAnalyzer(Version.LUCENE_30),
+ new
StandardAnalyzer(Version.LUCENE_31),
true,
IndexWriter.MaxFieldLength.UNLIMITED);
try {
Modified:
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java?rev=1131417&r1=1131416&r2=1131417&view=diff
==============================================================================
---
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
(original)
+++
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
Sat Jun 4 15:47:07 2011
@@ -34,6 +34,7 @@ import org.apache.lucene.analysis.TokenS
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.Version;
import org.apache.mahout.utils.MahoutTestCase;
import org.junit.Test;
@@ -76,7 +77,7 @@ public final class BloomTokenFilterTest
@Test
public void testAnalyzer() throws IOException {
StringReader reader = new StringReader(input);
- WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
+ WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_31);
TokenStream ts = analyzer.tokenStream(null, reader);
validateTokens(allTokens, ts);
}
@@ -85,7 +86,7 @@ public final class BloomTokenFilterTest
@Test
public void testNonKeepdAnalyzer() throws IOException {
StringReader reader = new StringReader(input);
- WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
+ WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_31);
TokenStream ts = analyzer.tokenStream(null, reader);
BloomTokenFilter f = new BloomTokenFilter(getFilter(filterTokens), false
/* toss matching tokens */, ts);
validateTokens(expectedNonKeepTokens, f);
@@ -95,7 +96,7 @@ public final class BloomTokenFilterTest
@Test
public void testKeepAnalyzer() throws IOException {
StringReader reader = new StringReader(input);
- WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
+ WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_31);
TokenStream ts = analyzer.tokenStream(null, reader);
BloomTokenFilter f = new BloomTokenFilter(getFilter(filterTokens), true /*
keep matching tokens */, ts);
validateTokens(expectedKeepTokens, f);
@@ -105,7 +106,7 @@ public final class BloomTokenFilterTest
@Test
public void testShingleFilteredAnalyzer() throws IOException {
StringReader reader = new StringReader(input);
- WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
+ WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_31);
TokenStream ts = analyzer.tokenStream(null, reader);
ShingleFilter sf = new ShingleFilter(ts, 3);
BloomTokenFilter f = new BloomTokenFilter(getFilter(shingleKeepTokens),
true, sf);
Modified:
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java?rev=1131417&r1=1131416&r2=1131417&view=diff
==============================================================================
---
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
(original)
+++
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
Sat Jun 4 15:47:07 2011
@@ -170,7 +170,7 @@ public final class LuceneIterableTest ex
int startingId) throws
IOException {
IndexWriter writer = new IndexWriter(
directory,
- new StandardAnalyzer(Version.LUCENE_30),
+ new StandardAnalyzer(Version.LUCENE_31),
createNew,
IndexWriter.MaxFieldLength.UNLIMITED);
try {