Repository: mahout Updated Branches: refs/heads/master 33c1eab11 -> 4d0cd66a6
MAHOUT-1876: Upgrade lucene to 5.5.2 and fix compilation failures, this closes apache/mahout#248 Project: http://git-wip-us.apache.org/repos/asf/mahout/repo Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/4d0cd66a Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/4d0cd66a Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/4d0cd66a Branch: refs/heads/master Commit: 4d0cd66a6269eb02fceaabdb11d70fd38d433474 Parents: 33c1eab Author: smarthi <[email protected]> Authored: Thu Aug 11 01:42:30 2016 -0400 Committer: smarthi <[email protected]> Committed: Thu Aug 11 01:42:30 2016 -0400 ---------------------------------------------------------------------- .../mahout/classifier/NewsgroupHelper.java | 3 +- .../text/MailArchivesClusteringAnalyzer.java | 31 ++++++++------------ .../text/wikipedia/WikipediaAnalyzer.java | 17 +++++------ .../mahout/utils/regex/AnalyzerTransformer.java | 3 +- .../vectors/lucene/AbstractLuceneIterator.java | 2 +- .../utils/vectors/lucene/CachedTermInfo.java | 2 +- .../utils/vectors/lucene/ClusterLabels.java | 19 ++++++------ .../mahout/utils/vectors/lucene/Driver.java | 3 +- .../mahout/clustering/TestClusterDumper.java | 6 ++-- .../collocations/llr/BloomTokenFilterTest.java | 9 +++--- .../vectors/lucene/CachedTermInfoTest.java | 6 ++-- .../mahout/utils/vectors/lucene/DriverTest.java | 17 +++++------ .../vectors/lucene/LuceneIterableTest.java | 8 ++--- .../mahout/common/lucene/AnalyzerUtils.java | 4 +-- .../org/apache/mahout/vectorizer/TFIDF.java | 4 +-- .../encoders/LuceneTextValueEncoder.java | 10 ++----- .../encoders/TextValueEncoderTest.java | 3 +- pom.xml | 2 +- 18 files changed, 67 insertions(+), 82 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java ---------------------------------------------------------------------- diff --git a/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java b/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java index 3674a57..5cec51c 100644 --- a/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java +++ b/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java @@ -26,7 +26,6 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.util.Version; import org.apache.mahout.common.RandomUtils; import org.apache.mahout.math.RandomAccessSparseVector; import org.apache.mahout.math.Vector; @@ -60,7 +59,7 @@ public final class NewsgroupHelper { private static final long WEEK = 7 * 24 * 3600; private final Random rand = RandomUtils.getRandom(); - private final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_46); + private final Analyzer analyzer = new StandardAnalyzer(); private final FeatureVectorEncoder encoder = new StaticWordValueEncoder("body"); private final FeatureVectorEncoder bias = new ConstantValueEncoder("Intercept"); http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java b/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java index 8776c5f..12ed471 100644 --- a/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java +++ b/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java @@ -16,12 +16,6 @@ */ package org.apache.mahout.text; -import java.io.IOException; -import java.io.Reader; -import java.util.Arrays; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; @@ -34,7 +28,11 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; -import org.apache.lucene.util.Version; + +import java.io.IOException; +import java.util.Arrays; +import java.util.regex.Matcher; +import java.util.regex.Pattern; /** * Custom Lucene Analyzer designed for aggressive feature reduction @@ -42,13 +40,11 @@ import org.apache.lucene.util.Version; * stop words, excluding non-alpha-numeric tokens, and porter stemming. */ public final class MailArchivesClusteringAnalyzer extends StopwordAnalyzerBase { - private static final Version LUCENE_VERSION = Version.LUCENE_46; - // extended set of stop words composed of common mail terms like "hi", // HTML tags, and Java keywords asmany of the messages in the archives // are subversion check-in notifications - private static final CharArraySet STOP_SET = new CharArraySet(LUCENE_VERSION, Arrays.asList( + private static final CharArraySet STOP_SET = new CharArraySet(Arrays.asList( "3d","7bit","a0","about","above","abstract","across","additional","after", "afterwards","again","against","align","all","almost","alone","along", "already","also","although","always","am","among","amongst","amoungst", @@ -108,22 +104,21 @@ public final class MailArchivesClusteringAnalyzer extends StopwordAnalyzerBase { private static final Matcher MATCHER = ALPHA_NUMERIC.matcher(""); public MailArchivesClusteringAnalyzer() { - super(LUCENE_VERSION, STOP_SET); + super(STOP_SET); } public MailArchivesClusteringAnalyzer(CharArraySet stopSet) { - super(LUCENE_VERSION, stopSet); - + super(stopSet); } @Override - protected TokenStreamComponents createComponents(String fieldName, Reader reader) { - Tokenizer tokenizer = new StandardTokenizer(LUCENE_VERSION, reader); - TokenStream result = new StandardFilter(LUCENE_VERSION, tokenizer); - result = new LowerCaseFilter(LUCENE_VERSION, result); + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new StandardTokenizer(); + TokenStream result = new StandardFilter(tokenizer); + result = new LowerCaseFilter(result); result = new ASCIIFoldingFilter(result); result = new AlphaNumericMaxLengthFilter(result); - result = new StopFilter(LUCENE_VERSION, result, STOP_SET); + result = new StopFilter(result, STOP_SET); result = new PorterStemFilter(result); return new TokenStreamComponents(tokenizer, result); } http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java b/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java index ad55ba7..d50323d 100644 --- a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java +++ b/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java @@ -17,8 +17,6 @@ package org.apache.mahout.text.wikipedia; -import java.io.Reader; - import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.LowerCaseFilter; @@ -28,25 +26,24 @@ import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer; -import org.apache.lucene.util.Version; public class WikipediaAnalyzer extends StopwordAnalyzerBase { public WikipediaAnalyzer() { - super(Version.LUCENE_46, StopAnalyzer.ENGLISH_STOP_WORDS_SET); + super(StopAnalyzer.ENGLISH_STOP_WORDS_SET); } public WikipediaAnalyzer(CharArraySet stopSet) { - super(Version.LUCENE_46, stopSet); + super(stopSet); } @Override - protected TokenStreamComponents createComponents(String fieldName, Reader reader) { - Tokenizer tokenizer = new WikipediaTokenizer(reader); - TokenStream result = new StandardFilter(Version.LUCENE_46, tokenizer); - result = new LowerCaseFilter(Version.LUCENE_46, result); - result = new StopFilter(Version.LUCENE_46, result, getStopwordSet()); + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new WikipediaTokenizer(); + TokenStream result = new StandardFilter(tokenizer); + result = new LowerCaseFilter(result); + result = new StopFilter(result, getStopwordSet()); return new TokenStreamComponents(tokenizer, result); } } http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java b/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java index 36b166a..4585a0a 100644 --- a/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java +++ b/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java @@ -24,7 +24,6 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.util.Version; import org.apache.mahout.common.lucene.TokenStreamIterator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -37,7 +36,7 @@ public class AnalyzerTransformer implements RegexTransformer { private static final Logger log = LoggerFactory.getLogger(AnalyzerTransformer.class); public AnalyzerTransformer() { - this(new StandardAnalyzer(Version.LUCENE_46), "text"); + this(new StandardAnalyzer(), "text"); } public AnalyzerTransformer(Analyzer analyzer) { http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java index 233c95c..ff61a70 100644 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java +++ b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java @@ -113,7 +113,7 @@ public abstract class AbstractLuceneIterator extends AbstractIterator<Vector> { // The loop exits with termFreqVector and name set. - TermsEnum te = termFreqVector.iterator(null); + TermsEnum te = termFreqVector.iterator(); BytesRef term; TFDFMapper mapper = new TFDFMapper(indexReader.numDocs(), weight, this.terminfo); mapper.setExpectations(field, termFreqVector.size()); http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java index 718704a..0b59ed6 100644 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java +++ b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java @@ -42,7 +42,7 @@ public class CachedTermInfo implements TermInfo { public CachedTermInfo(IndexReader reader, String field, int minDf, int maxDfPercent) throws IOException { this.field = field; Terms t = MultiFields.getTerms(reader, field); - TermsEnum te = t.iterator(null); + TermsEnum te = t.iterator(); int numDocs = reader.numDocs(); double percent = numDocs * maxDfPercent / 100.0; http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java index 6ef7fba..b2568e7 100644 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java +++ b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java @@ -21,6 +21,7 @@ import java.io.File; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; +import java.nio.file.Paths; import java.util.Collection; import java.util.Collections; import java.util.HashSet; @@ -44,9 +45,9 @@ import org.apache.commons.cli2.commandline.Parser; import org.apache.commons.io.Charsets; import org.apache.hadoop.fs.Path; import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; @@ -55,7 +56,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.OpenBitSet; +import org.apache.lucene.util.FixedBitSet; import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable; import org.apache.mahout.common.CommandLineUtil; import org.apache.mahout.common.commandline.DefaultOptionCreator; @@ -149,7 +150,7 @@ public class ClusterLabels { } log.info("Processing Cluster {} with {} documents", integer, wpvws.size()); - Directory dir = FSDirectory.open(new File(this.indexDir)); + Directory dir = FSDirectory.open(Paths.get(this.indexDir)); IndexReader reader = DirectoryReader.open(dir); @@ -165,7 +166,7 @@ public class ClusterLabels { int numDocs = reader.numDocs(); - OpenBitSet clusterDocBitset = getClusterDocBitset(reader, idSet, this.idField); + FixedBitSet clusterDocBitset = getClusterDocBitset(reader, idSet, this.idField); log.info("Populating term infos from the index"); @@ -179,7 +180,7 @@ public class ClusterLabels { * frequency. */ Terms t = MultiFields.getTerms(reader, contentField); - TermsEnum te = t.iterator(null); + TermsEnum te = t.iterator(); Map<String, TermEntry> termEntryMap = new LinkedHashMap<>(); Bits liveDocs = MultiFields.getLiveDocs(reader); //WARNING: returns null if there are no deletions @@ -187,8 +188,8 @@ public class ClusterLabels { int count = 0; BytesRef term; while ((term = te.next()) != null) { - OpenBitSet termBitset = new OpenBitSet(reader.maxDoc()); - DocsEnum docsEnum = MultiFields.getTermDocsEnum(reader, null, contentField, term); + FixedBitSet termBitset = new FixedBitSet(reader.maxDoc()); + PostingsEnum docsEnum = MultiFields.getTermDocsEnum(reader, contentField, term); int docID; while ((docID = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { //check to see if we don't have an deletions (null) or if document is live @@ -230,12 +231,12 @@ public class ClusterLabels { return clusteredTermInfo.subList(0, Math.min(clusteredTermInfo.size(), maxLabels)); } - private static OpenBitSet getClusterDocBitset(IndexReader reader, + private static FixedBitSet getClusterDocBitset(IndexReader reader, Collection<String> idSet, String idField) throws IOException { int numDocs = reader.numDocs(); - OpenBitSet bitset = new OpenBitSet(numDocs); + FixedBitSet bitset = new FixedBitSet(numDocs); Set<String> idFieldSelector = null; if (idField != null) { http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java index 2eeebd9..876816f 100644 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java +++ b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java @@ -20,6 +20,7 @@ package org.apache.mahout.utils.vectors.lucene; import java.io.File; import java.io.IOException; import java.io.Writer; +import java.nio.file.Paths; import java.util.Iterator; import com.google.common.base.Preconditions; @@ -85,7 +86,7 @@ public final class Driver { Preconditions.checkArgument(minDf >= 1, "minDf must be >= 1"); Preconditions.checkArgument(maxDFPercent <= 99, "maxDFPercent must be <= 99"); - Directory dir = FSDirectory.open(file); + Directory dir = FSDirectory.open(Paths.get(file.getAbsolutePath())); IndexReader reader = DirectoryReader.open(dir); http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java b/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java index a1d2bbb..01d46fc 100644 --- a/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java +++ b/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java @@ -31,11 +31,11 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.StringField; import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.RAMDirectory; -import org.apache.lucene.util.Version; import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver; import org.apache.mahout.clustering.kmeans.KMeansDriver; import org.apache.mahout.clustering.kmeans.RandomSeedGenerator; @@ -94,7 +94,7 @@ public final class TestClusterDumper extends MahoutTestCase { sampleData = new ArrayList<>(); RAMDirectory directory = new RAMDirectory(); try (IndexWriter writer = new IndexWriter(directory, - new IndexWriterConfig(Version.LUCENE_46, new StandardAnalyzer(Version.LUCENE_46)))){ + new IndexWriterConfig(new StandardAnalyzer()))){ for (int i = 0; i < docs2.length; i++) { Document doc = new Document(); Field id = new StringField("id", "doc_" + i, Field.Store.YES); @@ -102,7 +102,7 @@ public final class TestClusterDumper extends MahoutTestCase { // Store both position and offset information FieldType fieldType = new FieldType(); fieldType.setStored(false); - fieldType.setIndexed(true); + fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); fieldType.setTokenized(true); fieldType.setStoreTermVectors(true); fieldType.setStoreTermVectorPositions(true); http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java b/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java index 37efc01..4fdbbbc 100644 --- a/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java +++ b/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java @@ -36,7 +36,6 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.analysis.shingle.ShingleFilter; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.util.Version; import org.apache.mahout.common.MahoutTestCase; import org.junit.Test; @@ -79,7 +78,7 @@ public final class BloomTokenFilterTest extends MahoutTestCase { @Test public void testAnalyzer() throws IOException { Reader reader = new StringReader(input); - Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46); + Analyzer analyzer = new WhitespaceAnalyzer(); TokenStream ts = analyzer.tokenStream(null, reader); ts.reset(); validateTokens(allTokens, ts); @@ -91,7 +90,7 @@ public final class BloomTokenFilterTest extends MahoutTestCase { @Test public void testNonKeepdAnalyzer() throws IOException { Reader reader = new StringReader(input); - Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46); + Analyzer analyzer = new WhitespaceAnalyzer(); TokenStream ts = analyzer.tokenStream(null, reader); ts.reset(); TokenStream f = new BloomTokenFilter(getFilter(filterTokens), false /* toss matching tokens */, ts); @@ -104,7 +103,7 @@ public final class BloomTokenFilterTest extends MahoutTestCase { @Test public void testKeepAnalyzer() throws IOException { Reader reader = new StringReader(input); - Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46); + Analyzer analyzer = new WhitespaceAnalyzer(); TokenStream ts = analyzer.tokenStream(null, reader); ts.reset(); TokenStream f = new BloomTokenFilter(getFilter(filterTokens), true /* keep matching tokens */, ts); @@ -117,7 +116,7 @@ public final class BloomTokenFilterTest extends MahoutTestCase { @Test public void testShingleFilteredAnalyzer() throws IOException { Reader reader = new StringReader(input); - Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46); + Analyzer analyzer = new WhitespaceAnalyzer(); TokenStream ts = analyzer.tokenStream(null, reader); ts.reset(); ShingleFilter sf = new ShingleFilter(ts, 3); http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java index 44a91e9..890a14b 100644 --- a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java +++ b/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java @@ -28,11 +28,11 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.StringField; import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.RAMDirectory; -import org.apache.lucene.util.Version; import org.apache.mahout.common.MahoutTestCase; import org.junit.Before; import org.junit.Test; @@ -65,7 +65,7 @@ public class CachedTermInfoTest extends MahoutTestCase { FieldType fieldType = new FieldType(); fieldType.setStored(false); - fieldType.setIndexed(true); + fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); fieldType.setTokenized(true); fieldType.setStoreTermVectors(false); fieldType.setStoreTermVectorPositions(false); @@ -100,7 +100,7 @@ public class CachedTermInfoTest extends MahoutTestCase { static RAMDirectory createTestIndex(FieldType fieldType, RAMDirectory directory, int startingId) throws IOException { - IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_46, new WhitespaceAnalyzer(Version.LUCENE_46))); + IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(new WhitespaceAnalyzer())); try { for (int i = 0; i < DOCS.length; i++) { http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java index 6ac2df8..86c8305 100644 --- a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java +++ b/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java @@ -30,18 +30,18 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; -import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.SimpleFSDirectory; -import org.apache.lucene.util.Version; import org.apache.mahout.common.MahoutTestCase; import org.junit.Before; import org.junit.Test; import java.io.File; import java.io.IOException; +import java.nio.file.Paths; import java.util.Set; public class DriverTest extends MahoutTestCase { @@ -73,9 +73,8 @@ public class DriverTest extends MahoutTestCase { public static final FieldType TYPE = new FieldType(); static { - TYPE.setIndexed(true); TYPE.setOmitNorms(true); - TYPE.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS); + TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS); TYPE.setStored(true); TYPE.setTokenized(true); TYPE.setStoreTermVectors(true); @@ -90,9 +89,10 @@ public class DriverTest extends MahoutTestCase { @Test public void sequenceFileDictionary() throws IOException { - Directory index = new SimpleFSDirectory(indexDir); - Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_46); - IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_46, analyzer); + Directory index = new SimpleFSDirectory(Paths.get(indexDir.getAbsolutePath())); + Analyzer analyzer = new StandardAnalyzer(); + IndexWriterConfig config = new IndexWriterConfig(analyzer); + config.setCommitOnClose(true); final IndexWriter writer = new IndexWriter(index, config); try { @@ -100,9 +100,8 @@ public class DriverTest extends MahoutTestCase { writer.addDocument(asDocument("One Ring to find them,")); writer.addDocument(asDocument("One Ring to bring them all")); writer.addDocument(asDocument("and in the darkness bind them")); - } finally { - writer.close(true); + writer.close(); } File seqDict = new File(outputDir, "dict.seq"); http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java index ba49a2d..8d92551 100644 --- a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java +++ b/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java @@ -29,11 +29,11 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.StringField; import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.RAMDirectory; -import org.apache.lucene.util.Version; import org.apache.mahout.common.MahoutTestCase; import org.apache.mahout.math.NamedVector; import org.apache.mahout.math.Vector; @@ -62,14 +62,14 @@ public final class LuceneIterableTest extends MahoutTestCase { @Before public void before() throws IOException { - TYPE_NO_TERM_VECTORS.setIndexed(true); + TYPE_NO_TERM_VECTORS.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); TYPE_NO_TERM_VECTORS.setTokenized(true); TYPE_NO_TERM_VECTORS.setStoreTermVectors(false); TYPE_NO_TERM_VECTORS.setStoreTermVectorPositions(false); TYPE_NO_TERM_VECTORS.setStoreTermVectorOffsets(false); TYPE_NO_TERM_VECTORS.freeze(); - TYPE_TERM_VECTORS.setIndexed(true); + TYPE_TERM_VECTORS.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); TYPE_TERM_VECTORS.setTokenized(true); TYPE_TERM_VECTORS.setStored(true); TYPE_TERM_VECTORS.setStoreTermVectors(true); @@ -177,7 +177,7 @@ public final class LuceneIterableTest extends MahoutTestCase { RAMDirectory directory, int startingId) throws IOException { - try (IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_46,new StandardAnalyzer(Version.LUCENE_46)))) { + try (IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(new StandardAnalyzer()))) { for (int i = 0; i < DOCS.length; i++) { Document doc = new Document(); Field id = new StringField("id", "doc_" + (i + startingId), Field.Store.YES); http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java ---------------------------------------------------------------------- diff --git a/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java b/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java index 37ca383..742d6cf 100644 --- a/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java +++ b/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java @@ -32,7 +32,7 @@ public final class AnalyzerUtils { * @throws ClassNotFoundException - {@link ClassNotFoundException} */ public static Analyzer createAnalyzer(String analyzerClassName) throws ClassNotFoundException { - return createAnalyzer(analyzerClassName, Version.LUCENE_46); + return createAnalyzer(analyzerClassName, Version.LUCENE_5_5_2); } public static Analyzer createAnalyzer(String analyzerClassName, Version version) throws ClassNotFoundException { @@ -47,7 +47,7 @@ public final class AnalyzerUtils { * @return {@link Analyzer} */ public static Analyzer createAnalyzer(Class<? extends Analyzer> analyzerClass) { - return createAnalyzer(analyzerClass, Version.LUCENE_46); + return createAnalyzer(analyzerClass, Version.LUCENE_5_5_2); } public static Analyzer createAnalyzer(Class<? extends Analyzer> analyzerClass, Version version) { http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/mr/src/main/java/org/apache/mahout/vectorizer/TFIDF.java ---------------------------------------------------------------------- diff --git a/mr/src/main/java/org/apache/mahout/vectorizer/TFIDF.java b/mr/src/main/java/org/apache/mahout/vectorizer/TFIDF.java index 0a537eb..238fa03 100644 --- a/mr/src/main/java/org/apache/mahout/vectorizer/TFIDF.java +++ b/mr/src/main/java/org/apache/mahout/vectorizer/TFIDF.java @@ -17,11 +17,11 @@ package org.apache.mahout.vectorizer; -import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.ClassicSimilarity; //TODO: add a new class that supports arbitrary Lucene similarity implementations public class TFIDF implements Weight { - private final DefaultSimilarity sim = new DefaultSimilarity(); + private final ClassicSimilarity sim = new ClassicSimilarity(); @Override public double calculate(int tf, int df, int length, int numDocs) { http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/LuceneTextValueEncoder.java ---------------------------------------------------------------------- diff --git a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/LuceneTextValueEncoder.java b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/LuceneTextValueEncoder.java index 3bae26e..e3e133c 100644 --- a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/LuceneTextValueEncoder.java +++ b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/LuceneTextValueEncoder.java @@ -49,13 +49,9 @@ public class LuceneTextValueEncoder extends TextValueEncoder { */ @Override protected Iterable<String> tokenize(CharSequence originalForm) { - try { - TokenStream ts = analyzer.tokenStream(getName(), new CharSequenceReader(originalForm)); - ts.addAttribute(CharTermAttribute.class); - return new LuceneTokenIterable(ts, false); - } catch (IOException ex) { - throw new IllegalStateException(ex); - } + TokenStream ts = analyzer.tokenStream(getName(), new CharSequenceReader(originalForm)); + ts.addAttribute(CharTermAttribute.class); + return new LuceneTokenIterable(ts, false); } private static final class CharSequenceReader extends Reader { http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java ---------------------------------------------------------------------- diff --git a/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java b/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java index 4446fef..be3e03e 100644 --- a/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java +++ b/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java @@ -19,7 +19,6 @@ package org.apache.mahout.vectorizer.encoders; import com.google.common.collect.ImmutableMap; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; -import org.apache.lucene.util.Version; import org.apache.mahout.common.MahoutTestCase; import org.apache.mahout.math.DenseVector; import org.apache.mahout.math.Vector; @@ -70,7 +69,7 @@ public final class TextValueEncoderTest extends MahoutTestCase { @Test public void testLuceneEncoding() throws Exception { LuceneTextValueEncoder enc = new LuceneTextValueEncoder("text"); - enc.setAnalyzer(new WhitespaceAnalyzer(Version.LUCENE_46)); + enc.setAnalyzer(new WhitespaceAnalyzer()); Vector v1 = new DenseVector(200); enc.addToVector("test1 and more", v1); enc.flush(1, v1); http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index ca0ea21..165e42e 100644 --- a/pom.xml +++ b/pom.xml @@ -117,7 +117,7 @@ <mjavadoc.version>2.10.3</mjavadoc.version> <mscala.version>3.2.0</mscala.version> <hbase.version>1.0.0</hbase.version> - <lucene.version>4.6.1</lucene.version> + <lucene.version>5.5.2</lucene.version> <slf4j.version>1.7.19</slf4j.version> <scala.compat.version>2.10</scala.compat.version> <scala.version>2.10.4</scala.version>
