Repository: mahout Updated Branches: refs/heads/master 5e41bc096 -> daad3a4ce
Revert "MAHOUT-1649: Upgrade to be Lucene 4.10.4 compatible, patch from Frank Scholten" This reverts commit 4cff54295d9e494eaff46a190b73bcb8d491f7e5. Project: http://git-wip-us.apache.org/repos/asf/mahout/repo Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/db624ef3 Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/db624ef3 Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/db624ef3 Branch: refs/heads/master Commit: db624ef35c8174e9bd5a9f63f9e2957e8bd594b8 Parents: c70d708 Author: Andrew Musselman <[email protected]> Authored: Mon Apr 6 09:22:52 2015 -0700 Committer: Andrew Musselman <[email protected]> Committed: Mon Apr 6 09:22:52 2015 -0700 ---------------------------------------------------------------------- .../mahout/classifier/NewsgroupHelper.java | 25 +- integration/pom.xml | 11 - .../mahout/text/LuceneSegmentInputFormat.java | 5 +- .../mahout/text/LuceneSegmentInputSplit.java | 14 +- .../mahout/text/LuceneSegmentRecordReader.java | 4 +- .../mahout/text/LuceneStorageConfiguration.java | 5 +- .../text/MailArchivesClusteringAnalyzer.java | 21 +- .../text/ReadOnlyFileSystemDirectory.java | 354 +++++++++++++++++++ .../text/SequenceFilesFromLuceneStorage.java | 6 +- .../SequenceFilesFromLuceneStorageDriver.java | 4 +- .../text/wikipedia/WikipediaAnalyzer.java | 11 +- .../mahout/utils/regex/AnalyzerTransformer.java | 3 +- mr/src/main/java/org/apache/mahout/Version.java | 6 +- .../mahout/common/lucene/AnalyzerUtils.java | 4 +- .../mahout/vectorizer/DictionaryVectorizer.java | 14 +- .../mahout/vectorizer/DocumentProcessor.java | 2 +- .../encoders/AdaptiveWordValueEncoder.java | 2 +- .../encoders/ContinuousValueEncoder.java | 2 +- .../mahout/vectorizer/encoders/Dictionary.java | 9 +- .../encoders/FeatureVectorEncoder.java | 12 +- .../encoders/InteractionValueEncoder.java | 5 +- .../encoders/StaticWordValueEncoder.java | 4 +- .../vectorizer/encoders/TextValueEncoder.java | 12 +- .../vectorizer/term/TFPartialVectorReducer.java | 23 +- .../mahout/vectorizer/tfidf/TFIDFConverter.java | 12 +- .../encoders/TextValueEncoderTest.java | 12 +- pom.xml | 4 +- 27 files changed, 478 insertions(+), 108 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java ---------------------------------------------------------------------- diff --git a/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java b/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java index 532e023..3674a57 100644 --- a/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java +++ b/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java @@ -17,17 +17,6 @@ package org.apache.mahout.classifier; -import java.io.BufferedReader; -import java.io.File; -import java.io.IOException; -import java.io.Reader; -import java.io.StringReader; -import java.text.SimpleDateFormat; -import java.util.Collection; -import java.util.Date; -import java.util.Locale; -import java.util.Random; - import com.google.common.collect.ConcurrentHashMultiset; import com.google.common.collect.Multiset; import com.google.common.io.Closeables; @@ -37,6 +26,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.util.Version; import org.apache.mahout.common.RandomUtils; import org.apache.mahout.math.RandomAccessSparseVector; import org.apache.mahout.math.Vector; @@ -44,6 +34,17 @@ import org.apache.mahout.vectorizer.encoders.ConstantValueEncoder; import org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder; import org.apache.mahout.vectorizer.encoders.StaticWordValueEncoder; +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.text.SimpleDateFormat; +import java.util.Collection; +import java.util.Date; +import java.util.Locale; +import java.util.Random; + public final class NewsgroupHelper { private static final SimpleDateFormat[] DATE_FORMATS = { @@ -59,7 +60,7 @@ public final class NewsgroupHelper { private static final long WEEK = 7 * 24 * 3600; private final Random rand = RandomUtils.getRandom(); - private final Analyzer analyzer = new StandardAnalyzer(); + private final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_46); private final FeatureVectorEncoder encoder = new StaticWordValueEncoder("body"); private final FeatureVectorEncoder bias = new ConstantValueEncoder("Intercept"); http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/integration/pom.xml ---------------------------------------------------------------------- diff --git a/integration/pom.xml b/integration/pom.xml index 8960417..fcb85cb 100644 --- a/integration/pom.xml +++ b/integration/pom.xml @@ -137,17 +137,6 @@ <artifactId>lucene-analyzers-common</artifactId> <optional>true</optional> </dependency> - <dependency> - <groupId>org.apache.solr</groupId> - <artifactId>solr-core</artifactId> - <version>${lucene.version}</version> - </dependency> - <dependency> - <groupId>commons-httpclient</groupId> - <artifactId>commons-httpclient</artifactId> - <version>3.1</version> - </dependency> - <dependency> <groupId>org.mongodb</groupId> http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java b/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java index f1a360e..1c4f8de 100644 --- a/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java +++ b/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java @@ -21,6 +21,7 @@ import java.util.ArrayList; import java.util.List; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; @@ -31,7 +32,6 @@ import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.lucene.index.SegmentCommitInfo; import org.apache.lucene.index.SegmentInfos; -import org.apache.solr.store.hdfs.HdfsDirectory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -52,7 +52,8 @@ public class LuceneSegmentInputFormat extends InputFormat { List<Path> indexPaths = lucene2SeqConfiguration.getIndexPaths(); for (Path indexPath : indexPaths) { - HdfsDirectory directory = new HdfsDirectory(indexPath, configuration); + ReadOnlyFileSystemDirectory directory = new ReadOnlyFileSystemDirectory(FileSystem.get(configuration), indexPath, + false, configuration); SegmentInfos segmentInfos = new SegmentInfos(); segmentInfos.read(directory); http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputSplit.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputSplit.java b/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputSplit.java index 1d94416..1441e32 100644 --- a/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputSplit.java +++ b/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputSplit.java @@ -16,18 +16,18 @@ */ package org.apache.mahout.text; -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.lucene.index.SegmentCommitInfo; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.SegmentInfos; -import org.apache.solr.store.hdfs.HdfsDirectory; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; /** * {@link InputSplit} implementation that represents a Lucene segment. @@ -88,7 +88,9 @@ public class LuceneSegmentInputSplit extends InputSplit implements Writable { * @throws IOException if an error occurs when accessing the directory */ public SegmentCommitInfo getSegment(Configuration configuration) throws IOException { - HdfsDirectory directory = new HdfsDirectory(indexPath, configuration); + ReadOnlyFileSystemDirectory directory = new ReadOnlyFileSystemDirectory(FileSystem.get(configuration), indexPath, + false, configuration); + SegmentInfos segmentInfos = new SegmentInfos(); segmentInfos.read(directory); http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java b/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java index fe6a407..485e856 100644 --- a/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java +++ b/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java @@ -62,8 +62,9 @@ public class LuceneSegmentRecordReader extends RecordReader<Text, NullWritable> for (String field : lucene2SeqConfiguration.getFields()) { LuceneIndexHelper.fieldShouldExistInIndex(segmentReader, field); } + Weight weight = lucene2SeqConfiguration.getQuery().createWeight(searcher); - scorer = weight.scorer(segmentReader.getContext(), segmentReader.getLiveDocs()); + scorer = weight.scorer(segmentReader.getContext(), false, false, null); if (scorer == null) { throw new IllegalArgumentException("Could not create query scorer for query: " + lucene2SeqConfiguration.getQuery()); @@ -73,6 +74,7 @@ public class LuceneSegmentRecordReader extends RecordReader<Text, NullWritable> @Override public boolean nextKeyValue() throws IOException, InterruptedException { nextDocId = scorer.nextDoc(); + return nextDocId != Scorer.NO_MORE_DOCS; } http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java b/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java index f7a077b..b36f3e9 100644 --- a/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java +++ b/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java @@ -40,12 +40,13 @@ import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.Query; -import org.apache.lucene.util.Version; import org.apache.mahout.common.Pair; import org.apache.mahout.common.iterator.sequencefile.PathFilters; import org.apache.mahout.common.iterator.sequencefile.PathType; import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable; +import static org.apache.lucene.util.Version.LUCENE_46; + /** * Holds all the configuration for {@link SequenceFilesFromLuceneStorage}, which generates a sequence file * with id as the key and a content field as value. @@ -212,7 +213,7 @@ public class LuceneStorageConfiguration implements Writable { } idField = in.readUTF(); fields = Arrays.asList(in.readUTF().split(SEPARATOR_FIELDS)); - query = new QueryParser(Version.LUCENE_4_10_4, "query", new StandardAnalyzer()).parse(in.readUTF()); + query = new QueryParser(LUCENE_46, "query", new StandardAnalyzer(LUCENE_46)).parse(in.readUTF()); maxHits = in.readInt(); } catch (ParseException e) { throw new RuntimeException("Could not deserialize " + this.getClass().getName(), e); http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java b/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java index 9560142..8776c5f 100644 --- a/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java +++ b/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java @@ -34,6 +34,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; +import org.apache.lucene.util.Version; /** * Custom Lucene Analyzer designed for aggressive feature reduction @@ -41,12 +42,13 @@ import org.apache.lucene.analysis.util.StopwordAnalyzerBase; * stop words, excluding non-alpha-numeric tokens, and porter stemming. */ public final class MailArchivesClusteringAnalyzer extends StopwordAnalyzerBase { - + private static final Version LUCENE_VERSION = Version.LUCENE_46; + // extended set of stop words composed of common mail terms like "hi", // HTML tags, and Java keywords asmany of the messages in the archives // are subversion check-in notifications - private static final CharArraySet STOP_SET = new CharArraySet(Arrays.asList( + private static final CharArraySet STOP_SET = new CharArraySet(LUCENE_VERSION, Arrays.asList( "3d","7bit","a0","about","above","abstract","across","additional","after", "afterwards","again","against","align","all","almost","alone","along", "already","also","although","always","am","among","amongst","amoungst", @@ -106,17 +108,22 @@ public final class MailArchivesClusteringAnalyzer extends StopwordAnalyzerBase { private static final Matcher MATCHER = ALPHA_NUMERIC.matcher(""); public MailArchivesClusteringAnalyzer() { - super(STOP_SET); + super(LUCENE_VERSION, STOP_SET); } + public MailArchivesClusteringAnalyzer(CharArraySet stopSet) { + super(LUCENE_VERSION, stopSet); + + } + @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { - Tokenizer tokenizer = new StandardTokenizer(reader); - TokenStream result = new StandardFilter(tokenizer); - result = new LowerCaseFilter(result); + Tokenizer tokenizer = new StandardTokenizer(LUCENE_VERSION, reader); + TokenStream result = new StandardFilter(LUCENE_VERSION, tokenizer); + result = new LowerCaseFilter(LUCENE_VERSION, result); result = new ASCIIFoldingFilter(result); result = new AlphaNumericMaxLengthFilter(result); - result = new StopFilter(result, STOP_SET); + result = new StopFilter(LUCENE_VERSION, result, STOP_SET); result = new PorterStemFilter(result); return new TokenStreamComponents(tokenizer, result); } http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java b/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java new file mode 100644 index 0000000..e97e35b --- /dev/null +++ b/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java @@ -0,0 +1,354 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.text; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.lucene.store.BaseDirectory; +import org.apache.lucene.store.BufferedIndexInput; +import org.apache.lucene.store.BufferedIndexOutput; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.Lock; +import org.apache.lucene.store.LockFactory; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Collection; + +//TODO: is there a better way of doing this in Lucene 4.x? + +/** + * This class implements a read-only Lucene Directory on top of a general FileSystem. + * Currently it does not support locking. + * <p/> + * // TODO: Rename to FileSystemReadOnlyDirectory + */ +public class ReadOnlyFileSystemDirectory extends BaseDirectory { + + private final FileSystem fs; + private final Path directory; + private final int ioFileBufferSize; + + private static final Logger log = LoggerFactory.getLogger(ReadOnlyFileSystemDirectory.class); + + /** + * Constructor + * + * @param fs - filesystem + * @param directory - directory path + * @param create - if true create the directory + * @param conf - MR Job Configuration + * @throws IOException + */ + + public ReadOnlyFileSystemDirectory(FileSystem fs, Path directory, boolean create, + Configuration conf) throws IOException { + + this.fs = fs; + this.directory = directory; + this.ioFileBufferSize = conf.getInt("io.file.buffer.size", 4096); + + if (create) { + create(); + } + + boolean isDir = false; + try { + FileStatus status = fs.getFileStatus(directory); + if (status != null) { + isDir = status.isDir(); + } + } catch (IOException e) { + log.error(e.getMessage(), e); + } + if (!isDir) { + throw new IOException(directory + " is not a directory"); + } + } + + + private void create() throws IOException { + if (!fs.exists(directory)) { + fs.mkdirs(directory); + } + + boolean isDir = false; + try { + FileStatus status = fs.getFileStatus(directory); + if (status != null) { + isDir = status.isDir(); + } + } catch (IOException e) { + log.error(e.getMessage(), e); + } + if (!isDir) { + throw new IOException(directory + " is not a directory"); + } + + // clear old index files + FileStatus[] fileStatus = + fs.listStatus(directory, LuceneIndexFileNameFilter.getFilter()); + for (FileStatus status : fileStatus) { + if (!fs.delete(status.getPath(), true)) { + throw new IOException("Cannot delete index file " + + status.getPath()); + } + } + } + + public String[] list() throws IOException { + FileStatus[] fileStatus = + fs.listStatus(directory, LuceneIndexFileNameFilter.getFilter()); + String[] result = new String[fileStatus.length]; + for (int i = 0; i < fileStatus.length; i++) { + result[i] = fileStatus[i].getPath().getName(); + } + return result; + } + + @Override + public String[] listAll() throws IOException { + return list(); + } + + @Override + public boolean fileExists(String name) throws IOException { + return fs.exists(new Path(directory, name)); + } + + @Override + public long fileLength(String name) throws IOException { + return fs.getFileStatus(new Path(directory, name)).getLen(); + } + + @Override + public void deleteFile(String name) throws IOException { + if (!fs.delete(new Path(directory, name), true)) { + throw new IOException("Cannot delete index file " + name); + } + } + + @Override + public IndexOutput createOutput(String name, IOContext context) throws IOException { + //TODO: What should we be doing with the IOContext here, if anything? + Path file = new Path(directory, name); + if (fs.exists(file) && !fs.delete(file, true)) { + // delete the existing one if applicable + throw new IOException("Cannot overwrite index file " + file); + } + + return new FileSystemIndexOutput(file, ioFileBufferSize); + } + + @Override + public void sync(Collection<String> names) throws IOException { + // do nothing, as this is read-only + } + + @Override + public IndexInput openInput(String name, IOContext context) throws IOException { + return new FileSystemIndexInput(new Path(directory, name), ioFileBufferSize); + } + + @Override + public Lock makeLock(final String name) { + return new Lock() { + public boolean obtain() { + return true; + } + + public void release() { + } + + public boolean isLocked() { + throw new UnsupportedOperationException(); + } + + public String toString() { + return "Lock@" + new Path(directory, name); + } + }; + } + + @Override + public void clearLock(String name) throws IOException { + // do nothing + } + + @Override + public void close() throws IOException { + // do not close the file system + } + + @Override + public void setLockFactory(LockFactory lockFactory) throws IOException { + // do nothing + } + + @Override + public LockFactory getLockFactory() { + return null; + } + + @Override + public String toString() { + return this.getClass().getName() + "@" + directory; + } + + private class FileSystemIndexInput extends BufferedIndexInput implements Cloneable { + + // shared by clones + private class Descriptor { + public final FSDataInputStream in; + public long position; // cache of in.getPos() + + public Descriptor(Path file, int ioFileBufferSize) throws IOException { + this.in = fs.open(file, ioFileBufferSize); + } + } + + private final Path filePath; // for debugging + private final Descriptor descriptor; + private final long length; + private boolean isOpen; + private boolean isClone; + + public FileSystemIndexInput(Path path, int ioFileBufferSize) + throws IOException { + super("FSII_" + path.getName(), ioFileBufferSize); + filePath = path; + descriptor = new Descriptor(path, ioFileBufferSize); + length = fs.getFileStatus(path).getLen(); + isOpen = true; + } + + @Override + protected void readInternal(byte[] b, int offset, int len) + throws IOException { + long position = getFilePointer(); + if (position != descriptor.position) { + descriptor.in.seek(position); + descriptor.position = position; + } + int total = 0; + do { + int i = descriptor.in.read(b, offset + total, len - total); + if (i == -1) { + throw new IOException("Read past EOF"); + } + descriptor.position += i; + total += i; + } while (total < len); + } + + @Override + public void close() throws IOException { + if (!isClone) { + if (isOpen) { + descriptor.in.close(); + isOpen = false; + } else { + throw new IOException("Index file " + filePath + " already closed"); + } + } + } + + @Override + protected void seekInternal(long position) { + // handled in readInternal() + } + + @Override + public long length() { + return length; + } + + @Override + protected void finalize() throws Throwable { + super.finalize(); + if (!isClone && isOpen) { + close(); // close the file + } + } + + @Override + public BufferedIndexInput clone() { + FileSystemIndexInput clone = (FileSystemIndexInput) super.clone(); + clone.isClone = true; + return clone; + } + } + + private class FileSystemIndexOutput extends BufferedIndexOutput { + + private final Path filePath; // for debugging + private final FSDataOutputStream out; + private boolean isOpen; + + public FileSystemIndexOutput(Path path, int ioFileBufferSize) + throws IOException { + filePath = path; + // overwrite is true by default + out = fs.create(path, true, ioFileBufferSize); + isOpen = true; + } + + @Override + public void flushBuffer(byte[] b, int offset, int size) throws IOException { + out.write(b, offset, size); + } + + @Override + public void close() throws IOException { + if (isOpen) { + super.close(); + out.close(); + isOpen = false; + } else { + throw new IOException("Index file " + filePath + " already closed"); + } + } + + @Override + public void seek(long pos) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long length() throws IOException { + return out.getPos(); + } + + @Override + protected void finalize() throws Throwable { + super.finalize(); + if (isOpen) { + close(); // close the file + } + } + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java b/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java index 41c3f84..b7fd495 100644 --- a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java +++ b/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java @@ -22,7 +22,6 @@ import java.util.List; import com.google.common.base.Strings; import com.google.common.io.Closeables; -import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -41,6 +40,8 @@ import org.apache.lucene.store.FSDirectory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import static org.apache.commons.lang.StringUtils.isBlank; + /** * Generates a sequence file from a Lucene index with a specified id field as the key and a content field as the value. * Configure this class with a {@link LuceneStorageConfiguration} bean. @@ -81,6 +82,7 @@ public class SequenceFilesFromLuceneStorage { processedDocs = writerCollector.processedDocs; Closeables.close(sequenceFileWriter, false); directory.close(); + //searcher.close(); reader.close(); } } @@ -115,7 +117,7 @@ public class SequenceFilesFromLuceneStorage { Text theValue = new Text(); LuceneSeqFileHelper.populateValues(doc, theValue, fields); //if they are both empty, don't write - if (StringUtils.isBlank(theKey.toString()) && StringUtils.isBlank(theValue.toString())) { + if (isBlank(theKey.toString()) && isBlank(theValue.toString())) { return; } sequenceFileWriter.append(theKey, theValue); http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java b/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java index cd019ac..1bd3f3e 100644 --- a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java +++ b/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java @@ -30,6 +30,7 @@ import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.util.Version; import org.apache.mahout.common.AbstractJob; import org.apache.mahout.common.commandline.DefaultOptionCreator; @@ -95,7 +96,8 @@ public class SequenceFilesFromLuceneStorageDriver extends AbstractJob { if (hasOption(OPTION_QUERY)) { try { String queryString = COMPILE.matcher(getOption(OPTION_QUERY)).replaceAll(""); - QueryParser queryParser = new QueryParser(queryString, new StandardAnalyzer()); + QueryParser queryParser = new QueryParser(Version.LUCENE_46, queryString, + new StandardAnalyzer(Version.LUCENE_46)); query = queryParser.parse(queryString); } catch (ParseException e) { throw new IllegalArgumentException(e.getMessage(), e); http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java b/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java index 4ad0367..ad55ba7 100644 --- a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java +++ b/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java @@ -28,24 +28,25 @@ import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer; +import org.apache.lucene.util.Version; public class WikipediaAnalyzer extends StopwordAnalyzerBase { public WikipediaAnalyzer() { - super(StopAnalyzer.ENGLISH_STOP_WORDS_SET); + super(Version.LUCENE_46, StopAnalyzer.ENGLISH_STOP_WORDS_SET); } public WikipediaAnalyzer(CharArraySet stopSet) { - super(stopSet); + super(Version.LUCENE_46, stopSet); } @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new WikipediaTokenizer(reader); - TokenStream result = new StandardFilter(tokenizer); - result = new LowerCaseFilter(result); - result = new StopFilter(result, getStopwordSet()); + TokenStream result = new StandardFilter(Version.LUCENE_46, tokenizer); + result = new LowerCaseFilter(Version.LUCENE_46, result); + result = new StopFilter(Version.LUCENE_46, result, getStopwordSet()); return new TokenStreamComponents(tokenizer, result); } } http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java b/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java index 5b7f0af..36b166a 100644 --- a/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java +++ b/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.util.Version; import org.apache.mahout.common.lucene.TokenStreamIterator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -36,7 +37,7 @@ public class AnalyzerTransformer implements RegexTransformer { private static final Logger log = LoggerFactory.getLogger(AnalyzerTransformer.class); public AnalyzerTransformer() { - this(new StandardAnalyzer()); + this(new StandardAnalyzer(Version.LUCENE_46), "text"); } public AnalyzerTransformer(Analyzer analyzer) { http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/mr/src/main/java/org/apache/mahout/Version.java ---------------------------------------------------------------------- diff --git a/mr/src/main/java/org/apache/mahout/Version.java b/mr/src/main/java/org/apache/mahout/Version.java index aac359c..5f3c879 100644 --- a/mr/src/main/java/org/apache/mahout/Version.java +++ b/mr/src/main/java/org/apache/mahout/Version.java @@ -17,10 +17,10 @@ package org.apache.mahout; -import java.io.IOException; - +import com.google.common.base.Charsets; import com.google.common.io.Resources; -import org.apache.commons.io.Charsets; + +import java.io.IOException; public final class Version { http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java ---------------------------------------------------------------------- diff --git a/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java b/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java index e91423d..37ca383 100644 --- a/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java +++ b/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java @@ -32,7 +32,7 @@ public final class AnalyzerUtils { * @throws ClassNotFoundException - {@link ClassNotFoundException} */ public static Analyzer createAnalyzer(String analyzerClassName) throws ClassNotFoundException { - return createAnalyzer(analyzerClassName, Version.LUCENE_4_10_4); + return createAnalyzer(analyzerClassName, Version.LUCENE_46); } public static Analyzer createAnalyzer(String analyzerClassName, Version version) throws ClassNotFoundException { @@ -47,7 +47,7 @@ public final class AnalyzerUtils { * @return {@link Analyzer} */ public static Analyzer createAnalyzer(Class<? extends Analyzer> analyzerClass) { - return createAnalyzer(analyzerClass, Version.LUCENE_4_10_4); + return createAnalyzer(analyzerClass, Version.LUCENE_46); } public static Analyzer createAnalyzer(Class<? extends Analyzer> analyzerClass, Version version) { http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/mr/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java ---------------------------------------------------------------------- diff --git a/mr/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java b/mr/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java index 6765c80..8a1f8f8 100644 --- a/mr/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java +++ b/mr/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java @@ -18,11 +18,8 @@ package org.apache.mahout.vectorizer; import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; import com.google.common.io.Closeables; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FileSystem; @@ -57,6 +54,10 @@ import org.apache.mahout.vectorizer.term.TermCountReducer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; +import java.util.Collection; +import java.util.List; + /** * This class converts a set of input documents in the sequence file format to vectors. The Sequence file * input should have a {@link Text} key containing the unique document identifier and a {@link StringTuple} @@ -191,7 +192,7 @@ public final class DictionaryVectorizer extends AbstractJob implements Vectorize } int partialVectorIndex = 0; - Collection<Path> partialVectorPaths = new ArrayList<>(); + Collection<Path> partialVectorPaths = Lists.newArrayList(); for (Path dictionaryChunk : dictionaryChunks) { Path partialVectorOutputPath = new Path(output, VECTOR_OUTPUT_FOLDER + partialVectorIndex++); partialVectorPaths.add(partialVectorOutputPath); @@ -216,7 +217,7 @@ public final class DictionaryVectorizer extends AbstractJob implements Vectorize Configuration baseConf, int chunkSizeInMegabytes, int[] maxTermDimension) throws IOException { - List<Path> chunkPaths = new ArrayList<>(); + List<Path> chunkPaths = Lists.newArrayList(); Configuration conf = new Configuration(baseConf); @@ -274,7 +275,6 @@ public final class DictionaryVectorizer extends AbstractJob implements Vectorize * @param output * output directory were the partial vectors have to be created * @param dimension - * number of features * @param sequentialAccess * output vectors should be optimized for sequential access * @param namedVectors http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/mr/src/main/java/org/apache/mahout/vectorizer/DocumentProcessor.java ---------------------------------------------------------------------- diff --git a/mr/src/main/java/org/apache/mahout/vectorizer/DocumentProcessor.java b/mr/src/main/java/org/apache/mahout/vectorizer/DocumentProcessor.java index 8f57e3a..2c3c236 100644 --- a/mr/src/main/java/org/apache/mahout/vectorizer/DocumentProcessor.java +++ b/mr/src/main/java/org/apache/mahout/vectorizer/DocumentProcessor.java @@ -38,7 +38,7 @@ import org.apache.mahout.vectorizer.document.SequenceFileTokenizerMapper; * containing the unique document identifier and a * {@link Text} value containing the whole document. The document should be stored in UTF-8 encoding which is * recognizable by hadoop. It uses the given {@link Analyzer} to process the document into - * {@link org.apache.lucene.analysis.TokenStream}s. + * {@link org.apache.lucene.analysis.Token}s. * */ public final class DocumentProcessor { http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/mr/src/main/java/org/apache/mahout/vectorizer/encoders/AdaptiveWordValueEncoder.java ---------------------------------------------------------------------- diff --git a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/AdaptiveWordValueEncoder.java b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/AdaptiveWordValueEncoder.java index 6a746ce..04b718e 100644 --- a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/AdaptiveWordValueEncoder.java +++ b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/AdaptiveWordValueEncoder.java @@ -17,9 +17,9 @@ package org.apache.mahout.vectorizer.encoders; +import com.google.common.base.Charsets; import com.google.common.collect.HashMultiset; import com.google.common.collect.Multiset; -import org.apache.commons.io.Charsets; import org.apache.mahout.math.Vector; /** http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/mr/src/main/java/org/apache/mahout/vectorizer/encoders/ContinuousValueEncoder.java ---------------------------------------------------------------------- diff --git a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/ContinuousValueEncoder.java b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/ContinuousValueEncoder.java index a0349c0..14382a5 100644 --- a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/ContinuousValueEncoder.java +++ b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/ContinuousValueEncoder.java @@ -17,7 +17,7 @@ package org.apache.mahout.vectorizer.encoders; -import org.apache.commons.io.Charsets; +import com.google.common.base.Charsets; import org.apache.mahout.math.Vector; /** http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/mr/src/main/java/org/apache/mahout/vectorizer/encoders/Dictionary.java ---------------------------------------------------------------------- diff --git a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/Dictionary.java b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/Dictionary.java index 60c89f7..2ea9b1b 100644 --- a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/Dictionary.java +++ b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/Dictionary.java @@ -17,8 +17,9 @@ package org.apache.mahout.vectorizer.encoders; -import java.util.ArrayList; -import java.util.LinkedHashMap; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; + import java.util.List; import java.util.Map; @@ -26,7 +27,7 @@ import java.util.Map; * Assigns integer codes to strings as they appear. */ public class Dictionary { - private final Map<String, Integer> dict = new LinkedHashMap<>(); + private final Map<String, Integer> dict = Maps.newLinkedHashMap(); public int intern(String s) { if (!dict.containsKey(s)) { @@ -37,7 +38,7 @@ public class Dictionary { public List<String> values() { // order of keySet is guaranteed to be insertion order - return new ArrayList<>(dict.keySet()); + return Lists.newArrayList(dict.keySet()); } public int size() { http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/mr/src/main/java/org/apache/mahout/vectorizer/encoders/FeatureVectorEncoder.java ---------------------------------------------------------------------- diff --git a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/FeatureVectorEncoder.java b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/FeatureVectorEncoder.java index 12f1848..96498d7 100644 --- a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/FeatureVectorEncoder.java +++ b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/FeatureVectorEncoder.java @@ -17,15 +17,15 @@ package org.apache.mahout.vectorizer.encoders; +import com.google.common.base.Charsets; +import com.google.common.collect.Sets; +import org.apache.mahout.math.MurmurHash; +import org.apache.mahout.math.Vector; + import java.util.Collections; -import java.util.HashSet; import java.util.Map; import java.util.Set; -import org.apache.commons.io.Charsets; -import org.apache.mahout.math.MurmurHash; -import org.apache.mahout.math.Vector; - /** * General interface for objects that record features into a feature vector. * <p/> @@ -257,7 +257,7 @@ public abstract class FeatureVectorEncoder { } Set<Integer> trace = traceDictionary.get(key); if (trace == null) { - trace = new HashSet<>(n); + trace = Sets.newHashSet(n); traceDictionary.put(key, trace); } else { trace.add(n); http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/mr/src/main/java/org/apache/mahout/vectorizer/encoders/InteractionValueEncoder.java ---------------------------------------------------------------------- diff --git a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/InteractionValueEncoder.java b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/InteractionValueEncoder.java index bd6594a..0be8823 100644 --- a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/InteractionValueEncoder.java +++ b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/InteractionValueEncoder.java @@ -19,9 +19,10 @@ package org.apache.mahout.vectorizer.encoders; import java.util.Locale; -import org.apache.commons.io.Charsets; import org.apache.mahout.math.Vector; +import com.google.common.base.Charsets; + public class InteractionValueEncoder extends FeatureVectorEncoder { private final FeatureVectorEncoder firstEncoder; private final FeatureVectorEncoder secondEncoder; @@ -87,7 +88,7 @@ public class InteractionValueEncoder extends FeatureVectorEncoder { int n = (k + j) % data.size(); if (isTraceEnabled()) { trace(String.format("%s:%s", new String(originalForm1, Charsets.UTF_8), new String(originalForm2, - Charsets.UTF_8)), n); + Charsets.UTF_8)), n); } data.set(n, data.get(n) + w); } http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/mr/src/main/java/org/apache/mahout/vectorizer/encoders/StaticWordValueEncoder.java ---------------------------------------------------------------------- diff --git a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/StaticWordValueEncoder.java b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/StaticWordValueEncoder.java index 826f669..6f67ef4 100644 --- a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/StaticWordValueEncoder.java +++ b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/StaticWordValueEncoder.java @@ -17,11 +17,11 @@ package org.apache.mahout.vectorizer.encoders; +import com.google.common.base.Charsets; + import java.util.Collections; import java.util.Map; -import org.apache.commons.io.Charsets; - /** * Encodes a categorical values with an unbounded vocabulary. Values are encoding by incrementing a * few locations in the output vector with a weight that is either defaulted to 1 or that is looked http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/mr/src/main/java/org/apache/mahout/vectorizer/encoders/TextValueEncoder.java ---------------------------------------------------------------------- diff --git a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/TextValueEncoder.java b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/TextValueEncoder.java index 943fbc8..87de095 100644 --- a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/TextValueEncoder.java +++ b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/TextValueEncoder.java @@ -17,16 +17,16 @@ package org.apache.mahout.vectorizer.encoders; -import java.util.ArrayList; -import java.util.Collection; -import java.util.regex.Pattern; - +import com.google.common.base.Charsets; import com.google.common.base.Splitter; import com.google.common.collect.HashMultiset; +import com.google.common.collect.Lists; import com.google.common.collect.Multiset; -import org.apache.commons.io.Charsets; import org.apache.mahout.math.Vector; +import java.util.Collection; +import java.util.regex.Pattern; + /** * Encodes text that is tokenized on non-alphanum separators. Each word is encoded using a * settable encoder which is by default an StaticWordValueEncoder which gives all @@ -98,7 +98,7 @@ public class TextValueEncoder extends FeatureVectorEncoder { @Override protected Iterable<Integer> hashesForProbe(byte[] originalForm, int dataSize, String name, int probe) { - Collection<Integer> hashes = new ArrayList<>(); + Collection<Integer> hashes = Lists.newArrayList(); for (String word : tokenize(new String(originalForm, Charsets.UTF_8))) { hashes.add(hashForProbe(bytesForString(word), dataSize, name, probe)); } http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/mr/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java ---------------------------------------------------------------------- diff --git a/mr/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java b/mr/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java index c878946..1496c90 100644 --- a/mr/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java +++ b/mr/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java @@ -17,12 +17,8 @@ package org.apache.mahout.vectorizer.term; -import java.io.IOException; -import java.net.URI; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - +import com.google.common.collect.Lists; +import com.google.common.io.Closeables; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.Path; @@ -46,6 +42,11 @@ import org.apache.mahout.math.map.OpenObjectIntHashMap; import org.apache.mahout.vectorizer.DictionaryVectorizer; import org.apache.mahout.vectorizer.common.PartialVectorMerger; +import java.io.IOException; +import java.net.URI; +import java.util.Iterator; +import java.util.List; + /** * Converts a document in to a sparse vector */ @@ -67,7 +68,7 @@ public class TFPartialVectorReducer extends Reducer<Text, StringTuple, Text, Vec return; } - List<String> value = new ArrayList<>(); + List<String> value = Lists.newArrayList(); while (it.hasNext()) { value.addAll(it.next().getEntries()); @@ -76,8 +77,9 @@ public class TFPartialVectorReducer extends Reducer<Text, StringTuple, Text, Vec Vector vector = new RandomAccessSparseVector(dimension, value.size()); // guess at initial size if (maxNGramSize >= 2) { - try (ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.iterator()), maxNGramSize)){ - sf.reset(); + ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.iterator()), maxNGramSize); + sf.reset(); + try { do { String term = sf.getAttribute(CharTermAttribute.class).toString(); if (!term.isEmpty() && dictionary.containsKey(term)) { // ngram @@ -85,7 +87,10 @@ public class TFPartialVectorReducer extends Reducer<Text, StringTuple, Text, Vec vector.setQuick(termId, vector.getQuick(termId) + 1); } } while (sf.incrementToken()); + sf.end(); + } finally { + Closeables.close(sf, true); } } else { for (String term : value) { http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/mr/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFConverter.java ---------------------------------------------------------------------- diff --git a/mr/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFConverter.java b/mr/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFConverter.java index 3387255..5f9d666 100644 --- a/mr/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFConverter.java +++ b/mr/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFConverter.java @@ -17,11 +17,8 @@ package org.apache.mahout.vectorizer.tfidf; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; import com.google.common.io.Closeables; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.filecache.DistributedCache; @@ -46,6 +43,9 @@ import org.apache.mahout.vectorizer.common.PartialVectorMerger; import org.apache.mahout.vectorizer.term.TermDocumentCountMapper; import org.apache.mahout.vectorizer.term.TermDocumentCountReducer; +import java.io.IOException; +import java.util.List; + /** * This class converts a set of input vectors with term frequencies to TfIdf vectors. The Sequence file input * should have a {@link org.apache.hadoop.io.WritableComparable} key containing and a @@ -118,7 +118,7 @@ public final class TFIDFConverter { "normPower must be > 1 and not infinite if log normalization is chosen", normPower); int partialVectorIndex = 0; - List<Path> partialVectorPaths = new ArrayList<>(); + List<Path> partialVectorPaths = Lists.newArrayList(); List<Path> dictionaryChunks = datasetFeatures.getSecond(); for (Path dictionaryChunk : dictionaryChunks) { Path partialVectorOutputPath = new Path(output, VECTOR_OUTPUT_FOLDER + partialVectorIndex++); @@ -195,7 +195,7 @@ public final class TFIDFConverter { Path dictionaryPathBase, Configuration baseConf, int chunkSizeInMegabytes) throws IOException { - List<Path> chunkPaths = new ArrayList<>(); + List<Path> chunkPaths = Lists.newArrayList(); Configuration conf = new Configuration(baseConf); FileSystem fs = FileSystem.get(featureCountPath.toUri(), conf); http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java ---------------------------------------------------------------------- diff --git a/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java b/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java index 4cb6946..4446fef 100644 --- a/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java +++ b/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java @@ -17,15 +17,16 @@ package org.apache.mahout.vectorizer.encoders; -import java.util.Locale; - import com.google.common.collect.ImmutableMap; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; +import org.apache.lucene.util.Version; import org.apache.mahout.common.MahoutTestCase; import org.apache.mahout.math.DenseVector; import org.apache.mahout.math.Vector; import org.junit.Test; +import java.util.Locale; + public final class TextValueEncoderTest extends MahoutTestCase { @Test @@ -40,7 +41,7 @@ public final class TextValueEncoderTest extends MahoutTestCase { // now some fancy weighting StaticWordValueEncoder w = new StaticWordValueEncoder("text"); - w.setDictionary(ImmutableMap.of("word1", 3.0, "word2", 1.5)); + w.setDictionary(ImmutableMap.<String, Double>of("word1", 3.0, "word2", 1.5)); enc.setWordEncoder(w); // should set 6 locations to something @@ -69,7 +70,7 @@ public final class TextValueEncoderTest extends MahoutTestCase { @Test public void testLuceneEncoding() throws Exception { LuceneTextValueEncoder enc = new LuceneTextValueEncoder("text"); - enc.setAnalyzer(new WhitespaceAnalyzer()); + enc.setAnalyzer(new WhitespaceAnalyzer(Version.LUCENE_46)); Vector v1 = new DenseVector(200); enc.addToVector("test1 and more", v1); enc.flush(1, v1); @@ -87,8 +88,7 @@ public final class TextValueEncoderTest extends MahoutTestCase { v1 = new DenseVector(200); StringBuilder builder = new StringBuilder(5000); - for (int i = 0; i < 1000; i++) { - //lucene's internal buffer length request is 4096, so let's make sure we can handle larger size + for (int i = 0; i < 1000; i++) {//lucene's internal buffer length request is 4096, so let's make sure we can handle larger size builder.append("token_").append(i).append(' '); } enc.addToVector(builder.toString(), v1); http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index 253f31b..80b4a20 100644 --- a/pom.xml +++ b/pom.xml @@ -115,8 +115,8 @@ <mfindbugs.version>2.5.2</mfindbugs.version> <mjavadoc.version>2.9.1</mjavadoc.version> <hbase.version>1.0.0</hbase.version> - <lucene.version>4.10.4</lucene.version> - <slf4j.version>1.7.12</slf4j.version> + <lucene.version>4.6.1</lucene.version> + <slf4j.version>1.7.10</slf4j.version> <scala.compat.version>2.10</scala.compat.version> <scala.version>2.10.4</scala.version> <spark.version>1.1.1</spark.version>
