mahout git commit: MAHOUT-1649: Upgrade to be Lucene 4.10.4 compatible, patch from Frank Scholten

smarthi Sun, 05 Apr 2015 22:33:56 -0700

Repository: mahout
Updated Branches:
  refs/heads/master d67a1d034 -> 4cff54295



MAHOUT-1649: Upgrade to be Lucene 4.10.4 compatible, patch from Frank Scholten


Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/4cff5429
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/4cff5429
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/4cff5429

Branch: refs/heads/master
Commit: 4cff54295d9e494eaff46a190b73bcb8d491f7e5
Parents: d67a1d0
Author: Suneel Marthi <[email protected]>
Authored: Mon Apr 6 01:34:49 2015 -0400
Committer: Suneel Marthi <[email protected]>
Committed: Mon Apr 6 01:35:22 2015 -0400

----------------------------------------------------------------------
 .../mahout/classifier/NewsgroupHelper.java      |  25 +-
 integration/pom.xml                             |  11 +
 .../mahout/text/LuceneSegmentInputFormat.java   |   5 +-
 .../mahout/text/LuceneSegmentInputSplit.java    |  14 +-
 .../mahout/text/LuceneSegmentRecordReader.java  |   4 +-
 .../mahout/text/LuceneStorageConfiguration.java |   5 +-
 .../text/MailArchivesClusteringAnalyzer.java    |  21 +-
 .../text/ReadOnlyFileSystemDirectory.java       | 354 -------------------
 .../text/SequenceFilesFromLuceneStorage.java    |   6 +-
 .../SequenceFilesFromLuceneStorageDriver.java   |   4 +-
 .../text/wikipedia/WikipediaAnalyzer.java       |  11 +-
 .../mahout/utils/regex/AnalyzerTransformer.java |   3 +-
 mr/src/main/java/org/apache/mahout/Version.java |   6 +-
 .../mahout/common/lucene/AnalyzerUtils.java     |   4 +-
 .../mahout/vectorizer/DictionaryVectorizer.java |  14 +-
 .../mahout/vectorizer/DocumentProcessor.java    |   2 +-
 .../encoders/AdaptiveWordValueEncoder.java      |   2 +-
 .../encoders/ContinuousValueEncoder.java        |   2 +-
 .../mahout/vectorizer/encoders/Dictionary.java  |   9 +-
 .../encoders/FeatureVectorEncoder.java          |  12 +-
 .../encoders/InteractionValueEncoder.java       |   5 +-
 .../encoders/StaticWordValueEncoder.java        |   4 +-
 .../vectorizer/encoders/TextValueEncoder.java   |  12 +-
 .../vectorizer/term/TFPartialVectorReducer.java |  23 +-
 .../mahout/vectorizer/tfidf/TFIDFConverter.java |  12 +-
 .../encoders/TextValueEncoderTest.java          |  12 +-
 pom.xml                                         |   4 +-
 27 files changed, 108 insertions(+), 478 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mahout/blob/4cff5429/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java
----------------------------------------------------------------------
diff --git 
a/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java 
b/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java
index 3674a57..532e023 100644
--- a/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java
+++ b/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java
@@ -17,6 +17,17 @@
 
 package org.apache.mahout.classifier;
 
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.text.SimpleDateFormat;
+import java.util.Collection;
+import java.util.Date;
+import java.util.Locale;
+import java.util.Random;
+
 import com.google.common.collect.ConcurrentHashMultiset;
 import com.google.common.collect.Multiset;
 import com.google.common.io.Closeables;
@@ -26,7 +37,6 @@ import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.util.Version;
 import org.apache.mahout.common.RandomUtils;
 import org.apache.mahout.math.RandomAccessSparseVector;
 import org.apache.mahout.math.Vector;
@@ -34,17 +44,6 @@ import 
org.apache.mahout.vectorizer.encoders.ConstantValueEncoder;
 import org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder;
 import org.apache.mahout.vectorizer.encoders.StaticWordValueEncoder;
 
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.IOException;
-import java.io.Reader;
-import java.io.StringReader;
-import java.text.SimpleDateFormat;
-import java.util.Collection;
-import java.util.Date;
-import java.util.Locale;
-import java.util.Random;
-
 public final class NewsgroupHelper {
   
   private static final SimpleDateFormat[] DATE_FORMATS = {
@@ -60,7 +59,7 @@ public final class NewsgroupHelper {
   private static final long WEEK = 7 * 24 * 3600;
   
   private final Random rand = RandomUtils.getRandom();  
-  private final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_46);
+  private final Analyzer analyzer = new StandardAnalyzer();
   private final FeatureVectorEncoder encoder = new 
StaticWordValueEncoder("body");
   private final FeatureVectorEncoder bias = new 
ConstantValueEncoder("Intercept");
   

http://git-wip-us.apache.org/repos/asf/mahout/blob/4cff5429/integration/pom.xml
----------------------------------------------------------------------
diff --git a/integration/pom.xml b/integration/pom.xml
index fcb85cb..8960417 100644
--- a/integration/pom.xml
+++ b/integration/pom.xml
@@ -137,6 +137,17 @@
       <artifactId>lucene-analyzers-common</artifactId>
       <optional>true</optional>
     </dependency>
+    <dependency>
+      <groupId>org.apache.solr</groupId>
+      <artifactId>solr-core</artifactId>
+      <version>${lucene.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>commons-httpclient</groupId>
+      <artifactId>commons-httpclient</artifactId>
+      <version>3.1</version>
+    </dependency>
+
 
     <dependency>
       <groupId>org.mongodb</groupId>

http://git-wip-us.apache.org/repos/asf/mahout/blob/4cff5429/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java
 
b/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java
index 1c4f8de..f1a360e 100644
--- 
a/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java
+++ 
b/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java
@@ -21,7 +21,6 @@ import java.util.ArrayList;
 import java.util.List;
 
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.NullWritable;
 import org.apache.hadoop.io.Text;
@@ -32,6 +31,7 @@ import org.apache.hadoop.mapreduce.RecordReader;
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.apache.lucene.index.SegmentCommitInfo;
 import org.apache.lucene.index.SegmentInfos;
+import org.apache.solr.store.hdfs.HdfsDirectory;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -52,8 +52,7 @@ public class LuceneSegmentInputFormat extends InputFormat {
 
     List<Path> indexPaths = lucene2SeqConfiguration.getIndexPaths();
     for (Path indexPath : indexPaths) {
-      ReadOnlyFileSystemDirectory directory = new 
ReadOnlyFileSystemDirectory(FileSystem.get(configuration), indexPath,
-                                                                              
false, configuration);
+      HdfsDirectory directory = new HdfsDirectory(indexPath, configuration);
       SegmentInfos segmentInfos = new SegmentInfos();
       segmentInfos.read(directory);
 

http://git-wip-us.apache.org/repos/asf/mahout/blob/4cff5429/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputSplit.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputSplit.java 
b/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputSplit.java
index 1441e32..1d94416 100644
--- 
a/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputSplit.java
+++ 
b/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputSplit.java
@@ -16,18 +16,18 @@
  */
 package org.apache.mahout.text;
 
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.mapreduce.InputSplit;
 import org.apache.lucene.index.SegmentCommitInfo;
 import org.apache.lucene.index.SegmentInfo;
 import org.apache.lucene.index.SegmentInfos;
-
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
+import org.apache.solr.store.hdfs.HdfsDirectory;
 
 /**
  * {@link InputSplit} implementation that represents a Lucene segment.
@@ -88,9 +88,7 @@ public class LuceneSegmentInputSplit extends InputSplit 
implements Writable {
    * @throws IOException if an error occurs when accessing the directory
    */
   public SegmentCommitInfo getSegment(Configuration configuration) throws 
IOException {
-    ReadOnlyFileSystemDirectory directory = new 
ReadOnlyFileSystemDirectory(FileSystem.get(configuration), indexPath,
-                                                                            
false, configuration);
-
+    HdfsDirectory directory = new HdfsDirectory(indexPath, configuration);
     SegmentInfos segmentInfos = new SegmentInfos();
     segmentInfos.read(directory);
 

http://git-wip-us.apache.org/repos/asf/mahout/blob/4cff5429/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java
 
b/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java
index 485e856..fe6a407 100644
--- 
a/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java
+++ 
b/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java
@@ -62,9 +62,8 @@ public class LuceneSegmentRecordReader extends 
RecordReader<Text, NullWritable>
     for (String field : lucene2SeqConfiguration.getFields()) {
         LuceneIndexHelper.fieldShouldExistInIndex(segmentReader, field);
     }
-
     Weight weight = lucene2SeqConfiguration.getQuery().createWeight(searcher);
-    scorer = weight.scorer(segmentReader.getContext(), false, false, null);
+    scorer = weight.scorer(segmentReader.getContext(), 
segmentReader.getLiveDocs());
     if (scorer == null) {
       throw new IllegalArgumentException("Could not create query scorer for 
query: "
           + lucene2SeqConfiguration.getQuery());
@@ -74,7 +73,6 @@ public class LuceneSegmentRecordReader extends 
RecordReader<Text, NullWritable>
   @Override
   public boolean nextKeyValue() throws IOException, InterruptedException {
     nextDocId = scorer.nextDoc();
-
     return nextDocId != Scorer.NO_MORE_DOCS;
   }
 

http://git-wip-us.apache.org/repos/asf/mahout/blob/4cff5429/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java
 
b/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java
index b36f3e9..f7a077b 100644
--- 
a/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java
+++ 
b/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java
@@ -40,13 +40,12 @@ import org.apache.lucene.queryparser.classic.ParseException;
 import org.apache.lucene.queryparser.classic.QueryParser;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.Query;
+import org.apache.lucene.util.Version;
 import org.apache.mahout.common.Pair;
 import org.apache.mahout.common.iterator.sequencefile.PathFilters;
 import org.apache.mahout.common.iterator.sequencefile.PathType;
 import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
 
-import static org.apache.lucene.util.Version.LUCENE_46;
-
 /**
  * Holds all the configuration for {@link SequenceFilesFromLuceneStorage}, 
which generates a sequence file
  * with id as the key and a content field as value.
@@ -213,7 +212,7 @@ public class LuceneStorageConfiguration implements Writable 
{
       }
       idField = in.readUTF();
       fields = Arrays.asList(in.readUTF().split(SEPARATOR_FIELDS));
-      query = new QueryParser(LUCENE_46, "query", new 
StandardAnalyzer(LUCENE_46)).parse(in.readUTF());
+      query = new QueryParser(Version.LUCENE_4_10_4, "query", new 
StandardAnalyzer()).parse(in.readUTF());
       maxHits = in.readInt();
     } catch (ParseException e) {
       throw new RuntimeException("Could not deserialize " + 
this.getClass().getName(), e);

http://git-wip-us.apache.org/repos/asf/mahout/blob/4cff5429/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
 
b/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
index 8776c5f..9560142 100644
--- 
a/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
+++ 
b/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
@@ -34,7 +34,6 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
-import org.apache.lucene.util.Version;
 
 /**
  * Custom Lucene Analyzer designed for aggressive feature reduction
@@ -42,13 +41,12 @@ import org.apache.lucene.util.Version;
  * stop words, excluding non-alpha-numeric tokens, and porter stemming.
  */
 public final class MailArchivesClusteringAnalyzer extends StopwordAnalyzerBase 
{
-  private static final Version LUCENE_VERSION = Version.LUCENE_46;
-  
+
   // extended set of stop words composed of common mail terms like "hi",
   // HTML tags, and Java keywords asmany of the messages in the archives
   // are subversion check-in notifications
     
-  private static final CharArraySet STOP_SET = new 
CharArraySet(LUCENE_VERSION, Arrays.asList(
+  private static final CharArraySet STOP_SET = new CharArraySet(Arrays.asList(
     "3d","7bit","a0","about","above","abstract","across","additional","after",
     "afterwards","again","against","align","all","almost","alone","along",
     "already","also","although","always","am","among","amongst","amoungst",
@@ -108,22 +106,17 @@ public final class MailArchivesClusteringAnalyzer extends 
StopwordAnalyzerBase {
   private static final Matcher MATCHER = ALPHA_NUMERIC.matcher("");
 
   public MailArchivesClusteringAnalyzer() {
-    super(LUCENE_VERSION, STOP_SET);
+    super(STOP_SET);
   }
 
-  public MailArchivesClusteringAnalyzer(CharArraySet stopSet) {
-    super(LUCENE_VERSION, stopSet);
-
-  }
-  
   @Override
   protected TokenStreamComponents createComponents(String fieldName, Reader 
reader) {
-    Tokenizer tokenizer = new StandardTokenizer(LUCENE_VERSION, reader);
-    TokenStream result = new StandardFilter(LUCENE_VERSION, tokenizer);
-    result = new LowerCaseFilter(LUCENE_VERSION, result);
+    Tokenizer tokenizer = new StandardTokenizer(reader);
+    TokenStream result = new StandardFilter(tokenizer);
+    result = new LowerCaseFilter(result);
     result = new ASCIIFoldingFilter(result);
     result = new AlphaNumericMaxLengthFilter(result);
-    result = new StopFilter(LUCENE_VERSION, result, STOP_SET);
+    result = new StopFilter(result, STOP_SET);
     result = new PorterStemFilter(result);
     return new TokenStreamComponents(tokenizer, result);
   }

http://git-wip-us.apache.org/repos/asf/mahout/blob/4cff5429/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java
 
b/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java
deleted file mode 100644
index e97e35b..0000000
--- 
a/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java
+++ /dev/null
@@ -1,354 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.text;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FSDataOutputStream;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.lucene.store.BaseDirectory;
-import org.apache.lucene.store.BufferedIndexInput;
-import org.apache.lucene.store.BufferedIndexOutput;
-import org.apache.lucene.store.IOContext;
-import org.apache.lucene.store.IndexInput;
-import org.apache.lucene.store.IndexOutput;
-import org.apache.lucene.store.Lock;
-import org.apache.lucene.store.LockFactory;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.util.Collection;
-
-//TODO: is there a better way of doing this in Lucene 4.x?
-
-/**
- * This class implements a read-only Lucene Directory on top of a general 
FileSystem.
- * Currently it does not support locking.
- * <p/>
- * // TODO: Rename to FileSystemReadOnlyDirectory
- */
-public class ReadOnlyFileSystemDirectory extends BaseDirectory {
-
-  private final FileSystem fs;
-  private final Path directory;
-  private final int ioFileBufferSize;
-
-  private static final Logger log = 
LoggerFactory.getLogger(ReadOnlyFileSystemDirectory.class);
-
-      /**
-       * Constructor
-       *
-       * @param fs - filesystem
-       * @param directory - directory path
-       * @param create - if true create the directory
-       * @param conf - MR Job Configuration
-       * @throws IOException
-       */
-
-  public ReadOnlyFileSystemDirectory(FileSystem fs, Path directory, boolean 
create,
-                                     Configuration conf) throws IOException {
-
-    this.fs = fs;
-    this.directory = directory;
-    this.ioFileBufferSize = conf.getInt("io.file.buffer.size", 4096);
-
-    if (create) {
-      create();
-    }
-
-    boolean isDir = false;
-    try {
-      FileStatus status = fs.getFileStatus(directory);
-      if (status != null) {
-        isDir = status.isDir();
-      }
-    } catch (IOException e) {
-      log.error(e.getMessage(), e);
-    }
-    if (!isDir) {
-      throw new IOException(directory + " is not a directory");
-    }
-  }
-
-
-  private void create() throws IOException {
-    if (!fs.exists(directory)) {
-      fs.mkdirs(directory);
-    }
-
-    boolean isDir = false;
-    try {
-      FileStatus status = fs.getFileStatus(directory);
-      if (status != null) {
-        isDir = status.isDir();
-      }
-    } catch (IOException e) {
-      log.error(e.getMessage(), e);
-    }
-    if (!isDir) {
-      throw new IOException(directory + " is not a directory");
-    }
-
-    // clear old index files
-    FileStatus[] fileStatus =
-            fs.listStatus(directory, LuceneIndexFileNameFilter.getFilter());
-    for (FileStatus status : fileStatus) {
-      if (!fs.delete(status.getPath(), true)) {
-        throw new IOException("Cannot delete index file "
-                + status.getPath());
-      }
-    }
-  }
-
-  public String[] list() throws IOException {
-    FileStatus[] fileStatus =
-            fs.listStatus(directory, LuceneIndexFileNameFilter.getFilter());
-    String[] result = new String[fileStatus.length];
-    for (int i = 0; i < fileStatus.length; i++) {
-      result[i] = fileStatus[i].getPath().getName();
-    }
-    return result;
-  }
-
-  @Override
-  public String[] listAll() throws IOException {
-    return list();
-  }
-
-  @Override
-  public boolean fileExists(String name) throws IOException {
-    return fs.exists(new Path(directory, name));
-  }
-
-  @Override
-  public long fileLength(String name) throws IOException {
-    return fs.getFileStatus(new Path(directory, name)).getLen();
-  }
-
-  @Override
-  public void deleteFile(String name) throws IOException {
-    if (!fs.delete(new Path(directory, name), true)) {
-      throw new IOException("Cannot delete index file " + name);
-    }
-  }
-
-  @Override
-  public IndexOutput createOutput(String name, IOContext context) throws 
IOException {
-    //TODO: What should we be doing with the IOContext here, if anything?
-    Path file = new Path(directory, name);
-    if (fs.exists(file) && !fs.delete(file, true)) {
-      // delete the existing one if applicable
-      throw new IOException("Cannot overwrite index file " + file);
-    }
-
-    return new FileSystemIndexOutput(file, ioFileBufferSize);
-  }
-
-  @Override
-  public void sync(Collection<String> names) throws IOException {
-    // do nothing, as this is read-only
-  }
-
-  @Override
-  public IndexInput openInput(String name, IOContext context) throws 
IOException {
-    return new FileSystemIndexInput(new Path(directory, name), 
ioFileBufferSize);
-  }
-
-  @Override
-  public Lock makeLock(final String name) {
-    return new Lock() {
-      public boolean obtain() {
-        return true;
-      }
-
-      public void release() {
-      }
-
-      public boolean isLocked() {
-        throw new UnsupportedOperationException();
-      }
-
-      public String toString() {
-        return "Lock@" + new Path(directory, name);
-      }
-    };
-  }
-
-  @Override
-  public void clearLock(String name) throws IOException {
-    // do nothing
-  }
-
-  @Override
-  public void close() throws IOException {
-    // do not close the file system
-  }
-
-  @Override
-  public void setLockFactory(LockFactory lockFactory) throws IOException {
-    // do nothing
-  }
-
-  @Override
-  public LockFactory getLockFactory() {
-    return null;
-  }
-
-  @Override
-  public String toString() {
-    return this.getClass().getName() + "@" + directory;
-  }
-
-  private class FileSystemIndexInput extends BufferedIndexInput implements 
Cloneable {
-
-    // shared by clones
-    private class Descriptor {
-      public final FSDataInputStream in;
-      public long position; // cache of in.getPos()
-
-      public Descriptor(Path file, int ioFileBufferSize) throws IOException {
-        this.in = fs.open(file, ioFileBufferSize);
-      }
-    }
-
-    private final Path filePath; // for debugging
-    private final Descriptor descriptor;
-    private final long length;
-    private boolean isOpen;
-    private boolean isClone;
-
-    public FileSystemIndexInput(Path path, int ioFileBufferSize)
-      throws IOException {
-      super("FSII_" + path.getName(), ioFileBufferSize);
-      filePath = path;
-      descriptor = new Descriptor(path, ioFileBufferSize);
-      length = fs.getFileStatus(path).getLen();
-      isOpen = true;
-    }
-
-    @Override
-    protected void readInternal(byte[] b, int offset, int len)
-      throws IOException {
-      long position = getFilePointer();
-      if (position != descriptor.position) {
-        descriptor.in.seek(position);
-        descriptor.position = position;
-      }
-      int total = 0;
-      do {
-        int i = descriptor.in.read(b, offset + total, len - total);
-        if (i == -1) {
-          throw new IOException("Read past EOF");
-        }
-        descriptor.position += i;
-        total += i;
-      } while (total < len);
-    }
-
-    @Override
-    public void close() throws IOException {
-      if (!isClone) {
-        if (isOpen) {
-          descriptor.in.close();
-          isOpen = false;
-        } else {
-          throw new IOException("Index file " + filePath + " already closed");
-        }
-      }
-    }
-
-    @Override
-    protected void seekInternal(long position) {
-      // handled in readInternal()
-    }
-
-    @Override
-    public long length() {
-      return length;
-    }
-
-    @Override
-    protected void finalize() throws Throwable {
-      super.finalize();
-      if (!isClone && isOpen) {
-        close(); // close the file
-      }
-    }
-
-    @Override
-    public BufferedIndexInput clone() {
-      FileSystemIndexInput clone = (FileSystemIndexInput) super.clone();
-      clone.isClone = true;
-      return clone;
-    }
-  }
-
-  private class FileSystemIndexOutput extends BufferedIndexOutput {
-
-    private final Path filePath; // for debugging
-    private final FSDataOutputStream out;
-    private boolean isOpen;
-
-    public FileSystemIndexOutput(Path path, int ioFileBufferSize)
-      throws IOException {
-      filePath = path;
-      // overwrite is true by default
-      out = fs.create(path, true, ioFileBufferSize);
-      isOpen = true;
-    }
-
-    @Override
-    public void flushBuffer(byte[] b, int offset, int size) throws IOException 
{
-      out.write(b, offset, size);
-    }
-
-    @Override
-    public void close() throws IOException {
-      if (isOpen) {
-        super.close();
-        out.close();
-        isOpen = false;
-      } else {
-        throw new IOException("Index file " + filePath + " already closed");
-      }
-    }
-
-    @Override
-    public void seek(long pos) throws IOException {
-      throw new UnsupportedOperationException();
-    }
-
-    @Override
-    public long length() throws IOException {
-      return out.getPos();
-    }
-
-    @Override
-    protected void finalize() throws Throwable {
-      super.finalize();
-      if (isOpen) {
-        close(); // close the file
-      }
-    }
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/4cff5429/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java
 
b/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java
index b7fd495..41c3f84 100644
--- 
a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java
+++ 
b/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java
@@ -22,6 +22,7 @@ import java.util.List;
 
 import com.google.common.base.Strings;
 import com.google.common.io.Closeables;
+import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -40,8 +41,6 @@ import org.apache.lucene.store.FSDirectory;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import static org.apache.commons.lang.StringUtils.isBlank;
-
 /**
  * Generates a sequence file from a Lucene index with a specified id field as 
the key and a content field as the value.
  * Configure this class with a {@link LuceneStorageConfiguration} bean.
@@ -82,7 +81,6 @@ public class SequenceFilesFromLuceneStorage {
       processedDocs = writerCollector.processedDocs;
       Closeables.close(sequenceFileWriter, false);
       directory.close();
-      //searcher.close();
       reader.close();
     }
   }
@@ -117,7 +115,7 @@ public class SequenceFilesFromLuceneStorage {
         Text theValue = new Text();
         LuceneSeqFileHelper.populateValues(doc, theValue, fields);
         //if they are both empty, don't write
-        if (isBlank(theKey.toString()) && isBlank(theValue.toString())) {
+        if (StringUtils.isBlank(theKey.toString()) && 
StringUtils.isBlank(theValue.toString())) {
           return;
         }
         sequenceFileWriter.append(theKey, theValue);

http://git-wip-us.apache.org/repos/asf/mahout/blob/4cff5429/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java
 
b/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java
index 1bd3f3e..cd019ac 100644
--- 
a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java
+++ 
b/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java
@@ -30,7 +30,6 @@ import org.apache.lucene.queryparser.classic.ParseException;
 import org.apache.lucene.queryparser.classic.QueryParser;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.Query;
-import org.apache.lucene.util.Version;
 import org.apache.mahout.common.AbstractJob;
 import org.apache.mahout.common.commandline.DefaultOptionCreator;
 
@@ -96,8 +95,7 @@ public class SequenceFilesFromLuceneStorageDriver extends 
AbstractJob {
     if (hasOption(OPTION_QUERY)) {
       try {
         String queryString = 
COMPILE.matcher(getOption(OPTION_QUERY)).replaceAll("");
-        QueryParser queryParser = new QueryParser(Version.LUCENE_46, 
queryString,
-            new StandardAnalyzer(Version.LUCENE_46));
+        QueryParser queryParser = new QueryParser(queryString, new 
StandardAnalyzer());
         query = queryParser.parse(queryString);
       } catch (ParseException e) {
         throw new IllegalArgumentException(e.getMessage(), e);

http://git-wip-us.apache.org/repos/asf/mahout/blob/4cff5429/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java
 
b/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java
index ad55ba7..4ad0367 100644
--- 
a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java
+++ 
b/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java
@@ -28,25 +28,24 @@ import org.apache.lucene.analysis.standard.StandardFilter;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
-import org.apache.lucene.util.Version;
 
 
 public class WikipediaAnalyzer extends StopwordAnalyzerBase {
   
   public WikipediaAnalyzer() {
-    super(Version.LUCENE_46, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
+    super(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
   }
   
   public WikipediaAnalyzer(CharArraySet stopSet) {
-    super(Version.LUCENE_46, stopSet);
+    super(stopSet);
   }
 
   @Override
   protected TokenStreamComponents createComponents(String fieldName, Reader 
reader) {
     Tokenizer tokenizer = new WikipediaTokenizer(reader);
-    TokenStream result = new StandardFilter(Version.LUCENE_46, tokenizer);
-    result = new LowerCaseFilter(Version.LUCENE_46, result);
-    result = new StopFilter(Version.LUCENE_46, result, getStopwordSet());
+    TokenStream result = new StandardFilter(tokenizer);
+    result = new LowerCaseFilter(result);
+    result = new StopFilter(result, getStopwordSet());
     return new TokenStreamComponents(tokenizer, result);
   }
 }

http://git-wip-us.apache.org/repos/asf/mahout/blob/4cff5429/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
 
b/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
index 36b166a..5b7f0af 100644
--- 
a/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
+++ 
b/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
@@ -24,7 +24,6 @@ import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.util.Version;
 import org.apache.mahout.common.lucene.TokenStreamIterator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -37,7 +36,7 @@ public class AnalyzerTransformer implements RegexTransformer {
   private static final Logger log = 
LoggerFactory.getLogger(AnalyzerTransformer.class);
 
   public AnalyzerTransformer() {
-    this(new StandardAnalyzer(Version.LUCENE_46), "text");
+    this(new StandardAnalyzer());
   }
 
   public AnalyzerTransformer(Analyzer analyzer) {

http://git-wip-us.apache.org/repos/asf/mahout/blob/4cff5429/mr/src/main/java/org/apache/mahout/Version.java
----------------------------------------------------------------------
diff --git a/mr/src/main/java/org/apache/mahout/Version.java 
b/mr/src/main/java/org/apache/mahout/Version.java
index 5f3c879..aac359c 100644
--- a/mr/src/main/java/org/apache/mahout/Version.java
+++ b/mr/src/main/java/org/apache/mahout/Version.java
@@ -17,11 +17,11 @@
 
 package org.apache.mahout;
 
-import com.google.common.base.Charsets;
-import com.google.common.io.Resources;
-
 import java.io.IOException;
 
+import com.google.common.io.Resources;
+import org.apache.commons.io.Charsets;
+
 public final class Version {
 
   private Version() {

http://git-wip-us.apache.org/repos/asf/mahout/blob/4cff5429/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java
----------------------------------------------------------------------
diff --git 
a/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java 
b/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java
index 37ca383..e91423d 100644
--- a/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java
+++ b/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java
@@ -32,7 +32,7 @@ public final class AnalyzerUtils {
    * @throws ClassNotFoundException - {@link ClassNotFoundException}
    */
   public static Analyzer createAnalyzer(String analyzerClassName) throws 
ClassNotFoundException {
-    return createAnalyzer(analyzerClassName, Version.LUCENE_46);
+    return createAnalyzer(analyzerClassName, Version.LUCENE_4_10_4);
   }
 
   public static Analyzer createAnalyzer(String analyzerClassName, Version 
version) throws ClassNotFoundException {
@@ -47,7 +47,7 @@ public final class AnalyzerUtils {
    * @return {@link Analyzer}
    */
   public static Analyzer createAnalyzer(Class<? extends Analyzer> 
analyzerClass) {
-    return createAnalyzer(analyzerClass, Version.LUCENE_46);
+    return createAnalyzer(analyzerClass, Version.LUCENE_4_10_4);
   }
 
   public static Analyzer createAnalyzer(Class<? extends Analyzer> 
analyzerClass, Version version) {

http://git-wip-us.apache.org/repos/asf/mahout/blob/4cff5429/mr/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
----------------------------------------------------------------------
diff --git 
a/mr/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java 
b/mr/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
index 8a1f8f8..6765c80 100644
--- a/mr/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
+++ b/mr/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
@@ -18,8 +18,11 @@
 package org.apache.mahout.vectorizer;
 
 import com.google.common.base.Preconditions;
-import com.google.common.collect.Lists;
 import com.google.common.io.Closeables;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.filecache.DistributedCache;
 import org.apache.hadoop.fs.FileSystem;
@@ -54,10 +57,6 @@ import org.apache.mahout.vectorizer.term.TermCountReducer;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.io.IOException;
-import java.util.Collection;
-import java.util.List;
-
 /**
  * This class converts a set of input documents in the sequence file format to 
vectors. The Sequence file
  * input should have a {@link Text} key containing the unique document 
identifier and a {@link StringTuple}
@@ -192,7 +191,7 @@ public final class DictionaryVectorizer extends AbstractJob 
implements Vectorize
     }
     
     int partialVectorIndex = 0;
-    Collection<Path> partialVectorPaths = Lists.newArrayList();
+    Collection<Path> partialVectorPaths = new ArrayList<>();
     for (Path dictionaryChunk : dictionaryChunks) {
       Path partialVectorOutputPath = new Path(output, VECTOR_OUTPUT_FOLDER + 
partialVectorIndex++);
       partialVectorPaths.add(partialVectorOutputPath);
@@ -217,7 +216,7 @@ public final class DictionaryVectorizer extends AbstractJob 
implements Vectorize
                                                    Configuration baseConf,
                                                    int chunkSizeInMegabytes,
                                                    int[] maxTermDimension) 
throws IOException {
-    List<Path> chunkPaths = Lists.newArrayList();
+    List<Path> chunkPaths = new ArrayList<>();
     
     Configuration conf = new Configuration(baseConf);
     
@@ -275,6 +274,7 @@ public final class DictionaryVectorizer extends AbstractJob 
implements Vectorize
    * @param output
    *          output directory were the partial vectors have to be created
    * @param dimension
+   *          number of features
    * @param sequentialAccess
    *          output vectors should be optimized for sequential access
    * @param namedVectors

http://git-wip-us.apache.org/repos/asf/mahout/blob/4cff5429/mr/src/main/java/org/apache/mahout/vectorizer/DocumentProcessor.java
----------------------------------------------------------------------
diff --git 
a/mr/src/main/java/org/apache/mahout/vectorizer/DocumentProcessor.java 
b/mr/src/main/java/org/apache/mahout/vectorizer/DocumentProcessor.java
index 2c3c236..8f57e3a 100644
--- a/mr/src/main/java/org/apache/mahout/vectorizer/DocumentProcessor.java
+++ b/mr/src/main/java/org/apache/mahout/vectorizer/DocumentProcessor.java
@@ -38,7 +38,7 @@ import 
org.apache.mahout.vectorizer.document.SequenceFileTokenizerMapper;
  * containing the unique document identifier and a
  * {@link Text} value containing the whole document. The document should be 
stored in UTF-8 encoding which is
  * recognizable by hadoop. It uses the given {@link Analyzer} to process the 
document into
- * {@link org.apache.lucene.analysis.Token}s.
+ * {@link org.apache.lucene.analysis.TokenStream}s.
  * 
  */
 public final class DocumentProcessor {

http://git-wip-us.apache.org/repos/asf/mahout/blob/4cff5429/mr/src/main/java/org/apache/mahout/vectorizer/encoders/AdaptiveWordValueEncoder.java
----------------------------------------------------------------------
diff --git 
a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/AdaptiveWordValueEncoder.java
 
b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/AdaptiveWordValueEncoder.java
index 04b718e..6a746ce 100644
--- 
a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/AdaptiveWordValueEncoder.java
+++ 
b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/AdaptiveWordValueEncoder.java
@@ -17,9 +17,9 @@
 
 package org.apache.mahout.vectorizer.encoders;
 
-import com.google.common.base.Charsets;
 import com.google.common.collect.HashMultiset;
 import com.google.common.collect.Multiset;
+import org.apache.commons.io.Charsets;
 import org.apache.mahout.math.Vector;
 
 /**

http://git-wip-us.apache.org/repos/asf/mahout/blob/4cff5429/mr/src/main/java/org/apache/mahout/vectorizer/encoders/ContinuousValueEncoder.java
----------------------------------------------------------------------
diff --git 
a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/ContinuousValueEncoder.java
 
b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/ContinuousValueEncoder.java
index 14382a5..a0349c0 100644
--- 
a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/ContinuousValueEncoder.java
+++ 
b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/ContinuousValueEncoder.java
@@ -17,7 +17,7 @@
 
 package org.apache.mahout.vectorizer.encoders;
 
-import com.google.common.base.Charsets;
+import org.apache.commons.io.Charsets;
 import org.apache.mahout.math.Vector;
 
 /**

http://git-wip-us.apache.org/repos/asf/mahout/blob/4cff5429/mr/src/main/java/org/apache/mahout/vectorizer/encoders/Dictionary.java
----------------------------------------------------------------------
diff --git 
a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/Dictionary.java 
b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/Dictionary.java
index 2ea9b1b..60c89f7 100644
--- a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/Dictionary.java
+++ b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/Dictionary.java
@@ -17,9 +17,8 @@
 
 package org.apache.mahout.vectorizer.encoders;
 
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
 
@@ -27,7 +26,7 @@ import java.util.Map;
 * Assigns integer codes to strings as they appear.
 */
 public class Dictionary {
-  private final Map<String, Integer> dict = Maps.newLinkedHashMap();
+  private final Map<String, Integer> dict = new LinkedHashMap<>();
 
   public int intern(String s) {
     if (!dict.containsKey(s)) {
@@ -38,7 +37,7 @@ public class Dictionary {
 
   public List<String> values() {
     // order of keySet is guaranteed to be insertion order
-    return Lists.newArrayList(dict.keySet());
+    return new ArrayList<>(dict.keySet());
   }
 
   public int size() {

http://git-wip-us.apache.org/repos/asf/mahout/blob/4cff5429/mr/src/main/java/org/apache/mahout/vectorizer/encoders/FeatureVectorEncoder.java
----------------------------------------------------------------------
diff --git 
a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/FeatureVectorEncoder.java
 
b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/FeatureVectorEncoder.java
index 96498d7..12f1848 100644
--- 
a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/FeatureVectorEncoder.java
+++ 
b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/FeatureVectorEncoder.java
@@ -17,15 +17,15 @@
 
 package org.apache.mahout.vectorizer.encoders;
 
-import com.google.common.base.Charsets;
-import com.google.common.collect.Sets;
-import org.apache.mahout.math.MurmurHash;
-import org.apache.mahout.math.Vector;
-
 import java.util.Collections;
+import java.util.HashSet;
 import java.util.Map;
 import java.util.Set;
 
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.math.MurmurHash;
+import org.apache.mahout.math.Vector;
+
 /**
  * General interface for objects that record features into a feature vector.
  * <p/>
@@ -257,7 +257,7 @@ public abstract class FeatureVectorEncoder {
       }
       Set<Integer> trace = traceDictionary.get(key);
       if (trace == null) {
-        trace = Sets.newHashSet(n);
+        trace = new HashSet<>(n);
         traceDictionary.put(key, trace);
       } else {
         trace.add(n);

http://git-wip-us.apache.org/repos/asf/mahout/blob/4cff5429/mr/src/main/java/org/apache/mahout/vectorizer/encoders/InteractionValueEncoder.java
----------------------------------------------------------------------
diff --git 
a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/InteractionValueEncoder.java
 
b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/InteractionValueEncoder.java
index 0be8823..bd6594a 100644
--- 
a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/InteractionValueEncoder.java
+++ 
b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/InteractionValueEncoder.java
@@ -19,10 +19,9 @@ package org.apache.mahout.vectorizer.encoders;
 
 import java.util.Locale;
 
+import org.apache.commons.io.Charsets;
 import org.apache.mahout.math.Vector;
 
-import com.google.common.base.Charsets;
-
 public class InteractionValueEncoder extends FeatureVectorEncoder {
   private final FeatureVectorEncoder firstEncoder;
   private final FeatureVectorEncoder secondEncoder;
@@ -88,7 +87,7 @@ public class InteractionValueEncoder extends 
FeatureVectorEncoder {
           int n = (k + j) % data.size();
           if (isTraceEnabled()) {
             trace(String.format("%s:%s", new String(originalForm1, 
Charsets.UTF_8), new String(originalForm2,
-               Charsets.UTF_8)), n);
+                Charsets.UTF_8)), n);
           }
           data.set(n, data.get(n) + w);
         }

http://git-wip-us.apache.org/repos/asf/mahout/blob/4cff5429/mr/src/main/java/org/apache/mahout/vectorizer/encoders/StaticWordValueEncoder.java
----------------------------------------------------------------------
diff --git 
a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/StaticWordValueEncoder.java
 
b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/StaticWordValueEncoder.java
index 6f67ef4..826f669 100644
--- 
a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/StaticWordValueEncoder.java
+++ 
b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/StaticWordValueEncoder.java
@@ -17,11 +17,11 @@
 
 package org.apache.mahout.vectorizer.encoders;
 
-import com.google.common.base.Charsets;
-
 import java.util.Collections;
 import java.util.Map;
 
+import org.apache.commons.io.Charsets;
+
 /**
  * Encodes a categorical values with an unbounded vocabulary.  Values are 
encoding by incrementing a
  * few locations in the output vector with a weight that is either defaulted 
to 1 or that is looked

http://git-wip-us.apache.org/repos/asf/mahout/blob/4cff5429/mr/src/main/java/org/apache/mahout/vectorizer/encoders/TextValueEncoder.java
----------------------------------------------------------------------
diff --git 
a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/TextValueEncoder.java 
b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/TextValueEncoder.java
index 87de095..943fbc8 100644
--- 
a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/TextValueEncoder.java
+++ 
b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/TextValueEncoder.java
@@ -17,16 +17,16 @@
 
 package org.apache.mahout.vectorizer.encoders;
 
-import com.google.common.base.Charsets;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.regex.Pattern;
+
 import com.google.common.base.Splitter;
 import com.google.common.collect.HashMultiset;
-import com.google.common.collect.Lists;
 import com.google.common.collect.Multiset;
+import org.apache.commons.io.Charsets;
 import org.apache.mahout.math.Vector;
 
-import java.util.Collection;
-import java.util.regex.Pattern;
-
 /**
  * Encodes text that is tokenized on non-alphanum separators.  Each word is 
encoded using a
  * settable encoder which is by default an StaticWordValueEncoder which gives 
all
@@ -98,7 +98,7 @@ public class TextValueEncoder extends FeatureVectorEncoder {
 
   @Override
   protected Iterable<Integer> hashesForProbe(byte[] originalForm, int 
dataSize, String name, int probe) {
-    Collection<Integer> hashes = Lists.newArrayList();
+    Collection<Integer> hashes = new ArrayList<>();
     for (String word : tokenize(new String(originalForm, Charsets.UTF_8))) {
       hashes.add(hashForProbe(bytesForString(word), dataSize, name, probe));
     }

http://git-wip-us.apache.org/repos/asf/mahout/blob/4cff5429/mr/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java
----------------------------------------------------------------------
diff --git 
a/mr/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java
 
b/mr/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java
index 1496c90..c878946 100644
--- 
a/mr/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java
+++ 
b/mr/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java
@@ -17,8 +17,12 @@
 
 package org.apache.mahout.vectorizer.term;
 
-import com.google.common.collect.Lists;
-import com.google.common.io.Closeables;
+import java.io.IOException;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.filecache.DistributedCache;
 import org.apache.hadoop.fs.Path;
@@ -42,11 +46,6 @@ import org.apache.mahout.math.map.OpenObjectIntHashMap;
 import org.apache.mahout.vectorizer.DictionaryVectorizer;
 import org.apache.mahout.vectorizer.common.PartialVectorMerger;
 
-import java.io.IOException;
-import java.net.URI;
-import java.util.Iterator;
-import java.util.List;
-
 /**
  * Converts a document in to a sparse vector
  */
@@ -68,7 +67,7 @@ public class TFPartialVectorReducer extends Reducer<Text, 
StringTuple, Text, Vec
       return;
     }
 
-    List<String> value = Lists.newArrayList();
+    List<String> value = new ArrayList<>();
 
     while (it.hasNext()) {
       value.addAll(it.next().getEntries());
@@ -77,9 +76,8 @@ public class TFPartialVectorReducer extends Reducer<Text, 
StringTuple, Text, Vec
     Vector vector = new RandomAccessSparseVector(dimension, value.size()); // 
guess at initial size
 
     if (maxNGramSize >= 2) {
-      ShingleFilter sf = new ShingleFilter(new 
IteratorTokenStream(value.iterator()), maxNGramSize);
-      sf.reset();
-      try {
+      try (ShingleFilter sf = new ShingleFilter(new 
IteratorTokenStream(value.iterator()), maxNGramSize)){
+        sf.reset();
         do {
           String term = sf.getAttribute(CharTermAttribute.class).toString();
           if (!term.isEmpty() && dictionary.containsKey(term)) { // ngram
@@ -87,10 +85,7 @@ public class TFPartialVectorReducer extends Reducer<Text, 
StringTuple, Text, Vec
             vector.setQuick(termId, vector.getQuick(termId) + 1);
           }
         } while (sf.incrementToken());
-
         sf.end();
-      } finally {
-        Closeables.close(sf, true);
       }
     } else {
       for (String term : value) {

http://git-wip-us.apache.org/repos/asf/mahout/blob/4cff5429/mr/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFConverter.java
----------------------------------------------------------------------
diff --git 
a/mr/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFConverter.java 
b/mr/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFConverter.java
index 5f9d666..3387255 100644
--- a/mr/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFConverter.java
+++ b/mr/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFConverter.java
@@ -17,8 +17,11 @@
 
 package org.apache.mahout.vectorizer.tfidf;
 
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
 import com.google.common.base.Preconditions;
-import com.google.common.collect.Lists;
 import com.google.common.io.Closeables;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.filecache.DistributedCache;
@@ -43,9 +46,6 @@ import 
org.apache.mahout.vectorizer.common.PartialVectorMerger;
 import org.apache.mahout.vectorizer.term.TermDocumentCountMapper;
 import org.apache.mahout.vectorizer.term.TermDocumentCountReducer;
 
-import java.io.IOException;
-import java.util.List;
-
 /**
  * This class converts a set of input vectors with term frequencies to TfIdf 
vectors. The Sequence file input
  * should have a {@link org.apache.hadoop.io.WritableComparable} key 
containing and a
@@ -118,7 +118,7 @@ public final class TFIDFConverter {
         "normPower must be > 1 and not infinite if log normalization is 
chosen", normPower);
 
     int partialVectorIndex = 0;
-    List<Path> partialVectorPaths = Lists.newArrayList();
+    List<Path> partialVectorPaths = new ArrayList<>();
     List<Path> dictionaryChunks = datasetFeatures.getSecond();
     for (Path dictionaryChunk : dictionaryChunks) {
       Path partialVectorOutputPath = new Path(output, VECTOR_OUTPUT_FOLDER + 
partialVectorIndex++);
@@ -195,7 +195,7 @@ public final class TFIDFConverter {
                                                                  Path 
dictionaryPathBase,
                                                                  Configuration 
baseConf,
                                                                  int 
chunkSizeInMegabytes) throws IOException {
-    List<Path> chunkPaths = Lists.newArrayList();
+    List<Path> chunkPaths = new ArrayList<>();
     Configuration conf = new Configuration(baseConf);
 
     FileSystem fs = FileSystem.get(featureCountPath.toUri(), conf);

http://git-wip-us.apache.org/repos/asf/mahout/blob/4cff5429/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java
----------------------------------------------------------------------
diff --git 
a/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java
 
b/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java
index 4446fef..4cb6946 100644
--- 
a/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java
+++ 
b/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java
@@ -17,16 +17,15 @@
 
 package org.apache.mahout.vectorizer.encoders;
 
+import java.util.Locale;
+
 import com.google.common.collect.ImmutableMap;
 import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
-import org.apache.lucene.util.Version;
 import org.apache.mahout.common.MahoutTestCase;
 import org.apache.mahout.math.DenseVector;
 import org.apache.mahout.math.Vector;
 import org.junit.Test;
 
-import java.util.Locale;
-
 public final class TextValueEncoderTest extends MahoutTestCase {
 
   @Test
@@ -41,7 +40,7 @@ public final class TextValueEncoderTest extends 
MahoutTestCase {
 
     // now some fancy weighting
     StaticWordValueEncoder w = new StaticWordValueEncoder("text");
-    w.setDictionary(ImmutableMap.<String, Double>of("word1", 3.0, "word2", 
1.5));
+    w.setDictionary(ImmutableMap.of("word1", 3.0, "word2", 1.5));
     enc.setWordEncoder(w);
 
     // should set 6 locations to something
@@ -70,7 +69,7 @@ public final class TextValueEncoderTest extends 
MahoutTestCase {
   @Test
   public void testLuceneEncoding() throws Exception {
     LuceneTextValueEncoder enc = new LuceneTextValueEncoder("text");
-    enc.setAnalyzer(new WhitespaceAnalyzer(Version.LUCENE_46));
+    enc.setAnalyzer(new WhitespaceAnalyzer());
     Vector v1 = new DenseVector(200);
     enc.addToVector("test1 and more", v1);
     enc.flush(1, v1);
@@ -88,7 +87,8 @@ public final class TextValueEncoderTest extends 
MahoutTestCase {
 
     v1 = new DenseVector(200);
     StringBuilder builder = new StringBuilder(5000);
-    for (int i = 0; i < 1000; i++) {//lucene's internal buffer length request 
is 4096, so let's make sure we can handle larger size
+    for (int i = 0; i < 1000; i++) {
+      //lucene's internal buffer length request is 4096, so let's make sure we 
can handle larger size
       builder.append("token_").append(i).append(' ');
     }
     enc.addToVector(builder.toString(), v1);

http://git-wip-us.apache.org/repos/asf/mahout/blob/4cff5429/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 80b4a20..253f31b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -115,8 +115,8 @@
     <mfindbugs.version>2.5.2</mfindbugs.version>
     <mjavadoc.version>2.9.1</mjavadoc.version>
     <hbase.version>1.0.0</hbase.version>
-    <lucene.version>4.6.1</lucene.version>
-    <slf4j.version>1.7.10</slf4j.version>
+    <lucene.version>4.10.4</lucene.version>
+    <slf4j.version>1.7.12</slf4j.version>
     <scala.compat.version>2.10</scala.compat.version>
     <scala.version>2.10.4</scala.version>
     <spark.version>1.1.1</spark.version>

mahout git commit: MAHOUT-1649: Upgrade to be Lucene 4.10.4 compatible, patch from Frank Scholten

Reply via email to