[19/52] [partial] mahout git commit: removed all files except for website directory

vanstee Tue, 27 Jun 2017 09:14:45 -0700

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/SplitInputTest.java
----------------------------------------------------------------------
diff --git 
a/integration/src/test/java/org/apache/mahout/utils/SplitInputTest.java 
b/integration/src/test/java/org/apache/mahout/utils/SplitInputTest.java
deleted file mode 100644
index 7ffa690..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/SplitInputTest.java
+++ /dev/null
@@ -1,418 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils;
-
-import java.io.BufferedWriter;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.nio.charset.Charset;
-
-import com.google.common.io.Closeables;
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.classifier.ClassifierData;
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
-import 
org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterable;
-import org.apache.mahout.math.SequentialAccessSparseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.math.map.OpenObjectIntHashMap;
-import org.junit.Before;
-import org.junit.Test;
-
-public final class SplitInputTest extends MahoutTestCase {
-
-  private OpenObjectIntHashMap<String> countMap;
-  private Charset charset;
-  private FileSystem fs;
-  private Path tempInputFile;
-  private Path tempTrainingDirectory;
-  private Path tempTestDirectory;
-  private Path tempMapRedOutputDirectory;
-  private Path tempInputDirectory;
-  private Path tempSequenceDirectory;
-  private SplitInput si;
-
-  @Override
-  @Before
-  public void setUp() throws Exception {
-    Configuration conf = getConfiguration();
-    fs = FileSystem.get(conf);
-
-    super.setUp();
-
-    countMap = new OpenObjectIntHashMap<>();
-
-    charset = Charsets.UTF_8;
-    tempSequenceDirectory = getTestTempFilePath("tmpsequence");
-    tempInputFile = getTestTempFilePath("bayesinputfile");
-    tempTrainingDirectory = getTestTempDirPath("bayestrain");
-    tempTestDirectory = getTestTempDirPath("bayestest");
-    tempMapRedOutputDirectory = new Path(getTestTempDirPath(), "mapRedOutput");
-    tempInputDirectory = getTestTempDirPath("bayesinputdir");
-
-    si = new SplitInput();
-    si.setTrainingOutputDirectory(tempTrainingDirectory);
-    si.setTestOutputDirectory(tempTestDirectory);
-    si.setInputDirectory(tempInputDirectory);
-  }
-
-  private void writeMultipleInputFiles() throws IOException {
-    Writer writer = null;
-    String currentLabel = null;
-    try {
-     for (String[] entry : ClassifierData.DATA) {
-      if (!entry[0].equals(currentLabel)) {
-        currentLabel = entry[0];
-        Closeables.close(writer, false);
-
-        writer = new BufferedWriter(new OutputStreamWriter(fs.create(new 
Path(tempInputDirectory, currentLabel)),
-            Charsets.UTF_8));
-      }
-      countMap.adjustOrPutValue(currentLabel, 1, 1);
-      writer.write(currentLabel + '\t' + entry[1] + '\n');
-     }
-    }finally {
-     Closeables.close(writer, false);
-    }
-  }
-
-  private void writeSingleInputFile() throws IOException {
-    Writer writer = new BufferedWriter(new 
OutputStreamWriter(fs.create(tempInputFile), Charsets.UTF_8));
-    try {
-      for (String[] entry : ClassifierData.DATA) {
-        writer.write(entry[0] + '\t' + entry[1] + '\n');
-      }
-    } finally {
-      Closeables.close(writer, true);
-    }
-  }
-
-  @Test
-  public void testSplitDirectory() throws Exception {
-
-    writeMultipleInputFiles();
-
-    final int testSplitSize = 1;
-    si.setTestSplitSize(testSplitSize);
-    si.setCallback(new SplitInput.SplitCallback() {
-          @Override
-          public void splitComplete(Path inputFile, int lineCount, int 
trainCount, int testCount, int testSplitStart) {
-            int trainingLines = countMap.get(inputFile.getName()) - 
testSplitSize;
-            assertSplit(fs, inputFile, charset, testSplitSize, trainingLines, 
tempTrainingDirectory, tempTestDirectory);
-          }
-    });
-
-    si.splitDirectory(tempInputDirectory);
-  }
-
-  @Test
-  public void testSplitFile() throws Exception {
-    writeSingleInputFile();
-    si.setTestSplitSize(2);
-    si.setCallback(new TestCallback(2, 10));
-    si.splitFile(tempInputFile);
-  }
-
-  @Test
-  public void testSplitFileLocation() throws Exception {
-    writeSingleInputFile();
-    si.setTestSplitSize(2);
-    si.setSplitLocation(50);
-    si.setCallback(new TestCallback(2, 10));
-    si.splitFile(tempInputFile);
-  }
-
-  @Test
-  public void testSplitFilePct() throws Exception {
-    writeSingleInputFile();
-    si.setTestSplitPct(25);
-
-    si.setCallback(new TestCallback(3, 9));
-    si.splitFile(tempInputFile);
-  }
-
-  @Test
-  public void testSplitFilePctLocation() throws Exception {
-    writeSingleInputFile();
-    si.setTestSplitPct(25);
-    si.setSplitLocation(50);
-    si.setCallback(new TestCallback(3, 9));
-    si.splitFile(tempInputFile);
-  }
-
-  @Test
-  public void testSplitFileRandomSelectionSize() throws Exception {
-    writeSingleInputFile();
-    si.setTestRandomSelectionSize(5);
-
-    si.setCallback(new TestCallback(5, 7));
-    si.splitFile(tempInputFile);
-  }
-
-  @Test
-  public void testSplitFileRandomSelectionPct() throws Exception {
-    writeSingleInputFile();
-    si.setTestRandomSelectionPct(25);
-
-    si.setCallback(new TestCallback(3, 9));
-    si.splitFile(tempInputFile);
-  }
-
-  /**
-   * Create a Sequencefile for testing consisting of IntWritable
-   * keys and VectorWritable values
-   * @param path path for test SequenceFile
-   * @param testPoints number of records in test SequenceFile
-   */
-  private void writeVectorSequenceFile(Path path, int testPoints) throws 
IOException {
-    Path tempSequenceFile = new Path(path, "part-00000");
-    Configuration conf = getConfiguration();
-    IntWritable key = new IntWritable();
-    VectorWritable value = new VectorWritable();
-    try (SequenceFile.Writer writer =
-             SequenceFile.createWriter(fs, conf, tempSequenceFile, 
IntWritable.class, VectorWritable.class)) {
-      for (int i = 0; i < testPoints; i++) {
-        key.set(i);
-        Vector v = new SequentialAccessSparseVector(4);
-        v.assign(i);
-        value.set(v);
-        writer.append(key, value);
-      }
-    }
-  }
-
-  /**
-   * Create a Sequencefile for testing consisting of IntWritable keys and Text 
values
-   * @param path path for test SequenceFile
-   * @param testPoints number of records in test SequenceFile
-   */
-  private void writeTextSequenceFile(Path path, int testPoints) throws 
IOException {
-    Path tempSequenceFile = new Path(path, "part-00000");
-    Configuration conf = getConfiguration();
-    Text key = new Text();
-    Text value = new Text();
-    try (SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, 
tempSequenceFile, Text.class, Text.class)){
-      for (int i = 0; i < testPoints; i++) {
-        key.set(Integer.toString(i));
-        value.set("Line " + i);
-        writer.append(key, value);
-      }
-    }
-  }
-
-  /**
-   * Display contents of a SequenceFile
-   * @param sequenceFilePath path to SequenceFile
-   */
-  private void displaySequenceFile(Path sequenceFilePath) throws IOException {
-    for (Pair<?,?> record : new SequenceFileIterable<>(sequenceFilePath, true, 
getConfiguration())) {
-      System.out.println(record.getFirst() + "\t" + record.getSecond());
-    }
-  }
-
-  /**
-   * Determine number of records in a SequenceFile
-   * @param sequenceFilePath path to SequenceFile
-   * @return number of records
-   */
-  private int getNumberRecords(Path sequenceFilePath) throws IOException {
-    int numberRecords = 0;
-    for (Object value : new SequenceFileValueIterable<>(sequenceFilePath, 
true, getConfiguration())) {
-      numberRecords++;
-    }
-    return numberRecords;
-  }
-
-  /**
-   * Test map reduce version of split input with Text, Text key value
-   * pairs in input
-   */
-  @Test
-  public void testSplitInputMapReduceText() throws Exception {
-    writeTextSequenceFile(tempSequenceDirectory, 1000);
-    testSplitInputMapReduce(1000);
-  }
-
-  /** Test map reduce version of split input with Text, Text key value pairs 
in input called from command line */
-  @Test
-  public void testSplitInputMapReduceTextCli() throws Exception {
-    writeTextSequenceFile(tempSequenceDirectory, 1000);
-    testSplitInputMapReduceCli(1000);
-  }
-
-  /**
-   * Test map reduce version of split input with IntWritable, Vector key value
-   * pairs in input
-   */
-  @Test
-  public void testSplitInputMapReduceVector() throws Exception {
-    writeVectorSequenceFile(tempSequenceDirectory, 1000);
-    testSplitInputMapReduce(1000);
-  }
-
-  /**
-   * Test map reduce version of split input with IntWritable, Vector key value
-   * pairs in input called from command line
-   */
-  @Test
-  public void testSplitInputMapReduceVectorCli() throws Exception {
-    writeVectorSequenceFile(tempSequenceDirectory, 1000);
-    testSplitInputMapReduceCli(1000);
-  }
-
-  /**
-   * Test map reduce version of split input through CLI
-   */
-  private void testSplitInputMapReduceCli(int numPoints) throws Exception {
-    int randomSelectionPct = 25;
-    int keepPct = 10;
-    String[] args =
-        { "--method", "mapreduce", "--input", tempSequenceDirectory.toString(),
-            "--mapRedOutputDir", tempMapRedOutputDirectory.toString(),
-            "--randomSelectionPct", Integer.toString(randomSelectionPct),
-            "--keepPct", Integer.toString(keepPct), "-ow" };
-    ToolRunner.run(getConfiguration(), new SplitInput(), args);
-    validateSplitInputMapReduce(numPoints, randomSelectionPct, keepPct);
-  }
-
-  /**
-   * Test map reduce version of split input through method call
-   */
-  private void testSplitInputMapReduce(int numPoints) throws Exception {
-    int randomSelectionPct = 25;
-    si.setTestRandomSelectionPct(randomSelectionPct);
-    int keepPct = 10;
-    si.setKeepPct(keepPct);
-    si.setMapRedOutputDirectory(tempMapRedOutputDirectory);
-    si.setUseMapRed(true);
-    si.splitDirectory(getConfiguration(), tempSequenceDirectory);
-
-    validateSplitInputMapReduce(numPoints, randomSelectionPct, keepPct);
-  }
-
-  /**
-   * Validate that number of test records and number of training records
-   * are consistant with keepPct and randomSelectionPct
-   */
-  private void validateSplitInputMapReduce(int numPoints, int 
randomSelectionPct, int keepPct) throws IOException {
-    Path testPath = new Path(tempMapRedOutputDirectory, "test-r-00000");
-    Path trainingPath = new Path(tempMapRedOutputDirectory, 
"training-r-00000");
-    int numberTestRecords = getNumberRecords(testPath);
-    int numberTrainingRecords = getNumberRecords(trainingPath);
-    System.out.printf("Test data: %d records\n", numberTestRecords);
-    displaySequenceFile(testPath);
-    System.out.printf("Training data: %d records\n", numberTrainingRecords);
-    displaySequenceFile(trainingPath);
-    assertEquals((randomSelectionPct / 100.0) * (keepPct / 100.0) * numPoints,
-        numberTestRecords, 2);
-    assertEquals(
-        (1 - randomSelectionPct / 100.0) * (keepPct / 100.0) * numPoints,
-        numberTrainingRecords, 2);
-  }
-
-  @Test
-  public void testValidate() throws Exception {
-    SplitInput st = new SplitInput();
-    assertValidateException(st);
-
-    st.setTestSplitSize(100);
-    assertValidateException(st);
-
-    st.setTestOutputDirectory(tempTestDirectory);
-    assertValidateException(st);
-
-    st.setTrainingOutputDirectory(tempTrainingDirectory);
-    st.validate();
-
-    st.setTestSplitPct(50);
-    assertValidateException(st);
-
-    st = new SplitInput();
-    st.setTestRandomSelectionPct(50);
-    st.setTestOutputDirectory(tempTestDirectory);
-    st.setTrainingOutputDirectory(tempTrainingDirectory);
-    st.validate();
-
-    st.setTestSplitPct(50);
-    assertValidateException(st);
-
-    st = new SplitInput();
-    st.setTestRandomSelectionPct(50);
-    st.setTestOutputDirectory(tempTestDirectory);
-    st.setTrainingOutputDirectory(tempTrainingDirectory);
-    st.validate();
-
-    st.setTestSplitSize(100);
-    assertValidateException(st);
-  }
-
-  private class TestCallback implements SplitInput.SplitCallback {
-    private final int testSplitSize;
-    private final int trainingLines;
-
-    private TestCallback(int testSplitSize, int trainingLines) {
-      this.testSplitSize = testSplitSize;
-      this.trainingLines = trainingLines;
-    }
-
-    @Override
-    public void splitComplete(Path inputFile, int lineCount, int trainCount, 
int testCount, int testSplitStart) {
-      assertSplit(fs, tempInputFile, charset, testSplitSize, trainingLines, 
tempTrainingDirectory, tempTestDirectory);
-    }
-  }
-
-  private static void assertValidateException(SplitInput st) throws 
IOException {
-    try {
-      st.validate();
-      fail("Expected IllegalArgumentException");
-    } catch (IllegalArgumentException iae) {
-      // good
-    }
-  }
-
-  private static void assertSplit(FileSystem fs,
-                                  Path tempInputFile,
-                                  Charset charset,
-                                  int testSplitSize,
-                                  int trainingLines,
-                                  Path tempTrainingDirectory,
-                                  Path tempTestDirectory) {
-
-    try {
-      Path testFile = new Path(tempTestDirectory, tempInputFile.getName());
-      //assertTrue("test file exists", testFile.isFile());
-      assertEquals("test line count", testSplitSize, SplitInput.countLines(fs, 
testFile, charset));
-
-      Path trainingFile = new Path(tempTrainingDirectory, 
tempInputFile.getName());
-      //assertTrue("training file exists", trainingFile.isFile());
-      assertEquals("training line count", trainingLines, 
SplitInput.countLines(fs, trainingFile, charset));
-    } catch (IOException ioe) {
-      fail(ioe.toString());
-    }
-  }
-}


http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/email/MailProcessorTest.java
----------------------------------------------------------------------
diff --git 
a/integration/src/test/java/org/apache/mahout/utils/email/MailProcessorTest.java
 
b/integration/src/test/java/org/apache/mahout/utils/email/MailProcessorTest.java
deleted file mode 100644
index c519f85..0000000
--- 
a/integration/src/test/java/org/apache/mahout/utils/email/MailProcessorTest.java
+++ /dev/null
@@ -1,72 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.email;
-
-import java.io.File;
-import java.io.StringWriter;
-import java.net.URL;
-import java.util.regex.Pattern;
-
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Test;
-
-public final class MailProcessorTest extends MahoutTestCase {
-
-  @Test
-  public void testLabel() throws Exception {
-    StringWriter writer = new StringWriter();
-    MailOptions options = new MailOptions();
-    options.setSeparator(":::");
-    options.setCharset(Charsets.UTF_8);
-        options.setPatternsToMatch(new Pattern[]{
-        MailProcessor.FROM_PREFIX, MailProcessor.SUBJECT_PREFIX, 
MailProcessor.TO_PREFIX});
-    options.setInput(new File(System.getProperty("user.dir")));
-    MailProcessor proc = new MailProcessor(options, "", writer);
-    URL url = 
MailProcessorTest.class.getClassLoader().getResource("test.mbox");
-    File file = new File(url.toURI());
-    long count = proc.parseMboxLineByLine(file);
-    assertEquals(7, count);
-  }
-
-  @Test
-  public void testStripQuoted() throws Exception {
-    StringWriter writer = new StringWriter();
-    MailOptions options = new MailOptions();
-    options.setSeparator(":::");
-    options.setCharset(Charsets.UTF_8);
-        options.setPatternsToMatch(new Pattern[]{
-        MailProcessor.SUBJECT_PREFIX});
-    options.setInput(new File(System.getProperty("user.dir")));
-    options.setIncludeBody(true);
-    MailProcessor proc = new MailProcessor(options, "", writer);
-    URL url = 
MailProcessorTest.class.getClassLoader().getResource("test.mbox");
-    File file = new File(url.toURI());
-    long count = proc.parseMboxLineByLine(file);
-    assertEquals(7, count);
-    assertTrue(writer.getBuffer().toString().contains("> Cocoon Cron Block 
Configurable Clustering"));
-    writer = new StringWriter();
-    proc = new MailProcessor(options, "", writer);
-    options.setStripQuotedText(true);
-    count = proc.parseMboxLineByLine(file);
-    assertEquals(7, count);
-    assertFalse(writer.getBuffer().toString().contains("> Cocoon Cron Block 
Configurable Clustering"));
-
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
----------------------------------------------------------------------
diff --git 
a/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
 
b/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
deleted file mode 100644
index 4fdbbbc..0000000
--- 
a/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
+++ /dev/null
@@ -1,154 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.utils.nlp.collocations.llr;
-
-import java.io.IOException;
-import java.io.Reader;
-import java.io.StringReader;
-import java.nio.ByteBuffer;
-import java.nio.CharBuffer;
-import java.nio.charset.CharsetEncoder;
-
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.util.bloom.BloomFilter;
-import org.apache.hadoop.util.bloom.Filter;
-import org.apache.hadoop.util.bloom.Key;
-import org.apache.hadoop.util.hash.Hash;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
-import org.apache.lucene.analysis.shingle.ShingleFilter;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Test;
-
-public final class BloomTokenFilterTest extends MahoutTestCase {
-  
-  private static final CharsetEncoder encoder = Charsets.UTF_8.newEncoder();
-
-  private static final String input = "The best of times the worst of times";
-  private static final String[] allTokens = {
-      "The", "best", "of", "times", "the", "worst", "of", "times"
-  };
-  private static final String[] expectedNonKeepTokens = { "best", "times", 
"the", "worst", "times" };
-  private static final String[] expectedKeepTokens = { "The", "of", "of" };
-  private static final String[] filterTokens    = { "The", "of" };
-  private static final String[] notFilterTokens = { "best", "worst", "the", 
"times"};
-  private static final String[] shingleKeepTokens = {
-      "The best", "best of times", "the worst", "worst of times", "of times"
-  };
-  private static final String[] expectedShingleTokens = {
-      "The best", "best of times", "of times", "the worst", "worst of times", 
"of times"
-  };
-  
-  /** test standalone filter without tokenfilter wrapping */
-  @Test
-  public void testFilter() throws IOException {
-    Filter filter = getFilter(filterTokens);
-    Key k = new Key();
-    for (String s: filterTokens) {
-      setKey(k,s);
-      assertTrue("Key for string " + s + " should be filter member", 
filter.membershipTest(k));
-    }
-    
-    for (String s: notFilterTokens)  {
-      setKey(k,s);
-      assertFalse("Key for string " + s + " should not be filter member", 
filter.membershipTest(k));
-    }
-  }
-  
-  /** normal case, unfiltered analyzer */
-  @Test
-  public void testAnalyzer() throws IOException {
-    Reader reader = new StringReader(input);
-    Analyzer analyzer = new WhitespaceAnalyzer();
-    TokenStream ts = analyzer.tokenStream(null, reader);
-    ts.reset();
-    validateTokens(allTokens, ts);
-    ts.end();
-    ts.close();
-  }
-  
-  /** filtered analyzer */
-  @Test
-  public void testNonKeepdAnalyzer() throws IOException {
-    Reader reader = new StringReader(input);
-    Analyzer analyzer = new WhitespaceAnalyzer();
-    TokenStream ts = analyzer.tokenStream(null, reader);
-    ts.reset();
-    TokenStream f = new BloomTokenFilter(getFilter(filterTokens), false /* 
toss matching tokens */, ts);
-    validateTokens(expectedNonKeepTokens, f);
-    ts.end();
-    ts.close();
-  }
-
-  /** keep analyzer */
-  @Test
-  public void testKeepAnalyzer() throws IOException {
-    Reader reader = new StringReader(input);
-    Analyzer analyzer = new WhitespaceAnalyzer();
-    TokenStream ts = analyzer.tokenStream(null, reader);
-    ts.reset();
-    TokenStream f = new BloomTokenFilter(getFilter(filterTokens), true /* keep 
matching tokens */, ts);
-    validateTokens(expectedKeepTokens, f);
-    ts.end();
-    ts.close();
-  }
-  
-  /** shingles, keep those matching whitelist */
-  @Test
-  public void testShingleFilteredAnalyzer() throws IOException {
-    Reader reader = new StringReader(input);
-    Analyzer analyzer = new WhitespaceAnalyzer();
-    TokenStream ts = analyzer.tokenStream(null, reader);
-    ts.reset();
-    ShingleFilter sf = new ShingleFilter(ts, 3);
-    TokenStream f = new BloomTokenFilter(getFilter(shingleKeepTokens),  true, 
sf);
-    validateTokens(expectedShingleTokens, f);
-    ts.end();
-    ts.close();
-  }
-  
-  private static void setKey(Key k, String s) throws IOException {
-    ByteBuffer buffer = encoder.encode(CharBuffer.wrap(s.toCharArray()));
-    k.set(buffer.array(), 1.0);
-  }
-  
-  private static void validateTokens(String[] expected, TokenStream ts) throws 
IOException {
-    int pos = 0;
-    while (ts.incrementToken()) {
-      assertTrue("Analyzer produced too many tokens", pos <= expected.length);
-      CharTermAttribute termAttr = ts.getAttribute(CharTermAttribute.class);
-      assertEquals("Unexpected term", expected[pos++], termAttr.toString());
-    }
-    assertEquals("Analyzer produced too few terms", expected.length, pos);
-  }
-
-  private static Filter getFilter(String[] tokens) throws IOException {
-    Filter filter = new BloomFilter(100,50, Hash.JENKINS_HASH);
-    Key k = new Key();
-    for (String s: tokens) {
-      setKey(k,s);
-      filter.add(k);
-    }
-    return filter;
-  }
-  
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java
----------------------------------------------------------------------
diff --git 
a/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java 
b/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java
deleted file mode 100644
index 8ab643b..0000000
--- 
a/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java
+++ /dev/null
@@ -1,104 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.regex;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.mahout.common.DummyRecordWriter;
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Test;
-
-import java.util.List;
-
-public final class RegexMapperTest extends MahoutTestCase {
-
-  @Test
-  public void testRegex() throws Exception {
-    RegexMapper mapper = new RegexMapper();
-    Configuration conf = getConfiguration();
-    conf.set(RegexMapper.REGEX, "(?<=(\\?|&)q=).*?(?=&|$)");
-    conf.set(RegexMapper.TRANSFORMER_CLASS, 
URLDecodeTransformer.class.getName());
-    DummyRecordWriter<LongWritable, Text> mapWriter = new 
DummyRecordWriter<>();
-    Mapper<LongWritable, Text, LongWritable, Text>.Context mapContext = 
DummyRecordWriter
-            .build(mapper, conf, mapWriter);
-
-    mapper.setup(mapContext);
-    for (int i = 0; i < RegexUtilsTest.TEST_STRS.length; i++) {
-      String testStr = RegexUtilsTest.TEST_STRS[i];
-
-      LongWritable key = new LongWritable(i);
-      mapper.map(key, new Text(testStr), mapContext);
-      List<Text> value = mapWriter.getValue(key);
-      if (!RegexUtilsTest.GOLD[i].isEmpty()) {
-        assertEquals(1, value.size());
-        assertEquals(RegexUtilsTest.GOLD[i], value.get(0).toString());
-      }
-    }
-  }
-
-  @Test
-  public void testGroups() throws Exception {
-    RegexMapper mapper = new RegexMapper();
-    Configuration conf = getConfiguration();
-    conf.set(RegexMapper.REGEX, "(\\d+)\\.(\\d+)\\.(\\d+)");
-    conf.set(RegexMapper.TRANSFORMER_CLASS, 
URLDecodeTransformer.class.getName());
-    conf.setStrings(RegexMapper.GROUP_MATCHERS, "1", "3");
-    DummyRecordWriter<LongWritable, Text> mapWriter = new 
DummyRecordWriter<>();
-    Mapper<LongWritable, Text, LongWritable, Text>.Context mapContext = 
DummyRecordWriter
-            .build(mapper, conf, mapWriter);
-
-    mapper.setup(mapContext);
-    for (int i = 0; i < RegexUtilsTest.TEST_STRS.length; i++) {
-      String testStr = RegexUtilsTest.TEST_STRS[i];
-
-      LongWritable key = new LongWritable(i);
-      mapper.map(key, new Text(testStr), mapContext);
-      List<Text> value = mapWriter.getValue(key);
-      assertEquals(1, value.size());
-      assertEquals("127 0", value.get(0).toString());
-    }
-  }
-
-  @Test
-  public void testFPGFormatter() throws Exception {
-    RegexMapper mapper = new RegexMapper();
-    Configuration conf = getConfiguration();
-    conf.set(RegexMapper.REGEX, "(?<=(\\?|&)q=).*?(?=&|$)");
-    conf.set(RegexMapper.TRANSFORMER_CLASS, 
URLDecodeTransformer.class.getName());
-    conf.set(RegexMapper.FORMATTER_CLASS, FPGFormatter.class.getName());
-    DummyRecordWriter<LongWritable, Text> mapWriter = new 
DummyRecordWriter<>();
-    Mapper<LongWritable, Text, LongWritable, Text>.Context mapContext = 
DummyRecordWriter
-            .build(mapper, conf, mapWriter);
-
-    mapper.setup(mapContext);
-    RegexFormatter formatter = new FPGFormatter();
-    for (int i = 0; i < RegexUtilsTest.TEST_STRS.length; i++) {
-      String testStr = RegexUtilsTest.TEST_STRS[i];
-
-      LongWritable key = new LongWritable(i);
-      mapper.map(key, new Text(testStr), mapContext);
-      List<Text> value = mapWriter.getValue(key);
-      if (!RegexUtilsTest.GOLD[i].isEmpty()) {
-        assertEquals(1, value.size());
-        assertEquals(formatter.format(RegexUtilsTest.GOLD[i]), 
value.get(0).toString());
-      }
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java
----------------------------------------------------------------------
diff --git 
a/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java 
b/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java
deleted file mode 100644
index 8ae10a5..0000000
--- 
a/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.regex;
-
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.regex.Pattern;
-
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Test;
-
-public final class RegexUtilsTest extends MahoutTestCase {
-
-  static final String[] TEST_STRS = {
-          "127.0.0.1 -  -  [01/10/2011:00:01:51 +0000] \"GET 
/solr/collection1/browse?q=foo&rows=10&wt=json&hl=true&hl.fl=body&hl.fl=content",
-          "127.0.0.1 -  -  [01/10/2011:00:20:58 +0000] \"GET 
/solr/collection1/browse?q=Using+Solr+Search+RDBMS&fq=%7B%21tag%3Dsource%7D%28%28source%3Alucid+AND+lucid_facet%3A%28site%29%29%29&rows=10",
-          "127.0.0.1 -  -  [01/10/2011:00:21:21 +0000] \"GET 
/solr/collection1/browse?q=language+detection&start=560&rows=10 HTTP/1.1\" 200 
45071",
-          "127.0.0.1 -  -  [01/10/2011:00:21:21 +0000] \"GET 
/solr/collection1/browse?q=&start=560&rows=10 HTTP/1.1\" 200 45071"
-  };
-  static final String[] GOLD = {"foo", "Using Solr Search RDBMS", "language 
detection", ""};
-
-  @Test
-  public void testExtract() throws Exception {
-    Pattern pattern = Pattern.compile("(?<=(\\?|&)q=).*?(?=&|$)");
-    String line = "127.0.0.1 -  -  [24/05/2010:01:19:22 +0000] \"GET 
/solr/select?q=import statement&start=1 HTTP/1.1\" 200 37571";
-    String res = RegexUtils.extract(line, pattern, 
Collections.<Integer>emptyList(), " ", RegexUtils.IDENTITY_TRANSFORMER);
-    assertEquals(res, "import statement", res);
-
-    for (int i = 0; i < TEST_STRS.length; i++) {
-      String testStr = TEST_STRS[i];
-      res = RegexUtils.extract(testStr, pattern, 
Collections.<Integer>emptyList(), " ", new URLDecodeTransformer());
-      assertEquals(GOLD[i], res);
-    }
-
-    pattern = 
Pattern.compile("((?<=(\\?|&)q=)(.*?)(?=(&|$))|(?<=((\\?|&)start=))(\\d+))");
-    res = RegexUtils.extract(line, pattern, Collections.<Integer>emptyList(), 
" ", RegexUtils.IDENTITY_TRANSFORMER);
-    assertEquals(res, "import statement 1", res);
-
-    pattern = Pattern.compile("(start=1) HTTP");
-    Collection<Integer> groupsToKeep = new ArrayList<>();
-    groupsToKeep.add(1);
-    res = RegexUtils.extract(line, pattern, groupsToKeep, " ", 
RegexUtils.IDENTITY_TRANSFORMER);
-    assertEquals(res, "start=1", res);
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java
----------------------------------------------------------------------
diff --git 
a/integration/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java
 
b/integration/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java
deleted file mode 100644
index 2ddce14..0000000
--- 
a/integration/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors;
-
-import java.util.Iterator;
-import java.util.Random;
-
-import com.google.common.base.Function;
-import com.google.common.collect.Iterators;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.common.iterator.CountingIterator;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.RandomAccessSparseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.function.DoubleFunction;
-
-public final class RandomVectorIterable implements Iterable<Vector> {
-
-  public enum VectorType {DENSE, SPARSE}
-
-  private final int numItems;
-  private final VectorType type;
-  
-  public RandomVectorIterable() {
-    this(100, VectorType.SPARSE);
-  }
-  
-  public RandomVectorIterable(int numItems) {
-    this(numItems, VectorType.SPARSE);
-  }
-  
-  public RandomVectorIterable(int numItems, VectorType type) {
-    this.numItems = numItems;
-    this.type = type;
-  }
-  
-  @Override
-  public Iterator<Vector> iterator() {
-    return Iterators.transform(
-        new CountingIterator(numItems),
-        new Function<Integer, Vector>() {
-          private final Random random = RandomUtils.getRandom();
-          @Override
-          public Vector apply(Integer dummy) {
-            Vector result =
-                type == VectorType.SPARSE ? new 
RandomAccessSparseVector(numItems) : new DenseVector(numItems);
-            result.assign(new DoubleFunction() {
-              @Override
-              public double apply(double ignored) {
-                return random.nextDouble();
-              }
-            });
-            return result;
-          }
-        });
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java
----------------------------------------------------------------------
diff --git 
a/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java
 
b/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java
deleted file mode 100644
index c55fd8d..0000000
--- 
a/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors;
-
-import java.util.Random;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.math.SequentialAccessSparseVector;
-import org.apache.mahout.math.Vector;
-import org.junit.Before;
-import org.junit.Test;
-
-public final class VectorHelperTest extends MahoutTestCase {
-
-  private static final int NUM_DOCS = 100;
-
-  private Path inputPathOne;
-  private Path inputPathTwo;
-
-  private Configuration conf;
-
-  @Override
-  @Before
-  public void setUp() throws Exception {
-    super.setUp();
-    conf = getConfiguration();
-
-    inputPathOne = getTestTempFilePath("documents/docs-one.file");
-    FileSystem fs = FileSystem.get(inputPathOne.toUri(), conf);
-    try (SequenceFile.Writer writer =
-             new SequenceFile.Writer(fs, conf, inputPathOne, Text.class, 
IntWritable.class)) {
-      Random rd = RandomUtils.getRandom();
-      for (int i = 0; i < NUM_DOCS; i++) {
-        // Make all indices higher than dictionary size
-        writer.append(new Text("Document::ID::" + i), new IntWritable(NUM_DOCS 
+ rd.nextInt(NUM_DOCS)));
-      }
-    }
-
-    inputPathTwo = getTestTempFilePath("documents/docs-two.file");
-    fs = FileSystem.get(inputPathTwo.toUri(), conf);
-    try (SequenceFile.Writer writer =
-             new SequenceFile.Writer(fs, conf, inputPathTwo, Text.class, 
IntWritable.class)) {
-      Random rd = RandomUtils.getRandom();
-      for (int i = 0; i < NUM_DOCS; i++) {
-        // Keep indices within number of documents
-        writer.append(new Text("Document::ID::" + i), new 
IntWritable(rd.nextInt(NUM_DOCS)));
-      }
-    }
-  }
-
-  @Test
-  public void testJsonFormatting() throws Exception {
-    Vector v = new SequentialAccessSparseVector(10);
-    v.set(2, 3.1);
-    v.set(4, 1.0);
-    v.set(6, 8.1);
-    v.set(7, -100);
-    v.set(9, 12.2);
-    String UNUSED = "UNUSED";
-    String[] dictionary = {
-        UNUSED, UNUSED, "two", UNUSED, "four", UNUSED, "six", "seven", UNUSED, 
"nine"
-    };
-
-    assertEquals("sorted json form incorrect: ", "{nine:12.2,six:8.1,two:3.1}",
-        VectorHelper.vectorToJson(v, dictionary, 3, true));
-    assertEquals("unsorted form incorrect: ", "{two:3.1,four:1.0}",
-        VectorHelper.vectorToJson(v, dictionary, 2, false));
-    assertEquals("sorted json form incorrect: ", 
"{nine:12.2,six:8.1,two:3.1,four:1.0}",
-        VectorHelper.vectorToJson(v, dictionary, 4, true));
-    assertEquals("sorted json form incorrect: ", 
"{nine:12.2,six:8.1,two:3.1,four:1.0,seven:-100.0}",
-        VectorHelper.vectorToJson(v, dictionary, 5, true));
-    assertEquals("sorted json form incorrect: ", "{nine:12.2,six:8.1}",
-        VectorHelper.vectorToJson(v, dictionary, 2, true));
-    assertEquals("unsorted form incorrect: ", "{two:3.1,four:1.0}",
-        VectorHelper.vectorToJson(v, dictionary, 2, false));
-  }
-
-  @Test
-  public void testTopEntries() throws Exception {
-    Vector v = new SequentialAccessSparseVector(10);
-    v.set(2, 3.1);
-    v.set(4, 1.0);
-    v.set(6, 8.1);
-    v.set(7, -100);
-    v.set(9, 12.2);
-    v.set(1, 0.0);
-    v.set(3, 0.0);
-    v.set(8, 2.7);
-    // check if sizeOFNonZeroElementsInVector = maxEntries
-    assertEquals(6, VectorHelper.topEntries(v, 6).size());
-    // check if sizeOfNonZeroElementsInVector < maxEntries
-    assertTrue(VectorHelper.topEntries(v, 9).size() < 9);
-    // check if sizeOfNonZeroElementsInVector > maxEntries
-    assertTrue(VectorHelper.topEntries(v, 5).size() < 
v.getNumNonZeroElements());
-  }
-
-  @Test
-  public void testTopEntriesWhenAllZeros() throws Exception {
-    Vector v = new SequentialAccessSparseVector(10);
-    v.set(2, 0.0);
-    v.set(4, 0.0);
-    v.set(6, 0.0);
-    v.set(7, 0);
-    v.set(9, 0.0);
-    v.set(1, 0.0);
-    v.set(3, 0.0);
-    v.set(8, 0.0);
-    assertEquals(0, VectorHelper.topEntries(v, 6).size());
-  }
-
-  @Test
-  public void testLoadTermDictionary() throws Exception {
-    // With indices higher than dictionary size
-    VectorHelper.loadTermDictionary(conf, inputPathOne.toString());
-    // With dictionary size higher than indices
-    VectorHelper.loadTermDictionary(conf, inputPathTwo.toString());
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java
----------------------------------------------------------------------
diff --git 
a/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java
 
b/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java
deleted file mode 100644
index 2ea8b89..0000000
--- 
a/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.arff;
-
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Test;
-
-public final class ARFFTypeTest extends MahoutTestCase {
-
-  @Test
-  public void removeQuotes() {
-    assertNull(ARFFType.removeQuotes(null));
-    assertEquals("", ARFFType.removeQuotes("\"\""));
-    assertEquals("", ARFFType.removeQuotes("''"));
-    assertEquals("", ARFFType.removeQuotes(""));
-    assertEquals("", ARFFType.removeQuotes("  "));
-    assertEquals("single", ARFFType.removeQuotes("'single'"));
-    assertEquals("double", ARFFType.removeQuotes("\"double\""));
-    assertEquals("trim", ARFFType.removeQuotes(" trim "));
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
----------------------------------------------------------------------
diff --git 
a/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
 
b/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
deleted file mode 100644
index 4c7f17a..0000000
--- 
a/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
+++ /dev/null
@@ -1,289 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.arff;
-
-import java.io.IOException;
-import java.text.DateFormat;
-import java.text.SimpleDateFormat;
-import java.util.Date;
-import java.util.Iterator;
-import java.util.Locale;
-import java.util.Map;
-
-import com.google.common.io.Resources;
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.RandomAccessSparseVector;
-import org.apache.mahout.math.Vector;
-import org.junit.Test;
-
-public final class ARFFVectorIterableTest extends MahoutTestCase {
-
-  @Test
-  public void testValues() throws Exception {
-    ARFFVectorIterable iterable = readModelFromResource("sample.arff");
-
-    assertEquals("Mahout", iterable.getModel().getRelation());
-    Map<String, Integer> bindings = iterable.getModel().getLabelBindings();
-    assertNotNull(bindings);
-    assertEquals(5, bindings.size());
-    Iterator<Vector> iter = iterable.iterator();
-    assertTrue(iter.hasNext());
-    Vector next = iter.next();
-    assertNotNull(next);
-    assertTrue("Wrong instanceof", next instanceof DenseVector);
-    assertEquals(1.0, next.get(0), EPSILON);
-    assertEquals(2.0, next.get(1), EPSILON);
-    assertTrue(iter.hasNext());
-    next = iter.next();
-    assertNotNull(next);
-    assertTrue("Wrong instanceof", next instanceof DenseVector);
-    assertEquals(2.0, next.get(0), EPSILON);
-    assertEquals(3.0, next.get(1), EPSILON);
-
-    assertTrue(iter.hasNext());
-    next = iter.next();
-    assertNotNull(next);
-    assertTrue("Wrong instanceof", next instanceof RandomAccessSparseVector);
-    assertEquals(5.0, next.get(0), EPSILON);
-    assertEquals(23.0, next.get(1), EPSILON);
-
-    assertFalse(iter.hasNext());
-  }
-
-  @Test
-  public void testDense() throws Exception {
-    Iterable<Vector> iterable = readModelFromResource("sample-dense.arff");
-    Vector firstVector = iterable.iterator().next();
-    assertEquals(1.0, firstVector.get(0), 0);
-    assertEquals(65.0, firstVector.get(1), 0);
-    assertEquals(1.0, firstVector.get(3), 0);
-    assertEquals(1.0, firstVector.get(4), 0);
-
-    int count = 0;
-    for (Vector vector : iterable) {
-      assertTrue("Vector is not dense", vector instanceof DenseVector);
-      count++;
-    }
-    assertEquals(5, count);
-  }
-
-  @Test
-  public void testSparse() throws Exception {
-    Iterable<Vector> iterable = readModelFromResource("sample-sparse.arff");
-
-    Vector firstVector = iterable.iterator().next();
-    assertEquals(23.1, firstVector.get(1), 0);
-    assertEquals(3.23, firstVector.get(2), 0);
-    assertEquals(1.2, firstVector.get(3), 0);
-
-    int count = 0;
-    for (Vector vector : iterable) {
-      assertTrue("Vector is not dense", vector instanceof 
RandomAccessSparseVector);
-      count++;
-    }
-    assertEquals(9, count);
-  }
-
-  @Test
-  public void testNonNumeric() throws Exception {
-    MapBackedARFFModel model = new MapBackedARFFModel();
-    ARFFVectorIterable iterable = getVectors("non-numeric-1.arff", model);
-    int count = 0;
-    for (Vector vector : iterable) {
-      assertTrue("Vector is not dense", vector instanceof 
RandomAccessSparseVector);
-      count++;
-    }
-
-    iterable = getVectors("non-numeric-1.arff", model);
-    Iterator<Vector> iter = iterable.iterator();
-    Vector firstVector = iter.next();
-
-    assertEquals(1.0, firstVector.get(2), 0);
-
-    assertEquals(10, count);
-    Map<String, Map<String, Integer>> nominalMap = 
iterable.getModel().getNominalMap();
-    assertNotNull(nominalMap);
-    assertEquals(1, nominalMap.size());
-    Map<String, Integer> noms = nominalMap.get("bar");
-    assertNotNull("nominals for bar are null", noms);
-    assertEquals(5, noms.size());
-    Map<Integer, ARFFType> integerARFFTypeMap = model.getTypeMap();
-    assertNotNull("Type map null", integerARFFTypeMap);
-    assertEquals(5, integerARFFTypeMap.size());
-    Map<String, Long> words = model.getWords();
-    assertNotNull("words null", words);
-    assertEquals(10, words.size());
-    Map<Integer, DateFormat> integerDateFormatMap = model.getDateMap();
-    assertNotNull("date format null", integerDateFormatMap);
-    assertEquals(1, integerDateFormatMap.size());
-  }
-
-  @Test
-  public void testDate() throws Exception {
-    ARFFVectorIterable iterable = readModelFromResource("date.arff");
-    Iterator<Vector> iter = iterable.iterator();
-    Vector firstVector = iter.next();
-
-    DateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", 
Locale.ENGLISH);
-    Date date = format.parse("2001-07-04T12:08:56");
-    long result = date.getTime();
-    assertEquals(result, firstVector.get(1), 0);
-
-    format = new SimpleDateFormat("yyyy.MM.dd G 'at' HH:mm:ss z", 
Locale.ENGLISH);
-    date = format.parse("2001.07.04 AD at 12:08:56 PDT");
-    result = date.getTime();
-    assertEquals(result, firstVector.get(2), 0);
-
-    format = new SimpleDateFormat("EEE, MMM d, ''yy", Locale.ENGLISH);
-    date = format.parse("Wed, Jul 4, '01,4 0:08 PM, PDT");
-    result = date.getTime();
-    assertEquals(result, firstVector.get(3), 0);
-
-    format = new SimpleDateFormat("K:mm a, z", Locale.ENGLISH);
-    date = format.parse("0:08 PM, PDT");
-    result = date.getTime();
-    assertEquals(result, firstVector.get(4), 0);
-
-    format = new SimpleDateFormat("yyyyy.MMMMM.dd GGG hh:mm aaa", 
Locale.ENGLISH);
-    date = format.parse("02001.July.04 AD 12:08 PM");
-    result = date.getTime();
-    assertEquals(result, firstVector.get(5), 0);
-
-    format = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", 
Locale.ENGLISH);
-    date = format.parse("Wed, 4 Jul 2001 12:08:56 -0700");
-    result = date.getTime();
-    assertEquals(result, firstVector.get(6), 0);
-
-  }
-
-  @Test
-  public void testMultipleNoms() throws Exception {
-    MapBackedARFFModel model = new MapBackedARFFModel();
-    ARFFVectorIterable iterable = getVectors("non-numeric-1.arff", model);
-    int count = 0;
-    for (Vector vector : iterable) {
-      assertTrue("Vector is not dense", vector instanceof 
RandomAccessSparseVector);
-      count++;
-    }
-    assertEquals(10, count);
-    Map<String,Map<String,Integer>> nominalMap = 
iterable.getModel().getNominalMap();
-    assertNotNull(nominalMap);
-    assertEquals(1, nominalMap.size());
-    Map<String,Integer> noms = nominalMap.get("bar");
-    assertNotNull("nominals for bar are null", noms);
-    assertEquals(5, noms.size());
-    Map<Integer,ARFFType> integerARFFTypeMap = model.getTypeMap();
-    assertNotNull("Type map null", integerARFFTypeMap);
-    assertEquals(5, integerARFFTypeMap.size());
-    Map<String,Long> words = model.getWords();
-    assertNotNull("words null", words);
-    assertEquals(10, words.size());
-
-    Map<Integer,DateFormat> integerDateFormatMap = model.getDateMap();
-    assertNotNull("date format null", integerDateFormatMap);
-    assertEquals(1, integerDateFormatMap.size());
-
-
-    iterable = getVectors("non-numeric-2.arff", model);
-    count = 0;
-    for (Vector vector : iterable) {
-      assertTrue("Vector is not dense", vector instanceof 
RandomAccessSparseVector);
-      count++;
-    }
-    nominalMap = model.getNominalMap();
-    assertNotNull(nominalMap);
-    assertEquals(2, nominalMap.size());
-    noms = nominalMap.get("test");
-    assertNotNull("nominals for bar are null", noms);
-    assertEquals(2, noms.size());
-  }
-
-  @Test
-  public void testNumerics() throws Exception {
-    String arff = "@RELATION numerics\n"
-      + "@ATTRIBUTE theNumeric NUMERIC\n"
-      + "@ATTRIBUTE theInteger INTEGER\n"
-      + "@ATTRIBUTE theReal REAL\n"
-      + "@DATA\n"
-      + "1.0,2,3.0";
-    ARFFModel model = new MapBackedARFFModel();
-    ARFFVectorIterable iterable = new ARFFVectorIterable(arff, model);
-    model = iterable.getModel();
-    assertNotNull(model);
-    assertEquals(3, model.getLabelSize());
-    assertEquals(ARFFType.NUMERIC, model.getARFFType(0));
-    assertEquals(ARFFType.INTEGER, model.getARFFType(1));
-    assertEquals(ARFFType.REAL, model.getARFFType(2));
-    Iterator<Vector> it = iterable.iterator();
-    Vector vector = it.next();
-    assertEquals(1.0, vector.get(0), EPSILON);
-    assertEquals(2.0, vector.get(1), EPSILON);
-    assertEquals(3.0, vector.get(2), EPSILON);
-  }
-
-  @Test
-  public void testQuotes() throws Exception {
-    // ARFF allows quotes on identifiers
-    ARFFModel model = new MapBackedARFFModel();
-    ARFFVectorIterable iterable = getVectors("quoted-id.arff", model);
-    model = iterable.getModel();
-    assertNotNull(model);
-    assertEquals("quotes", model.getRelation());
-
-    // check attribute labels
-    assertEquals(4, model.getLabelSize());
-    assertEquals(ARFFType.NUMERIC, model.getARFFType(0));
-    assertEquals(ARFFType.INTEGER, model.getARFFType(1));
-    assertEquals(ARFFType.REAL, model.getARFFType(2));
-    assertEquals(ARFFType.NOMINAL, model.getARFFType(3));
-
-    Map<String, Integer> labelBindings = model.getLabelBindings();
-    assertTrue(labelBindings.keySet().contains("thenumeric"));
-    assertTrue(labelBindings.keySet().contains("theinteger"));
-    assertTrue(labelBindings.keySet().contains("thereal"));
-    assertTrue(labelBindings.keySet().contains("thenominal"));
-
-    // check nominal values
-    Map<String, Integer> nominalMap = model.getNominalMap().get("thenominal");
-    assertNotNull(nominalMap);
-    assertEquals(3, nominalMap.size());
-    assertTrue(nominalMap.keySet().contains("double-quote"));
-    assertTrue(nominalMap.keySet().contains("single-quote"));
-    assertTrue(nominalMap.keySet().contains("no-quote"));
-
-    // check data values
-    Iterator<Vector> it = iterable.iterator();
-    Vector vector = it.next();
-    assertEquals(nominalMap.get("no-quote"), vector.get(3), EPSILON);
-    assertEquals(nominalMap.get("single-quote"), it.next().get(3), EPSILON);
-    assertEquals(nominalMap.get("double-quote"), it.next().get(3), EPSILON);
-  }
-
-  static ARFFVectorIterable getVectors(String resourceName, ARFFModel model) 
throws IOException {
-    String sample = Resources.toString(Resources.getResource(resourceName), 
Charsets.UTF_8);
-    return new ARFFVectorIterable(sample, model);
-  }
-
-  private static ARFFVectorIterable readModelFromResource(String resourceName) 
throws IOException {
-    ARFFModel model = new MapBackedARFFModel();
-    return getVectors(resourceName, model);
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/vectors/arff/DriverTest.java
----------------------------------------------------------------------
diff --git 
a/integration/src/test/java/org/apache/mahout/utils/vectors/arff/DriverTest.java
 
b/integration/src/test/java/org/apache/mahout/utils/vectors/arff/DriverTest.java
deleted file mode 100644
index 7e7623e..0000000
--- 
a/integration/src/test/java/org/apache/mahout/utils/vectors/arff/DriverTest.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.utils.vectors.arff;
-
-import java.io.IOException;
-import java.io.StringWriter;
-
-import com.google.common.io.Resources;
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Test;
-
-/**
- * Test case for {@link Driver}
- */
-public class DriverTest extends MahoutTestCase {
-
-  @Test
-  public void dictionary() throws IOException {
-
-    ARFFModel model = new MapBackedARFFModel();
-    ARFFVectorIterableTest.getVectors("sample-dense.arff", model);
-    StringWriter writer = new StringWriter();
-    Driver.writeLabelBindings(writer, model, ",");
-    String expected1 = 
Resources.toString(Resources.getResource("expected-arff-dictionary.csv"), 
Charsets.UTF_8);
-    String expected2 = 
Resources.toString(Resources.getResource("expected-arff-dictionary-2.csv"), 
Charsets.UTF_8);
-    assertTrue(expected1.equals(writer.toString()) || 
expected2.equals(writer.toString()));
-  }
-
-
-  @Test
-  public void dictionaryJSON() throws IOException {
-    ARFFModel model = new MapBackedARFFModel();
-    ARFFVectorIterableTest.getVectors("sample-dense.arff", model);
-    StringWriter writer = new StringWriter();
-    Driver.writeLabelBindingsJSON(writer, model);
-    String expected1 = 
Resources.toString(Resources.getResource("expected-arff-schema.json"), 
Charsets.UTF_8);
-    String expected2 = 
Resources.toString(Resources.getResource("expected-arff-schema-2.json"), 
Charsets.UTF_8);
-    assertTrue(expected1.equals(writer.toString()) || 
expected2.equals(writer.toString()));
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java
----------------------------------------------------------------------
diff --git 
a/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java
 
b/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java
deleted file mode 100644
index 2867640..0000000
--- 
a/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.arff;
-
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Test;
-
-import java.util.Map;
-
-public class MapBackedARFFModelTest extends MahoutTestCase {
-
-  @Test
-  public void processNominal() {
-    String windy = "windy";
-    String breezy = "breezy";
-
-    ARFFModel model = new MapBackedARFFModel();
-    model.addNominal(windy, breezy, 77);
-    model.addNominal(windy, "strong", 23);
-    model.addNominal(windy, "nuking", 55);
-    Map<String, Map<String, Integer>> nominalMap = model.getNominalMap();
-
-    assertEquals(1, nominalMap.size());
-    Map<String, Integer> windyValues = nominalMap.get(windy);
-    assertEquals(77, windyValues.get(breezy).intValue());
-  }
-
-  @Test
-  public void processBadNumeric() {
-    ARFFModel model = new MapBackedARFFModel();
-    model.addLabel("b1shkt70694difsmmmdv0ikmoh", 77);
-    model.addType(77, ARFFType.REAL);
-    assertTrue(Double.isNaN(model.getValue("b1shkt70694difsmmmdv0ikmoh", 77)));
-  }
-
-  @Test
-  public void processGoodNumeric() {
-    ARFFModel model = new MapBackedARFFModel();
-    model.addLabel("1234", 77);
-    model.addType(77, ARFFType.INTEGER);
-    assertTrue(1234 == model.getValue("1234", 77));
-    model.addLabel("131.34", 78);
-    model.addType(78, ARFFType.REAL);
-    assertTrue(131.34 == model.getValue("131.34", 78));
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIteratorTest.java
----------------------------------------------------------------------
diff --git 
a/integration/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIteratorTest.java
 
b/integration/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIteratorTest.java
deleted file mode 100644
index e76cf70..0000000
--- 
a/integration/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIteratorTest.java
+++ /dev/null
@@ -1,57 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.csv;
-
-import java.io.IOException;
-import java.io.StringReader;
-import java.io.StringWriter;
-import java.util.Iterator;
-
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.utils.vectors.RandomVectorIterable;
-import org.apache.mahout.utils.vectors.VectorHelper;
-import org.apache.mahout.utils.vectors.io.TextualVectorWriter;
-import org.junit.Test;
-
-public class CSVVectorIteratorTest extends MahoutTestCase {
-
-  @Test
-  public void testCount() throws Exception {
-
-    StringWriter sWriter = new StringWriter();
-    try (TextualVectorWriter writer = new TextualVectorWriter(sWriter) {
-      @Override
-      public void write(Vector vector) throws IOException {
-        String vecStr = VectorHelper.vectorToCSVString(vector, false);
-        getWriter().write(vecStr);
-      }
-    }) {
-      Iterable<Vector> iter = new RandomVectorIterable(50);
-      writer.write(iter);
-    }
-
-    Iterator<Vector> csvIter = new CSVVectorIterator(new 
StringReader(sWriter.getBuffer().toString()));
-    int count = 0;
-    while (csvIter.hasNext()) {
-      csvIter.next();
-      count++;
-    }
-    assertEquals(50, count);
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java
----------------------------------------------------------------------
diff --git 
a/integration/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java
 
b/integration/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java
deleted file mode 100644
index e2f7032..0000000
--- 
a/integration/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.io;
-
-import java.io.StringWriter;
-import java.util.ArrayList;
-import java.util.Collection;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.utils.vectors.RandomVectorIterable;
-import org.junit.Test;
-
-public final class VectorWriterTest extends MahoutTestCase {
-
-  @Test
-  public void testSFVW() throws Exception {
-    Path path = getTestTempFilePath("sfvw");
-    Configuration conf = getConfiguration();
-    FileSystem fs = FileSystem.get(conf);
-    SequenceFile.Writer seqWriter = new SequenceFile.Writer(fs, conf, path, 
LongWritable.class, VectorWritable.class);
-    try (SequenceFileVectorWriter writer = new 
SequenceFileVectorWriter(seqWriter)) {
-      writer.write(new RandomVectorIterable(50));
-    }
-
-    long count = HadoopUtil.countRecords(path, conf);
-    assertEquals(50, count);
-  }
-
-  @Test
-  public void testTextOutputSize() throws Exception {
-    StringWriter strWriter = new StringWriter();
-    try (VectorWriter writer = new TextualVectorWriter(strWriter)) {
-      Collection<Vector> vectors = new ArrayList<>();
-      vectors.add(new DenseVector(new double[]{0.3, 1.5, 4.5}));
-      vectors.add(new DenseVector(new double[]{1.3, 1.5, 3.5}));
-      writer.write(vectors);
-    }
-    String buffer = strWriter.toString();
-    assertNotNull(buffer);
-    assertFalse(buffer.isEmpty());
-    
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java
----------------------------------------------------------------------
diff --git 
a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java
 
b/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java
deleted file mode 100644
index 890a14b..0000000
--- 
a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-
-import java.io.IOException;
-
-import com.google.common.io.Closeables;
-
-import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.FieldType;
-import org.apache.lucene.document.StringField;
-import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.IndexOptions;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.store.RAMDirectory;
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Before;
-import org.junit.Test;
-
-public class CachedTermInfoTest extends MahoutTestCase {
-  private RAMDirectory directory;
-  private static final String[] DOCS = {
-          "a a b b c c",
-          "a b a b a b a b",
-          "a b a",
-          "a",
-          "b",
-          "a",
-          "a"
-  };
-
-  private static final String[] DOCS2 = {
-          "d d d d",
-          "e e e e",
-          "d e d e",
-          "d",
-          "e",
-          "d",
-          "e"
-  };
-
-  @Before
-  public void before() throws IOException {
-    directory = new RAMDirectory();
-
-    FieldType fieldType = new FieldType();
-    fieldType.setStored(false);
-    
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
-    fieldType.setTokenized(true);
-    fieldType.setStoreTermVectors(false);
-    fieldType.setStoreTermVectorPositions(false);
-    fieldType.setStoreTermVectorOffsets(false);
-    fieldType.freeze();
-
-    directory = createTestIndex(fieldType, directory, 0);
-  }
-
-  @Test
-  public void test() throws Exception {
-    IndexReader reader = DirectoryReader.open(directory);
-    CachedTermInfo cti = new CachedTermInfo(reader, "content", 0, 100);
-    assertEquals(3, cti.totalTerms("content"));
-    assertNotNull(cti.getTermEntry("content", "a"));
-    assertNull(cti.getTermEntry("content", "e"));
-    //minDf
-    cti = new CachedTermInfo(reader, "content", 3, 100);
-    assertEquals(2, cti.totalTerms("content"));
-    assertNotNull(cti.getTermEntry("content", "a"));
-    assertNull(cti.getTermEntry("content", "c"));
-    //maxDFPercent, a is in 6 of 7 docs: numDocs * maxDfPercent / 100 < 6 to 
exclude, 85% should suffice to exclude a
-    cti = new CachedTermInfo(reader, "content", 0, 85);
-    assertEquals(2, cti.totalTerms("content"));
-    assertNotNull(cti.getTermEntry("content", "b"));
-    assertNotNull(cti.getTermEntry("content", "c"));
-    assertNull(cti.getTermEntry("content", "a"));
-
-
-  }
-
-  static RAMDirectory createTestIndex(FieldType fieldType,
-                                      RAMDirectory directory,
-                                      int startingId) throws IOException {
-    IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(new 
WhitespaceAnalyzer()));
-
-    try {
-      for (int i = 0; i < DOCS.length; i++) {
-        Document doc = new Document();
-        Field id = new StringField("id", "doc_" + (i + startingId), 
Field.Store.YES);
-        doc.add(id);
-        Field text = new Field("content", DOCS[i], fieldType);
-        doc.add(text);
-        Field text2 = new Field("content2", DOCS2[i], fieldType);
-        doc.add(text2);
-        writer.addDocument(doc);
-      }
-    } finally {
-      Closeables.close(writer, false);
-    }
-    return directory;
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java
----------------------------------------------------------------------
diff --git 
a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java
 
b/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java
deleted file mode 100644
index 86c8305..0000000
--- 
a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java
+++ /dev/null
@@ -1,136 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-import com.google.common.collect.Sets;
-import com.google.common.io.Closeables;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.FieldType;
-import org.apache.lucene.index.IndexOptions;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.SimpleFSDirectory;
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Before;
-import org.junit.Test;
-
-import java.io.File;
-import java.io.IOException;
-import java.nio.file.Paths;
-import java.util.Set;
-
-public class DriverTest extends MahoutTestCase {
-
-  private File indexDir;
-  private File outputDir;
-  private Configuration conf;
-
-  @Before
-  @Override
-  public void setUp() throws Exception {
-    super.setUp();
-    indexDir = getTestTempDir("intermediate");
-    indexDir.delete();
-    outputDir = getTestTempDir("output");
-    outputDir.delete();
-
-    conf = getConfiguration();
-  }
-
-  private Document asDocument(String line) {
-    Document doc = new Document();
-    doc.add(new TextFieldWithTermVectors("text", line));
-    return doc;
-  }
-
-  static class TextFieldWithTermVectors extends Field {
-
-    public static final FieldType TYPE = new FieldType();
-
-    static {
-      TYPE.setOmitNorms(true);
-      TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
-      TYPE.setStored(true);
-      TYPE.setTokenized(true);
-      TYPE.setStoreTermVectors(true);
-      TYPE.freeze();
-    }
-
-    public TextFieldWithTermVectors(String name, String value) {
-      super(name, value, TYPE);
-    }
-  }
-
-  @Test
-  public void sequenceFileDictionary() throws IOException {
-
-    Directory index = new 
SimpleFSDirectory(Paths.get(indexDir.getAbsolutePath()));
-    Analyzer analyzer = new StandardAnalyzer();
-    IndexWriterConfig config = new IndexWriterConfig(analyzer);
-    config.setCommitOnClose(true);
-    final IndexWriter writer = new IndexWriter(index, config);
-
-    try {
-      writer.addDocument(asDocument("One Ring to rule them all"));
-      writer.addDocument(asDocument("One Ring to find them,"));
-      writer.addDocument(asDocument("One Ring to bring them all"));
-      writer.addDocument(asDocument("and in the darkness bind them"));
-    } finally {
-      writer.close();
-    }
-
-    File seqDict = new File(outputDir, "dict.seq");
-
-    Driver.main(new String[] {
-        "--dir", indexDir.getAbsolutePath(),
-        "--output", new File(outputDir, "out").getAbsolutePath(),
-        "--field", "text",
-        "--dictOut", new File(outputDir, "dict.txt").getAbsolutePath(),
-        "--seqDictOut", seqDict.getAbsolutePath(),
-    });
-
-    SequenceFile.Reader reader = null;
-    Set<String> indexTerms = Sets.newHashSet();
-    try {
-      reader = new SequenceFile.Reader(FileSystem.getLocal(conf), new 
Path(seqDict.getAbsolutePath()), conf);
-      Text term = new Text();
-      IntWritable termIndex = new IntWritable();
-
-      while (reader.next(term, termIndex)) {
-        indexTerms.add(term.toString());
-      }
-    } finally {
-      Closeables.close(reader, true);
-    }
-
-    Set<String> expectedIndexTerms = Sets.newHashSet("all", "bind", "bring", 
"darkness", "find", "one", "ring", "rule");
-
-    // should contain the same terms as expected
-    assertEquals(expectedIndexTerms.size(), Sets.union(expectedIndexTerms, 
indexTerms).size());
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
----------------------------------------------------------------------
diff --git 
a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
 
b/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
deleted file mode 100644
index 8d92551..0000000
--- 
a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
+++ /dev/null
@@ -1,195 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-import java.io.IOException;
-import java.util.Iterator;
-
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Iterators;
-
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.FieldType;
-import org.apache.lucene.document.StringField;
-import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.IndexOptions;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.store.RAMDirectory;
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.math.NamedVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.utils.vectors.TermInfo;
-import org.apache.mahout.vectorizer.TFIDF;
-import org.apache.mahout.vectorizer.Weight;
-import org.junit.Before;
-import org.junit.Test;
-
-public final class LuceneIterableTest extends MahoutTestCase {
-
-  private static final String [] DOCS = {
-      "The quick red fox jumped over the lazy brown dogs.",
-      "Mary had a little lamb whose fleece was white as snow.",
-      "Moby Dick is a story of a whale and a man obsessed.",
-      "The robber wore a black fleece jacket and a baseball cap.",
-      "The English Springer Spaniel is the best of all dogs."
-  };
-
-  private RAMDirectory directory;
-
-  private final FieldType TYPE_NO_TERM_VECTORS = new FieldType();
-
-  private final FieldType TYPE_TERM_VECTORS = new FieldType();
-
-  @Before
-  public void before() throws IOException {
-
-    
TYPE_NO_TERM_VECTORS.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
-    TYPE_NO_TERM_VECTORS.setTokenized(true);
-    TYPE_NO_TERM_VECTORS.setStoreTermVectors(false);
-    TYPE_NO_TERM_VECTORS.setStoreTermVectorPositions(false);
-    TYPE_NO_TERM_VECTORS.setStoreTermVectorOffsets(false);
-    TYPE_NO_TERM_VECTORS.freeze();
-
-    
TYPE_TERM_VECTORS.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
-    TYPE_TERM_VECTORS.setTokenized(true);
-    TYPE_TERM_VECTORS.setStored(true);
-    TYPE_TERM_VECTORS.setStoreTermVectors(true);
-    TYPE_TERM_VECTORS.setStoreTermVectorPositions(true);
-    TYPE_TERM_VECTORS.setStoreTermVectorOffsets(true);
-    TYPE_TERM_VECTORS.freeze();
-
-    directory = createTestIndex(TYPE_TERM_VECTORS);
-  }
-
-  @Test
-  public void testIterable() throws Exception {
-    IndexReader reader = DirectoryReader.open(directory);
-    Weight weight = new TFIDF();
-    TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
-    LuceneIterable iterable = new LuceneIterable(reader, "id", "content", 
termInfo,weight);
-
-    //TODO: do something more meaningful here
-    for (Vector vector : iterable) {
-      assertNotNull(vector);
-      assertTrue("vector is not an instanceof " + NamedVector.class, vector 
instanceof NamedVector);
-      assertTrue("vector Size: " + vector.size() + " is not greater than: " + 
0, vector.size() > 0);
-      assertTrue(((NamedVector) vector).getName().startsWith("doc_"));
-    }
-
-    iterable = new LuceneIterable(reader, "id", "content", termInfo,weight, 3);
-
-    //TODO: do something more meaningful here
-    for (Vector vector : iterable) {
-      assertNotNull(vector);
-      assertTrue("vector is not an instanceof " + NamedVector.class, vector 
instanceof NamedVector);
-      assertTrue("vector Size: " + vector.size() + " is not greater than: " + 
0, vector.size() > 0);
-      assertTrue(((NamedVector) vector).getName().startsWith("doc_"));
-    }
-
-  }
-
-  @Test(expected = IllegalStateException.class)
-  public void testIterableNoTermVectors() throws IOException {
-    RAMDirectory directory = createTestIndex(TYPE_NO_TERM_VECTORS);
-    IndexReader reader = DirectoryReader.open(directory);
-
-    Weight weight = new TFIDF();
-    TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
-    LuceneIterable iterable = new LuceneIterable(reader, "id", "content",  
termInfo,weight);
-
-    Iterator<Vector> iterator = iterable.iterator();
-    Iterators.advance(iterator, 1);
-  }
-
-  @Test
-  public void testIterableSomeNoiseTermVectors() throws IOException {
-    //get noise vectors
-    RAMDirectory directory = createTestIndex(TYPE_TERM_VECTORS, new 
RAMDirectory(), 0);
-    //get real vectors
-    createTestIndex(TYPE_NO_TERM_VECTORS, directory, 5);
-    IndexReader reader = DirectoryReader.open(directory);
-
-    Weight weight = new TFIDF();
-    TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
-
-    boolean exceptionThrown;
-    //0 percent tolerance
-    LuceneIterable iterable = new LuceneIterable(reader, "id", "content", 
termInfo, weight);
-    try {
-      Iterables.skip(iterable, Iterables.size(iterable));
-      exceptionThrown = false;
-    }
-    catch(IllegalStateException ise) {
-        exceptionThrown = true;
-    }
-    assertTrue(exceptionThrown);
-
-    //100 percent tolerance
-    iterable = new LuceneIterable(reader, "id", "content", termInfo,weight, 
-1, 1.0);
-    try {
-      Iterables.skip(iterable, Iterables.size(iterable));
-      exceptionThrown = false;
-    }
-    catch(IllegalStateException ise) {
-        exceptionThrown = true;
-    }
-    assertFalse(exceptionThrown);
-
-    //50 percent tolerance
-    iterable = new LuceneIterable(reader, "id", "content", termInfo,weight, 
-1, 0.5);
-    Iterator<Vector> iterator = iterable.iterator();
-    Iterators.advance(iterator, 5);
-
-    try {
-      Iterators.advance(iterator, Iterators.size(iterator));
-      exceptionThrown = false;
-    }
-    catch(IllegalStateException ise) {
-      exceptionThrown = true;
-    }
-    assertTrue(exceptionThrown);
-  }
-
-  static RAMDirectory createTestIndex(FieldType fieldType) throws IOException {
-      return createTestIndex(fieldType, new RAMDirectory(), 0);
-  }
-
-  static RAMDirectory createTestIndex(FieldType fieldType,
-                                              RAMDirectory directory,
-                                              int startingId) throws 
IOException {
-
-    try (IndexWriter writer = new IndexWriter(directory, new 
IndexWriterConfig(new StandardAnalyzer()))) {
-      for (int i = 0; i < DOCS.length; i++) {
-        Document doc = new Document();
-        Field id = new StringField("id", "doc_" + (i + startingId), 
Field.Store.YES);
-        doc.add(id);
-        //Store both position and offset information
-        Field text = new Field("content", DOCS[i], fieldType);
-        doc.add(text);
-        Field text2 = new Field("content2", DOCS[i], fieldType);
-        doc.add(text2);
-        writer.addDocument(doc);
-      }
-    }
-    return directory;
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/resources/date.arff
----------------------------------------------------------------------
diff --git a/integration/src/test/resources/date.arff 
b/integration/src/test/resources/date.arff
deleted file mode 100644
index 9daeb52..0000000
--- a/integration/src/test/resources/date.arff
+++ /dev/null
@@ -1,18 +0,0 @@
-   % Comments
-   %
-   % Comments go here   %
-   @RELATION MahoutDateTest
-
-   @ATTRIBUTE junk  NUMERIC
-   @ATTRIBUTE date1   date
-   @ATTRIBUTE date2   date "yyyy.MM.dd G 'at' HH:mm:ss z"
-   @ATTRIBUTE date3   date "EEE, MMM d, ''yy"
-   @ATTRIBUTE date4   date "K:mm a, z"
-   @ATTRIBUTE date5   date "yyyyy.MMMMM.dd GGG hh:mm aaa"
-   @ATTRIBUTE date6   date "EEE, d MMM yyyy HH:mm:ss Z"
-
-
-
-   @DATA
-   {0 1,1 "2001-07-04T12:08:56",2 "2001.07.04 AD at 12:08:56 PDT",3 "Wed, Jul 
4, '01,4 0:08 PM, PDT",4 "0:08 PM, PDT", 5 "02001.July.04 AD 12:08 PM" ,6 "Wed, 
4 Jul 2001 12:08:56 -0700"  }
-   {0 2,1 "2001-08-04T12:09:56",2 "2011.07.04 AD at 12:08:56 PDT",3 "Mon, Jul 
4, '11,4 0:08 PM, PDT",4 "0:08 PM, PDT", 5 "02001.July.14 AD 12:08 PM" ,6 "Mon, 
4 Jul 2011 12:08:56 -0700"  }

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/resources/expected-arff-dictionary-2.csv
----------------------------------------------------------------------
diff --git a/integration/src/test/resources/expected-arff-dictionary-2.csv 
b/integration/src/test/resources/expected-arff-dictionary-2.csv
deleted file mode 100644
index acb1c43..0000000
--- a/integration/src/test/resources/expected-arff-dictionary-2.csv
+++ /dev/null
@@ -1,22 +0,0 @@
-Label bindings for Relation golf
-temperature,1
-humidity,2
-outlook,0
-class,4
-windy,3
-
-Values for nominal attributes
-3
-outlook
-3
-rain,3
-overcast,2
-sunny,1
-class
-2
-play,2
-dont_play,1
-windy
-2
-false,1
-true,2

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/resources/expected-arff-dictionary.csv
----------------------------------------------------------------------
diff --git a/integration/src/test/resources/expected-arff-dictionary.csv 
b/integration/src/test/resources/expected-arff-dictionary.csv
deleted file mode 100644
index f2dac13..0000000
--- a/integration/src/test/resources/expected-arff-dictionary.csv
+++ /dev/null
@@ -1,22 +0,0 @@
-Label bindings for Relation golf
-humidity,2
-windy,3
-outlook,0
-class,4
-temperature,1
-
-Values for nominal attributes
-3
-windy
-2
-true,2
-false,1
-outlook
-3
-sunny,1
-overcast,2
-rain,3
-class
-2
-play,2
-dont_play,1

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/resources/expected-arff-schema-2.json
----------------------------------------------------------------------
diff --git a/integration/src/test/resources/expected-arff-schema-2.json 
b/integration/src/test/resources/expected-arff-schema-2.json
deleted file mode 100644
index b73f55c..0000000
--- a/integration/src/test/resources/expected-arff-schema-2.json
+++ /dev/null
@@ -1 +0,0 @@
-[{"values":["rain","overcast","sunny"],"label":"false","attribute":"outlook","type":"categorical"},{"label":"false","attribute":"temperature","type":"numerical"},{"label":"false","attribute":"humidity","type":"numerical"},{"values":["false","true"],"label":"false","attribute":"windy","type":"categorical"},{"values":["play","dont_play"],"label":"true","attribute":"class","type":"categorical"}]
\ No newline at end of file

[19/52] [partial] mahout git commit: removed all files except for website directory

Reply via email to