http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/MultipleFieldsDocument.java
----------------------------------------------------------------------
diff --git 
a/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/MultipleFieldsDocument.java
 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/MultipleFieldsDocument.java
new file mode 100644
index 0000000..7483b2d
--- /dev/null
+++ 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/MultipleFieldsDocument.java
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.text.doc;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
+
+/**
+ * Used for testing lucene2seq
+ */
+@Deprecated
+public class MultipleFieldsDocument extends SingleFieldDocument {
+
+  public static final String FIELD1 = "field1";
+  public static final String FIELD2 = "field2";
+
+  private String field1;
+  private String field2;
+
+  public MultipleFieldsDocument(String id, String field, String field1, String 
field2) {
+    super(id, field);
+    this.field1 = field1;
+    this.field2 = field2;
+  }
+
+  public String getField1() {
+    return field1;
+  }
+
+  public String getField2() {
+    return field2;
+  }
+
+  @Override
+  public Document asLuceneDocument() {
+    Document document = super.asLuceneDocument();
+
+    document.add(new TextField(FIELD1, this.field1, Field.Store.YES));
+    document.add(new TextField(FIELD2, this.field2, Field.Store.YES));
+
+    return document;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/NumericFieldDocument.java
----------------------------------------------------------------------
diff --git 
a/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/NumericFieldDocument.java
 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/NumericFieldDocument.java
new file mode 100644
index 0000000..e06e8d6
--- /dev/null
+++ 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/NumericFieldDocument.java
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.text.doc;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.IntField;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
+
+/**
+ * Document with numeric field.
+ */
+@Deprecated
+public class NumericFieldDocument extends SingleFieldDocument {
+
+  public static final String NUMERIC_FIELD = "numeric";
+
+  private int numericField;
+
+  public NumericFieldDocument(String id, String field, int numericField) {
+    super(id, field);
+    this.numericField = numericField;
+  }
+
+  @Override
+  public Document asLuceneDocument() {
+    Document document = new Document();
+
+    document.add(new StringField(ID_FIELD, getId(), Field.Store.YES));
+    document.add(new TextField(FIELD, getField(), Field.Store.YES));
+    document.add(new IntField(NUMERIC_FIELD, numericField, Field.Store.YES));
+
+    return document;
+  }
+
+  public int getNumericField() {
+    return numericField;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/SingleFieldDocument.java
----------------------------------------------------------------------
diff --git 
a/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/SingleFieldDocument.java
 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/SingleFieldDocument.java
new file mode 100644
index 0000000..4636a51
--- /dev/null
+++ 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/SingleFieldDocument.java
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.text.doc;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
+
+/**
+ * Used for testing lucene2seq
+ */
+@Deprecated
+public class SingleFieldDocument implements TestDocument {
+
+  public static final String ID_FIELD = "idField";
+  public static final String FIELD = "field";
+
+  private String id;
+  private String field;
+
+  public SingleFieldDocument(String id, String field) {
+    this.id = id;
+    this.field = field;
+  }
+
+  @Override
+  public String getId() {
+    return id;
+  }
+
+  @Override
+  public String getField() {
+    return field;
+  }
+
+  @Override
+  public Document asLuceneDocument() {
+    Document document = new Document();
+
+    Field idField = new StringField(ID_FIELD, getId(), Field.Store.YES);
+    Field field = new TextField(FIELD, getField(), Field.Store.YES);
+
+    document.add(idField);
+    document.add(field);
+
+    return document;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/TestDocument.java
----------------------------------------------------------------------
diff --git 
a/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/TestDocument.java
 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/TestDocument.java
new file mode 100644
index 0000000..7243c71
--- /dev/null
+++ 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/TestDocument.java
@@ -0,0 +1,29 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.text.doc;
+
+import org.apache.lucene.document.Document;
+@Deprecated
+public interface TestDocument {
+
+  String getId();
+
+  String getField();
+
+  Document asLuceneDocument();
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/UnstoredFieldsDocument.java
----------------------------------------------------------------------
diff --git 
a/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/UnstoredFieldsDocument.java
 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/UnstoredFieldsDocument.java
new file mode 100644
index 0000000..6eb43f6
--- /dev/null
+++ 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/UnstoredFieldsDocument.java
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.text.doc;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+
+/**
+ * Used for testing lucene2seq
+ */
+@Deprecated
+public class UnstoredFieldsDocument extends SingleFieldDocument {
+
+  public static final String UNSTORED_FIELD = "unstored";
+
+  public UnstoredFieldsDocument(String id, String field) {
+    super(id, field);
+  }
+
+  @Override
+  public Document asLuceneDocument() {
+    Document document = super.asLuceneDocument();
+
+    document.add(new StringField(UNSTORED_FIELD, "", Field.Store.NO));
+
+    return document;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/Bump125Test.java
----------------------------------------------------------------------
diff --git 
a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/Bump125Test.java
 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/Bump125Test.java
new file mode 100644
index 0000000..65b308f
--- /dev/null
+++ 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/Bump125Test.java
@@ -0,0 +1,42 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils;
+
+import com.google.common.collect.Lists;
+
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Test;
+
+import java.util.Iterator;
+
+public class Bump125Test extends MahoutTestCase {
+  @Test
+  public void testIncrement() throws Exception {
+    Iterator<Integer> ref = Lists.newArrayList(1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 12, 14, 16, 18, 20, 25, 30, 35, 40, 50, 60,
+            70, 80, 100, 120, 140, 160, 180, 200, 250, 300, 350,
+            400, 500, 600, 700, 800, 1000, 1200, 1400, 1600, 1800,
+            2000, 2500, 3000, 3500, 4000, 5000, 6000, 7000)
+            .iterator();
+    Bump125 b = new Bump125();
+    for (int i = 0; i < 50; i++) {
+      long x = b.increment();
+      assertEquals(ref.next().longValue(), x);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/SplitInputTest.java
----------------------------------------------------------------------
diff --git 
a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/SplitInputTest.java
 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/SplitInputTest.java
new file mode 100644
index 0000000..7ffa690
--- /dev/null
+++ 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/SplitInputTest.java
@@ -0,0 +1,418 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils;
+
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.nio.charset.Charset;
+
+import com.google.common.io.Closeables;
+import org.apache.commons.io.Charsets;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.classifier.ClassifierData;
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
+import 
org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterable;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.map.OpenObjectIntHashMap;
+import org.junit.Before;
+import org.junit.Test;
+
+public final class SplitInputTest extends MahoutTestCase {
+
+  private OpenObjectIntHashMap<String> countMap;
+  private Charset charset;
+  private FileSystem fs;
+  private Path tempInputFile;
+  private Path tempTrainingDirectory;
+  private Path tempTestDirectory;
+  private Path tempMapRedOutputDirectory;
+  private Path tempInputDirectory;
+  private Path tempSequenceDirectory;
+  private SplitInput si;
+
+  @Override
+  @Before
+  public void setUp() throws Exception {
+    Configuration conf = getConfiguration();
+    fs = FileSystem.get(conf);
+
+    super.setUp();
+
+    countMap = new OpenObjectIntHashMap<>();
+
+    charset = Charsets.UTF_8;
+    tempSequenceDirectory = getTestTempFilePath("tmpsequence");
+    tempInputFile = getTestTempFilePath("bayesinputfile");
+    tempTrainingDirectory = getTestTempDirPath("bayestrain");
+    tempTestDirectory = getTestTempDirPath("bayestest");
+    tempMapRedOutputDirectory = new Path(getTestTempDirPath(), "mapRedOutput");
+    tempInputDirectory = getTestTempDirPath("bayesinputdir");
+
+    si = new SplitInput();
+    si.setTrainingOutputDirectory(tempTrainingDirectory);
+    si.setTestOutputDirectory(tempTestDirectory);
+    si.setInputDirectory(tempInputDirectory);
+  }
+
+  private void writeMultipleInputFiles() throws IOException {
+    Writer writer = null;
+    String currentLabel = null;
+    try {
+     for (String[] entry : ClassifierData.DATA) {
+      if (!entry[0].equals(currentLabel)) {
+        currentLabel = entry[0];
+        Closeables.close(writer, false);
+
+        writer = new BufferedWriter(new OutputStreamWriter(fs.create(new 
Path(tempInputDirectory, currentLabel)),
+            Charsets.UTF_8));
+      }
+      countMap.adjustOrPutValue(currentLabel, 1, 1);
+      writer.write(currentLabel + '\t' + entry[1] + '\n');
+     }
+    }finally {
+     Closeables.close(writer, false);
+    }
+  }
+
+  private void writeSingleInputFile() throws IOException {
+    Writer writer = new BufferedWriter(new 
OutputStreamWriter(fs.create(tempInputFile), Charsets.UTF_8));
+    try {
+      for (String[] entry : ClassifierData.DATA) {
+        writer.write(entry[0] + '\t' + entry[1] + '\n');
+      }
+    } finally {
+      Closeables.close(writer, true);
+    }
+  }
+
+  @Test
+  public void testSplitDirectory() throws Exception {
+
+    writeMultipleInputFiles();
+
+    final int testSplitSize = 1;
+    si.setTestSplitSize(testSplitSize);
+    si.setCallback(new SplitInput.SplitCallback() {
+          @Override
+          public void splitComplete(Path inputFile, int lineCount, int 
trainCount, int testCount, int testSplitStart) {
+            int trainingLines = countMap.get(inputFile.getName()) - 
testSplitSize;
+            assertSplit(fs, inputFile, charset, testSplitSize, trainingLines, 
tempTrainingDirectory, tempTestDirectory);
+          }
+    });
+
+    si.splitDirectory(tempInputDirectory);
+  }
+
+  @Test
+  public void testSplitFile() throws Exception {
+    writeSingleInputFile();
+    si.setTestSplitSize(2);
+    si.setCallback(new TestCallback(2, 10));
+    si.splitFile(tempInputFile);
+  }
+
+  @Test
+  public void testSplitFileLocation() throws Exception {
+    writeSingleInputFile();
+    si.setTestSplitSize(2);
+    si.setSplitLocation(50);
+    si.setCallback(new TestCallback(2, 10));
+    si.splitFile(tempInputFile);
+  }
+
+  @Test
+  public void testSplitFilePct() throws Exception {
+    writeSingleInputFile();
+    si.setTestSplitPct(25);
+
+    si.setCallback(new TestCallback(3, 9));
+    si.splitFile(tempInputFile);
+  }
+
+  @Test
+  public void testSplitFilePctLocation() throws Exception {
+    writeSingleInputFile();
+    si.setTestSplitPct(25);
+    si.setSplitLocation(50);
+    si.setCallback(new TestCallback(3, 9));
+    si.splitFile(tempInputFile);
+  }
+
+  @Test
+  public void testSplitFileRandomSelectionSize() throws Exception {
+    writeSingleInputFile();
+    si.setTestRandomSelectionSize(5);
+
+    si.setCallback(new TestCallback(5, 7));
+    si.splitFile(tempInputFile);
+  }
+
+  @Test
+  public void testSplitFileRandomSelectionPct() throws Exception {
+    writeSingleInputFile();
+    si.setTestRandomSelectionPct(25);
+
+    si.setCallback(new TestCallback(3, 9));
+    si.splitFile(tempInputFile);
+  }
+
+  /**
+   * Create a Sequencefile for testing consisting of IntWritable
+   * keys and VectorWritable values
+   * @param path path for test SequenceFile
+   * @param testPoints number of records in test SequenceFile
+   */
+  private void writeVectorSequenceFile(Path path, int testPoints) throws 
IOException {
+    Path tempSequenceFile = new Path(path, "part-00000");
+    Configuration conf = getConfiguration();
+    IntWritable key = new IntWritable();
+    VectorWritable value = new VectorWritable();
+    try (SequenceFile.Writer writer =
+             SequenceFile.createWriter(fs, conf, tempSequenceFile, 
IntWritable.class, VectorWritable.class)) {
+      for (int i = 0; i < testPoints; i++) {
+        key.set(i);
+        Vector v = new SequentialAccessSparseVector(4);
+        v.assign(i);
+        value.set(v);
+        writer.append(key, value);
+      }
+    }
+  }
+
+  /**
+   * Create a Sequencefile for testing consisting of IntWritable keys and Text 
values
+   * @param path path for test SequenceFile
+   * @param testPoints number of records in test SequenceFile
+   */
+  private void writeTextSequenceFile(Path path, int testPoints) throws 
IOException {
+    Path tempSequenceFile = new Path(path, "part-00000");
+    Configuration conf = getConfiguration();
+    Text key = new Text();
+    Text value = new Text();
+    try (SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, 
tempSequenceFile, Text.class, Text.class)){
+      for (int i = 0; i < testPoints; i++) {
+        key.set(Integer.toString(i));
+        value.set("Line " + i);
+        writer.append(key, value);
+      }
+    }
+  }
+
+  /**
+   * Display contents of a SequenceFile
+   * @param sequenceFilePath path to SequenceFile
+   */
+  private void displaySequenceFile(Path sequenceFilePath) throws IOException {
+    for (Pair<?,?> record : new SequenceFileIterable<>(sequenceFilePath, true, 
getConfiguration())) {
+      System.out.println(record.getFirst() + "\t" + record.getSecond());
+    }
+  }
+
+  /**
+   * Determine number of records in a SequenceFile
+   * @param sequenceFilePath path to SequenceFile
+   * @return number of records
+   */
+  private int getNumberRecords(Path sequenceFilePath) throws IOException {
+    int numberRecords = 0;
+    for (Object value : new SequenceFileValueIterable<>(sequenceFilePath, 
true, getConfiguration())) {
+      numberRecords++;
+    }
+    return numberRecords;
+  }
+
+  /**
+   * Test map reduce version of split input with Text, Text key value
+   * pairs in input
+   */
+  @Test
+  public void testSplitInputMapReduceText() throws Exception {
+    writeTextSequenceFile(tempSequenceDirectory, 1000);
+    testSplitInputMapReduce(1000);
+  }
+
+  /** Test map reduce version of split input with Text, Text key value pairs 
in input called from command line */
+  @Test
+  public void testSplitInputMapReduceTextCli() throws Exception {
+    writeTextSequenceFile(tempSequenceDirectory, 1000);
+    testSplitInputMapReduceCli(1000);
+  }
+
+  /**
+   * Test map reduce version of split input with IntWritable, Vector key value
+   * pairs in input
+   */
+  @Test
+  public void testSplitInputMapReduceVector() throws Exception {
+    writeVectorSequenceFile(tempSequenceDirectory, 1000);
+    testSplitInputMapReduce(1000);
+  }
+
+  /**
+   * Test map reduce version of split input with IntWritable, Vector key value
+   * pairs in input called from command line
+   */
+  @Test
+  public void testSplitInputMapReduceVectorCli() throws Exception {
+    writeVectorSequenceFile(tempSequenceDirectory, 1000);
+    testSplitInputMapReduceCli(1000);
+  }
+
+  /**
+   * Test map reduce version of split input through CLI
+   */
+  private void testSplitInputMapReduceCli(int numPoints) throws Exception {
+    int randomSelectionPct = 25;
+    int keepPct = 10;
+    String[] args =
+        { "--method", "mapreduce", "--input", tempSequenceDirectory.toString(),
+            "--mapRedOutputDir", tempMapRedOutputDirectory.toString(),
+            "--randomSelectionPct", Integer.toString(randomSelectionPct),
+            "--keepPct", Integer.toString(keepPct), "-ow" };
+    ToolRunner.run(getConfiguration(), new SplitInput(), args);
+    validateSplitInputMapReduce(numPoints, randomSelectionPct, keepPct);
+  }
+
+  /**
+   * Test map reduce version of split input through method call
+   */
+  private void testSplitInputMapReduce(int numPoints) throws Exception {
+    int randomSelectionPct = 25;
+    si.setTestRandomSelectionPct(randomSelectionPct);
+    int keepPct = 10;
+    si.setKeepPct(keepPct);
+    si.setMapRedOutputDirectory(tempMapRedOutputDirectory);
+    si.setUseMapRed(true);
+    si.splitDirectory(getConfiguration(), tempSequenceDirectory);
+
+    validateSplitInputMapReduce(numPoints, randomSelectionPct, keepPct);
+  }
+
+  /**
+   * Validate that number of test records and number of training records
+   * are consistant with keepPct and randomSelectionPct
+   */
+  private void validateSplitInputMapReduce(int numPoints, int 
randomSelectionPct, int keepPct) throws IOException {
+    Path testPath = new Path(tempMapRedOutputDirectory, "test-r-00000");
+    Path trainingPath = new Path(tempMapRedOutputDirectory, 
"training-r-00000");
+    int numberTestRecords = getNumberRecords(testPath);
+    int numberTrainingRecords = getNumberRecords(trainingPath);
+    System.out.printf("Test data: %d records\n", numberTestRecords);
+    displaySequenceFile(testPath);
+    System.out.printf("Training data: %d records\n", numberTrainingRecords);
+    displaySequenceFile(trainingPath);
+    assertEquals((randomSelectionPct / 100.0) * (keepPct / 100.0) * numPoints,
+        numberTestRecords, 2);
+    assertEquals(
+        (1 - randomSelectionPct / 100.0) * (keepPct / 100.0) * numPoints,
+        numberTrainingRecords, 2);
+  }
+
+  @Test
+  public void testValidate() throws Exception {
+    SplitInput st = new SplitInput();
+    assertValidateException(st);
+
+    st.setTestSplitSize(100);
+    assertValidateException(st);
+
+    st.setTestOutputDirectory(tempTestDirectory);
+    assertValidateException(st);
+
+    st.setTrainingOutputDirectory(tempTrainingDirectory);
+    st.validate();
+
+    st.setTestSplitPct(50);
+    assertValidateException(st);
+
+    st = new SplitInput();
+    st.setTestRandomSelectionPct(50);
+    st.setTestOutputDirectory(tempTestDirectory);
+    st.setTrainingOutputDirectory(tempTrainingDirectory);
+    st.validate();
+
+    st.setTestSplitPct(50);
+    assertValidateException(st);
+
+    st = new SplitInput();
+    st.setTestRandomSelectionPct(50);
+    st.setTestOutputDirectory(tempTestDirectory);
+    st.setTrainingOutputDirectory(tempTrainingDirectory);
+    st.validate();
+
+    st.setTestSplitSize(100);
+    assertValidateException(st);
+  }
+
+  private class TestCallback implements SplitInput.SplitCallback {
+    private final int testSplitSize;
+    private final int trainingLines;
+
+    private TestCallback(int testSplitSize, int trainingLines) {
+      this.testSplitSize = testSplitSize;
+      this.trainingLines = trainingLines;
+    }
+
+    @Override
+    public void splitComplete(Path inputFile, int lineCount, int trainCount, 
int testCount, int testSplitStart) {
+      assertSplit(fs, tempInputFile, charset, testSplitSize, trainingLines, 
tempTrainingDirectory, tempTestDirectory);
+    }
+  }
+
+  private static void assertValidateException(SplitInput st) throws 
IOException {
+    try {
+      st.validate();
+      fail("Expected IllegalArgumentException");
+    } catch (IllegalArgumentException iae) {
+      // good
+    }
+  }
+
+  private static void assertSplit(FileSystem fs,
+                                  Path tempInputFile,
+                                  Charset charset,
+                                  int testSplitSize,
+                                  int trainingLines,
+                                  Path tempTrainingDirectory,
+                                  Path tempTestDirectory) {
+
+    try {
+      Path testFile = new Path(tempTestDirectory, tempInputFile.getName());
+      //assertTrue("test file exists", testFile.isFile());
+      assertEquals("test line count", testSplitSize, SplitInput.countLines(fs, 
testFile, charset));
+
+      Path trainingFile = new Path(tempTrainingDirectory, 
tempInputFile.getName());
+      //assertTrue("training file exists", trainingFile.isFile());
+      assertEquals("training line count", trainingLines, 
SplitInput.countLines(fs, trainingFile, charset));
+    } catch (IOException ioe) {
+      fail(ioe.toString());
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/email/MailProcessorTest.java
----------------------------------------------------------------------
diff --git 
a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/email/MailProcessorTest.java
 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/email/MailProcessorTest.java
new file mode 100644
index 0000000..c519f85
--- /dev/null
+++ 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/email/MailProcessorTest.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.email;
+
+import java.io.File;
+import java.io.StringWriter;
+import java.net.URL;
+import java.util.regex.Pattern;
+
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Test;
+
+public final class MailProcessorTest extends MahoutTestCase {
+
+  @Test
+  public void testLabel() throws Exception {
+    StringWriter writer = new StringWriter();
+    MailOptions options = new MailOptions();
+    options.setSeparator(":::");
+    options.setCharset(Charsets.UTF_8);
+        options.setPatternsToMatch(new Pattern[]{
+        MailProcessor.FROM_PREFIX, MailProcessor.SUBJECT_PREFIX, 
MailProcessor.TO_PREFIX});
+    options.setInput(new File(System.getProperty("user.dir")));
+    MailProcessor proc = new MailProcessor(options, "", writer);
+    URL url = 
MailProcessorTest.class.getClassLoader().getResource("test.mbox");
+    File file = new File(url.toURI());
+    long count = proc.parseMboxLineByLine(file);
+    assertEquals(7, count);
+  }
+
+  @Test
+  public void testStripQuoted() throws Exception {
+    StringWriter writer = new StringWriter();
+    MailOptions options = new MailOptions();
+    options.setSeparator(":::");
+    options.setCharset(Charsets.UTF_8);
+        options.setPatternsToMatch(new Pattern[]{
+        MailProcessor.SUBJECT_PREFIX});
+    options.setInput(new File(System.getProperty("user.dir")));
+    options.setIncludeBody(true);
+    MailProcessor proc = new MailProcessor(options, "", writer);
+    URL url = 
MailProcessorTest.class.getClassLoader().getResource("test.mbox");
+    File file = new File(url.toURI());
+    long count = proc.parseMboxLineByLine(file);
+    assertEquals(7, count);
+    assertTrue(writer.getBuffer().toString().contains("> Cocoon Cron Block 
Configurable Clustering"));
+    writer = new StringWriter();
+    proc = new MailProcessor(options, "", writer);
+    options.setStripQuotedText(true);
+    count = proc.parseMboxLineByLine(file);
+    assertEquals(7, count);
+    assertFalse(writer.getBuffer().toString().contains("> Cocoon Cron Block 
Configurable Clustering"));
+
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
----------------------------------------------------------------------
diff --git 
a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
new file mode 100644
index 0000000..4fdbbbc
--- /dev/null
+++ 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
@@ -0,0 +1,154 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.mahout.utils.nlp.collocations.llr;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharsetEncoder;
+
+import org.apache.commons.io.Charsets;
+import org.apache.hadoop.util.bloom.BloomFilter;
+import org.apache.hadoop.util.bloom.Filter;
+import org.apache.hadoop.util.bloom.Key;
+import org.apache.hadoop.util.hash.Hash;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
+import org.apache.lucene.analysis.shingle.ShingleFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Test;
+
+public final class BloomTokenFilterTest extends MahoutTestCase {
+  
+  private static final CharsetEncoder encoder = Charsets.UTF_8.newEncoder();
+
+  private static final String input = "The best of times the worst of times";
+  private static final String[] allTokens = {
+      "The", "best", "of", "times", "the", "worst", "of", "times"
+  };
+  private static final String[] expectedNonKeepTokens = { "best", "times", 
"the", "worst", "times" };
+  private static final String[] expectedKeepTokens = { "The", "of", "of" };
+  private static final String[] filterTokens    = { "The", "of" };
+  private static final String[] notFilterTokens = { "best", "worst", "the", 
"times"};
+  private static final String[] shingleKeepTokens = {
+      "The best", "best of times", "the worst", "worst of times", "of times"
+  };
+  private static final String[] expectedShingleTokens = {
+      "The best", "best of times", "of times", "the worst", "worst of times", 
"of times"
+  };
+  
+  /** test standalone filter without tokenfilter wrapping */
+  @Test
+  public void testFilter() throws IOException {
+    Filter filter = getFilter(filterTokens);
+    Key k = new Key();
+    for (String s: filterTokens) {
+      setKey(k,s);
+      assertTrue("Key for string " + s + " should be filter member", 
filter.membershipTest(k));
+    }
+    
+    for (String s: notFilterTokens)  {
+      setKey(k,s);
+      assertFalse("Key for string " + s + " should not be filter member", 
filter.membershipTest(k));
+    }
+  }
+  
+  /** normal case, unfiltered analyzer */
+  @Test
+  public void testAnalyzer() throws IOException {
+    Reader reader = new StringReader(input);
+    Analyzer analyzer = new WhitespaceAnalyzer();
+    TokenStream ts = analyzer.tokenStream(null, reader);
+    ts.reset();
+    validateTokens(allTokens, ts);
+    ts.end();
+    ts.close();
+  }
+  
+  /** filtered analyzer */
+  @Test
+  public void testNonKeepdAnalyzer() throws IOException {
+    Reader reader = new StringReader(input);
+    Analyzer analyzer = new WhitespaceAnalyzer();
+    TokenStream ts = analyzer.tokenStream(null, reader);
+    ts.reset();
+    TokenStream f = new BloomTokenFilter(getFilter(filterTokens), false /* 
toss matching tokens */, ts);
+    validateTokens(expectedNonKeepTokens, f);
+    ts.end();
+    ts.close();
+  }
+
+  /** keep analyzer */
+  @Test
+  public void testKeepAnalyzer() throws IOException {
+    Reader reader = new StringReader(input);
+    Analyzer analyzer = new WhitespaceAnalyzer();
+    TokenStream ts = analyzer.tokenStream(null, reader);
+    ts.reset();
+    TokenStream f = new BloomTokenFilter(getFilter(filterTokens), true /* keep 
matching tokens */, ts);
+    validateTokens(expectedKeepTokens, f);
+    ts.end();
+    ts.close();
+  }
+  
+  /** shingles, keep those matching whitelist */
+  @Test
+  public void testShingleFilteredAnalyzer() throws IOException {
+    Reader reader = new StringReader(input);
+    Analyzer analyzer = new WhitespaceAnalyzer();
+    TokenStream ts = analyzer.tokenStream(null, reader);
+    ts.reset();
+    ShingleFilter sf = new ShingleFilter(ts, 3);
+    TokenStream f = new BloomTokenFilter(getFilter(shingleKeepTokens),  true, 
sf);
+    validateTokens(expectedShingleTokens, f);
+    ts.end();
+    ts.close();
+  }
+  
+  private static void setKey(Key k, String s) throws IOException {
+    ByteBuffer buffer = encoder.encode(CharBuffer.wrap(s.toCharArray()));
+    k.set(buffer.array(), 1.0);
+  }
+  
+  private static void validateTokens(String[] expected, TokenStream ts) throws 
IOException {
+    int pos = 0;
+    while (ts.incrementToken()) {
+      assertTrue("Analyzer produced too many tokens", pos <= expected.length);
+      CharTermAttribute termAttr = ts.getAttribute(CharTermAttribute.class);
+      assertEquals("Unexpected term", expected[pos++], termAttr.toString());
+    }
+    assertEquals("Analyzer produced too few terms", expected.length, pos);
+  }
+
+  private static Filter getFilter(String[] tokens) throws IOException {
+    Filter filter = new BloomFilter(100,50, Hash.JENKINS_HASH);
+    Key k = new Key();
+    for (String s: tokens) {
+      setKey(k,s);
+      filter.add(k);
+    }
+    return filter;
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java
----------------------------------------------------------------------
diff --git 
a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java
 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java
new file mode 100644
index 0000000..8ab643b
--- /dev/null
+++ 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.regex;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.common.DummyRecordWriter;
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Test;
+
+import java.util.List;
+
+public final class RegexMapperTest extends MahoutTestCase {
+
+  @Test
+  public void testRegex() throws Exception {
+    RegexMapper mapper = new RegexMapper();
+    Configuration conf = getConfiguration();
+    conf.set(RegexMapper.REGEX, "(?<=(\\?|&)q=).*?(?=&|$)");
+    conf.set(RegexMapper.TRANSFORMER_CLASS, 
URLDecodeTransformer.class.getName());
+    DummyRecordWriter<LongWritable, Text> mapWriter = new 
DummyRecordWriter<>();
+    Mapper<LongWritable, Text, LongWritable, Text>.Context mapContext = 
DummyRecordWriter
+            .build(mapper, conf, mapWriter);
+
+    mapper.setup(mapContext);
+    for (int i = 0; i < RegexUtilsTest.TEST_STRS.length; i++) {
+      String testStr = RegexUtilsTest.TEST_STRS[i];
+
+      LongWritable key = new LongWritable(i);
+      mapper.map(key, new Text(testStr), mapContext);
+      List<Text> value = mapWriter.getValue(key);
+      if (!RegexUtilsTest.GOLD[i].isEmpty()) {
+        assertEquals(1, value.size());
+        assertEquals(RegexUtilsTest.GOLD[i], value.get(0).toString());
+      }
+    }
+  }
+
+  @Test
+  public void testGroups() throws Exception {
+    RegexMapper mapper = new RegexMapper();
+    Configuration conf = getConfiguration();
+    conf.set(RegexMapper.REGEX, "(\\d+)\\.(\\d+)\\.(\\d+)");
+    conf.set(RegexMapper.TRANSFORMER_CLASS, 
URLDecodeTransformer.class.getName());
+    conf.setStrings(RegexMapper.GROUP_MATCHERS, "1", "3");
+    DummyRecordWriter<LongWritable, Text> mapWriter = new 
DummyRecordWriter<>();
+    Mapper<LongWritable, Text, LongWritable, Text>.Context mapContext = 
DummyRecordWriter
+            .build(mapper, conf, mapWriter);
+
+    mapper.setup(mapContext);
+    for (int i = 0; i < RegexUtilsTest.TEST_STRS.length; i++) {
+      String testStr = RegexUtilsTest.TEST_STRS[i];
+
+      LongWritable key = new LongWritable(i);
+      mapper.map(key, new Text(testStr), mapContext);
+      List<Text> value = mapWriter.getValue(key);
+      assertEquals(1, value.size());
+      assertEquals("127 0", value.get(0).toString());
+    }
+  }
+
+  @Test
+  public void testFPGFormatter() throws Exception {
+    RegexMapper mapper = new RegexMapper();
+    Configuration conf = getConfiguration();
+    conf.set(RegexMapper.REGEX, "(?<=(\\?|&)q=).*?(?=&|$)");
+    conf.set(RegexMapper.TRANSFORMER_CLASS, 
URLDecodeTransformer.class.getName());
+    conf.set(RegexMapper.FORMATTER_CLASS, FPGFormatter.class.getName());
+    DummyRecordWriter<LongWritable, Text> mapWriter = new 
DummyRecordWriter<>();
+    Mapper<LongWritable, Text, LongWritable, Text>.Context mapContext = 
DummyRecordWriter
+            .build(mapper, conf, mapWriter);
+
+    mapper.setup(mapContext);
+    RegexFormatter formatter = new FPGFormatter();
+    for (int i = 0; i < RegexUtilsTest.TEST_STRS.length; i++) {
+      String testStr = RegexUtilsTest.TEST_STRS[i];
+
+      LongWritable key = new LongWritable(i);
+      mapper.map(key, new Text(testStr), mapContext);
+      List<Text> value = mapWriter.getValue(key);
+      if (!RegexUtilsTest.GOLD[i].isEmpty()) {
+        assertEquals(1, value.size());
+        assertEquals(formatter.format(RegexUtilsTest.GOLD[i]), 
value.get(0).toString());
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java
----------------------------------------------------------------------
diff --git 
a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java
 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java
new file mode 100644
index 0000000..8ae10a5
--- /dev/null
+++ 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.regex;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.regex.Pattern;
+
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Test;
+
+public final class RegexUtilsTest extends MahoutTestCase {
+
+  static final String[] TEST_STRS = {
+          "127.0.0.1 -  -  [01/10/2011:00:01:51 +0000] \"GET 
/solr/collection1/browse?q=foo&rows=10&wt=json&hl=true&hl.fl=body&hl.fl=content",
+          "127.0.0.1 -  -  [01/10/2011:00:20:58 +0000] \"GET 
/solr/collection1/browse?q=Using+Solr+Search+RDBMS&fq=%7B%21tag%3Dsource%7D%28%28source%3Alucid+AND+lucid_facet%3A%28site%29%29%29&rows=10",
+          "127.0.0.1 -  -  [01/10/2011:00:21:21 +0000] \"GET 
/solr/collection1/browse?q=language+detection&start=560&rows=10 HTTP/1.1\" 200 
45071",
+          "127.0.0.1 -  -  [01/10/2011:00:21:21 +0000] \"GET 
/solr/collection1/browse?q=&start=560&rows=10 HTTP/1.1\" 200 45071"
+  };
+  static final String[] GOLD = {"foo", "Using Solr Search RDBMS", "language 
detection", ""};
+
+  @Test
+  public void testExtract() throws Exception {
+    Pattern pattern = Pattern.compile("(?<=(\\?|&)q=).*?(?=&|$)");
+    String line = "127.0.0.1 -  -  [24/05/2010:01:19:22 +0000] \"GET 
/solr/select?q=import statement&start=1 HTTP/1.1\" 200 37571";
+    String res = RegexUtils.extract(line, pattern, 
Collections.<Integer>emptyList(), " ", RegexUtils.IDENTITY_TRANSFORMER);
+    assertEquals(res, "import statement", res);
+
+    for (int i = 0; i < TEST_STRS.length; i++) {
+      String testStr = TEST_STRS[i];
+      res = RegexUtils.extract(testStr, pattern, 
Collections.<Integer>emptyList(), " ", new URLDecodeTransformer());
+      assertEquals(GOLD[i], res);
+    }
+
+    pattern = 
Pattern.compile("((?<=(\\?|&)q=)(.*?)(?=(&|$))|(?<=((\\?|&)start=))(\\d+))");
+    res = RegexUtils.extract(line, pattern, Collections.<Integer>emptyList(), 
" ", RegexUtils.IDENTITY_TRANSFORMER);
+    assertEquals(res, "import statement 1", res);
+
+    pattern = Pattern.compile("(start=1) HTTP");
+    Collection<Integer> groupsToKeep = new ArrayList<>();
+    groupsToKeep.add(1);
+    res = RegexUtils.extract(line, pattern, groupsToKeep, " ", 
RegexUtils.IDENTITY_TRANSFORMER);
+    assertEquals(res, "start=1", res);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java
----------------------------------------------------------------------
diff --git 
a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java
 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java
new file mode 100644
index 0000000..2ddce14
--- /dev/null
+++ 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors;
+
+import java.util.Iterator;
+import java.util.Random;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Iterators;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.iterator.CountingIterator;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.function.DoubleFunction;
+
+public final class RandomVectorIterable implements Iterable<Vector> {
+
+  public enum VectorType {DENSE, SPARSE}
+
+  private final int numItems;
+  private final VectorType type;
+  
+  public RandomVectorIterable() {
+    this(100, VectorType.SPARSE);
+  }
+  
+  public RandomVectorIterable(int numItems) {
+    this(numItems, VectorType.SPARSE);
+  }
+  
+  public RandomVectorIterable(int numItems, VectorType type) {
+    this.numItems = numItems;
+    this.type = type;
+  }
+  
+  @Override
+  public Iterator<Vector> iterator() {
+    return Iterators.transform(
+        new CountingIterator(numItems),
+        new Function<Integer, Vector>() {
+          private final Random random = RandomUtils.getRandom();
+          @Override
+          public Vector apply(Integer dummy) {
+            Vector result =
+                type == VectorType.SPARSE ? new 
RandomAccessSparseVector(numItems) : new DenseVector(numItems);
+            result.assign(new DoubleFunction() {
+              @Override
+              public double apply(double ignored) {
+                return random.nextDouble();
+              }
+            });
+            return result;
+          }
+        });
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java
----------------------------------------------------------------------
diff --git 
a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java
 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java
new file mode 100644
index 0000000..c55fd8d
--- /dev/null
+++ 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors;
+
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.junit.Before;
+import org.junit.Test;
+
+public final class VectorHelperTest extends MahoutTestCase {
+
+  private static final int NUM_DOCS = 100;
+
+  private Path inputPathOne;
+  private Path inputPathTwo;
+
+  private Configuration conf;
+
+  @Override
+  @Before
+  public void setUp() throws Exception {
+    super.setUp();
+    conf = getConfiguration();
+
+    inputPathOne = getTestTempFilePath("documents/docs-one.file");
+    FileSystem fs = FileSystem.get(inputPathOne.toUri(), conf);
+    try (SequenceFile.Writer writer =
+             new SequenceFile.Writer(fs, conf, inputPathOne, Text.class, 
IntWritable.class)) {
+      Random rd = RandomUtils.getRandom();
+      for (int i = 0; i < NUM_DOCS; i++) {
+        // Make all indices higher than dictionary size
+        writer.append(new Text("Document::ID::" + i), new IntWritable(NUM_DOCS 
+ rd.nextInt(NUM_DOCS)));
+      }
+    }
+
+    inputPathTwo = getTestTempFilePath("documents/docs-two.file");
+    fs = FileSystem.get(inputPathTwo.toUri(), conf);
+    try (SequenceFile.Writer writer =
+             new SequenceFile.Writer(fs, conf, inputPathTwo, Text.class, 
IntWritable.class)) {
+      Random rd = RandomUtils.getRandom();
+      for (int i = 0; i < NUM_DOCS; i++) {
+        // Keep indices within number of documents
+        writer.append(new Text("Document::ID::" + i), new 
IntWritable(rd.nextInt(NUM_DOCS)));
+      }
+    }
+  }
+
+  @Test
+  public void testJsonFormatting() throws Exception {
+    Vector v = new SequentialAccessSparseVector(10);
+    v.set(2, 3.1);
+    v.set(4, 1.0);
+    v.set(6, 8.1);
+    v.set(7, -100);
+    v.set(9, 12.2);
+    String UNUSED = "UNUSED";
+    String[] dictionary = {
+        UNUSED, UNUSED, "two", UNUSED, "four", UNUSED, "six", "seven", UNUSED, 
"nine"
+    };
+
+    assertEquals("sorted json form incorrect: ", "{nine:12.2,six:8.1,two:3.1}",
+        VectorHelper.vectorToJson(v, dictionary, 3, true));
+    assertEquals("unsorted form incorrect: ", "{two:3.1,four:1.0}",
+        VectorHelper.vectorToJson(v, dictionary, 2, false));
+    assertEquals("sorted json form incorrect: ", 
"{nine:12.2,six:8.1,two:3.1,four:1.0}",
+        VectorHelper.vectorToJson(v, dictionary, 4, true));
+    assertEquals("sorted json form incorrect: ", 
"{nine:12.2,six:8.1,two:3.1,four:1.0,seven:-100.0}",
+        VectorHelper.vectorToJson(v, dictionary, 5, true));
+    assertEquals("sorted json form incorrect: ", "{nine:12.2,six:8.1}",
+        VectorHelper.vectorToJson(v, dictionary, 2, true));
+    assertEquals("unsorted form incorrect: ", "{two:3.1,four:1.0}",
+        VectorHelper.vectorToJson(v, dictionary, 2, false));
+  }
+
+  @Test
+  public void testTopEntries() throws Exception {
+    Vector v = new SequentialAccessSparseVector(10);
+    v.set(2, 3.1);
+    v.set(4, 1.0);
+    v.set(6, 8.1);
+    v.set(7, -100);
+    v.set(9, 12.2);
+    v.set(1, 0.0);
+    v.set(3, 0.0);
+    v.set(8, 2.7);
+    // check if sizeOFNonZeroElementsInVector = maxEntries
+    assertEquals(6, VectorHelper.topEntries(v, 6).size());
+    // check if sizeOfNonZeroElementsInVector < maxEntries
+    assertTrue(VectorHelper.topEntries(v, 9).size() < 9);
+    // check if sizeOfNonZeroElementsInVector > maxEntries
+    assertTrue(VectorHelper.topEntries(v, 5).size() < 
v.getNumNonZeroElements());
+  }
+
+  @Test
+  public void testTopEntriesWhenAllZeros() throws Exception {
+    Vector v = new SequentialAccessSparseVector(10);
+    v.set(2, 0.0);
+    v.set(4, 0.0);
+    v.set(6, 0.0);
+    v.set(7, 0);
+    v.set(9, 0.0);
+    v.set(1, 0.0);
+    v.set(3, 0.0);
+    v.set(8, 0.0);
+    assertEquals(0, VectorHelper.topEntries(v, 6).size());
+  }
+
+  @Test
+  public void testLoadTermDictionary() throws Exception {
+    // With indices higher than dictionary size
+    VectorHelper.loadTermDictionary(conf, inputPathOne.toString());
+    // With dictionary size higher than indices
+    VectorHelper.loadTermDictionary(conf, inputPathTwo.toString());
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java
----------------------------------------------------------------------
diff --git 
a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java
 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java
new file mode 100644
index 0000000..2ea8b89
--- /dev/null
+++ 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.arff;
+
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Test;
+
+public final class ARFFTypeTest extends MahoutTestCase {
+
+  @Test
+  public void removeQuotes() {
+    assertNull(ARFFType.removeQuotes(null));
+    assertEquals("", ARFFType.removeQuotes("\"\""));
+    assertEquals("", ARFFType.removeQuotes("''"));
+    assertEquals("", ARFFType.removeQuotes(""));
+    assertEquals("", ARFFType.removeQuotes("  "));
+    assertEquals("single", ARFFType.removeQuotes("'single'"));
+    assertEquals("double", ARFFType.removeQuotes("\"double\""));
+    assertEquals("trim", ARFFType.removeQuotes(" trim "));
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
----------------------------------------------------------------------
diff --git 
a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
new file mode 100644
index 0000000..4c7f17a
--- /dev/null
+++ 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
@@ -0,0 +1,289 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.arff;
+
+import java.io.IOException;
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.Iterator;
+import java.util.Locale;
+import java.util.Map;
+
+import com.google.common.io.Resources;
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.junit.Test;
+
+public final class ARFFVectorIterableTest extends MahoutTestCase {
+
+  @Test
+  public void testValues() throws Exception {
+    ARFFVectorIterable iterable = readModelFromResource("sample.arff");
+
+    assertEquals("Mahout", iterable.getModel().getRelation());
+    Map<String, Integer> bindings = iterable.getModel().getLabelBindings();
+    assertNotNull(bindings);
+    assertEquals(5, bindings.size());
+    Iterator<Vector> iter = iterable.iterator();
+    assertTrue(iter.hasNext());
+    Vector next = iter.next();
+    assertNotNull(next);
+    assertTrue("Wrong instanceof", next instanceof DenseVector);
+    assertEquals(1.0, next.get(0), EPSILON);
+    assertEquals(2.0, next.get(1), EPSILON);
+    assertTrue(iter.hasNext());
+    next = iter.next();
+    assertNotNull(next);
+    assertTrue("Wrong instanceof", next instanceof DenseVector);
+    assertEquals(2.0, next.get(0), EPSILON);
+    assertEquals(3.0, next.get(1), EPSILON);
+
+    assertTrue(iter.hasNext());
+    next = iter.next();
+    assertNotNull(next);
+    assertTrue("Wrong instanceof", next instanceof RandomAccessSparseVector);
+    assertEquals(5.0, next.get(0), EPSILON);
+    assertEquals(23.0, next.get(1), EPSILON);
+
+    assertFalse(iter.hasNext());
+  }
+
+  @Test
+  public void testDense() throws Exception {
+    Iterable<Vector> iterable = readModelFromResource("sample-dense.arff");
+    Vector firstVector = iterable.iterator().next();
+    assertEquals(1.0, firstVector.get(0), 0);
+    assertEquals(65.0, firstVector.get(1), 0);
+    assertEquals(1.0, firstVector.get(3), 0);
+    assertEquals(1.0, firstVector.get(4), 0);
+
+    int count = 0;
+    for (Vector vector : iterable) {
+      assertTrue("Vector is not dense", vector instanceof DenseVector);
+      count++;
+    }
+    assertEquals(5, count);
+  }
+
+  @Test
+  public void testSparse() throws Exception {
+    Iterable<Vector> iterable = readModelFromResource("sample-sparse.arff");
+
+    Vector firstVector = iterable.iterator().next();
+    assertEquals(23.1, firstVector.get(1), 0);
+    assertEquals(3.23, firstVector.get(2), 0);
+    assertEquals(1.2, firstVector.get(3), 0);
+
+    int count = 0;
+    for (Vector vector : iterable) {
+      assertTrue("Vector is not dense", vector instanceof 
RandomAccessSparseVector);
+      count++;
+    }
+    assertEquals(9, count);
+  }
+
+  @Test
+  public void testNonNumeric() throws Exception {
+    MapBackedARFFModel model = new MapBackedARFFModel();
+    ARFFVectorIterable iterable = getVectors("non-numeric-1.arff", model);
+    int count = 0;
+    for (Vector vector : iterable) {
+      assertTrue("Vector is not dense", vector instanceof 
RandomAccessSparseVector);
+      count++;
+    }
+
+    iterable = getVectors("non-numeric-1.arff", model);
+    Iterator<Vector> iter = iterable.iterator();
+    Vector firstVector = iter.next();
+
+    assertEquals(1.0, firstVector.get(2), 0);
+
+    assertEquals(10, count);
+    Map<String, Map<String, Integer>> nominalMap = 
iterable.getModel().getNominalMap();
+    assertNotNull(nominalMap);
+    assertEquals(1, nominalMap.size());
+    Map<String, Integer> noms = nominalMap.get("bar");
+    assertNotNull("nominals for bar are null", noms);
+    assertEquals(5, noms.size());
+    Map<Integer, ARFFType> integerARFFTypeMap = model.getTypeMap();
+    assertNotNull("Type map null", integerARFFTypeMap);
+    assertEquals(5, integerARFFTypeMap.size());
+    Map<String, Long> words = model.getWords();
+    assertNotNull("words null", words);
+    assertEquals(10, words.size());
+    Map<Integer, DateFormat> integerDateFormatMap = model.getDateMap();
+    assertNotNull("date format null", integerDateFormatMap);
+    assertEquals(1, integerDateFormatMap.size());
+  }
+
+  @Test
+  public void testDate() throws Exception {
+    ARFFVectorIterable iterable = readModelFromResource("date.arff");
+    Iterator<Vector> iter = iterable.iterator();
+    Vector firstVector = iter.next();
+
+    DateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", 
Locale.ENGLISH);
+    Date date = format.parse("2001-07-04T12:08:56");
+    long result = date.getTime();
+    assertEquals(result, firstVector.get(1), 0);
+
+    format = new SimpleDateFormat("yyyy.MM.dd G 'at' HH:mm:ss z", 
Locale.ENGLISH);
+    date = format.parse("2001.07.04 AD at 12:08:56 PDT");
+    result = date.getTime();
+    assertEquals(result, firstVector.get(2), 0);
+
+    format = new SimpleDateFormat("EEE, MMM d, ''yy", Locale.ENGLISH);
+    date = format.parse("Wed, Jul 4, '01,4 0:08 PM, PDT");
+    result = date.getTime();
+    assertEquals(result, firstVector.get(3), 0);
+
+    format = new SimpleDateFormat("K:mm a, z", Locale.ENGLISH);
+    date = format.parse("0:08 PM, PDT");
+    result = date.getTime();
+    assertEquals(result, firstVector.get(4), 0);
+
+    format = new SimpleDateFormat("yyyyy.MMMMM.dd GGG hh:mm aaa", 
Locale.ENGLISH);
+    date = format.parse("02001.July.04 AD 12:08 PM");
+    result = date.getTime();
+    assertEquals(result, firstVector.get(5), 0);
+
+    format = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", 
Locale.ENGLISH);
+    date = format.parse("Wed, 4 Jul 2001 12:08:56 -0700");
+    result = date.getTime();
+    assertEquals(result, firstVector.get(6), 0);
+
+  }
+
+  @Test
+  public void testMultipleNoms() throws Exception {
+    MapBackedARFFModel model = new MapBackedARFFModel();
+    ARFFVectorIterable iterable = getVectors("non-numeric-1.arff", model);
+    int count = 0;
+    for (Vector vector : iterable) {
+      assertTrue("Vector is not dense", vector instanceof 
RandomAccessSparseVector);
+      count++;
+    }
+    assertEquals(10, count);
+    Map<String,Map<String,Integer>> nominalMap = 
iterable.getModel().getNominalMap();
+    assertNotNull(nominalMap);
+    assertEquals(1, nominalMap.size());
+    Map<String,Integer> noms = nominalMap.get("bar");
+    assertNotNull("nominals for bar are null", noms);
+    assertEquals(5, noms.size());
+    Map<Integer,ARFFType> integerARFFTypeMap = model.getTypeMap();
+    assertNotNull("Type map null", integerARFFTypeMap);
+    assertEquals(5, integerARFFTypeMap.size());
+    Map<String,Long> words = model.getWords();
+    assertNotNull("words null", words);
+    assertEquals(10, words.size());
+
+    Map<Integer,DateFormat> integerDateFormatMap = model.getDateMap();
+    assertNotNull("date format null", integerDateFormatMap);
+    assertEquals(1, integerDateFormatMap.size());
+
+
+    iterable = getVectors("non-numeric-2.arff", model);
+    count = 0;
+    for (Vector vector : iterable) {
+      assertTrue("Vector is not dense", vector instanceof 
RandomAccessSparseVector);
+      count++;
+    }
+    nominalMap = model.getNominalMap();
+    assertNotNull(nominalMap);
+    assertEquals(2, nominalMap.size());
+    noms = nominalMap.get("test");
+    assertNotNull("nominals for bar are null", noms);
+    assertEquals(2, noms.size());
+  }
+
+  @Test
+  public void testNumerics() throws Exception {
+    String arff = "@RELATION numerics\n"
+      + "@ATTRIBUTE theNumeric NUMERIC\n"
+      + "@ATTRIBUTE theInteger INTEGER\n"
+      + "@ATTRIBUTE theReal REAL\n"
+      + "@DATA\n"
+      + "1.0,2,3.0";
+    ARFFModel model = new MapBackedARFFModel();
+    ARFFVectorIterable iterable = new ARFFVectorIterable(arff, model);
+    model = iterable.getModel();
+    assertNotNull(model);
+    assertEquals(3, model.getLabelSize());
+    assertEquals(ARFFType.NUMERIC, model.getARFFType(0));
+    assertEquals(ARFFType.INTEGER, model.getARFFType(1));
+    assertEquals(ARFFType.REAL, model.getARFFType(2));
+    Iterator<Vector> it = iterable.iterator();
+    Vector vector = it.next();
+    assertEquals(1.0, vector.get(0), EPSILON);
+    assertEquals(2.0, vector.get(1), EPSILON);
+    assertEquals(3.0, vector.get(2), EPSILON);
+  }
+
+  @Test
+  public void testQuotes() throws Exception {
+    // ARFF allows quotes on identifiers
+    ARFFModel model = new MapBackedARFFModel();
+    ARFFVectorIterable iterable = getVectors("quoted-id.arff", model);
+    model = iterable.getModel();
+    assertNotNull(model);
+    assertEquals("quotes", model.getRelation());
+
+    // check attribute labels
+    assertEquals(4, model.getLabelSize());
+    assertEquals(ARFFType.NUMERIC, model.getARFFType(0));
+    assertEquals(ARFFType.INTEGER, model.getARFFType(1));
+    assertEquals(ARFFType.REAL, model.getARFFType(2));
+    assertEquals(ARFFType.NOMINAL, model.getARFFType(3));
+
+    Map<String, Integer> labelBindings = model.getLabelBindings();
+    assertTrue(labelBindings.keySet().contains("thenumeric"));
+    assertTrue(labelBindings.keySet().contains("theinteger"));
+    assertTrue(labelBindings.keySet().contains("thereal"));
+    assertTrue(labelBindings.keySet().contains("thenominal"));
+
+    // check nominal values
+    Map<String, Integer> nominalMap = model.getNominalMap().get("thenominal");
+    assertNotNull(nominalMap);
+    assertEquals(3, nominalMap.size());
+    assertTrue(nominalMap.keySet().contains("double-quote"));
+    assertTrue(nominalMap.keySet().contains("single-quote"));
+    assertTrue(nominalMap.keySet().contains("no-quote"));
+
+    // check data values
+    Iterator<Vector> it = iterable.iterator();
+    Vector vector = it.next();
+    assertEquals(nominalMap.get("no-quote"), vector.get(3), EPSILON);
+    assertEquals(nominalMap.get("single-quote"), it.next().get(3), EPSILON);
+    assertEquals(nominalMap.get("double-quote"), it.next().get(3), EPSILON);
+  }
+
+  static ARFFVectorIterable getVectors(String resourceName, ARFFModel model) 
throws IOException {
+    String sample = Resources.toString(Resources.getResource(resourceName), 
Charsets.UTF_8);
+    return new ARFFVectorIterable(sample, model);
+  }
+
+  private static ARFFVectorIterable readModelFromResource(String resourceName) 
throws IOException {
+    ARFFModel model = new MapBackedARFFModel();
+    return getVectors(resourceName, model);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/DriverTest.java
----------------------------------------------------------------------
diff --git 
a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/DriverTest.java
 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/DriverTest.java
new file mode 100644
index 0000000..7e7623e
--- /dev/null
+++ 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/DriverTest.java
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.utils.vectors.arff;
+
+import java.io.IOException;
+import java.io.StringWriter;
+
+import com.google.common.io.Resources;
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Test;
+
+/**
+ * Test case for {@link Driver}
+ */
+public class DriverTest extends MahoutTestCase {
+
+  @Test
+  public void dictionary() throws IOException {
+
+    ARFFModel model = new MapBackedARFFModel();
+    ARFFVectorIterableTest.getVectors("sample-dense.arff", model);
+    StringWriter writer = new StringWriter();
+    Driver.writeLabelBindings(writer, model, ",");
+    String expected1 = 
Resources.toString(Resources.getResource("expected-arff-dictionary.csv"), 
Charsets.UTF_8);
+    String expected2 = 
Resources.toString(Resources.getResource("expected-arff-dictionary-2.csv"), 
Charsets.UTF_8);
+    assertTrue(expected1.equals(writer.toString()) || 
expected2.equals(writer.toString()));
+  }
+
+
+  @Test
+  public void dictionaryJSON() throws IOException {
+    ARFFModel model = new MapBackedARFFModel();
+    ARFFVectorIterableTest.getVectors("sample-dense.arff", model);
+    StringWriter writer = new StringWriter();
+    Driver.writeLabelBindingsJSON(writer, model);
+    String expected1 = 
Resources.toString(Resources.getResource("expected-arff-schema.json"), 
Charsets.UTF_8);
+    String expected2 = 
Resources.toString(Resources.getResource("expected-arff-schema-2.json"), 
Charsets.UTF_8);
+    assertTrue(expected1.equals(writer.toString()) || 
expected2.equals(writer.toString()));
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java
----------------------------------------------------------------------
diff --git 
a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java
 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java
new file mode 100644
index 0000000..2867640
--- /dev/null
+++ 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.arff;
+
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Test;
+
+import java.util.Map;
+
+public class MapBackedARFFModelTest extends MahoutTestCase {
+
+  @Test
+  public void processNominal() {
+    String windy = "windy";
+    String breezy = "breezy";
+
+    ARFFModel model = new MapBackedARFFModel();
+    model.addNominal(windy, breezy, 77);
+    model.addNominal(windy, "strong", 23);
+    model.addNominal(windy, "nuking", 55);
+    Map<String, Map<String, Integer>> nominalMap = model.getNominalMap();
+
+    assertEquals(1, nominalMap.size());
+    Map<String, Integer> windyValues = nominalMap.get(windy);
+    assertEquals(77, windyValues.get(breezy).intValue());
+  }
+
+  @Test
+  public void processBadNumeric() {
+    ARFFModel model = new MapBackedARFFModel();
+    model.addLabel("b1shkt70694difsmmmdv0ikmoh", 77);
+    model.addType(77, ARFFType.REAL);
+    assertTrue(Double.isNaN(model.getValue("b1shkt70694difsmmmdv0ikmoh", 77)));
+  }
+
+  @Test
+  public void processGoodNumeric() {
+    ARFFModel model = new MapBackedARFFModel();
+    model.addLabel("1234", 77);
+    model.addType(77, ARFFType.INTEGER);
+    assertTrue(1234 == model.getValue("1234", 77));
+    model.addLabel("131.34", 78);
+    model.addType(78, ARFFType.REAL);
+    assertTrue(131.34 == model.getValue("131.34", 78));
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIteratorTest.java
----------------------------------------------------------------------
diff --git 
a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIteratorTest.java
 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIteratorTest.java
new file mode 100644
index 0000000..e76cf70
--- /dev/null
+++ 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIteratorTest.java
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.csv;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.io.StringWriter;
+import java.util.Iterator;
+
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.utils.vectors.RandomVectorIterable;
+import org.apache.mahout.utils.vectors.VectorHelper;
+import org.apache.mahout.utils.vectors.io.TextualVectorWriter;
+import org.junit.Test;
+
+public class CSVVectorIteratorTest extends MahoutTestCase {
+
+  @Test
+  public void testCount() throws Exception {
+
+    StringWriter sWriter = new StringWriter();
+    try (TextualVectorWriter writer = new TextualVectorWriter(sWriter) {
+      @Override
+      public void write(Vector vector) throws IOException {
+        String vecStr = VectorHelper.vectorToCSVString(vector, false);
+        getWriter().write(vecStr);
+      }
+    }) {
+      Iterable<Vector> iter = new RandomVectorIterable(50);
+      writer.write(iter);
+    }
+
+    Iterator<Vector> csvIter = new CSVVectorIterator(new 
StringReader(sWriter.getBuffer().toString()));
+    int count = 0;
+    while (csvIter.hasNext()) {
+      csvIter.next();
+      count++;
+    }
+    assertEquals(50, count);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java
----------------------------------------------------------------------
diff --git 
a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java
 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java
new file mode 100644
index 0000000..e2f7032
--- /dev/null
+++ 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.io;
+
+import java.io.StringWriter;
+import java.util.ArrayList;
+import java.util.Collection;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.utils.vectors.RandomVectorIterable;
+import org.junit.Test;
+
+public final class VectorWriterTest extends MahoutTestCase {
+
+  @Test
+  public void testSFVW() throws Exception {
+    Path path = getTestTempFilePath("sfvw");
+    Configuration conf = getConfiguration();
+    FileSystem fs = FileSystem.get(conf);
+    SequenceFile.Writer seqWriter = new SequenceFile.Writer(fs, conf, path, 
LongWritable.class, VectorWritable.class);
+    try (SequenceFileVectorWriter writer = new 
SequenceFileVectorWriter(seqWriter)) {
+      writer.write(new RandomVectorIterable(50));
+    }
+
+    long count = HadoopUtil.countRecords(path, conf);
+    assertEquals(50, count);
+  }
+
+  @Test
+  public void testTextOutputSize() throws Exception {
+    StringWriter strWriter = new StringWriter();
+    try (VectorWriter writer = new TextualVectorWriter(strWriter)) {
+      Collection<Vector> vectors = new ArrayList<>();
+      vectors.add(new DenseVector(new double[]{0.3, 1.5, 4.5}));
+      vectors.add(new DenseVector(new double[]{1.3, 1.5, 3.5}));
+      writer.write(vectors);
+    }
+    String buffer = strWriter.toString();
+    assertNotNull(buffer);
+    assertFalse(buffer.isEmpty());
+    
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java
----------------------------------------------------------------------
diff --git 
a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java
 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java
new file mode 100644
index 0000000..890a14b
--- /dev/null
+++ 
b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.lucene;
+
+
+import java.io.IOException;
+
+import com.google.common.io.Closeables;
+
+import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Before;
+import org.junit.Test;
+
+public class CachedTermInfoTest extends MahoutTestCase {
+  private RAMDirectory directory;
+  private static final String[] DOCS = {
+          "a a b b c c",
+          "a b a b a b a b",
+          "a b a",
+          "a",
+          "b",
+          "a",
+          "a"
+  };
+
+  private static final String[] DOCS2 = {
+          "d d d d",
+          "e e e e",
+          "d e d e",
+          "d",
+          "e",
+          "d",
+          "e"
+  };
+
+  @Before
+  public void before() throws IOException {
+    directory = new RAMDirectory();
+
+    FieldType fieldType = new FieldType();
+    fieldType.setStored(false);
+    
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+    fieldType.setTokenized(true);
+    fieldType.setStoreTermVectors(false);
+    fieldType.setStoreTermVectorPositions(false);
+    fieldType.setStoreTermVectorOffsets(false);
+    fieldType.freeze();
+
+    directory = createTestIndex(fieldType, directory, 0);
+  }
+
+  @Test
+  public void test() throws Exception {
+    IndexReader reader = DirectoryReader.open(directory);
+    CachedTermInfo cti = new CachedTermInfo(reader, "content", 0, 100);
+    assertEquals(3, cti.totalTerms("content"));
+    assertNotNull(cti.getTermEntry("content", "a"));
+    assertNull(cti.getTermEntry("content", "e"));
+    //minDf
+    cti = new CachedTermInfo(reader, "content", 3, 100);
+    assertEquals(2, cti.totalTerms("content"));
+    assertNotNull(cti.getTermEntry("content", "a"));
+    assertNull(cti.getTermEntry("content", "c"));
+    //maxDFPercent, a is in 6 of 7 docs: numDocs * maxDfPercent / 100 < 6 to 
exclude, 85% should suffice to exclude a
+    cti = new CachedTermInfo(reader, "content", 0, 85);
+    assertEquals(2, cti.totalTerms("content"));
+    assertNotNull(cti.getTermEntry("content", "b"));
+    assertNotNull(cti.getTermEntry("content", "c"));
+    assertNull(cti.getTermEntry("content", "a"));
+
+
+  }
+
+  static RAMDirectory createTestIndex(FieldType fieldType,
+                                      RAMDirectory directory,
+                                      int startingId) throws IOException {
+    IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(new 
WhitespaceAnalyzer()));
+
+    try {
+      for (int i = 0; i < DOCS.length; i++) {
+        Document doc = new Document();
+        Field id = new StringField("id", "doc_" + (i + startingId), 
Field.Store.YES);
+        doc.add(id);
+        Field text = new Field("content", DOCS[i], fieldType);
+        doc.add(text);
+        Field text2 = new Field("content2", DOCS2[i], fieldType);
+        doc.add(text2);
+        writer.addDocument(doc);
+      }
+    } finally {
+      Closeables.close(writer, false);
+    }
+    return directory;
+  }
+}

Reply via email to