http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/regex/RegexMapper.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/regex/RegexMapper.java 
b/integration/src/main/java/org/apache/mahout/utils/regex/RegexMapper.java
deleted file mode 100644
index 04cacaa..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/regex/RegexMapper.java
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.regex;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.regex.Pattern;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.mahout.common.ClassUtils;
-
-public class RegexMapper extends Mapper<LongWritable, Text, LongWritable, 
Text> {
-
-  public static final String REGEX = "regex";
-  public static final String GROUP_MATCHERS = "regex.groups";
-  public static final String TRANSFORMER_CLASS = "transformer.class";
-  public static final String FORMATTER_CLASS = "formatter.class";
-
-  private Pattern regex;
-  private List<Integer> groupsToKeep;
-  private RegexTransformer transformer = RegexUtils.IDENTITY_TRANSFORMER;
-  private RegexFormatter formatter = RegexUtils.IDENTITY_FORMATTER;
-  public static final String ANALYZER_NAME = "analyzerName";
-
-
-  @Override
-  protected void setup(Context context) throws IOException, 
InterruptedException {
-    groupsToKeep = new ArrayList<>();
-    Configuration config = context.getConfiguration();
-    String regexStr = config.get(REGEX);
-    regex = Pattern.compile(regexStr);
-    String[] groups = config.getStrings(GROUP_MATCHERS);
-    if (groups != null) {
-      for (String group : groups) {
-        groupsToKeep.add(Integer.parseInt(group));
-      }
-    }
-
-    transformer = ClassUtils.instantiateAs(config.get(TRANSFORMER_CLASS, 
IdentityTransformer.class.getName()),
-        RegexTransformer.class);
-    String analyzerName = config.get(ANALYZER_NAME);
-    if (analyzerName != null && transformer instanceof AnalyzerTransformer) {
-      Analyzer analyzer = ClassUtils.instantiateAs(analyzerName, 
Analyzer.class);
-      ((AnalyzerTransformer)transformer).setAnalyzer(analyzer);
-    }
-
-    formatter = ClassUtils.instantiateAs(config.get(FORMATTER_CLASS, 
IdentityFormatter.class.getName()),
-        RegexFormatter.class);
-  }
-
-
-  @Override
-  protected void map(LongWritable key, Text text, Context context) throws 
IOException, InterruptedException {
-    String result = RegexUtils.extract(text.toString(), regex, groupsToKeep, " 
", transformer);
-    if (!result.isEmpty()) {
-      String format = formatter.format(result);
-      context.write(key, new Text(format));
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/regex/RegexTransformer.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/regex/RegexTransformer.java 
b/integration/src/main/java/org/apache/mahout/utils/regex/RegexTransformer.java
deleted file mode 100644
index adbc98f..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/regex/RegexTransformer.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.regex;
-
-/**
- * Transforms the match of a regular expression.
- */
-public interface RegexTransformer {
-
-  String transformMatch(String match);
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/regex/RegexUtils.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/regex/RegexUtils.java 
b/integration/src/main/java/org/apache/mahout/utils/regex/RegexUtils.java
deleted file mode 100644
index 5e32b99..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/regex/RegexUtils.java
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.regex;
-
-import java.util.Collection;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-public final class RegexUtils {
-
-  public static final RegexTransformer IDENTITY_TRANSFORMER = new 
IdentityTransformer();
-  public static final RegexFormatter IDENTITY_FORMATTER = new 
IdentityFormatter();
-
-  private RegexUtils() {
-  }
-
-  public static String extract(CharSequence line, Pattern pattern, 
Collection<Integer> groupsToKeep,
-                               String separator, RegexTransformer transformer) 
{
-    StringBuilder bldr = new StringBuilder();
-    extract(line, bldr, pattern, groupsToKeep, separator, transformer);
-    return bldr.toString();
-  }
-
-  public static void extract(CharSequence line, StringBuilder outputBuffer,
-                             Pattern pattern, Collection<Integer> 
groupsToKeep, String separator,
-                             RegexTransformer transformer) {
-    if (transformer == null) {
-      transformer = IDENTITY_TRANSFORMER;
-    }
-    Matcher matcher = pattern.matcher(line);
-    String match;
-    if (groupsToKeep.isEmpty()) {
-      while (matcher.find()) {
-        match = matcher.group();
-        if (match != null) {
-          
outputBuffer.append(transformer.transformMatch(match)).append(separator);
-        }
-      }
-    } else {
-      while (matcher.find()) {
-        for (Integer groupNum : groupsToKeep) {
-          match = matcher.group(groupNum);
-          if (match != null) {
-            
outputBuffer.append(transformer.transformMatch(match)).append(separator);
-          }
-        }
-      }
-    }
-    //trim off the last separator, which is always there
-    if (outputBuffer.length() > 0) {
-      outputBuffer.setLength(outputBuffer.length() - separator.length());
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/regex/URLDecodeTransformer.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/regex/URLDecodeTransformer.java
 
b/integration/src/main/java/org/apache/mahout/utils/regex/URLDecodeTransformer.java
deleted file mode 100644
index 3eb7fc0..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/regex/URLDecodeTransformer.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.regex;
-
-import java.io.UnsupportedEncodingException;
-import java.net.URLDecoder;
-
-public final class URLDecodeTransformer implements RegexTransformer {
-
-  private final String enc;
-
-  public URLDecodeTransformer() {
-    enc = "UTF-8";
-  }
-
-  public URLDecodeTransformer(String encoding) {
-    this.enc = encoding;
-  }
-
-  @Override
-  public String transformMatch(String match) {
-    try {
-      return URLDecoder.decode(match, enc);
-    } catch (UnsupportedEncodingException e) {
-      throw new IllegalStateException(e);
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java 
b/integration/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java
deleted file mode 100644
index 13d61b8..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java
+++ /dev/null
@@ -1,99 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors;
-
-import java.util.List;
-import java.util.Map;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.iterator.sequencefile.PathFilters;
-import org.apache.mahout.common.iterator.sequencefile.PathType;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
-import org.apache.mahout.math.VectorWritable;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Converts a vector representation of documents into a {@code document x 
terms} matrix.
- * The input data is in {@code SequenceFile<Text,VectorWritable>} format (as 
generated by
- * {@link org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles 
SparseVectorsFromSequenceFiles}
- * or by {@link org.apache.mahout.vectorizer.EncodedVectorsFromSequenceFiles 
EncodedVectorsFromSequenceFiles})
- * and generates the following two files as output:
- * <ul><li>A file called "matrix" of format {@code 
SequenceFile<IntWritable,VectorWritable>}.</li>
- * <li>A file called "docIndex" of format {@code 
SequenceFile<IntWritable,Text>}.</li></ul>
- * The input file can be regenerated by joining the two output files on the 
generated int key.
- * In other words, {@code RowIdJob} replaces the document text ids by integers.
- * The original document text ids can still be retrieved from the "docIndex".
- */
-public class RowIdJob extends AbstractJob {
-  private static final Logger log = LoggerFactory.getLogger(RowIdJob.class);
-
-  @Override
-  public int run(String[] args) throws Exception {
-
-    addInputOption();
-    addOutputOption();
-
-    Map<String, List<String>> parsedArgs = parseArguments(args);
-    if (parsedArgs == null) {
-      return -1;
-    }
-
-    Configuration conf = getConf();
-    FileSystem fs = FileSystem.get(conf);
-
-    Path outputPath = getOutputPath();
-    Path indexPath = new Path(outputPath, "docIndex");
-    Path matrixPath = new Path(outputPath, "matrix");
-
-    try (SequenceFile.Writer indexWriter = SequenceFile.createWriter(fs, conf, 
indexPath,
-        IntWritable.class, Text.class);
-         SequenceFile.Writer matrixWriter = SequenceFile.createWriter(fs, 
conf, matrixPath, IntWritable.class,
-             VectorWritable.class)) {
-      IntWritable docId = new IntWritable();
-      int i = 0;
-      int numCols = 0;
-      for (Pair<Text, VectorWritable> record
-          : new SequenceFileDirIterable<Text, VectorWritable>(getInputPath(), 
PathType.LIST, PathFilters.logsCRCFilter(),
-          null, true, conf)) {
-        VectorWritable value = record.getSecond();
-        docId.set(i);
-        indexWriter.append(docId, record.getFirst());
-        matrixWriter.append(docId, value);
-        i++;
-        numCols = value.get().size();
-      }
-
-      log.info("Wrote out matrix with {} rows and {} columns to {}", i, 
numCols, matrixPath);
-      return 0;
-    }
-  }
-
-  public static void main(String[] args) throws Exception {
-    ToolRunner.run(new RowIdJob(), args);
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java 
b/integration/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java
deleted file mode 100644
index d74803f..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java
+++ /dev/null
@@ -1,46 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors;
-
-/**
- * Each entry in a {@link TermInfo} dictionary. Contains information about a 
term.
- */
-public class TermEntry {
-
-  private final String term;
-  private final int termIdx;
-  private final int docFreq;
-  
-  public TermEntry(String term, int termIdx, int docFreq) {
-    this.term = term;
-    this.termIdx = termIdx;
-    this.docFreq = docFreq;
-  }
-
-  public String getTerm() {
-    return term;
-  }
-
-  public int getTermIdx() {
-    return termIdx;
-  }
-
-  public int getDocFreq() {
-    return docFreq;
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java 
b/integration/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java
deleted file mode 100644
index 4fb36a3..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java
+++ /dev/null
@@ -1,33 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors;
-
-import java.util.Iterator;
-
-/**
- * Contains the term dictionary information associated with a vectorized 
collection of text documents
- *
- */
-public interface TermInfo {
-  
-  int totalTerms(String field);
-  
-  TermEntry getTermEntry(String field, String term);
-  
-  Iterator<TermEntry> getAllEntries();
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java 
b/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
deleted file mode 100644
index e1c3fbc..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
+++ /dev/null
@@ -1,266 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors;
-
-import com.google.common.collect.Sets;
-import com.google.common.io.Closeables;
-import com.google.common.io.Files;
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.FileUtil;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.iterator.sequencefile.PathFilters;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
-import org.apache.mahout.math.NamedVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.util.Iterator;
-import java.util.Set;
-
-/**
- * Can read in a {@link org.apache.hadoop.io.SequenceFile} of {@link Vector}s 
and dump
- * out the results using {@link Vector#asFormatString()} to either the console 
or to a
- * file.
- */
-public final class VectorDumper extends AbstractJob {
-
-  private static final Logger log = 
LoggerFactory.getLogger(VectorDumper.class);
-
-  private VectorDumper() {
-  }
-
-  @Override
-  public int run(String[] args) throws Exception {
-    /**
-     Option seqOpt = 
obuilder.withLongName("seqFile").withRequired(false).withArgument(
-     
abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).withDescription(
-     "The Sequence File containing the Vectors").withShortName("s").create();
-     Option dirOpt = 
obuilder.withLongName("seqDirectory").withRequired(false).withArgument(
-     abuilder.withName("seqDirectory").withMinimum(1).withMaximum(1).create())
-     .withDescription("The directory containing Sequence File of Vectors")
-     .withShortName("d").create();
-     */
-    addInputOption();
-    addOutputOption();
-    addOption("useKey", "u", "If the Key is a vector than dump that instead");
-    addOption("printKey", "p", "Print out the key as well, delimited by tab 
(or the value if useKey is true");
-    addOption("dictionary", "d", "The dictionary file.", false);
-    addOption("dictionaryType", "dt", "The dictionary file type 
(text|seqfile)", false);
-    addOption("csv", "c", "Output the Vector as CSV.  Otherwise it substitutes 
in the terms for vector cell entries");
-    addOption("namesAsComments", "n", "If using CSV output, optionally add a 
comment line for each NamedVector "
-        + "(if the vector is one) printing out the name");
-    addOption("nameOnly", "N", "Use the name as the value for each NamedVector 
(skip other vectors)");
-    addOption("sortVectors", "sort", "Sort output key/value pairs of the 
vector entries in abs magnitude "
-        + "descending order");
-    addOption("quiet", "q", "Print only file contents");
-    addOption("sizeOnly", "sz", "Dump only the size of the vector");
-    addOption("numItems", "ni", "Output at most <n> vecors", false);
-    addOption("vectorSize", "vs", "Truncate vectors to <vs> length when 
dumping (most useful when in"
-        + " conjunction with -sort", false);
-    addOption(buildOption("filter", "fi", "Only dump out those vectors whose 
name matches the filter."
-        + "  Multiple items may be specified by repeating the argument.", 
true, 1, Integer.MAX_VALUE, false, null));
-
-    if (parseArguments(args, false, true) == null) {
-      return -1;
-    }
-
-    Path[] pathArr;
-    Configuration conf = new Configuration();
-    FileSystem fs = FileSystem.get(conf);
-    Path input = getInputPath();
-    FileStatus fileStatus = fs.getFileStatus(input);
-    if (fileStatus.isDir()) {
-      pathArr = FileUtil.stat2Paths(fs.listStatus(input, 
PathFilters.logsCRCFilter()));
-    } else {
-      FileStatus[] inputPaths = fs.globStatus(input);
-      pathArr = new Path[inputPaths.length];
-      int i = 0;
-      for (FileStatus fstatus : inputPaths) {
-        pathArr[i++] = fstatus.getPath();
-      }
-    }
-
-
-    String dictionaryType = getOption("dictionaryType", "text");
-
-    boolean sortVectors = hasOption("sortVectors");
-    boolean quiet = hasOption("quiet");
-    if (!quiet) {
-      log.info("Sort? {}", sortVectors);
-    }
-
-    String[] dictionary = null;
-    if (hasOption("dictionary")) {
-      String dictFile = getOption("dictionary");
-      switch (dictionaryType) {
-        case "text":
-          dictionary = VectorHelper.loadTermDictionary(new File(dictFile));
-          break;
-        case "sequencefile":
-          dictionary = VectorHelper.loadTermDictionary(conf, dictFile);
-          break;
-        default:
-          //TODO: support Lucene's FST as a dictionary type
-          throw new IOException("Invalid dictionary type: " + dictionaryType);
-      }
-    }
-
-    Set<String> filters;
-    if (hasOption("filter")) {
-      filters = Sets.newHashSet(getOptions("filter"));
-    } else {
-      filters = null;
-    }
-
-    boolean useCSV = hasOption("csv");
-
-    boolean sizeOnly = hasOption("sizeOnly");
-    boolean nameOnly = hasOption("nameOnly");
-    boolean namesAsComments = hasOption("namesAsComments");
-    boolean transposeKeyValue = hasOption("vectorAsKey");
-    Writer writer;
-    boolean shouldClose;
-    File output = getOutputFile();
-    if (output != null) {
-      shouldClose = true;
-      log.info("Output file: {}", output);
-      Files.createParentDirs(output);
-      writer = Files.newWriter(output, Charsets.UTF_8);
-    } else {
-      shouldClose = false;
-      writer = new OutputStreamWriter(System.out, Charsets.UTF_8);
-    }
-    try {
-      boolean printKey = hasOption("printKey");
-      if (useCSV && dictionary != null) {
-        writer.write("#");
-        for (int j = 0; j < dictionary.length; j++) {
-          writer.write(dictionary[j]);
-          if (j < dictionary.length - 1) {
-            writer.write(',');
-          }
-        }
-        writer.write('\n');
-      }
-      Long numItems = null;
-      if (hasOption("numItems")) {
-        numItems = Long.parseLong(getOption("numItems"));
-        if (quiet) {
-          writer.append("#Max Items to dump: 
").append(String.valueOf(numItems)).append('\n');
-        }
-      }
-      int maxIndexesPerVector = hasOption("vectorSize")
-          ? Integer.parseInt(getOption("vectorSize"))
-          : Integer.MAX_VALUE;
-      long itemCount = 0;
-      int fileCount = 0;
-      for (Path path : pathArr) {
-        if (numItems != null && numItems <= itemCount) {
-          break;
-        }
-        if (quiet) {
-          log.info("Processing file '{}' ({}/{})", path, ++fileCount, 
pathArr.length);
-        }
-        SequenceFileIterable<Writable, Writable> iterable = new 
SequenceFileIterable<>(path, true, conf);
-        Iterator<Pair<Writable, Writable>> iterator = iterable.iterator();
-        long i = 0;
-        while (iterator.hasNext() && (numItems == null || itemCount < 
numItems)) {
-          Pair<Writable, Writable> record = iterator.next();
-          Writable keyWritable = record.getFirst();
-          Writable valueWritable = record.getSecond();
-          if (printKey) {
-            Writable notTheVectorWritable = transposeKeyValue ? valueWritable 
: keyWritable;
-            writer.write(notTheVectorWritable.toString());
-            writer.write('\t');
-          }
-          Vector vector;
-          try {
-            vector = ((VectorWritable)
-                (transposeKeyValue ? keyWritable : valueWritable)).get();
-          } catch (ClassCastException e) {
-            if ((transposeKeyValue ? keyWritable : valueWritable)
-                instanceof WeightedPropertyVectorWritable) {
-              vector =
-                  ((WeightedPropertyVectorWritable)
-                      (transposeKeyValue ? keyWritable : 
valueWritable)).getVector();
-            } else {
-              throw e;
-            }
-          }
-          if (filters == null
-              || !(vector instanceof NamedVector)
-              || filters.contains(((NamedVector) vector).getName())) {
-            if (sizeOnly) {
-              if (vector instanceof NamedVector) {
-                writer.write(((NamedVector) vector).getName());
-                writer.write(":");
-              } else {
-                writer.write(String.valueOf(i++));
-                writer.write(":");
-              }
-              writer.write(String.valueOf(vector.size()));
-              writer.write('\n');
-            } else if (nameOnly) {
-              if (vector instanceof NamedVector) {
-                writer.write(((NamedVector) vector).getName());
-                writer.write('\n');
-              }
-            } else {
-              String fmtStr;
-              if (useCSV) {
-                fmtStr = VectorHelper.vectorToCSVString(vector, 
namesAsComments);
-              } else {
-                fmtStr = VectorHelper.vectorToJson(vector, dictionary, 
maxIndexesPerVector,
-                    sortVectors);
-              }
-              writer.write(fmtStr);
-              writer.write('\n');
-            }
-            itemCount++;
-          }
-        }
-      }
-      writer.flush();
-    } finally {
-      if (shouldClose) {
-        Closeables.close(writer, false);
-      }
-    }
-
-    return 0;
-  }
-
-  public static void main(String[] args) throws Exception {
-    ToolRunner.run(new Configuration(), new VectorDumper(), args);
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java 
b/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
deleted file mode 100644
index 66c3fb6..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
+++ /dev/null
@@ -1,256 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors;
-
-import com.google.common.base.Function;
-import com.google.common.collect.Collections2;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.lucene.util.PriorityQueue;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.iterator.FileLineIterator;
-import org.apache.mahout.common.iterator.sequencefile.PathType;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
-import org.apache.mahout.math.NamedVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.Vector.Element;
-import org.apache.mahout.math.map.OpenObjectIntHashMap;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.List;
-import java.util.regex.Pattern;
-
-/** Static utility methods related to vectors. */
-public final class VectorHelper {
-
-  private static final Pattern TAB_PATTERN = Pattern.compile("\t");
-
-
-  private VectorHelper() {
-  }
-
-  public static String vectorToCSVString(Vector vector, boolean 
namesAsComments) throws IOException {
-    Appendable bldr = new StringBuilder(2048);
-    vectorToCSVString(vector, namesAsComments, bldr);
-    return bldr.toString();
-  }
-
-  public static String buildJson(Iterable<Pair<String, Double>> iterable) {
-    return buildJson(iterable, new StringBuilder(2048));
-  }
-
-  public static String buildJson(Iterable<Pair<String, Double>> iterable, 
StringBuilder bldr) {
-    bldr.append('{');
-    for (Pair<String, Double> p : iterable) {
-      bldr.append(p.getFirst());
-      bldr.append(':');
-      bldr.append(p.getSecond());
-      bldr.append(',');
-    }
-    if (bldr.length() > 1) {
-      bldr.setCharAt(bldr.length() - 1, '}');
-    }
-    return bldr.toString();
-  }
-
-  public static List<Pair<Integer, Double>> topEntries(Vector vector, int 
maxEntries) {
-
-    // Get the size of nonZero elements in the input vector
-    int sizeOfNonZeroElementsInVector = vector.getNumNonZeroElements();
-
-    // If the sizeOfNonZeroElementsInVector < maxEntries then set maxEntries = 
sizeOfNonZeroElementsInVector
-    // otherwise the call to queue.pop() returns a Pair(null, null) and the 
subsequent call
-    // to pair.getFirst() throws a NullPointerException
-    if (sizeOfNonZeroElementsInVector < maxEntries) {
-      maxEntries = sizeOfNonZeroElementsInVector;
-    }
-
-    PriorityQueue<Pair<Integer, Double>> queue = new TDoublePQ<>(-1, 
maxEntries);
-    for (Element e : vector.nonZeroes()) {
-      queue.insertWithOverflow(Pair.of(e.index(), e.get()));
-    }
-    List<Pair<Integer, Double>> entries = new ArrayList<>();
-    Pair<Integer, Double> pair;
-    while ((pair = queue.pop()) != null) {
-      if (pair.getFirst() > -1) {
-        entries.add(pair);
-      }
-    }
-    Collections.sort(entries, new Comparator<Pair<Integer, Double>>() {
-      @Override
-      public int compare(Pair<Integer, Double> a, Pair<Integer, Double> b) {
-        return b.getSecond().compareTo(a.getSecond());
-      }
-    });
-    return entries;
-  }
-
-  public static List<Pair<Integer, Double>> firstEntries(Vector vector, int 
maxEntries) {
-    List<Pair<Integer, Double>> entries = new ArrayList<>();
-    Iterator<Vector.Element> it = vector.nonZeroes().iterator();
-    int i = 0;
-    while (it.hasNext() && i++ < maxEntries) {
-      Vector.Element e = it.next();
-      entries.add(Pair.of(e.index(), e.get()));
-    }
-    return entries;
-  }
-
-  public static List<Pair<String, Double>> 
toWeightedTerms(Collection<Pair<Integer, Double>> entries,
-                                                           final String[] 
dictionary) {
-    if (dictionary != null) {
-      return new ArrayList<>(Collections2.transform(entries,
-        new Function<Pair<Integer, Double>, Pair<String, Double>>() {
-          @Override
-          public Pair<String, Double> apply(Pair<Integer, Double> p) {
-            return Pair.of(dictionary[p.getFirst()], p.getSecond());
-          }
-        }));
-    } else {
-      return new ArrayList<>(Collections2.transform(entries,
-        new Function<Pair<Integer, Double>, Pair<String, Double>>() {
-          @Override
-          public Pair<String, Double> apply(Pair<Integer, Double> p) {
-            return Pair.of(Integer.toString(p.getFirst()), p.getSecond());
-          }
-        }));
-    }
-  }
-
-  public static String vectorToJson(Vector vector, String[] dictionary, int 
maxEntries, boolean sort) {
-    return buildJson(toWeightedTerms(sort
-            ? topEntries(vector, maxEntries)
-            : firstEntries(vector, maxEntries), dictionary));
-  }
-
-  public static void vectorToCSVString(Vector vector,
-                                       boolean namesAsComments,
-                                       Appendable bldr) throws IOException {
-    if (namesAsComments && vector instanceof NamedVector) {
-      bldr.append('#').append(((NamedVector) vector).getName()).append('\n');
-    }
-    Iterator<Vector.Element> iter = vector.all().iterator();
-    boolean first = true;
-    while (iter.hasNext()) {
-      if (first) {
-        first = false;
-      } else {
-        bldr.append(',');
-      }
-      Vector.Element elt = iter.next();
-      bldr.append(String.valueOf(elt.get()));
-    }
-    bldr.append('\n');
-  }
-
-  /**
-   * Read in a dictionary file. Format is:
-   * <p/>
-   * <pre>
-   * term DocFreq Index
-   * </pre>
-   */
-  public static String[] loadTermDictionary(File dictFile) throws IOException {
-    try (InputStream in = new FileInputStream(dictFile)) {
-      return loadTermDictionary(in);
-    }
-  }
-
-  /**
-   * Read a dictionary in {@link org.apache.hadoop.io.SequenceFile} generated 
by
-   * {@link org.apache.mahout.vectorizer.DictionaryVectorizer}
-   *
-   * @param filePattern <PATH TO DICTIONARY>/dictionary.file-*
-   */
-  public static String[] loadTermDictionary(Configuration conf, String 
filePattern) {
-    OpenObjectIntHashMap<String> dict = new OpenObjectIntHashMap<>();
-    int maxIndexValue = 0;
-    for (Pair<Text, IntWritable> record
-        : new SequenceFileDirIterable<Text, IntWritable>(new 
Path(filePattern), PathType.GLOB, null, null, true,
-                                                         conf)) {
-      dict.put(record.getFirst().toString(), record.getSecond().get());
-      if (record.getSecond().get() > maxIndexValue) {
-        maxIndexValue = record.getSecond().get();
-      }
-    }
-    // Set dictionary size to greater of (maxIndexValue + 1, dict.size())
-    int maxDictionarySize = maxIndexValue + 1 > dict.size() ? maxIndexValue + 
1 : dict.size();
-    String[] dictionary = new String[maxDictionarySize];
-    for (String feature : dict.keys()) {
-      dictionary[dict.get(feature)] = feature;
-    }
-    return dictionary;
-  }
-
-  /**
-   * Read in a dictionary file. Format is: First line is the number of entries
-   * <p/>
-   * <pre>
-   * term DocFreq Index
-   * </pre>
-   */
-  private static String[] loadTermDictionary(InputStream is) throws 
IOException {
-    FileLineIterator it = new FileLineIterator(is);
-
-    int numEntries = Integer.parseInt(it.next());
-    String[] result = new String[numEntries];
-
-    while (it.hasNext()) {
-      String line = it.next();
-      if (line.startsWith("#")) {
-        continue;
-      }
-      String[] tokens = TAB_PATTERN.split(line);
-      if (tokens.length < 3) {
-        continue;
-      }
-      int index = Integer.parseInt(tokens[2]); // tokens[1] is the doc freq
-      result[index] = tokens[0];
-    }
-    return result;
-  }
-
-  private static final class TDoublePQ<T> extends PriorityQueue<Pair<T, 
Double>> {
-    private final T sentinel;
-
-    private TDoublePQ(T sentinel, int size) {
-      super(size);
-      this.sentinel = sentinel;
-    }
-
-    @Override
-    protected boolean lessThan(Pair<T, Double> a, Pair<T, Double> b) {
-      return a.getSecond().compareTo(b.getSecond()) < 0;
-    }
-
-    @Override
-    protected Pair<T, Double> getSentinelObject() {
-      return Pair.of(sentinel, Double.NEGATIVE_INFINITY);
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java
 
b/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java
deleted file mode 100644
index f2632a4..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java
+++ /dev/null
@@ -1,144 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.arff;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import com.google.common.collect.AbstractIterator;
-import com.google.common.io.Closeables;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.RandomAccessSparseVector;
-import org.apache.mahout.math.Vector;
-
-final class ARFFIterator extends AbstractIterator<Vector> {
-
-  // This pattern will make sure a , inside a string is not a point for split.
-  // Ex: "Arizona" , "0:08 PM, PDT" , 110 will be split considering "0:08 PM, 
PDT" as one string
-  private static final Pattern WORDS_WITHOUT_SPARSE = 
Pattern.compile("([\\w[^{]])*");
-  private static final Pattern DATA_PATTERN = 
Pattern.compile("^\\"+ARFFModel.ARFF_SPARSE+"(.*)\\"+ARFFModel.ARFF_SPARSE_END+"$");
-
-  private final BufferedReader reader;
-  private final ARFFModel model;
-
-  ARFFIterator(BufferedReader reader, ARFFModel model) {
-    this.reader = reader;
-    this.model = model;
-  }
-
-  @Override
-  protected Vector computeNext() {
-    String line;
-    try {
-      while ((line = reader.readLine()) != null) {
-        line = line.trim();
-        if (!line.isEmpty() && !line.startsWith(ARFFModel.ARFF_COMMENT)) {
-          break;
-        }
-      }
-    } catch (IOException ioe) {
-      throw new IllegalStateException(ioe);
-    }
-    if (line == null) {
-      try {
-        Closeables.close(reader, true);
-      } catch (IOException e) {
-        throw new IllegalStateException(e);
-      }
-      return endOfData();
-    }
-    Vector result;
-    Matcher contents = DATA_PATTERN.matcher(line);
-    if (contents.find()) {
-      line = contents.group(1);
-      String[] splits = splitCSV(line);
-      result = new RandomAccessSparseVector(model.getLabelSize());
-      for (String split : splits) {
-        int idIndex = split.indexOf(' ');
-        int idx = Integer.parseInt(split.substring(0, idIndex).trim());
-        String data = split.substring(idIndex).trim();
-        if (!"?".equals(data)) {
-          result.setQuick(idx, model.getValue(data, idx));
-        }
-      }
-    } else {
-      result = new DenseVector(model.getLabelSize());
-      String[] splits = splitCSV(line);
-      for (int i = 0; i < splits.length; i++) {
-        String split = splits[i];
-        split = split.trim();
-        if (WORDS_WITHOUT_SPARSE.matcher(split).matches() && 
!"?".equals(split)) {
-          result.setQuick(i, model.getValue(split, i));
-        }
-      }
-    }
-    return result;
-  }
-
-  /**
-   * Splits a string by comma, ignores commas inside quotes and escaped quotes.
-   * As quotes are both double and single possible, because there is no exact 
definition
-   * for ARFF files
-   * @param line -
-   * @return String[]
-   */
-  public static String[] splitCSV(String line) {
-    StringBuilder sb = new StringBuilder(128);
-    List<String> tokens = new ArrayList<>();
-    char escapeChar = '\0';
-    for (int i = 0; i < line.length(); i++) {
-      char c = line.charAt(i);
-      if (c == '\\') {
-        i++;
-        sb.append(line.charAt(i));
-      }
-      else if (c == '"' || c == '\'') {
-        // token is closed
-        if (c == escapeChar) {
-          escapeChar = '\0';
-        }
-        else if (escapeChar == '\0') {
-          escapeChar = c;
-        }
-        sb.append(c);
-      }
-      else if (c == ',') {
-        if (escapeChar == '\0') {
-          tokens.add(sb.toString().trim());
-          sb.setLength(0); // start work on next token
-        }
-        else {
-          sb.append(c);
-        }
-      }
-      else {
-        sb.append(c);
-      }
-    }
-    if (sb.length() > 0) {
-      tokens.add(sb.toString().trim());
-    }
-
-    return tokens.toArray(new String[tokens.size()]);
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java 
b/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java
deleted file mode 100644
index fc86997..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.arff;
-
-import java.text.DateFormat;
-import java.util.Map;
-
-/**
- * An interface for representing an ARFFModel.  Implementations can decide on 
the best approach
- * for storing the model, as some approaches will be fine for smaller files, 
while larger
- * ones may require a better implementation.
- */
-public interface ARFFModel {
-  String ARFF_SPARSE = "{"; //indicates the vector is sparse
-  String ARFF_SPARSE_END = "}";
-  String ARFF_COMMENT = "%";
-  String ATTRIBUTE = "@attribute";
-  String DATA = "@data";
-  String RELATION = "@relation";
-  
-  
-  String getRelation();
-  
-  void setRelation(String relation);
-  
-  /**
-   * The vector attributes (labels in Mahout speak)
-   * @return the map
-   */
-  Map<String, Integer> getLabelBindings();
-  
-  Integer getNominalValue(String label, String nominal);
-  
-  void addNominal(String label, String nominal, int idx);
-  
-  DateFormat getDateFormat(Integer idx);
-  
-  void addDateFormat(Integer idx, DateFormat format);
-  
-  Integer getLabelIndex(String label);
-  
-  void addLabel(String label, Integer idx);
-  
-  ARFFType getARFFType(Integer idx);
-  
-  void addType(Integer idx, ARFFType type);
-  
-  /**
-   * The count of the number of words seen
-   * @return the count
-   */
-  long getWordCount();
-  
-  double getValue(String data, int idx);
-  
-  Map<String, Map<String, Integer>> getNominalMap();
-  
-  int getLabelSize();
-  
-  Map<String, Long> getWords();
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java 
b/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java
deleted file mode 100644
index 9ba7c31..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java
+++ /dev/null
@@ -1,62 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.arff;
-
-public enum ARFFType {
-
-  NUMERIC("numeric"),
-  INTEGER("integer"),
-  REAL("real"),
-  NOMINAL("{"),
-  DATE("date"),
-  STRING("string");
-  
-  private final String indicator;
-  
-  ARFFType(String indicator) {
-    this.indicator = indicator;
-  }
-  
-  public String getIndicator() {
-    return indicator;
-  }
-  
-  public String getLabel(String line) {
-    int idx = line.lastIndexOf(indicator);
-    return removeQuotes(line.substring(ARFFModel.ATTRIBUTE.length(), idx));
-  }
-
-  /**
-   * Remove quotes and leading/trailing whitespace from a single or double 
quoted string
-   * @param str quotes from
-   * @return  A string without quotes
-   */
-  public static String removeQuotes(String str) {
-    String cleaned = str;
-    if (cleaned != null) {
-      cleaned = cleaned.trim();
-      boolean isQuoted = cleaned.length() > 1
-          && (cleaned.startsWith("\"") &&  cleaned.endsWith("\"")
-          || cleaned.startsWith("'") &&  cleaned.endsWith("'"));
-      if (isQuoted) {
-        cleaned = cleaned.substring(1, cleaned.length() - 1);
-      }
-    }
-    return cleaned;
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
 
b/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
deleted file mode 100644
index 180a1e1..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
+++ /dev/null
@@ -1,155 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.arff;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.IOException;
-import java.io.Reader;
-import java.io.StringReader;
-import java.nio.charset.Charset;
-import java.text.DateFormat;
-import java.text.SimpleDateFormat;
-import java.util.Iterator;
-import java.util.Locale;
-
-import com.google.common.io.Files;
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.math.Vector;
-
-/**
- * Read in ARFF (http://www.cs.waikato.ac.nz/~ml/weka/arff.html) and create 
{@link Vector}s
- * <p/>
- * Attribute type handling:
- * <ul>
- * <li>Numeric -> As is</li>
- * <li>Nominal -> ordinal(value) i.e. @attribute lumber 
{'\'(-inf-0.5]\'','\'(0.5-inf)\''}
- * will convert -inf-0.5 -> 0, and 0.5-inf -> 1</li>
- * <li>Dates -> Convert to time as a long</li>
- * <li>Strings -> Create a map of String -> long</li>
- * </ul>
- * NOTE: This class does not set the label bindings on every vector.  If you 
want the label
- * bindings, call {@link MapBackedARFFModel#getLabelBindings()}, as they are 
the same for every vector.
- */
-public class ARFFVectorIterable implements Iterable<Vector> {
-
-  private final BufferedReader buff;
-  private final ARFFModel model;
-
-  public ARFFVectorIterable(File file, ARFFModel model) throws IOException {
-    this(file, Charsets.UTF_8, model);
-  }
-
-  public ARFFVectorIterable(File file, Charset encoding, ARFFModel model) 
throws IOException {
-    this(Files.newReader(file, encoding), model);
-  }
-
-  public ARFFVectorIterable(String arff, ARFFModel model) throws IOException {
-    this(new StringReader(arff), model);
-  }
-
-  public ARFFVectorIterable(Reader reader, ARFFModel model) throws IOException 
{
-    if (reader instanceof BufferedReader) {
-      buff = (BufferedReader) reader;
-    } else {
-      buff = new BufferedReader(reader);
-    }
-    //grab the attributes, then start the iterator at the first line of data
-    this.model = model;
-
-    int labelNumber = 0;
-    String line;
-    while ((line = buff.readLine()) != null) {
-      line = line.trim();
-      if (!line.startsWith(ARFFModel.ARFF_COMMENT) && !line.isEmpty()) {
-        Integer labelNumInt = labelNumber;
-        String[] lineParts = line.split("[\\s\\t]+", 2);
-
-        // is it a relation name?
-        if (lineParts[0].equalsIgnoreCase(ARFFModel.RELATION)) {
-          model.setRelation(ARFFType.removeQuotes(lineParts[1]));
-        }
-        // or an attribute
-        else if (lineParts[0].equalsIgnoreCase(ARFFModel.ATTRIBUTE)) {
-          String label;
-          ARFFType type;
-
-          // split the name of the attribute and its description
-          String[] attrParts = lineParts[1].split("[\\s\\t]+", 2);
-          if (attrParts.length < 2)
-            throw new UnsupportedOperationException("No type for attribute 
found: " + lineParts[1]);
-
-          // label is attribute name
-          label = ARFFType.removeQuotes(attrParts[0].toLowerCase());
-          if (attrParts[1].equalsIgnoreCase(ARFFType.NUMERIC.getIndicator())) {
-            type = ARFFType.NUMERIC;
-          } else if 
(attrParts[1].equalsIgnoreCase(ARFFType.INTEGER.getIndicator())) {
-            type = ARFFType.INTEGER;
-          } else if 
(attrParts[1].equalsIgnoreCase(ARFFType.REAL.getIndicator())) {
-            type = ARFFType.REAL;
-          } else if 
(attrParts[1].equalsIgnoreCase(ARFFType.STRING.getIndicator())) {
-            type = ARFFType.STRING;
-          } else if 
(attrParts[1].toLowerCase().startsWith(ARFFType.NOMINAL.getIndicator())) {
-            type = ARFFType.NOMINAL;
-            // nominal example:
-            // @ATTRIBUTE class        {Iris-setosa,'Iris 
versicolor',Iris-virginica}
-            String[] classes = ARFFIterator.splitCSV(attrParts[1].substring(1, 
attrParts[1].length() - 1));
-            for (int i = 0; i < classes.length; i++) {
-              model.addNominal(label, ARFFType.removeQuotes(classes[i]), i + 
1);
-            }
-          } else if 
(attrParts[1].toLowerCase().startsWith(ARFFType.DATE.getIndicator())) {
-            type = ARFFType.DATE;
-            //TODO: DateFormatter map
-            DateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", 
Locale.ENGLISH);
-            String formStr = 
attrParts[1].substring(ARFFType.DATE.getIndicator().length()).trim();
-            if (!formStr.isEmpty()) {
-              if (formStr.startsWith("\"")) {
-                formStr = formStr.substring(1, formStr.length() - 1);
-              }
-              format = new SimpleDateFormat(formStr, Locale.ENGLISH);
-            }
-            model.addDateFormat(labelNumInt, format);
-            //@attribute <name> date [<date-format>]
-          } else {
-            throw new UnsupportedOperationException("Invalid attribute: " + 
attrParts[1]);
-          }
-          model.addLabel(label, labelNumInt);
-          model.addType(labelNumInt, type);
-          labelNumber++;
-        } else if (lineParts[0].equalsIgnoreCase(ARFFModel.DATA)) {
-          break; //skip it
-        }
-      }
-    }
-
-  }
-
-  @Override
-  public Iterator<Vector> iterator() {
-    return new ARFFIterator(buff, model);
-  }
-
-  /**
-   * Returns info about the ARFF content that was parsed.
-   *
-   * @return the model
-   */
-  public ARFFModel getModel() {
-    return model;
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java 
b/integration/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java
deleted file mode 100644
index ccecbb1..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java
+++ /dev/null
@@ -1,263 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.arff;
-
-import java.io.File;
-import java.io.FilenameFilter;
-import java.io.IOException;
-import java.io.Writer;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
-import java.util.Map.Entry;
-import java.util.Set;
-
-import com.google.common.io.Files;
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.mahout.common.CommandLineUtil;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.utils.vectors.io.SequenceFileVectorWriter;
-import org.apache.mahout.utils.vectors.io.VectorWriter;
-import org.codehaus.jackson.map.ObjectMapper;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public final class Driver {
-
-  private static final Logger log = LoggerFactory.getLogger(Driver.class);
-
-  /** used for JSON serialization/deserialization */
-  private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
-
-  private Driver() {
-  }
-
-  public static void main(String[] args) throws IOException {
-    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
-    ArgumentBuilder abuilder = new ArgumentBuilder();
-    GroupBuilder gbuilder = new GroupBuilder();
-
-    Option inputOpt = obuilder
-        .withLongName("input")
-        .withRequired(true)
-        
.withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
-        .withDescription(
-            "The file or directory containing the ARFF files.  If it is a 
directory, all .arff files will be converted")
-        .withShortName("d").create();
-
-    Option outputOpt = 
obuilder.withLongName("output").withRequired(true).withArgument(
-        
abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The output directory.  Files will have the same name as the input, 
but with the extension .mvc")
-        .withShortName("o").create();
-
-    Option maxOpt = 
obuilder.withLongName("max").withRequired(false).withArgument(
-        
abuilder.withName("max").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The maximum number of vectors to output.  If not specified, then it 
will loop over all docs")
-        .withShortName("m").create();
-
-    Option dictOutOpt = 
obuilder.withLongName("dictOut").withRequired(true).withArgument(
-        
abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The file to output the label bindings").withShortName("t").create();
-
-    Option jsonDictonaryOpt = 
obuilder.withLongName("json-dictonary").withRequired(false)
-        .withDescription("Write dictonary in JSON 
format").withShortName("j").create();
-
-    Option delimiterOpt = 
obuilder.withLongName("delimiter").withRequired(false).withArgument(
-        
abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The delimiter for outputing the 
dictionary").withShortName("l").create();
-
-    Option helpOpt = obuilder.withLongName("help").withDescription("Print out 
help").withShortName("h")
-        .create();
-    Group group = 
gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(maxOpt)
-        
.withOption(helpOpt).withOption(dictOutOpt).withOption(jsonDictonaryOpt).withOption(delimiterOpt)
-        .create();
-
-    try {
-      Parser parser = new Parser();
-      parser.setGroup(group);
-      CommandLine cmdLine = parser.parse(args);
-
-      if (cmdLine.hasOption(helpOpt)) {
-
-        CommandLineUtil.printHelp(group);
-        return;
-      }
-      if (cmdLine.hasOption(inputOpt)) { // Lucene case
-        File input = new File(cmdLine.getValue(inputOpt).toString());
-        long maxDocs = Long.MAX_VALUE;
-        if (cmdLine.hasOption(maxOpt)) {
-          maxDocs = Long.parseLong(cmdLine.getValue(maxOpt).toString());
-        }
-        if (maxDocs < 0) {
-          throw new IllegalArgumentException("maxDocs must be >= 0");
-        }
-        String outDir = cmdLine.getValue(outputOpt).toString();
-        log.info("Output Dir: {}", outDir);
-
-        String delimiter = cmdLine.hasOption(delimiterOpt) ? 
cmdLine.getValue(delimiterOpt).toString() : "\t";
-        File dictOut = new File(cmdLine.getValue(dictOutOpt).toString());
-        boolean jsonDictonary = cmdLine.hasOption(jsonDictonaryOpt);
-        ARFFModel model = new MapBackedARFFModel();
-        if (input.exists() && input.isDirectory()) {
-          File[] files = input.listFiles(new FilenameFilter() {
-            @Override
-            public boolean accept(File file, String name) {
-              return name.endsWith(".arff");
-            }
-          });
-
-          for (File file : files) {
-            writeFile(outDir, file, maxDocs, model, dictOut, delimiter, 
jsonDictonary);
-          }
-        } else {
-          writeFile(outDir, input, maxDocs, model, dictOut, delimiter, 
jsonDictonary);
-        }
-      }
-
-    } catch (OptionException e) {
-      log.error("Exception", e);
-      CommandLineUtil.printHelp(group);
-    }
-  }
-
-  protected static void writeLabelBindings(File dictOut, ARFFModel arffModel, 
String delimiter, boolean jsonDictonary)
-      throws IOException {
-    try (Writer writer = Files.newWriterSupplier(dictOut, Charsets.UTF_8, 
true).getOutput()) {
-      if (jsonDictonary) {
-        writeLabelBindingsJSON(writer, arffModel);
-      } else {
-        writeLabelBindings(writer, arffModel, delimiter);
-      }
-    }
-  }
-
-  protected static void writeLabelBindingsJSON(Writer writer, ARFFModel 
arffModel) throws IOException {
-
-    // Turn the map of labels into a list order by order of appearance
-    List<Entry<String, Integer>> attributes = new ArrayList<>();
-    attributes.addAll(arffModel.getLabelBindings().entrySet());
-    Collections.sort(attributes, new Comparator<Map.Entry<String, Integer>>() {
-      @Override
-      public int compare(Entry<String, Integer> t, Entry<String, Integer> t1) {
-        return t.getValue().compareTo(t1.getValue());
-      }
-    });
-
-    // write a map for each object
-    List<Map<String, Object>> jsonObjects = new LinkedList<>();
-    for (int i = 0; i < attributes.size(); i++) {
-
-      Entry<String, Integer> modelRepresentation = attributes.get(i);
-      Map<String, Object> jsonRepresentation = new HashMap<>();
-      jsonObjects.add(jsonRepresentation);
-      // the last one is the class label
-      jsonRepresentation.put("label", i < (attributes.size() - 1) ? 
String.valueOf(false) : String.valueOf(true));
-      String attribute = modelRepresentation.getKey();
-      jsonRepresentation.put("attribute", attribute);
-      Map<String, Integer> nominalValues = 
arffModel.getNominalMap().get(attribute);
-
-      if (nominalValues != null) {
-        String[] values = nominalValues.keySet().toArray(new String[1]);
-
-        jsonRepresentation.put("values", values);
-        jsonRepresentation.put("type", "categorical");
-      } else {
-        jsonRepresentation.put("type", "numerical");
-      }
-    }
-    writer.write(OBJECT_MAPPER.writeValueAsString(jsonObjects));
-  }
-
-  protected static void writeLabelBindings(Writer writer, ARFFModel arffModel, 
String delimiter) throws IOException {
-
-    Map<String, Integer> labels = arffModel.getLabelBindings();
-    writer.write("Label bindings for Relation " + arffModel.getRelation() + 
'\n');
-    for (Map.Entry<String, Integer> entry : labels.entrySet()) {
-      writer.write(entry.getKey());
-      writer.write(delimiter);
-      writer.write(String.valueOf(entry.getValue()));
-      writer.write('\n');
-    }
-    writer.write('\n');
-    writer.write("Values for nominal attributes\n");
-    // emit allowed values for NOMINAL/categorical/enumerated attributes
-    Map<String, Map<String, Integer>> nominalMap = arffModel.getNominalMap();
-    // how many nominal attributes
-    writer.write(String.valueOf(nominalMap.size()) + "\n");
-
-    for (Entry<String, Map<String, Integer>> entry : nominalMap.entrySet()) {
-      // the label of this attribute
-      writer.write(entry.getKey() + "\n");
-      Set<Entry<String, Integer>> attributeValues = 
entry.getValue().entrySet();
-      // how many values does this attribute have
-      writer.write(attributeValues.size() + "\n");
-      for (Map.Entry<String, Integer> value : attributeValues) {
-        // the value and the value index
-        writer.write(String.format("%s%s%s\n", value.getKey(), delimiter, 
value.getValue().toString()));
-      }
-    }
-  }
-
-  protected static void writeFile(String outDir,
-                                  File file,
-                                  long maxDocs,
-                                  ARFFModel arffModel,
-                                  File dictOut,
-                                  String delimiter,
-                                  boolean jsonDictonary) throws IOException {
-    log.info("Converting File: {}", file);
-    ARFFModel model = new MapBackedARFFModel(arffModel.getWords(), 
arffModel.getWordCount() + 1, arffModel
-        .getNominalMap());
-    Iterable<Vector> iteratable = new ARFFVectorIterable(file, model);
-    String outFile = outDir + '/' + file.getName() + ".mvc";
-
-    try (VectorWriter vectorWriter = getSeqFileWriter(outFile)) {
-      long numDocs = vectorWriter.write(iteratable, maxDocs);
-      writeLabelBindings(dictOut, model, delimiter, jsonDictonary);
-      log.info("Wrote: {} vectors", numDocs);
-    }
-  }
-
-  private static VectorWriter getSeqFileWriter(String outFile) throws 
IOException {
-    Path path = new Path(outFile);
-    Configuration conf = new Configuration();
-    FileSystem fs = FileSystem.get(conf);
-    SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, path, 
LongWritable.class,
-        VectorWritable.class);
-    return new SequenceFileVectorWriter(seqWriter);
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
 
b/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
deleted file mode 100644
index e911b1a..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
+++ /dev/null
@@ -1,282 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.arff;
-
-import java.text.DateFormat;
-import java.text.NumberFormat;
-import java.text.ParseException;
-import java.text.ParsePosition;
-import java.text.SimpleDateFormat;
-import java.util.Collections;
-import java.util.Date;
-import java.util.HashMap;
-import java.util.Locale;
-import java.util.Map;
-import java.util.regex.Pattern;
-
-/**
- * Holds ARFF information in {@link Map}.
- */
-public class MapBackedARFFModel implements ARFFModel {
-  
-  private static final Pattern QUOTE_PATTERN = Pattern.compile("\"");
-  
-  private long wordCount = 1;
-  
-  private String relation;
-  
-  private final Map<String,Integer> labelBindings;
-  private final Map<Integer,String> idxLabel;
-  private final Map<Integer,ARFFType> typeMap; // key is the vector index, 
value is the type
-  private final Map<Integer,DateFormat> dateMap;
-  private final Map<String,Map<String,Integer>> nominalMap;
-  private final Map<String,Long> words;
-  
-  public MapBackedARFFModel() {
-    this(new HashMap<String,Long>(), 1, new 
HashMap<String,Map<String,Integer>>());
-  }
-  
-  public MapBackedARFFModel(Map<String,Long> words, long wordCount, 
Map<String,Map<String,Integer>> nominalMap) {
-    this.words = words;
-    this.wordCount = wordCount;
-    labelBindings = new HashMap<>();
-    idxLabel = new HashMap<>();
-    typeMap = new HashMap<>();
-    dateMap = new HashMap<>();
-    this.nominalMap = nominalMap;
-    
-  }
-  
-  @Override
-  public String getRelation() {
-    return relation;
-  }
-  
-  @Override
-  public void setRelation(String relation) {
-    this.relation = relation;
-  }
-  
-  /**
-   * Convert a piece of String data at a specific spot into a value
-   * 
-   * @param data
-   *          The data to convert
-   * @param idx
-   *          The position in the ARFF data
-   * @return A double representing the data
-   */
-  @Override
-  public double getValue(String data, int idx) {
-    ARFFType type = typeMap.get(idx);
-    if (type == null) {
-      throw new IllegalArgumentException("Attribute type cannot be NULL, 
attribute index was: " + idx);
-    }
-    data = QUOTE_PATTERN.matcher(data).replaceAll("");
-    data = data.trim();
-    double result;
-    switch (type) {
-      case NUMERIC:
-      case INTEGER:
-      case REAL:
-        result = processNumeric(data);
-        break;
-      case DATE:
-        result = processDate(data, idx);
-        break;
-      case STRING:
-        // may have quotes
-        result = processString(data);
-        break;
-      case NOMINAL:
-        String label = idxLabel.get(idx);
-        result = processNominal(label, data);
-        break;
-      default:
-        throw new IllegalStateException("Unknown type: " + type);
-    }
-    return result;
-  }
-  
-  protected double processNominal(String label, String data) {
-    double result;
-    Map<String,Integer> classes = nominalMap.get(label);
-    if (classes != null) {
-      Integer ord = classes.get(ARFFType.removeQuotes(data));
-      if (ord != null) {
-        result = ord;
-      } else {
-        throw new IllegalStateException("Invalid nominal: " + data + " for 
label: " + label);
-      }
-    } else {
-      throw new IllegalArgumentException("Invalid nominal label: " + label + " 
Data: " + data);
-    }
-    
-    return result;
-  }
-
-  // Not sure how scalable this is going to be
-  protected double processString(String data) {
-    data = QUOTE_PATTERN.matcher(data).replaceAll("");
-    // map it to an long
-    Long theLong = words.get(data);
-    if (theLong == null) {
-      theLong = wordCount++;
-      words.put(data, theLong);
-    }
-    return theLong;
-  }
-  
-  protected static double processNumeric(String data) {
-    if (isNumeric(data)) {
-      return Double.parseDouble(data);
-    }
-    return Double.NaN;
-  }
-
-  public static boolean isNumeric(String str) {
-    NumberFormat formatter = NumberFormat.getInstance();
-    ParsePosition parsePosition = new ParsePosition(0);
-    formatter.parse(str, parsePosition);
-    return str.length() == parsePosition.getIndex();
-  }
-
-  protected double processDate(String data, int idx) {
-    DateFormat format = dateMap.get(idx);
-    if (format == null) {
-      format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH);
-    }
-    double result;
-    try {
-      Date date = format.parse(data);
-      result = date.getTime(); // hmmm, what kind of loss casting long to 
double?
-    } catch (ParseException e) {
-      throw new IllegalArgumentException(e);
-    }
-    return result;
-  }
-  
-  /**
-   * The vector attributes (labels in Mahout speak), unmodifiable
-   * 
-   * @return the map
-   */
-  @Override
-  public Map<String,Integer> getLabelBindings() {
-    return Collections.unmodifiableMap(labelBindings);
-  }
-  
-  /**
-   * The map of types encountered
-   * 
-   * @return the map
-   */
-  public Map<Integer,ARFFType> getTypeMap() {
-    return Collections.unmodifiableMap(typeMap);
-  }
-  
-  /**
-   * Map of Date formatters used
-   * 
-   * @return the map
-   */
-  public Map<Integer,DateFormat> getDateMap() {
-    return Collections.unmodifiableMap(dateMap);
-  }
-  
-  /**
-   * Map nominals to ids. Should only be modified by calling {@link 
ARFFModel#addNominal(String, String, int)}
-   * 
-   * @return the map
-   */
-  @Override
-  public Map<String,Map<String,Integer>> getNominalMap() {
-    return nominalMap;
-  }
-  
-  /**
-   * Immutable map of words to the long id used for those words
-   * 
-   * @return The map
-   */
-  @Override
-  public Map<String,Long> getWords() {
-    return words;
-  }
-  
-  @Override
-  public Integer getNominalValue(String label, String nominal) {
-    return nominalMap.get(label).get(nominal);
-  }
-  
-  @Override
-  public void addNominal(String label, String nominal, int idx) {
-    Map<String,Integer> noms = nominalMap.get(label);
-    if (noms == null) {
-      noms = new HashMap<>();
-      nominalMap.put(label, noms);
-    }
-    noms.put(nominal, idx);
-  }
-  
-  @Override
-  public DateFormat getDateFormat(Integer idx) {
-    return dateMap.get(idx);
-  }
-  
-  @Override
-  public void addDateFormat(Integer idx, DateFormat format) {
-    dateMap.put(idx, format);
-  }
-  
-  @Override
-  public Integer getLabelIndex(String label) {
-    return labelBindings.get(label);
-  }
-  
-  @Override
-  public void addLabel(String label, Integer idx) {
-    labelBindings.put(label, idx);
-    idxLabel.put(idx, label);
-  }
-  
-  @Override
-  public ARFFType getARFFType(Integer idx) {
-    return typeMap.get(idx);
-  }
-  
-  @Override
-  public void addType(Integer idx, ARFFType type) {
-    typeMap.put(idx, type);
-  }
-  
-  /**
-   * The count of the number of words seen
-   * 
-   * @return the count
-   */
-  @Override
-  public long getWordCount() {
-    return wordCount;
-  }
-  
-  @Override
-  public int getLabelSize() {
-    return labelBindings.size();
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java
 
b/integration/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java
deleted file mode 100644
index 3c583fd..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java
+++ /dev/null
@@ -1,69 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.csv;
-
-import java.io.IOException;
-import java.io.Reader;
-
-import com.google.common.collect.AbstractIterator;
-import org.apache.commons.csv.CSVParser;
-import org.apache.commons.csv.CSVStrategy;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.Vector;
-
-/**
- * Iterates a CSV file and produces {@link org.apache.mahout.math.Vector}.
- * <br/>
- * The Iterator returned throws {@link UnsupportedOperationException} for the 
{@link java.util.Iterator#remove()}
- * method.
- * <p/>
- * Assumes DenseVector for now, but in the future may have the option of 
mapping columns to sparse format
- * <p/>
- * The Iterator is not thread-safe.
- */
-public class CSVVectorIterator extends AbstractIterator<Vector> {
-
-  private final CSVParser parser;
-
-  public CSVVectorIterator(Reader reader) {
-    parser = new CSVParser(reader);
-  }
-
-  public CSVVectorIterator(Reader reader, CSVStrategy strategy) {
-    parser = new CSVParser(reader, strategy);
-  }
-
-  @Override
-  protected Vector computeNext() {
-    String[] line;
-    try {
-      line = parser.getLine();
-    } catch (IOException e) {
-      throw new IllegalStateException(e);
-    }
-    if (line == null) {
-      return endOfData();
-    }
-    Vector result = new DenseVector(line.length);
-    for (int i = 0; i < line.length; i++) {
-      result.setQuick(i, Double.parseDouble(line[i]));
-    }
-    return result;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/io/DelimitedTermInfoWriter.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/vectors/io/DelimitedTermInfoWriter.java
 
b/integration/src/main/java/org/apache/mahout/utils/vectors/io/DelimitedTermInfoWriter.java
deleted file mode 100644
index b5f9f2b..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/vectors/io/DelimitedTermInfoWriter.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.io;
-
-import java.io.IOException;
-import java.io.Writer;
-import java.util.Iterator;
-
-import com.google.common.io.Closeables;
-import org.apache.mahout.utils.vectors.TermEntry;
-import org.apache.mahout.utils.vectors.TermInfo;
-
-/**
- * Write {@link TermInfo} to a {@link Writer} in a textual, delimited format 
with header.
- */
-public class DelimitedTermInfoWriter implements TermInfoWriter {
-  
-  private final Writer writer;
-  private final String delimiter;
-  private final String field;
-  
-  public DelimitedTermInfoWriter(Writer writer, String delimiter, String 
field) {
-    this.writer = writer;
-    this.delimiter = delimiter;
-    this.field = field;
-  }
-  
-  @Override
-  public void write(TermInfo ti) throws IOException {
-    
-    Iterator<TermEntry> entIter = ti.getAllEntries();
-    try {
-      writer.write(String.valueOf(ti.totalTerms(field)));
-      writer.write('\n');
-      writer.write("#term" + delimiter + "doc freq" + delimiter + "idx");
-      writer.write('\n');
-      while (entIter.hasNext()) {
-        TermEntry entry = entIter.next();
-        writer.write(entry.getTerm());
-        writer.write(delimiter);
-        writer.write(String.valueOf(entry.getDocFreq()));
-        writer.write(delimiter);
-        writer.write(String.valueOf(entry.getTermIdx()));
-        writer.write('\n');
-      }
-    } finally {
-      Closeables.close(writer, false);
-    }
-  }
-  
-  /**
-   * Does NOT close the underlying writer
-   */
-  @Override
-  public void close() {
-
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
 
b/integration/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
deleted file mode 100644
index 0d763a1..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
+++ /dev/null
@@ -1,75 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.io;
-
-import java.io.IOException;
-
-import com.google.common.io.Closeables;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-
-
-/**
- * Writes out Vectors to a SequenceFile.
- *
- * Closes the writer when done
- */
-public class SequenceFileVectorWriter implements VectorWriter {
-  private final SequenceFile.Writer writer;
-  private long recNum = 0;
-  public SequenceFileVectorWriter(SequenceFile.Writer writer) {
-    this.writer = writer;
-  }
-  
-  @Override
-  public long write(Iterable<Vector> iterable, long maxDocs) throws 
IOException {
-
-    for (Vector point : iterable) {
-      if (recNum >= maxDocs) {
-        break;
-      }
-      if (point != null) {
-        writer.append(new LongWritable(recNum++), new VectorWritable(point));
-      }
-      
-    }
-    return recNum;
-  }
-
-  @Override
-  public void write(Vector vector) throws IOException {
-    writer.append(new LongWritable(recNum++), new VectorWritable(vector));
-
-  }
-
-  @Override
-  public long write(Iterable<Vector> iterable) throws IOException {
-    return write(iterable, Long.MAX_VALUE);
-  }
-  
-  @Override
-  public void close() throws IOException {
-    Closeables.close(writer, false);
-  }
-  
-  public SequenceFile.Writer getWriter() {
-    return writer;
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/io/TermInfoWriter.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/vectors/io/TermInfoWriter.java
 
b/integration/src/main/java/org/apache/mahout/utils/vectors/io/TermInfoWriter.java
deleted file mode 100644
index e165b45..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/vectors/io/TermInfoWriter.java
+++ /dev/null
@@ -1,29 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.io;
-
-import java.io.Closeable;
-import java.io.IOException;
-
-import org.apache.mahout.utils.vectors.TermInfo;
-
-public interface TermInfoWriter extends Closeable {
-  
-  void write(TermInfo ti) throws IOException;
-
-}

Reply via email to