no longer in use

rawkintrevo Wed, 27 Jun 2018 07:51:57 -0700

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
 
b/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
deleted file mode 100644
index 180a1e1..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
+++ /dev/null
@@ -1,155 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.arff;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.IOException;
-import java.io.Reader;
-import java.io.StringReader;
-import java.nio.charset.Charset;
-import java.text.DateFormat;
-import java.text.SimpleDateFormat;
-import java.util.Iterator;
-import java.util.Locale;
-
-import com.google.common.io.Files;
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.math.Vector;
-
-/**
- * Read in ARFF (http://www.cs.waikato.ac.nz/~ml/weka/arff.html) and create 
{@link Vector}s
- * <p/>
- * Attribute type handling:
- * <ul>
- * <li>Numeric -> As is</li>
- * <li>Nominal -> ordinal(value) i.e. @attribute lumber 
{'\'(-inf-0.5]\'','\'(0.5-inf)\''}
- * will convert -inf-0.5 -> 0, and 0.5-inf -> 1</li>
- * <li>Dates -> Convert to time as a long</li>
- * <li>Strings -> Create a map of String -> long</li>
- * </ul>
- * NOTE: This class does not set the label bindings on every vector.  If you 
want the label
- * bindings, call {@link MapBackedARFFModel#getLabelBindings()}, as they are 
the same for every vector.
- */
-public class ARFFVectorIterable implements Iterable<Vector> {
-
-  private final BufferedReader buff;
-  private final ARFFModel model;
-
-  public ARFFVectorIterable(File file, ARFFModel model) throws IOException {
-    this(file, Charsets.UTF_8, model);
-  }
-
-  public ARFFVectorIterable(File file, Charset encoding, ARFFModel model) 
throws IOException {
-    this(Files.newReader(file, encoding), model);
-  }
-
-  public ARFFVectorIterable(String arff, ARFFModel model) throws IOException {
-    this(new StringReader(arff), model);
-  }
-
-  public ARFFVectorIterable(Reader reader, ARFFModel model) throws IOException 
{
-    if (reader instanceof BufferedReader) {
-      buff = (BufferedReader) reader;
-    } else {
-      buff = new BufferedReader(reader);
-    }
-    //grab the attributes, then start the iterator at the first line of data
-    this.model = model;
-
-    int labelNumber = 0;
-    String line;
-    while ((line = buff.readLine()) != null) {
-      line = line.trim();
-      if (!line.startsWith(ARFFModel.ARFF_COMMENT) && !line.isEmpty()) {
-        Integer labelNumInt = labelNumber;
-        String[] lineParts = line.split("[\\s\\t]+", 2);
-
-        // is it a relation name?
-        if (lineParts[0].equalsIgnoreCase(ARFFModel.RELATION)) {
-          model.setRelation(ARFFType.removeQuotes(lineParts[1]));
-        }
-        // or an attribute
-        else if (lineParts[0].equalsIgnoreCase(ARFFModel.ATTRIBUTE)) {
-          String label;
-          ARFFType type;
-
-          // split the name of the attribute and its description
-          String[] attrParts = lineParts[1].split("[\\s\\t]+", 2);
-          if (attrParts.length < 2)
-            throw new UnsupportedOperationException("No type for attribute 
found: " + lineParts[1]);
-
-          // label is attribute name
-          label = ARFFType.removeQuotes(attrParts[0].toLowerCase());
-          if (attrParts[1].equalsIgnoreCase(ARFFType.NUMERIC.getIndicator())) {
-            type = ARFFType.NUMERIC;
-          } else if 
(attrParts[1].equalsIgnoreCase(ARFFType.INTEGER.getIndicator())) {
-            type = ARFFType.INTEGER;
-          } else if 
(attrParts[1].equalsIgnoreCase(ARFFType.REAL.getIndicator())) {
-            type = ARFFType.REAL;
-          } else if 
(attrParts[1].equalsIgnoreCase(ARFFType.STRING.getIndicator())) {
-            type = ARFFType.STRING;
-          } else if 
(attrParts[1].toLowerCase().startsWith(ARFFType.NOMINAL.getIndicator())) {
-            type = ARFFType.NOMINAL;
-            // nominal example:
-            // @ATTRIBUTE class        {Iris-setosa,'Iris 
versicolor',Iris-virginica}
-            String[] classes = ARFFIterator.splitCSV(attrParts[1].substring(1, 
attrParts[1].length() - 1));
-            for (int i = 0; i < classes.length; i++) {
-              model.addNominal(label, ARFFType.removeQuotes(classes[i]), i + 
1);
-            }
-          } else if 
(attrParts[1].toLowerCase().startsWith(ARFFType.DATE.getIndicator())) {
-            type = ARFFType.DATE;
-            //TODO: DateFormatter map
-            DateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", 
Locale.ENGLISH);
-            String formStr = 
attrParts[1].substring(ARFFType.DATE.getIndicator().length()).trim();
-            if (!formStr.isEmpty()) {
-              if (formStr.startsWith("\"")) {
-                formStr = formStr.substring(1, formStr.length() - 1);
-              }
-              format = new SimpleDateFormat(formStr, Locale.ENGLISH);
-            }
-            model.addDateFormat(labelNumInt, format);
-            //@attribute <name> date [<date-format>]
-          } else {
-            throw new UnsupportedOperationException("Invalid attribute: " + 
attrParts[1]);
-          }
-          model.addLabel(label, labelNumInt);
-          model.addType(labelNumInt, type);
-          labelNumber++;
-        } else if (lineParts[0].equalsIgnoreCase(ARFFModel.DATA)) {
-          break; //skip it
-        }
-      }
-    }
-
-  }
-
-  @Override
-  public Iterator<Vector> iterator() {
-    return new ARFFIterator(buff, model);
-  }
-
-  /**
-   * Returns info about the ARFF content that was parsed.
-   *
-   * @return the model
-   */
-  public ARFFModel getModel() {
-    return model;
-  }
-}


http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java 
b/integration/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java
deleted file mode 100644
index ccecbb1..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java
+++ /dev/null
@@ -1,263 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.arff;
-
-import java.io.File;
-import java.io.FilenameFilter;
-import java.io.IOException;
-import java.io.Writer;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
-import java.util.Map.Entry;
-import java.util.Set;
-
-import com.google.common.io.Files;
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.mahout.common.CommandLineUtil;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.utils.vectors.io.SequenceFileVectorWriter;
-import org.apache.mahout.utils.vectors.io.VectorWriter;
-import org.codehaus.jackson.map.ObjectMapper;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public final class Driver {
-
-  private static final Logger log = LoggerFactory.getLogger(Driver.class);
-
-  /** used for JSON serialization/deserialization */
-  private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
-
-  private Driver() {
-  }
-
-  public static void main(String[] args) throws IOException {
-    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
-    ArgumentBuilder abuilder = new ArgumentBuilder();
-    GroupBuilder gbuilder = new GroupBuilder();
-
-    Option inputOpt = obuilder
-        .withLongName("input")
-        .withRequired(true)
-        
.withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
-        .withDescription(
-            "The file or directory containing the ARFF files.  If it is a 
directory, all .arff files will be converted")
-        .withShortName("d").create();
-
-    Option outputOpt = 
obuilder.withLongName("output").withRequired(true).withArgument(
-        
abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The output directory.  Files will have the same name as the input, 
but with the extension .mvc")
-        .withShortName("o").create();
-
-    Option maxOpt = 
obuilder.withLongName("max").withRequired(false).withArgument(
-        
abuilder.withName("max").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The maximum number of vectors to output.  If not specified, then it 
will loop over all docs")
-        .withShortName("m").create();
-
-    Option dictOutOpt = 
obuilder.withLongName("dictOut").withRequired(true).withArgument(
-        
abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The file to output the label bindings").withShortName("t").create();
-
-    Option jsonDictonaryOpt = 
obuilder.withLongName("json-dictonary").withRequired(false)
-        .withDescription("Write dictonary in JSON 
format").withShortName("j").create();
-
-    Option delimiterOpt = 
obuilder.withLongName("delimiter").withRequired(false).withArgument(
-        
abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The delimiter for outputing the 
dictionary").withShortName("l").create();
-
-    Option helpOpt = obuilder.withLongName("help").withDescription("Print out 
help").withShortName("h")
-        .create();
-    Group group = 
gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(maxOpt)
-        
.withOption(helpOpt).withOption(dictOutOpt).withOption(jsonDictonaryOpt).withOption(delimiterOpt)
-        .create();
-
-    try {
-      Parser parser = new Parser();
-      parser.setGroup(group);
-      CommandLine cmdLine = parser.parse(args);
-
-      if (cmdLine.hasOption(helpOpt)) {
-
-        CommandLineUtil.printHelp(group);
-        return;
-      }
-      if (cmdLine.hasOption(inputOpt)) { // Lucene case
-        File input = new File(cmdLine.getValue(inputOpt).toString());
-        long maxDocs = Long.MAX_VALUE;
-        if (cmdLine.hasOption(maxOpt)) {
-          maxDocs = Long.parseLong(cmdLine.getValue(maxOpt).toString());
-        }
-        if (maxDocs < 0) {
-          throw new IllegalArgumentException("maxDocs must be >= 0");
-        }
-        String outDir = cmdLine.getValue(outputOpt).toString();
-        log.info("Output Dir: {}", outDir);
-
-        String delimiter = cmdLine.hasOption(delimiterOpt) ? 
cmdLine.getValue(delimiterOpt).toString() : "\t";
-        File dictOut = new File(cmdLine.getValue(dictOutOpt).toString());
-        boolean jsonDictonary = cmdLine.hasOption(jsonDictonaryOpt);
-        ARFFModel model = new MapBackedARFFModel();
-        if (input.exists() && input.isDirectory()) {
-          File[] files = input.listFiles(new FilenameFilter() {
-            @Override
-            public boolean accept(File file, String name) {
-              return name.endsWith(".arff");
-            }
-          });
-
-          for (File file : files) {
-            writeFile(outDir, file, maxDocs, model, dictOut, delimiter, 
jsonDictonary);
-          }
-        } else {
-          writeFile(outDir, input, maxDocs, model, dictOut, delimiter, 
jsonDictonary);
-        }
-      }
-
-    } catch (OptionException e) {
-      log.error("Exception", e);
-      CommandLineUtil.printHelp(group);
-    }
-  }
-
-  protected static void writeLabelBindings(File dictOut, ARFFModel arffModel, 
String delimiter, boolean jsonDictonary)
-      throws IOException {
-    try (Writer writer = Files.newWriterSupplier(dictOut, Charsets.UTF_8, 
true).getOutput()) {
-      if (jsonDictonary) {
-        writeLabelBindingsJSON(writer, arffModel);
-      } else {
-        writeLabelBindings(writer, arffModel, delimiter);
-      }
-    }
-  }
-
-  protected static void writeLabelBindingsJSON(Writer writer, ARFFModel 
arffModel) throws IOException {
-
-    // Turn the map of labels into a list order by order of appearance
-    List<Entry<String, Integer>> attributes = new ArrayList<>();
-    attributes.addAll(arffModel.getLabelBindings().entrySet());
-    Collections.sort(attributes, new Comparator<Map.Entry<String, Integer>>() {
-      @Override
-      public int compare(Entry<String, Integer> t, Entry<String, Integer> t1) {
-        return t.getValue().compareTo(t1.getValue());
-      }
-    });
-
-    // write a map for each object
-    List<Map<String, Object>> jsonObjects = new LinkedList<>();
-    for (int i = 0; i < attributes.size(); i++) {
-
-      Entry<String, Integer> modelRepresentation = attributes.get(i);
-      Map<String, Object> jsonRepresentation = new HashMap<>();
-      jsonObjects.add(jsonRepresentation);
-      // the last one is the class label
-      jsonRepresentation.put("label", i < (attributes.size() - 1) ? 
String.valueOf(false) : String.valueOf(true));
-      String attribute = modelRepresentation.getKey();
-      jsonRepresentation.put("attribute", attribute);
-      Map<String, Integer> nominalValues = 
arffModel.getNominalMap().get(attribute);
-
-      if (nominalValues != null) {
-        String[] values = nominalValues.keySet().toArray(new String[1]);
-
-        jsonRepresentation.put("values", values);
-        jsonRepresentation.put("type", "categorical");
-      } else {
-        jsonRepresentation.put("type", "numerical");
-      }
-    }
-    writer.write(OBJECT_MAPPER.writeValueAsString(jsonObjects));
-  }
-
-  protected static void writeLabelBindings(Writer writer, ARFFModel arffModel, 
String delimiter) throws IOException {
-
-    Map<String, Integer> labels = arffModel.getLabelBindings();
-    writer.write("Label bindings for Relation " + arffModel.getRelation() + 
'\n');
-    for (Map.Entry<String, Integer> entry : labels.entrySet()) {
-      writer.write(entry.getKey());
-      writer.write(delimiter);
-      writer.write(String.valueOf(entry.getValue()));
-      writer.write('\n');
-    }
-    writer.write('\n');
-    writer.write("Values for nominal attributes\n");
-    // emit allowed values for NOMINAL/categorical/enumerated attributes
-    Map<String, Map<String, Integer>> nominalMap = arffModel.getNominalMap();
-    // how many nominal attributes
-    writer.write(String.valueOf(nominalMap.size()) + "\n");
-
-    for (Entry<String, Map<String, Integer>> entry : nominalMap.entrySet()) {
-      // the label of this attribute
-      writer.write(entry.getKey() + "\n");
-      Set<Entry<String, Integer>> attributeValues = 
entry.getValue().entrySet();
-      // how many values does this attribute have
-      writer.write(attributeValues.size() + "\n");
-      for (Map.Entry<String, Integer> value : attributeValues) {
-        // the value and the value index
-        writer.write(String.format("%s%s%s\n", value.getKey(), delimiter, 
value.getValue().toString()));
-      }
-    }
-  }
-
-  protected static void writeFile(String outDir,
-                                  File file,
-                                  long maxDocs,
-                                  ARFFModel arffModel,
-                                  File dictOut,
-                                  String delimiter,
-                                  boolean jsonDictonary) throws IOException {
-    log.info("Converting File: {}", file);
-    ARFFModel model = new MapBackedARFFModel(arffModel.getWords(), 
arffModel.getWordCount() + 1, arffModel
-        .getNominalMap());
-    Iterable<Vector> iteratable = new ARFFVectorIterable(file, model);
-    String outFile = outDir + '/' + file.getName() + ".mvc";
-
-    try (VectorWriter vectorWriter = getSeqFileWriter(outFile)) {
-      long numDocs = vectorWriter.write(iteratable, maxDocs);
-      writeLabelBindings(dictOut, model, delimiter, jsonDictonary);
-      log.info("Wrote: {} vectors", numDocs);
-    }
-  }
-
-  private static VectorWriter getSeqFileWriter(String outFile) throws 
IOException {
-    Path path = new Path(outFile);
-    Configuration conf = new Configuration();
-    FileSystem fs = FileSystem.get(conf);
-    SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, path, 
LongWritable.class,
-        VectorWritable.class);
-    return new SequenceFileVectorWriter(seqWriter);
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
 
b/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
deleted file mode 100644
index e911b1a..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
+++ /dev/null
@@ -1,282 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.arff;
-
-import java.text.DateFormat;
-import java.text.NumberFormat;
-import java.text.ParseException;
-import java.text.ParsePosition;
-import java.text.SimpleDateFormat;
-import java.util.Collections;
-import java.util.Date;
-import java.util.HashMap;
-import java.util.Locale;
-import java.util.Map;
-import java.util.regex.Pattern;
-
-/**
- * Holds ARFF information in {@link Map}.
- */
-public class MapBackedARFFModel implements ARFFModel {
-  
-  private static final Pattern QUOTE_PATTERN = Pattern.compile("\"");
-  
-  private long wordCount = 1;
-  
-  private String relation;
-  
-  private final Map<String,Integer> labelBindings;
-  private final Map<Integer,String> idxLabel;
-  private final Map<Integer,ARFFType> typeMap; // key is the vector index, 
value is the type
-  private final Map<Integer,DateFormat> dateMap;
-  private final Map<String,Map<String,Integer>> nominalMap;
-  private final Map<String,Long> words;
-  
-  public MapBackedARFFModel() {
-    this(new HashMap<String,Long>(), 1, new 
HashMap<String,Map<String,Integer>>());
-  }
-  
-  public MapBackedARFFModel(Map<String,Long> words, long wordCount, 
Map<String,Map<String,Integer>> nominalMap) {
-    this.words = words;
-    this.wordCount = wordCount;
-    labelBindings = new HashMap<>();
-    idxLabel = new HashMap<>();
-    typeMap = new HashMap<>();
-    dateMap = new HashMap<>();
-    this.nominalMap = nominalMap;
-    
-  }
-  
-  @Override
-  public String getRelation() {
-    return relation;
-  }
-  
-  @Override
-  public void setRelation(String relation) {
-    this.relation = relation;
-  }
-  
-  /**
-   * Convert a piece of String data at a specific spot into a value
-   * 
-   * @param data
-   *          The data to convert
-   * @param idx
-   *          The position in the ARFF data
-   * @return A double representing the data
-   */
-  @Override
-  public double getValue(String data, int idx) {
-    ARFFType type = typeMap.get(idx);
-    if (type == null) {
-      throw new IllegalArgumentException("Attribute type cannot be NULL, 
attribute index was: " + idx);
-    }
-    data = QUOTE_PATTERN.matcher(data).replaceAll("");
-    data = data.trim();
-    double result;
-    switch (type) {
-      case NUMERIC:
-      case INTEGER:
-      case REAL:
-        result = processNumeric(data);
-        break;
-      case DATE:
-        result = processDate(data, idx);
-        break;
-      case STRING:
-        // may have quotes
-        result = processString(data);
-        break;
-      case NOMINAL:
-        String label = idxLabel.get(idx);
-        result = processNominal(label, data);
-        break;
-      default:
-        throw new IllegalStateException("Unknown type: " + type);
-    }
-    return result;
-  }
-  
-  protected double processNominal(String label, String data) {
-    double result;
-    Map<String,Integer> classes = nominalMap.get(label);
-    if (classes != null) {
-      Integer ord = classes.get(ARFFType.removeQuotes(data));
-      if (ord != null) {
-        result = ord;
-      } else {
-        throw new IllegalStateException("Invalid nominal: " + data + " for 
label: " + label);
-      }
-    } else {
-      throw new IllegalArgumentException("Invalid nominal label: " + label + " 
Data: " + data);
-    }
-    
-    return result;
-  }
-
-  // Not sure how scalable this is going to be
-  protected double processString(String data) {
-    data = QUOTE_PATTERN.matcher(data).replaceAll("");
-    // map it to an long
-    Long theLong = words.get(data);
-    if (theLong == null) {
-      theLong = wordCount++;
-      words.put(data, theLong);
-    }
-    return theLong;
-  }
-  
-  protected static double processNumeric(String data) {
-    if (isNumeric(data)) {
-      return Double.parseDouble(data);
-    }
-    return Double.NaN;
-  }
-
-  public static boolean isNumeric(String str) {
-    NumberFormat formatter = NumberFormat.getInstance();
-    ParsePosition parsePosition = new ParsePosition(0);
-    formatter.parse(str, parsePosition);
-    return str.length() == parsePosition.getIndex();
-  }
-
-  protected double processDate(String data, int idx) {
-    DateFormat format = dateMap.get(idx);
-    if (format == null) {
-      format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH);
-    }
-    double result;
-    try {
-      Date date = format.parse(data);
-      result = date.getTime(); // hmmm, what kind of loss casting long to 
double?
-    } catch (ParseException e) {
-      throw new IllegalArgumentException(e);
-    }
-    return result;
-  }
-  
-  /**
-   * The vector attributes (labels in Mahout speak), unmodifiable
-   * 
-   * @return the map
-   */
-  @Override
-  public Map<String,Integer> getLabelBindings() {
-    return Collections.unmodifiableMap(labelBindings);
-  }
-  
-  /**
-   * The map of types encountered
-   * 
-   * @return the map
-   */
-  public Map<Integer,ARFFType> getTypeMap() {
-    return Collections.unmodifiableMap(typeMap);
-  }
-  
-  /**
-   * Map of Date formatters used
-   * 
-   * @return the map
-   */
-  public Map<Integer,DateFormat> getDateMap() {
-    return Collections.unmodifiableMap(dateMap);
-  }
-  
-  /**
-   * Map nominals to ids. Should only be modified by calling {@link 
ARFFModel#addNominal(String, String, int)}
-   * 
-   * @return the map
-   */
-  @Override
-  public Map<String,Map<String,Integer>> getNominalMap() {
-    return nominalMap;
-  }
-  
-  /**
-   * Immutable map of words to the long id used for those words
-   * 
-   * @return The map
-   */
-  @Override
-  public Map<String,Long> getWords() {
-    return words;
-  }
-  
-  @Override
-  public Integer getNominalValue(String label, String nominal) {
-    return nominalMap.get(label).get(nominal);
-  }
-  
-  @Override
-  public void addNominal(String label, String nominal, int idx) {
-    Map<String,Integer> noms = nominalMap.get(label);
-    if (noms == null) {
-      noms = new HashMap<>();
-      nominalMap.put(label, noms);
-    }
-    noms.put(nominal, idx);
-  }
-  
-  @Override
-  public DateFormat getDateFormat(Integer idx) {
-    return dateMap.get(idx);
-  }
-  
-  @Override
-  public void addDateFormat(Integer idx, DateFormat format) {
-    dateMap.put(idx, format);
-  }
-  
-  @Override
-  public Integer getLabelIndex(String label) {
-    return labelBindings.get(label);
-  }
-  
-  @Override
-  public void addLabel(String label, Integer idx) {
-    labelBindings.put(label, idx);
-    idxLabel.put(idx, label);
-  }
-  
-  @Override
-  public ARFFType getARFFType(Integer idx) {
-    return typeMap.get(idx);
-  }
-  
-  @Override
-  public void addType(Integer idx, ARFFType type) {
-    typeMap.put(idx, type);
-  }
-  
-  /**
-   * The count of the number of words seen
-   * 
-   * @return the count
-   */
-  @Override
-  public long getWordCount() {
-    return wordCount;
-  }
-  
-  @Override
-  public int getLabelSize() {
-    return labelBindings.size();
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java
 
b/integration/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java
deleted file mode 100644
index 3c583fd..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java
+++ /dev/null
@@ -1,69 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.csv;
-
-import java.io.IOException;
-import java.io.Reader;
-
-import com.google.common.collect.AbstractIterator;
-import org.apache.commons.csv.CSVParser;
-import org.apache.commons.csv.CSVStrategy;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.Vector;
-
-/**
- * Iterates a CSV file and produces {@link org.apache.mahout.math.Vector}.
- * <br/>
- * The Iterator returned throws {@link UnsupportedOperationException} for the 
{@link java.util.Iterator#remove()}
- * method.
- * <p/>
- * Assumes DenseVector for now, but in the future may have the option of 
mapping columns to sparse format
- * <p/>
- * The Iterator is not thread-safe.
- */
-public class CSVVectorIterator extends AbstractIterator<Vector> {
-
-  private final CSVParser parser;
-
-  public CSVVectorIterator(Reader reader) {
-    parser = new CSVParser(reader);
-  }
-
-  public CSVVectorIterator(Reader reader, CSVStrategy strategy) {
-    parser = new CSVParser(reader, strategy);
-  }
-
-  @Override
-  protected Vector computeNext() {
-    String[] line;
-    try {
-      line = parser.getLine();
-    } catch (IOException e) {
-      throw new IllegalStateException(e);
-    }
-    if (line == null) {
-      return endOfData();
-    }
-    Vector result = new DenseVector(line.length);
-    for (int i = 0; i < line.length; i++) {
-      result.setQuick(i, Double.parseDouble(line[i]));
-    }
-    return result;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/io/DelimitedTermInfoWriter.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/vectors/io/DelimitedTermInfoWriter.java
 
b/integration/src/main/java/org/apache/mahout/utils/vectors/io/DelimitedTermInfoWriter.java
deleted file mode 100644
index b5f9f2b..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/vectors/io/DelimitedTermInfoWriter.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.io;
-
-import java.io.IOException;
-import java.io.Writer;
-import java.util.Iterator;
-
-import com.google.common.io.Closeables;
-import org.apache.mahout.utils.vectors.TermEntry;
-import org.apache.mahout.utils.vectors.TermInfo;
-
-/**
- * Write {@link TermInfo} to a {@link Writer} in a textual, delimited format 
with header.
- */
-public class DelimitedTermInfoWriter implements TermInfoWriter {
-  
-  private final Writer writer;
-  private final String delimiter;
-  private final String field;
-  
-  public DelimitedTermInfoWriter(Writer writer, String delimiter, String 
field) {
-    this.writer = writer;
-    this.delimiter = delimiter;
-    this.field = field;
-  }
-  
-  @Override
-  public void write(TermInfo ti) throws IOException {
-    
-    Iterator<TermEntry> entIter = ti.getAllEntries();
-    try {
-      writer.write(String.valueOf(ti.totalTerms(field)));
-      writer.write('\n');
-      writer.write("#term" + delimiter + "doc freq" + delimiter + "idx");
-      writer.write('\n');
-      while (entIter.hasNext()) {
-        TermEntry entry = entIter.next();
-        writer.write(entry.getTerm());
-        writer.write(delimiter);
-        writer.write(String.valueOf(entry.getDocFreq()));
-        writer.write(delimiter);
-        writer.write(String.valueOf(entry.getTermIdx()));
-        writer.write('\n');
-      }
-    } finally {
-      Closeables.close(writer, false);
-    }
-  }
-  
-  /**
-   * Does NOT close the underlying writer
-   */
-  @Override
-  public void close() {
-
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
 
b/integration/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
deleted file mode 100644
index 0d763a1..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
+++ /dev/null
@@ -1,75 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.io;
-
-import java.io.IOException;
-
-import com.google.common.io.Closeables;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-
-
-/**
- * Writes out Vectors to a SequenceFile.
- *
- * Closes the writer when done
- */
-public class SequenceFileVectorWriter implements VectorWriter {
-  private final SequenceFile.Writer writer;
-  private long recNum = 0;
-  public SequenceFileVectorWriter(SequenceFile.Writer writer) {
-    this.writer = writer;
-  }
-  
-  @Override
-  public long write(Iterable<Vector> iterable, long maxDocs) throws 
IOException {
-
-    for (Vector point : iterable) {
-      if (recNum >= maxDocs) {
-        break;
-      }
-      if (point != null) {
-        writer.append(new LongWritable(recNum++), new VectorWritable(point));
-      }
-      
-    }
-    return recNum;
-  }
-
-  @Override
-  public void write(Vector vector) throws IOException {
-    writer.append(new LongWritable(recNum++), new VectorWritable(vector));
-
-  }
-
-  @Override
-  public long write(Iterable<Vector> iterable) throws IOException {
-    return write(iterable, Long.MAX_VALUE);
-  }
-  
-  @Override
-  public void close() throws IOException {
-    Closeables.close(writer, false);
-  }
-  
-  public SequenceFile.Writer getWriter() {
-    return writer;
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/io/TermInfoWriter.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/vectors/io/TermInfoWriter.java
 
b/integration/src/main/java/org/apache/mahout/utils/vectors/io/TermInfoWriter.java
deleted file mode 100644
index e165b45..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/vectors/io/TermInfoWriter.java
+++ /dev/null
@@ -1,29 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.io;
-
-import java.io.Closeable;
-import java.io.IOException;
-
-import org.apache.mahout.utils.vectors.TermInfo;
-
-public interface TermInfoWriter extends Closeable {
-  
-  void write(TermInfo ti) throws IOException;
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/io/TextualVectorWriter.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/vectors/io/TextualVectorWriter.java
 
b/integration/src/main/java/org/apache/mahout/utils/vectors/io/TextualVectorWriter.java
deleted file mode 100644
index cc27d1d..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/vectors/io/TextualVectorWriter.java
+++ /dev/null
@@ -1,70 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.io;
-
-import java.io.IOException;
-import java.io.Writer;
-
-import com.google.common.io.Closeables;
-import org.apache.mahout.math.Vector;
-
-/**
- * Write out the vectors to any {@link Writer} using {@link 
Vector#asFormatString()},
- * one per line by default.
- */
-public class TextualVectorWriter implements VectorWriter {
-
-  private final Writer writer;
-  
-  public TextualVectorWriter(Writer writer) {
-    this.writer = writer;
-  }
-
-  protected Writer getWriter() {
-    return writer;
-  }
-  
-  @Override
-  public long write(Iterable<Vector> iterable) throws IOException {
-    return write(iterable, Long.MAX_VALUE);
-  }
-  
-  @Override
-  public long write(Iterable<Vector> iterable, long maxDocs) throws 
IOException {
-    long result = 0;
-    for (Vector vector : iterable) {
-      if (result >= maxDocs) {
-        break;
-      }
-      write(vector);
-      result++;
-    }
-    return result;
-  }
-
-  @Override
-  public void write(Vector vector) throws IOException {
-    writer.write(vector.asFormatString());
-    writer.write('\n');
-  }
-
-  @Override
-  public void close() throws IOException {
-    Closeables.close(writer, false);
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
 
b/integration/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
deleted file mode 100644
index 923e270..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
+++ /dev/null
@@ -1,52 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.io;
-
-import java.io.Closeable;
-import java.io.IOException;
-
-import org.apache.mahout.math.Vector;
-
-public interface VectorWriter extends Closeable {
-  /**
-   * Write all values in the Iterable to the output
-   * @param iterable The {@link Iterable} to loop over
-   * @return the number of docs written
-   * @throws IOException if there was a problem writing
-   *
-   */
-  long write(Iterable<Vector> iterable) throws IOException;
-
-  /**
-   * Write out a vector
-   *
-   * @param vector The {@link org.apache.mahout.math.Vector} to write
-   * @throws IOException
-   */
-  void write(Vector vector) throws IOException;
-  
-  /**
-   * Write the first {@code maxDocs} to the output.
-   * @param iterable The {@link Iterable} to loop over
-   * @param maxDocs the maximum number of docs to write
-   * @return The number of docs written
-   * @throws IOException if there was a problem writing
-   */
-  long write(Iterable<Vector> iterable, long maxDocs) throws IOException;
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java
 
b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java
deleted file mode 100644
index ff61a70..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java
+++ /dev/null
@@ -1,140 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-import com.google.common.collect.AbstractIterator;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.util.BytesRef;
-import org.apache.mahout.math.NamedVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.utils.Bump125;
-import org.apache.mahout.utils.vectors.TermInfo;
-import org.apache.mahout.vectorizer.Weight;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-
-/**
- * Iterate over a Lucene index, extracting term vectors.
- * Subclasses define how much information to retrieve from the Lucene index.
- */
-public abstract class AbstractLuceneIterator extends AbstractIterator<Vector> {
-  private static final Logger log = 
LoggerFactory.getLogger(LuceneIterator.class);
-  protected final IndexReader indexReader;
-  protected final String field;
-  protected final TermInfo terminfo;
-  protected final double normPower;
-  protected final Weight weight;
-  protected final Bump125 bump = new Bump125();
-  protected int nextDocId;
-  protected int maxErrorDocs;
-  protected int numErrorDocs;
-  protected long nextLogRecord = bump.increment();
-  protected int skippedErrorMessages;
-
-  public AbstractLuceneIterator(TermInfo terminfo, double normPower, 
IndexReader indexReader, Weight weight,
-      double maxPercentErrorDocs, String field) {
-    this.terminfo = terminfo;
-    this.normPower = normPower;
-    this.indexReader = indexReader;
-
-    this.weight = weight;
-    this.nextDocId = 0;
-    this.maxErrorDocs = (int) (maxPercentErrorDocs * indexReader.numDocs());
-    this.field = field;
-  }
-
-  /**
-   * Given the document name, derive a name for the vector. This may involve
-   * reading the document from Lucene and setting up any other state that the
-   * subclass wants. This will be called once for each document that the
-   * iterator processes.
-   * @param documentIndex the lucene document index.
-   * @return the name to store in the vector.
-   */
-  protected abstract String getVectorName(int documentIndex) throws 
IOException;
-
-  @Override
-  protected Vector computeNext() {
-    try {
-      int doc;
-      Terms termFreqVector;
-      String name;
-
-      do {
-        doc = this.nextDocId;
-        nextDocId++;
-
-        if (doc >= indexReader.maxDoc()) {
-          return endOfData();
-        }
-
-        termFreqVector = indexReader.getTermVector(doc, field);
-        name = getVectorName(doc);
-
-        if (termFreqVector == null) {
-          numErrorDocs++;
-          if (numErrorDocs >= maxErrorDocs) {
-            log.error("There are too many documents that do not have a term 
vector for {}", field);
-            throw new IllegalStateException("There are too many documents that 
do not have a term vector for "
-                + field);
-          }
-          if (numErrorDocs >= nextLogRecord) {
-            if (skippedErrorMessages == 0) {
-              log.warn("{} does not have a term vector for {}", name, field);
-            } else {
-              log.warn("{} documents do not have a term vector for {}", 
numErrorDocs, field);
-            }
-            nextLogRecord = bump.increment();
-            skippedErrorMessages = 0;
-          } else {
-            skippedErrorMessages++;
-          }
-        }
-      } while (termFreqVector == null);
-
-      // The loop exits with termFreqVector and name set.
-
-      TermsEnum te = termFreqVector.iterator();
-      BytesRef term;
-      TFDFMapper mapper = new TFDFMapper(indexReader.numDocs(), weight, 
this.terminfo);
-      mapper.setExpectations(field, termFreqVector.size());
-      while ((term = te.next()) != null) {
-        mapper.map(term, (int) te.totalTermFreq());
-      }
-      Vector result = mapper.getVector();
-      if (result == null) {
-        // TODO is this right? last version would produce null in the 
iteration in this case, though it
-        // seems like that may not be desirable
-        return null;
-      }
-
-      if (normPower == LuceneIterable.NO_NORMALIZING) {
-        result = new NamedVector(result, name);
-      } else {
-        result = new NamedVector(result.normalize(normPower), name);
-      }
-      return result;
-    } catch (IOException ioe) {
-      throw new IllegalStateException(ioe);
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java
 
b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java
deleted file mode 100644
index 0b59ed6..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java
+++ /dev/null
@@ -1,79 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-import java.io.IOException;
-import java.util.Iterator;
-import java.util.LinkedHashMap;
-import java.util.Map;
-
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.MultiFields;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.util.BytesRef;
-import org.apache.mahout.utils.vectors.TermEntry;
-import org.apache.mahout.utils.vectors.TermInfo;
-
-
-/**
- * Caches TermEntries from a single field.  Materializes all values in the 
TermEnum to memory (much like FieldCache)
- */
-public class CachedTermInfo implements TermInfo {
-
-  private final Map<String, TermEntry> termEntries;
-  private final String field;
-
-  public CachedTermInfo(IndexReader reader, String field, int minDf, int 
maxDfPercent) throws IOException {
-    this.field = field;
-    Terms t = MultiFields.getTerms(reader, field);
-    TermsEnum te = t.iterator();
-
-    int numDocs = reader.numDocs();
-    double percent = numDocs * maxDfPercent / 100.0;
-    //Should we use a linked hash map so that we know terms are in order?
-    termEntries = new LinkedHashMap<>();
-    int count = 0;
-    BytesRef text;
-    while ((text = te.next()) != null) {
-      int df = te.docFreq();
-      if (df >= minDf && df <= percent) {
-        TermEntry entry = new TermEntry(text.utf8ToString(), count++, df);
-        termEntries.put(entry.getTerm(), entry);
-      }
-    }
-  }
-
-  @Override
-  public int totalTerms(String field) {
-    return termEntries.size();
-  }
-
-  @Override
-  public TermEntry getTermEntry(String field, String term) {
-    if (!this.field.equals(field)) {
-      return null;
-    }
-    return termEntries.get(term);
-  }
-
-  @Override
-  public Iterator<TermEntry> getAllEntries() {
-    return termEntries.values().iterator();
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
 
b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
deleted file mode 100644
index b2568e7..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
+++ /dev/null
@@ -1,381 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.nio.file.Paths;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.LinkedHashMap;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.TreeSet;
-
-import com.google.common.io.Closeables;
-import com.google.common.io.Files;
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.fs.Path;
-import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.MultiFields;
-import org.apache.lucene.index.PostingsEnum;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.FSDirectory;
-import org.apache.lucene.util.Bits;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.FixedBitSet;
-import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
-import org.apache.mahout.common.CommandLineUtil;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.math.NamedVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.stats.LogLikelihood;
-import org.apache.mahout.utils.clustering.ClusterDumper;
-import org.apache.mahout.utils.vectors.TermEntry;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Get labels for the cluster using Log Likelihood Ratio (LLR).
- * <p/>
- *"The most useful way to think of this (LLR) is as the percentage of 
in-cluster documents that have the
- * feature (term) versus the percentage out, keeping in mind that both 
percentages are uncertain since we have
- * only a sample of all possible documents." - Ted Dunning
- * <p/>
- * More about LLR can be found at : 
http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html
- */
-public class ClusterLabels {
-
-  private static final Logger log = 
LoggerFactory.getLogger(ClusterLabels.class);
-
-  public static final int DEFAULT_MIN_IDS = 50;
-  public static final int DEFAULT_MAX_LABELS = 25;
-
-  private final String indexDir;
-  private final String contentField;
-  private String idField;
-  private final Map<Integer, List<WeightedPropertyVectorWritable>> 
clusterIdToPoints;
-  private String output;
-  private final int minNumIds;
-  private final int maxLabels;
-
-  public ClusterLabels(Path seqFileDir,
-                       Path pointsDir,
-                       String indexDir,
-                       String contentField,
-                       int minNumIds,
-                       int maxLabels) {
-    this.indexDir = indexDir;
-    this.contentField = contentField;
-    this.minNumIds = minNumIds;
-    this.maxLabels = maxLabels;
-    ClusterDumper clusterDumper = new ClusterDumper(seqFileDir, pointsDir);
-    this.clusterIdToPoints = clusterDumper.getClusterIdToPoints();
-  }
-
-  public void getLabels() throws IOException {
-
-    try (Writer writer = (this.output == null) ?
-        new OutputStreamWriter(System.out, Charsets.UTF_8) : 
Files.newWriter(new File(this.output), Charsets.UTF_8)){
-      for (Map.Entry<Integer, List<WeightedPropertyVectorWritable>> 
integerListEntry : clusterIdToPoints.entrySet()) {
-        List<WeightedPropertyVectorWritable> wpvws = 
integerListEntry.getValue();
-        List<TermInfoClusterInOut> termInfos = 
getClusterLabels(integerListEntry.getKey(), wpvws);
-        if (termInfos != null) {
-          writer.write('\n');
-          writer.write("Top labels for Cluster ");
-          writer.write(String.valueOf(integerListEntry.getKey()));
-          writer.write(" containing ");
-          writer.write(String.valueOf(wpvws.size()));
-          writer.write(" vectors");
-          writer.write('\n');
-          writer.write("Term \t\t LLR \t\t In-ClusterDF \t\t Out-ClusterDF ");
-          writer.write('\n');
-          for (TermInfoClusterInOut termInfo : termInfos) {
-            writer.write(termInfo.getTerm());
-            writer.write("\t\t");
-            writer.write(String.valueOf(termInfo.getLogLikelihoodRatio()));
-            writer.write("\t\t");
-            writer.write(String.valueOf(termInfo.getInClusterDF()));
-            writer.write("\t\t");
-            writer.write(String.valueOf(termInfo.getOutClusterDF()));
-            writer.write('\n');
-          }
-        }
-      }
-    }
-  }
-
-  /**
-   * Get the list of labels, sorted by best score.
-   */
-  protected List<TermInfoClusterInOut> getClusterLabels(Integer integer,
-                                                        
Collection<WeightedPropertyVectorWritable> wpvws) throws IOException {
-
-    if (wpvws.size() < minNumIds) {
-      log.info("Skipping small cluster {} with size: {}", integer, 
wpvws.size());
-      return null;
-    }
-
-    log.info("Processing Cluster {} with {} documents", integer, wpvws.size());
-    Directory dir = FSDirectory.open(Paths.get(this.indexDir));
-    IndexReader reader = DirectoryReader.open(dir);
-    
-    
-    log.info("# of documents in the index {}", reader.numDocs());
-
-    Collection<String> idSet = new HashSet<>();
-    for (WeightedPropertyVectorWritable wpvw : wpvws) {
-      Vector vector = wpvw.getVector();
-      if (vector instanceof NamedVector) {
-        idSet.add(((NamedVector) vector).getName());
-      }
-    }
-
-    int numDocs = reader.numDocs();
-
-    FixedBitSet clusterDocBitset = getClusterDocBitset(reader, idSet, 
this.idField);
-
-    log.info("Populating term infos from the index");
-
-    /**
-     * This code is as that of CachedTermInfo, with one major change, which is 
to get the document frequency.
-     * 
-     * Since we have deleted the documents out of the cluster, the document 
frequency for a term should only
-     * include the in-cluster documents. The document frequency obtained from 
TermEnum reflects the frequency
-     * in the entire index. To get the in-cluster frequency, we need to query 
the index to get the term
-     * frequencies in each document. The number of results of this call will 
be the in-cluster document
-     * frequency.
-     */
-    Terms t = MultiFields.getTerms(reader, contentField);
-    TermsEnum te = t.iterator();
-    Map<String, TermEntry> termEntryMap = new LinkedHashMap<>();
-    Bits liveDocs = MultiFields.getLiveDocs(reader); //WARNING: returns null 
if there are no deletions
-
-
-    int count = 0;
-    BytesRef term;
-    while ((term = te.next()) != null) {
-      FixedBitSet termBitset = new FixedBitSet(reader.maxDoc());
-      PostingsEnum docsEnum = MultiFields.getTermDocsEnum(reader, 
contentField, term);
-      int docID;
-      while ((docID = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
-        //check to see if we don't have an deletions (null) or if document is 
live
-        if (liveDocs != null && !liveDocs.get(docID)) {
-          // document is deleted...
-          termBitset.set(docsEnum.docID());
-        }
-      }
-      // AND the term's bitset with cluster doc bitset to get the term's 
in-cluster frequency.
-      // This modifies the termBitset, but that's fine as we are not using it 
anywhere else.
-      termBitset.and(clusterDocBitset);
-      int inclusterDF = (int) termBitset.cardinality();
-
-      TermEntry entry = new TermEntry(term.utf8ToString(), count++, 
inclusterDF);
-      termEntryMap.put(entry.getTerm(), entry);
-
-    }
-
-    List<TermInfoClusterInOut> clusteredTermInfo = new LinkedList<>();
-
-    int clusterSize = wpvws.size();
-
-    for (TermEntry termEntry : termEntryMap.values()) {
-        
-      int corpusDF = reader.docFreq(new 
Term(this.contentField,termEntry.getTerm()));
-      int outDF = corpusDF - termEntry.getDocFreq();
-      int inDF = termEntry.getDocFreq();
-      double logLikelihoodRatio = scoreDocumentFrequencies(inDF, outDF, 
clusterSize, numDocs);
-      TermInfoClusterInOut termInfoCluster =
-          new TermInfoClusterInOut(termEntry.getTerm(), inDF, outDF, 
logLikelihoodRatio);
-      clusteredTermInfo.add(termInfoCluster);
-    }
-
-    Collections.sort(clusteredTermInfo);
-    // Cleanup
-    Closeables.close(reader, true);
-    termEntryMap.clear();
-
-    return clusteredTermInfo.subList(0, Math.min(clusteredTermInfo.size(), 
maxLabels));
-  }
-
-  private static FixedBitSet getClusterDocBitset(IndexReader reader,
-                                                Collection<String> idSet,
-                                                String idField) throws 
IOException {
-    int numDocs = reader.numDocs();
-
-    FixedBitSet bitset = new FixedBitSet(numDocs);
-    
-    Set<String>  idFieldSelector = null;
-    if (idField != null) {
-      idFieldSelector = new TreeSet<>();
-      idFieldSelector.add(idField);
-    }
-    
-    
-    for (int i = 0; i < numDocs; i++) {
-      String id;
-      // Use Lucene's internal ID if idField is not specified. Else, get it 
from the document.
-      if (idField == null) {
-        id = Integer.toString(i);
-      } else {
-        id = reader.document(i, idFieldSelector).get(idField);
-      }
-      if (idSet.contains(id)) {
-        bitset.set(i);
-      }
-    }
-    log.info("Created bitset for in-cluster documents : {}", 
bitset.cardinality());
-    return bitset;
-  }
-
-  private static double scoreDocumentFrequencies(long inDF, long outDF, long 
clusterSize, long corpusSize) {
-    long k12 = clusterSize - inDF;
-    long k22 = corpusSize - clusterSize - outDF;
-
-    return LogLikelihood.logLikelihoodRatio(inDF, k12, outDF, k22);
-  }
-
-  public String getIdField() {
-    return idField;
-  }
-
-  public void setIdField(String idField) {
-    this.idField = idField;
-  }
-
-  public String getOutput() {
-    return output;
-  }
-
-  public void setOutput(String output) {
-    this.output = output;
-  }
-
-  public static void main(String[] args) {
-
-    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
-    ArgumentBuilder abuilder = new ArgumentBuilder();
-    GroupBuilder gbuilder = new GroupBuilder();
-
-    Option indexOpt = 
obuilder.withLongName("dir").withRequired(true).withArgument(
-        abuilder.withName("dir").withMinimum(1).withMaximum(1).create())
-        .withDescription("The Lucene index 
directory").withShortName("d").create();
-
-    Option outputOpt = 
obuilder.withLongName("output").withRequired(false).withArgument(
-        
abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The output file. If not specified, the result is printed on 
console.").withShortName("o").create();
-
-    Option fieldOpt = 
obuilder.withLongName("field").withRequired(true).withArgument(
-        abuilder.withName("field").withMinimum(1).withMaximum(1).create())
-        .withDescription("The content field in the 
index").withShortName("f").create();
-
-    Option idFieldOpt = 
obuilder.withLongName("idField").withRequired(false).withArgument(
-        
abuilder.withName("idField").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The field for the document ID in the index.  If null, then the Lucene 
internal doc "
-            + "id is used which is prone to error if the underlying index 
changes").withShortName("i").create();
-
-    Option seqOpt = 
obuilder.withLongName("seqFileDir").withRequired(true).withArgument(
-        
abuilder.withName("seqFileDir").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The directory containing Sequence Files for the 
Clusters").withShortName("s").create();
-
-    Option pointsOpt = 
obuilder.withLongName("pointsDir").withRequired(true).withArgument(
-        
abuilder.withName("pointsDir").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The directory containing points sequence files mapping input vectors 
to their cluster.  ")
-        .withShortName("p").create();
-    Option minClusterSizeOpt = 
obuilder.withLongName("minClusterSize").withRequired(false).withArgument(
-        
abuilder.withName("minClusterSize").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The minimum number of points required in a cluster to print the 
labels for").withShortName("m").create();
-    Option maxLabelsOpt = 
obuilder.withLongName("maxLabels").withRequired(false).withArgument(
-        
abuilder.withName("maxLabels").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The maximum number of labels to print per 
cluster").withShortName("x").create();
-    Option helpOpt = DefaultOptionCreator.helpOption();
-
-    Group group = 
gbuilder.withName("Options").withOption(indexOpt).withOption(idFieldOpt).withOption(outputOpt)
-        
.withOption(fieldOpt).withOption(seqOpt).withOption(pointsOpt).withOption(helpOpt)
-        .withOption(maxLabelsOpt).withOption(minClusterSizeOpt).create();
-
-    try {
-      Parser parser = new Parser();
-      parser.setGroup(group);
-      CommandLine cmdLine = parser.parse(args);
-
-      if (cmdLine.hasOption(helpOpt)) {
-        CommandLineUtil.printHelp(group);
-        return;
-      }
-
-      Path seqFileDir = new Path(cmdLine.getValue(seqOpt).toString());
-      Path pointsDir = new Path(cmdLine.getValue(pointsOpt).toString());
-      String indexDir = cmdLine.getValue(indexOpt).toString();
-      String contentField = cmdLine.getValue(fieldOpt).toString();
-
-      String idField = null;
-
-      if (cmdLine.hasOption(idFieldOpt)) {
-        idField = cmdLine.getValue(idFieldOpt).toString();
-      }
-      String output = null;
-      if (cmdLine.hasOption(outputOpt)) {
-        output = cmdLine.getValue(outputOpt).toString();
-      }
-      int maxLabels = DEFAULT_MAX_LABELS;
-      if (cmdLine.hasOption(maxLabelsOpt)) {
-        maxLabels = 
Integer.parseInt(cmdLine.getValue(maxLabelsOpt).toString());
-      }
-      int minSize = DEFAULT_MIN_IDS;
-      if (cmdLine.hasOption(minClusterSizeOpt)) {
-        minSize = 
Integer.parseInt(cmdLine.getValue(minClusterSizeOpt).toString());
-      }
-      ClusterLabels clusterLabel = new ClusterLabels(seqFileDir, pointsDir, 
indexDir, contentField, minSize, maxLabels);
-
-      if (idField != null) {
-        clusterLabel.setIdField(idField);
-      }
-      if (output != null) {
-        clusterLabel.setOutput(output);
-      }
-
-      clusterLabel.getLabels();
-
-    } catch (OptionException e) {
-      log.error("Exception", e);
-      CommandLineUtil.printHelp(group);
-    } catch (IOException e) {
-      log.error("Exception", e);
-    }
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java 
b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
deleted file mode 100644
index 876816f..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
+++ /dev/null
@@ -1,349 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.Writer;
-import java.nio.file.Paths;
-import java.util.Iterator;
-
-import com.google.common.base.Preconditions;
-import com.google.common.io.Files;
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
-import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.FSDirectory;
-import org.apache.mahout.common.CommandLineUtil;
-import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.utils.vectors.TermEntry;
-import org.apache.mahout.utils.vectors.TermInfo;
-import org.apache.mahout.utils.vectors.io.DelimitedTermInfoWriter;
-import org.apache.mahout.utils.vectors.io.SequenceFileVectorWriter;
-import org.apache.mahout.utils.vectors.io.VectorWriter;
-import org.apache.mahout.vectorizer.TF;
-import org.apache.mahout.vectorizer.TFIDF;
-import org.apache.mahout.vectorizer.Weight;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public final class Driver {
-
-  private static final Logger log = LoggerFactory.getLogger(Driver.class);
-
-  private String luceneDir;
-  private String outFile;
-  private String field;
-  private String idField;
-  private String dictOut;
-  private String seqDictOut = "";
-  private String weightType = "tfidf";
-  private String delimiter = "\t";
-  private double norm = LuceneIterable.NO_NORMALIZING;
-  private long maxDocs = Long.MAX_VALUE;
-  private int minDf = 1;
-  private int maxDFPercent = 99;
-  private double maxPercentErrorDocs = 0.0;
-
-  public void dumpVectors() throws IOException {
-
-    File file = new File(luceneDir);
-    Preconditions.checkArgument(file.isDirectory(),
-        "Lucene directory: " + file.getAbsolutePath()
-            + " does not exist or is not a directory");
-    Preconditions.checkArgument(maxDocs >= 0, "maxDocs must be >= 0");
-    Preconditions.checkArgument(minDf >= 1, "minDf must be >= 1");
-    Preconditions.checkArgument(maxDFPercent <= 99, "maxDFPercent must be <= 
99");
-
-    Directory dir = FSDirectory.open(Paths.get(file.getAbsolutePath()));
-    IndexReader reader = DirectoryReader.open(dir);
-
-
-    Weight weight;
-    if ("tf".equalsIgnoreCase(weightType)) {
-      weight = new TF();
-    } else if ("tfidf".equalsIgnoreCase(weightType)) {
-      weight = new TFIDF();
-    } else {
-      throw new IllegalArgumentException("Weight type " + weightType + " is 
not supported");
-    }
-
-    TermInfo termInfo = new CachedTermInfo(reader, field, minDf, maxDFPercent);
-
-    LuceneIterable iterable;
-    if (norm == LuceneIterable.NO_NORMALIZING) {
-      iterable = new LuceneIterable(reader, idField, field, termInfo, weight, 
LuceneIterable.NO_NORMALIZING,
-          maxPercentErrorDocs);
-    } else {
-      iterable = new LuceneIterable(reader, idField, field, termInfo, weight, 
norm, maxPercentErrorDocs);
-    }
-
-    log.info("Output File: {}", outFile);
-
-    try (VectorWriter vectorWriter = getSeqFileWriter(outFile)) {
-      long numDocs = vectorWriter.write(iterable, maxDocs);
-      log.info("Wrote: {} vectors", numDocs);
-    }
-
-    File dictOutFile = new File(dictOut);
-    log.info("Dictionary Output file: {}", dictOutFile);
-    Writer writer = Files.newWriter(dictOutFile, Charsets.UTF_8);
-    try (DelimitedTermInfoWriter tiWriter = new 
DelimitedTermInfoWriter(writer, delimiter, field)) {
-      tiWriter.write(termInfo);
-    }
-
-    if (!"".equals(seqDictOut)) {
-      log.info("SequenceFile Dictionary Output file: {}", seqDictOut);
-
-      Path path = new Path(seqDictOut);
-      Configuration conf = new Configuration();
-      FileSystem fs = FileSystem.get(conf);
-      try (SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, 
path, Text.class, IntWritable.class)) {
-        Text term = new Text();
-        IntWritable termIndex = new IntWritable();
-        Iterator<TermEntry> termEntries = termInfo.getAllEntries();
-        while (termEntries.hasNext()) {
-          TermEntry termEntry = termEntries.next();
-          term.set(termEntry.getTerm());
-          termIndex.set(termEntry.getTermIdx());
-          seqWriter.append(term, termIndex);
-        }
-      }
-    }
-  }
-
-  public static void main(String[] args) throws IOException {
-
-    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
-    ArgumentBuilder abuilder = new ArgumentBuilder();
-    GroupBuilder gbuilder = new GroupBuilder();
-
-    Option inputOpt = 
obuilder.withLongName("dir").withRequired(true).withArgument(
-        abuilder.withName("dir").withMinimum(1).withMaximum(1).create())
-        .withDescription("The Lucene directory").withShortName("d").create();
-
-    Option outputOpt = 
obuilder.withLongName("output").withRequired(true).withArgument(
-        
abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription("The
 output file")
-        .withShortName("o").create();
-
-    Option fieldOpt = 
obuilder.withLongName("field").withRequired(true).withArgument(
-        
abuilder.withName("field").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The field in the index").withShortName("f").create();
-
-    Option idFieldOpt = 
obuilder.withLongName("idField").withRequired(false).withArgument(
-        
abuilder.withName("idField").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The field in the index containing the index.  If null, then the 
Lucene internal doc "
-            + "id is used which is prone to error if the underlying index 
changes").create();
-
-    Option dictOutOpt = 
obuilder.withLongName("dictOut").withRequired(true).withArgument(
-        
abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The output of the dictionary").withShortName("t").create();
-
-    Option seqDictOutOpt = 
obuilder.withLongName("seqDictOut").withRequired(false).withArgument(
-        
abuilder.withName("seqDictOut").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The output of the dictionary as sequence 
file").withShortName("st").create();
-
-    Option weightOpt = 
obuilder.withLongName("weight").withRequired(false).withArgument(
-        
abuilder.withName("weight").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The kind of weight to use. Currently TF or 
TFIDF").withShortName("w").create();
-
-    Option delimiterOpt = 
obuilder.withLongName("delimiter").withRequired(false).withArgument(
-        
abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The delimiter for outputting the 
dictionary").withShortName("l").create();
-
-    Option powerOpt = 
obuilder.withLongName("norm").withRequired(false).withArgument(
-        
abuilder.withName("norm").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The norm to use, expressed as either a double or \"INF\" if you want 
to use the Infinite norm.  "
-            + "Must be greater or equal to 0.  The default is not to 
normalize").withShortName("n").create();
-
-    Option maxOpt = 
obuilder.withLongName("max").withRequired(false).withArgument(
-        
abuilder.withName("max").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The maximum number of vectors to output.  If not specified, then it 
will loop over all docs")
-        .withShortName("m").create();
-
-    Option minDFOpt = 
obuilder.withLongName("minDF").withRequired(false).withArgument(
-        
abuilder.withName("minDF").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The minimum document frequency.  Default is 
1").withShortName("md").create();
-
-    Option maxDFPercentOpt = 
obuilder.withLongName("maxDFPercent").withRequired(false).withArgument(
-        
abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The max percentage of docs for the DF.  Can be used to remove really 
high frequency terms."
-            + "  Expressed as an integer between 0 and 100. Default is 
99.").withShortName("x").create();
-
-    Option maxPercentErrorDocsOpt = 
obuilder.withLongName("maxPercentErrorDocs").withRequired(false).withArgument(
-        
abuilder.withName("maxPercentErrorDocs").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The max percentage of docs that can have a null term vector. These 
are noise document and can occur if the "
-            + "analyzer used strips out all terms in the target field. This 
percentage is expressed as a value "
-            + "between 0 and 1. The default is 
0.").withShortName("err").create();
-
-    Option helpOpt = obuilder.withLongName("help").withDescription("Print out 
help").withShortName("h")
-        .create();
-
-    Group group = 
gbuilder.withName("Options").withOption(inputOpt).withOption(idFieldOpt).withOption(
-        
outputOpt).withOption(delimiterOpt).withOption(helpOpt).withOption(fieldOpt).withOption(maxOpt)
-        
.withOption(dictOutOpt).withOption(seqDictOutOpt).withOption(powerOpt).withOption(maxDFPercentOpt)
-        
.withOption(weightOpt).withOption(minDFOpt).withOption(maxPercentErrorDocsOpt).create();
-
-    try {
-      Parser parser = new Parser();
-      parser.setGroup(group);
-      CommandLine cmdLine = parser.parse(args);
-
-      if (cmdLine.hasOption(helpOpt)) {
-
-        CommandLineUtil.printHelp(group);
-        return;
-      }
-
-      if (cmdLine.hasOption(inputOpt)) { // Lucene case
-        Driver luceneDriver = new Driver();
-        luceneDriver.setLuceneDir(cmdLine.getValue(inputOpt).toString());
-
-        if (cmdLine.hasOption(maxOpt)) {
-          
luceneDriver.setMaxDocs(Long.parseLong(cmdLine.getValue(maxOpt).toString()));
-        }
-
-        if (cmdLine.hasOption(weightOpt)) {
-          luceneDriver.setWeightType(cmdLine.getValue(weightOpt).toString());
-        }
-
-        luceneDriver.setField(cmdLine.getValue(fieldOpt).toString());
-
-        if (cmdLine.hasOption(minDFOpt)) {
-          
luceneDriver.setMinDf(Integer.parseInt(cmdLine.getValue(minDFOpt).toString()));
-        }
-
-        if (cmdLine.hasOption(maxDFPercentOpt)) {
-          
luceneDriver.setMaxDFPercent(Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString()));
-        }
-
-        if (cmdLine.hasOption(powerOpt)) {
-          String power = cmdLine.getValue(powerOpt).toString();
-          if ("INF".equals(power)) {
-            luceneDriver.setNorm(Double.POSITIVE_INFINITY);
-          } else {
-            luceneDriver.setNorm(Double.parseDouble(power));
-          }
-        }
-
-        if (cmdLine.hasOption(idFieldOpt)) {
-          luceneDriver.setIdField(cmdLine.getValue(idFieldOpt).toString());
-        }
-
-        if (cmdLine.hasOption(maxPercentErrorDocsOpt)) {
-          
luceneDriver.setMaxPercentErrorDocs(Double.parseDouble(cmdLine.getValue(maxPercentErrorDocsOpt).toString()));
-        }
-
-        luceneDriver.setOutFile(cmdLine.getValue(outputOpt).toString());
-
-        luceneDriver.setDelimiter(cmdLine.hasOption(delimiterOpt) ? 
cmdLine.getValue(delimiterOpt).toString() : "\t");
-
-        luceneDriver.setDictOut(cmdLine.getValue(dictOutOpt).toString());
-
-        if (cmdLine.hasOption(seqDictOutOpt)) {
-          
luceneDriver.setSeqDictOut(cmdLine.getValue(seqDictOutOpt).toString());
-        }
-
-        luceneDriver.dumpVectors();
-      }
-    } catch (OptionException e) {
-      log.error("Exception", e);
-      CommandLineUtil.printHelp(group);
-    }
-  }
-
-  private static VectorWriter getSeqFileWriter(String outFile) throws 
IOException {
-    Path path = new Path(outFile);
-    Configuration conf = new Configuration();
-    FileSystem fs = FileSystem.get(conf);
-    // TODO: Make this parameter driven
-
-    SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, path, 
LongWritable.class,
-        VectorWritable.class);
-
-    return new SequenceFileVectorWriter(seqWriter);
-  }
-
-  public void setLuceneDir(String luceneDir) {
-    this.luceneDir = luceneDir;
-  }
-
-  public void setMaxDocs(long maxDocs) {
-    this.maxDocs = maxDocs;
-  }
-
-  public void setWeightType(String weightType) {
-    this.weightType = weightType;
-  }
-
-  public void setField(String field) {
-    this.field = field;
-  }
-
-  public void setMinDf(int minDf) {
-    this.minDf = minDf;
-  }
-
-  public void setMaxDFPercent(int maxDFPercent) {
-    this.maxDFPercent = maxDFPercent;
-  }
-
-  public void setNorm(double norm) {
-    this.norm = norm;
-  }
-
-  public void setIdField(String idField) {
-    this.idField = idField;
-  }
-
-  public void setOutFile(String outFile) {
-    this.outFile = outFile;
-  }
-
-  public void setDelimiter(String delimiter) {
-    this.delimiter = delimiter;
-  }
-
-  public void setDictOut(String dictOut) {
-    this.dictOut = dictOut;
-  }
-
-  public void setSeqDictOut(String seqDictOut) {
-    this.seqDictOut = seqDictOut;
-  }
-
-  public void setMaxPercentErrorDocs(double maxPercentErrorDocs) {
-    this.maxPercentErrorDocs = maxPercentErrorDocs;
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
 
b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
deleted file mode 100644
index 1af0ed0..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
+++ /dev/null
@@ -1,80 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-import org.apache.lucene.index.IndexReader;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.utils.vectors.TermInfo;
-import org.apache.mahout.vectorizer.Weight;
-
-import java.util.Iterator;
-
-/**
- * {@link Iterable} counterpart to {@link LuceneIterator}.
- */
-public final class LuceneIterable implements Iterable<Vector> {
-
-  public static final double NO_NORMALIZING = -1.0;
-
-  private final IndexReader indexReader;
-  private final String field;
-  private final String idField;
-  private final TermInfo terminfo;
-  private final double normPower;
-  private final double maxPercentErrorDocs;
-  private final Weight weight;
-
-  public LuceneIterable(IndexReader reader, String idField, String field, 
TermInfo terminfo, Weight weight) {
-    this(reader, idField, field, terminfo, weight, NO_NORMALIZING);
-  }
-
-  public LuceneIterable(IndexReader indexReader, String idField, String field, 
TermInfo terminfo, Weight weight,
-      double normPower) {
-    this(indexReader, idField, field, terminfo, weight, normPower, 0);
-  }
-
-  /**
-   * Produce a LuceneIterable that can create the Vector plus normalize it.
-   *
-   * @param indexReader         {@link org.apache.lucene.index.IndexReader} to 
read the documents from.
-   * @param idField             field containing the id. May be null.
-   * @param field               field to use for the Vector
-   * @param normPower           the normalization value. Must be nonnegative, 
or {@link #NO_NORMALIZING}
-   * @param maxPercentErrorDocs the percentage of documents in the lucene 
index that can have a null term vector
-   */
-  public LuceneIterable(IndexReader indexReader,
-                        String idField,
-                        String field,
-                        TermInfo terminfo,
-                        Weight weight,
-                        double normPower,
-                        double maxPercentErrorDocs) {
-    this.indexReader = indexReader;
-    this.idField = idField;
-    this.field = field;
-    this.terminfo = terminfo;
-    this.normPower = normPower;
-    this.maxPercentErrorDocs = maxPercentErrorDocs;
-    this.weight = weight;
-  }
-
-  @Override
-  public Iterator<Vector> iterator() {
-    return new LuceneIterator(indexReader, idField, field, terminfo, weight, 
normPower, maxPercentErrorDocs);
-  }
-}

[27/51] [partial] mahout git commit: MAHOUT-2042 and MAHOUT-2045 Delete directories which were moved/no longer in use

Reply via email to