[23/52] [partial] mahout git commit: removed all files except for website directory

vanstee Tue, 27 Jun 2017 09:15:07 -0700

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/clustering/AbstractClusterWriter.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/clustering/AbstractClusterWriter.java
 
b/integration/src/main/java/org/apache/mahout/utils/clustering/AbstractClusterWriter.java
deleted file mode 100644
index ac884d0..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/clustering/AbstractClusterWriter.java
+++ /dev/null
@@ -1,160 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.clustering;
-
-import java.io.IOException;
-import java.io.Writer;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.commons.lang3.StringUtils;
-import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
-import org.apache.mahout.clustering.iterator.ClusterWritable;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.math.Vector;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.google.common.collect.Lists;
-
-/**
- * Base class for implementing ClusterWriter
- */
-public abstract class AbstractClusterWriter implements ClusterWriter {
-
-  private static final Logger log = 
LoggerFactory.getLogger(AbstractClusterWriter.class);
-
-  protected final Writer writer;
-  protected final Map<Integer, List<WeightedPropertyVectorWritable>> 
clusterIdToPoints;
-  protected final DistanceMeasure measure;
-
-  /**
-   *
-   * @param writer The underlying {@link java.io.Writer} to use
-   * @param clusterIdToPoints The map between cluster ids {@link 
org.apache.mahout.clustering.Cluster#getId()} and the
-   *                          points in the cluster
-   * @param measure The {@link 
org.apache.mahout.common.distance.DistanceMeasure} used to calculate the 
distance.
-   *                Some writers may wish to use it for calculating weights 
for display.  May be null.
-   */
-  protected AbstractClusterWriter(Writer writer, Map<Integer, 
List<WeightedPropertyVectorWritable>> clusterIdToPoints,
-      DistanceMeasure measure) {
-    this.writer = writer;
-    this.clusterIdToPoints = clusterIdToPoints;
-    this.measure = measure;
-  }
-
-  protected Writer getWriter() {
-    return writer;
-  }
-
-  protected Map<Integer, List<WeightedPropertyVectorWritable>> 
getClusterIdToPoints() {
-    return clusterIdToPoints;
-  }
-
-  public static String getTopFeatures(Vector vector, String[] dictionary, int 
numTerms) {
-
-    StringBuilder sb = new StringBuilder(100);
-
-    for (Pair<String, Double> item : getTopPairs(vector, dictionary, 
numTerms)) {
-      String term = item.getFirst();
-      sb.append("\n\t\t");
-      sb.append(StringUtils.rightPad(term, 40));
-      sb.append("=>");
-      sb.append(StringUtils.leftPad(item.getSecond().toString(), 20));
-    }
-    return sb.toString();
-  }
-
-  public static String getTopTerms(Vector vector, String[] dictionary, int 
numTerms) {
-
-    StringBuilder sb = new StringBuilder(100);
-
-    for (Pair<String, Double> item : getTopPairs(vector, dictionary, 
numTerms)) {
-      String term = item.getFirst();
-      sb.append(term).append('_');
-    }
-    sb.deleteCharAt(sb.length() - 1);
-    return sb.toString();
-  }
-
-  @Override
-  public long write(Iterable<ClusterWritable> iterable) throws IOException {
-    return write(iterable, Long.MAX_VALUE);
-  }
-
-  @Override
-  public void close() throws IOException {
-    writer.close();
-  }
-
-  @Override
-  public long write(Iterable<ClusterWritable> iterable, long maxDocs) throws 
IOException {
-    long result = 0;
-    Iterator<ClusterWritable> iterator = iterable.iterator();
-    while (result < maxDocs && iterator.hasNext()) {
-      write(iterator.next());
-      result++;
-    }
-    return result;
-  }
-
-  private static Collection<Pair<String, Double>> getTopPairs(Vector vector, 
String[] dictionary, int numTerms) {
-    List<TermIndexWeight> vectorTerms = Lists.newArrayList();
-
-    for (Vector.Element elt : vector.nonZeroes()) {
-      vectorTerms.add(new TermIndexWeight(elt.index(), elt.get()));
-    }
-
-    // Sort results in reverse order (ie weight in descending order)
-    Collections.sort(vectorTerms, new Comparator<TermIndexWeight>() {
-      @Override
-      public int compare(TermIndexWeight one, TermIndexWeight two) {
-        return Double.compare(two.weight, one.weight);
-      }
-    });
-
-    Collection<Pair<String, Double>> topTerms = Lists.newLinkedList();
-
-    for (int i = 0; i < vectorTerms.size() && i < numTerms; i++) {
-      int index = vectorTerms.get(i).index;
-      String dictTerm = dictionary[index];
-      if (dictTerm == null) {
-        log.error("Dictionary entry missing for {}", index);
-        continue;
-      }
-      topTerms.add(new Pair<>(dictTerm, vectorTerms.get(i).weight));
-    }
-
-    return topTerms;
-  }
-
-  private static class TermIndexWeight {
-    private final int index;
-    private final double weight;
-
-    TermIndexWeight(int index, double weight) {
-      this.index = index;
-      this.weight = weight;
-    }
-  }
-}


http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/clustering/CSVClusterWriter.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/clustering/CSVClusterWriter.java
 
b/integration/src/main/java/org/apache/mahout/utils/clustering/CSVClusterWriter.java
deleted file mode 100644
index 7269016..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/clustering/CSVClusterWriter.java
+++ /dev/null
@@ -1,69 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.clustering;
-
-import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
-import org.apache.mahout.clustering.iterator.ClusterWritable;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.math.NamedVector;
-import org.apache.mahout.math.Vector;
-
-import java.io.IOException;
-import java.io.Writer;
-import java.util.List;
-import java.util.Map;
-import java.util.regex.Pattern;
-
-/**
- * Format is adjacency style as put forth at 
http://gephi.org/users/supported-graph-formats/csv-format/, the centroid
- * is the first element and all the rest of the row are the points in that 
cluster
- *
- **/
-public class CSVClusterWriter extends AbstractClusterWriter {
-
-  private static final Pattern VEC_PATTERN = 
Pattern.compile("\\{|\\:|\\,|\\}");
-
-  public CSVClusterWriter(Writer writer, Map<Integer, 
List<WeightedPropertyVectorWritable>> clusterIdToPoints,
-      DistanceMeasure measure) {
-    super(writer, clusterIdToPoints, measure);
-  }
-
-  @Override
-  public void write(ClusterWritable clusterWritable) throws IOException {
-    StringBuilder line = new StringBuilder();
-    Cluster cluster = clusterWritable.getValue();
-    line.append(cluster.getId());
-    List<WeightedPropertyVectorWritable> points = 
getClusterIdToPoints().get(cluster.getId());
-    if (points != null) {
-      for (WeightedPropertyVectorWritable point : points) {
-        Vector theVec = point.getVector();
-        line.append(',');
-        if (theVec instanceof NamedVector) {
-          line.append(((NamedVector)theVec).getName());
-        } else {
-          String vecStr = theVec.asFormatString();
-          //do some basic manipulations for display
-          vecStr = VEC_PATTERN.matcher(vecStr).replaceAll("_");
-          line.append(vecStr);
-        }
-      }
-      getWriter().append(line).append("\n");
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
 
b/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
deleted file mode 100644
index 75b5ded..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
+++ /dev/null
@@ -1,328 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.clustering;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
-import java.util.TreeMap;
-
-import com.google.common.io.Closeables;
-import com.google.common.io.Files;
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.mahout.clustering.cdbw.CDbwEvaluator;
-import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
-import org.apache.mahout.clustering.evaluation.ClusterEvaluator;
-import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver;
-import org.apache.mahout.clustering.iterator.ClusterWritable;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.ClassUtils;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.iterator.sequencefile.PathFilters;
-import org.apache.mahout.common.iterator.sequencefile.PathType;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
-import 
org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
-import org.apache.mahout.utils.vectors.VectorHelper;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public final class ClusterDumper extends AbstractJob {
-
-  public static final String SAMPLE_POINTS = "samplePoints";
-  DistanceMeasure measure;
-
-  public enum OUTPUT_FORMAT {
-    TEXT,
-    CSV,
-    GRAPH_ML,
-    JSON,
-  }
-
-  public static final String DICTIONARY_TYPE_OPTION = "dictionaryType";
-  public static final String DICTIONARY_OPTION = "dictionary";
-  public static final String POINTS_DIR_OPTION = "pointsDir";
-  public static final String NUM_WORDS_OPTION = "numWords";
-  public static final String SUBSTRING_OPTION = "substring";
-  public static final String EVALUATE_CLUSTERS = "evaluate";
-
-  public static final String OUTPUT_FORMAT_OPT = "outputFormat";
-
-  private static final Logger log = 
LoggerFactory.getLogger(ClusterDumper.class);
-  private Path seqFileDir;
-  private Path pointsDir;
-  private long maxPointsPerCluster = Long.MAX_VALUE;
-  private String termDictionary;
-  private String dictionaryFormat;
-  private int subString = Integer.MAX_VALUE;
-  private int numTopFeatures = 10;
-  private Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints;
-  private OUTPUT_FORMAT outputFormat = OUTPUT_FORMAT.TEXT;
-  private boolean runEvaluation;
-
-  public ClusterDumper(Path seqFileDir, Path pointsDir) {
-    this.seqFileDir = seqFileDir;
-    this.pointsDir = pointsDir;
-    init();
-  }
-
-  public ClusterDumper() {
-    setConf(new Configuration());
-  }
-
-  public static void main(String[] args) throws Exception {
-    new ClusterDumper().run(args);
-  }
-
-  @Override
-  public int run(String[] args) throws Exception {
-    addInputOption();
-    addOutputOption();
-    addOption(OUTPUT_FORMAT_OPT, "of", "The optional output format for the 
results.  Options: TEXT, CSV, JSON or GRAPH_ML",
-        "TEXT");
-    addOption(SUBSTRING_OPTION, "b", "The number of chars of the 
asFormatString() to print");
-    addOption(NUM_WORDS_OPTION, "n", "The number of top terms to print");
-    addOption(POINTS_DIR_OPTION, "p",
-            "The directory containing points sequence files mapping input 
vectors to their cluster.  "
-                    + "If specified, then the program will output the points 
associated with a cluster");
-    addOption(SAMPLE_POINTS, "sp", "Specifies the maximum number of points to 
include _per_ cluster.  The default "
-        + "is to include all points");
-    addOption(DICTIONARY_OPTION, "d", "The dictionary file");
-    addOption(DICTIONARY_TYPE_OPTION, "dt", "The dictionary file type 
(text|sequencefile)", "text");
-    addOption(buildOption(EVALUATE_CLUSTERS, "e", "Run ClusterEvaluator and 
CDbwEvaluator over the input.  "
-        + "The output will be appended to the rest of the output at the end.", 
false, false, null));
-    addOption(DefaultOptionCreator.distanceMeasureOption().create());
-
-    // output is optional, will print to System.out per default
-    if (parseArguments(args, false, true) == null) {
-      return -1;
-    }
-
-    seqFileDir = getInputPath();
-    if (hasOption(POINTS_DIR_OPTION)) {
-      pointsDir = new Path(getOption(POINTS_DIR_OPTION));
-    }
-    outputFile = getOutputFile();
-    if (hasOption(SUBSTRING_OPTION)) {
-      int sub = Integer.parseInt(getOption(SUBSTRING_OPTION));
-      if (sub >= 0) {
-        subString = sub;
-      }
-    }
-    termDictionary = getOption(DICTIONARY_OPTION);
-    dictionaryFormat = getOption(DICTIONARY_TYPE_OPTION);
-    if (hasOption(NUM_WORDS_OPTION)) {
-      numTopFeatures = Integer.parseInt(getOption(NUM_WORDS_OPTION));
-    }
-    if (hasOption(OUTPUT_FORMAT_OPT)) {
-      outputFormat = OUTPUT_FORMAT.valueOf(getOption(OUTPUT_FORMAT_OPT));
-    }
-    if (hasOption(SAMPLE_POINTS)) {
-      maxPointsPerCluster = Long.parseLong(getOption(SAMPLE_POINTS));
-    } else {
-      maxPointsPerCluster = Long.MAX_VALUE;
-    }
-    runEvaluation = hasOption(EVALUATE_CLUSTERS);
-    String distanceMeasureClass = 
getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
-    measure = ClassUtils.instantiateAs(distanceMeasureClass, 
DistanceMeasure.class);
-
-    init();
-    printClusters(null);
-    return 0;
-  }
-
-  public void printClusters(String[] dictionary) throws Exception {
-    Configuration conf = new Configuration();
-
-    if (this.termDictionary != null) {
-      if ("text".equals(dictionaryFormat)) {
-        dictionary = VectorHelper.loadTermDictionary(new 
File(this.termDictionary));
-      } else if ("sequencefile".equals(dictionaryFormat)) {
-        dictionary = VectorHelper.loadTermDictionary(conf, 
this.termDictionary);
-      } else {
-        throw new IllegalArgumentException("Invalid dictionary format");
-      }
-    }
-
-    Writer writer;
-    boolean shouldClose;
-    if (this.outputFile == null) {
-      shouldClose = false;
-      writer = new OutputStreamWriter(System.out, Charsets.UTF_8);
-    } else {
-      shouldClose = true;
-      if (outputFile.getName().startsWith("s3n://")) {
-        Path p = outputPath;
-        FileSystem fs = FileSystem.get(p.toUri(), conf);
-        writer = new OutputStreamWriter(fs.create(p), Charsets.UTF_8);
-      } else {
-        Files.createParentDirs(outputFile);
-        writer = Files.newWriter(this.outputFile, Charsets.UTF_8);
-      }
-    }
-    ClusterWriter clusterWriter = createClusterWriter(writer, dictionary);
-    try {
-      long numWritten = clusterWriter.write(new 
SequenceFileDirValueIterable<ClusterWritable>(new Path(seqFileDir,
-          "part-*"), PathType.GLOB, conf));
-
-      writer.flush();
-      if (runEvaluation) {
-        HadoopUtil.delete(conf, new Path("tmp/representative"));
-        int numIters = 5;
-        RepresentativePointsDriver.main(new String[]{
-          "--input", seqFileDir.toString(),
-          "--output", "tmp/representative",
-          "--clusteredPoints", pointsDir.toString(),
-          "--distanceMeasure", measure.getClass().getName(),
-          "--maxIter", String.valueOf(numIters)
-        });
-        conf.set(RepresentativePointsDriver.DISTANCE_MEASURE_KEY, 
measure.getClass().getName());
-        conf.set(RepresentativePointsDriver.STATE_IN_KEY, 
"tmp/representative/representativePoints-" + numIters);
-        ClusterEvaluator ce = new ClusterEvaluator(conf, seqFileDir);
-        writer.append("\n");
-        writer.append("Inter-Cluster Density: 
").append(String.valueOf(ce.interClusterDensity())).append("\n");
-        writer.append("Intra-Cluster Density: 
").append(String.valueOf(ce.intraClusterDensity())).append("\n");
-        CDbwEvaluator cdbw = new CDbwEvaluator(conf, seqFileDir);
-        writer.append("CDbw Inter-Cluster Density: 
").append(String.valueOf(cdbw.interClusterDensity())).append("\n");
-        writer.append("CDbw Intra-Cluster Density: 
").append(String.valueOf(cdbw.intraClusterDensity())).append("\n");
-        writer.append("CDbw Separation: 
").append(String.valueOf(cdbw.separation())).append("\n");
-        writer.flush();
-      }
-      log.info("Wrote {} clusters", numWritten);
-    } finally {
-      if (shouldClose) {
-        Closeables.close(clusterWriter, false);
-      } else {
-        if (clusterWriter instanceof GraphMLClusterWriter) {
-          clusterWriter.close();
-        }
-      }
-    }
-  }
-
-  ClusterWriter createClusterWriter(Writer writer, String[] dictionary) throws 
IOException {
-    ClusterWriter result;
-
-    switch (outputFormat) {
-      case TEXT:
-        result = new ClusterDumperWriter(writer, clusterIdToPoints, measure, 
numTopFeatures, dictionary, subString);
-        break;
-      case CSV:
-        result = new CSVClusterWriter(writer, clusterIdToPoints, measure);
-        break;
-      case GRAPH_ML:
-        result = new GraphMLClusterWriter(writer, clusterIdToPoints, measure, 
numTopFeatures, dictionary, subString);
-        break;
-      case JSON:
-        result = new JsonClusterWriter(writer, clusterIdToPoints, measure, 
numTopFeatures, dictionary);
-        break;
-      default:
-        throw new IllegalStateException("Unknown outputformat: " + 
outputFormat);
-    }
-    return result;
-  }
-
-  /**
-   * Convenience function to set the output format during testing.
-   */
-  public void setOutputFormat(OUTPUT_FORMAT of) {
-    outputFormat = of;
-  }
-
-  private void init() {
-    if (this.pointsDir != null) {
-      Configuration conf = new Configuration();
-      // read in the points
-      clusterIdToPoints = readPoints(this.pointsDir, maxPointsPerCluster, 
conf);
-    } else {
-      clusterIdToPoints = Collections.emptyMap();
-    }
-  }
-
-
-  public int getSubString() {
-    return subString;
-  }
-
-  public void setSubString(int subString) {
-    this.subString = subString;
-  }
-
-  public Map<Integer, List<WeightedPropertyVectorWritable>> 
getClusterIdToPoints() {
-    return clusterIdToPoints;
-  }
-
-  public String getTermDictionary() {
-    return termDictionary;
-  }
-
-  public void setTermDictionary(String termDictionary, String dictionaryType) {
-    this.termDictionary = termDictionary;
-    this.dictionaryFormat = dictionaryType;
-  }
-
-  public void setNumTopFeatures(int num) {
-    this.numTopFeatures = num;
-  }
-
-  public int getNumTopFeatures() {
-    return this.numTopFeatures;
-  }
-
-  public long getMaxPointsPerCluster() {
-    return maxPointsPerCluster;
-  }
-
-  public void setMaxPointsPerCluster(long maxPointsPerCluster) {
-    this.maxPointsPerCluster = maxPointsPerCluster;
-  }
-
-  public static Map<Integer, List<WeightedPropertyVectorWritable>> 
readPoints(Path pointsPathDir,
-                                                                              
long maxPointsPerCluster,
-                                                                              
Configuration conf) {
-    Map<Integer, List<WeightedPropertyVectorWritable>> result = new 
TreeMap<>();
-    for (Pair<IntWritable, WeightedPropertyVectorWritable> record
-        : new SequenceFileDirIterable<IntWritable, 
WeightedPropertyVectorWritable>(pointsPathDir, PathType.LIST,
-            PathFilters.logsCRCFilter(), conf)) {
-      // value is the cluster id as an int, key is the name/id of the
-      // vector, but that doesn't matter because we only care about printing it
-      //String clusterId = value.toString();
-      int keyValue = record.getFirst().get();
-      List<WeightedPropertyVectorWritable> pointList = result.get(keyValue);
-      if (pointList == null) {
-        pointList = new ArrayList<>();
-        result.put(keyValue, pointList);
-      }
-      if (pointList.size() < maxPointsPerCluster) {
-        pointList.add(record.getSecond());
-      }
-    }
-    return result;
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumperWriter.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumperWriter.java
 
b/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumperWriter.java
deleted file mode 100644
index 31858c4..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumperWriter.java
+++ /dev/null
@@ -1,100 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.clustering;
-
-import org.apache.hadoop.io.Text;
-import org.apache.mahout.clustering.AbstractCluster;
-import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
-import org.apache.mahout.clustering.iterator.ClusterWritable;
-import org.apache.mahout.common.distance.DistanceMeasure;
-
-import java.io.IOException;
-import java.io.Writer;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-
-/**
- * Implements a {@link ClusterWriter} that outputs in the format used by 
ClusterDumper in Mahout 0.5
- */
-public class ClusterDumperWriter extends AbstractClusterWriter {
-  
-  private final int subString;
-  private final String[] dictionary;
-  private final int numTopFeatures;
-  
-  public ClusterDumperWriter(Writer writer, 
Map<Integer,List<WeightedPropertyVectorWritable>> clusterIdToPoints,
-      DistanceMeasure measure, int numTopFeatures, String[] dictionary, int 
subString) {
-    super(writer, clusterIdToPoints, measure);
-    this.numTopFeatures = numTopFeatures;
-    this.dictionary = dictionary;
-    this.subString = subString;
-  }
-  
-  @Override
-  public void write(ClusterWritable clusterWritable) throws IOException {
-    Cluster cluster = clusterWritable.getValue();
-    String fmtStr = cluster.asFormatString(dictionary);
-    Writer writer = getWriter();
-    if (subString > 0 && fmtStr.length() > subString) {
-      writer.write(':');
-      writer.write(fmtStr, 0, Math.min(subString, fmtStr.length()));
-    } else {
-      writer.write(fmtStr);
-    }
-    
-    writer.write('\n');
-    
-    if (dictionary != null) {
-      String topTerms = getTopFeatures(clusterWritable.getValue().getCenter(), 
dictionary, numTopFeatures);
-      writer.write("\tTop Terms: ");
-      writer.write(topTerms);
-      writer.write('\n');
-    }
-    
-    Map<Integer,List<WeightedPropertyVectorWritable>> clusterIdToPoints = 
getClusterIdToPoints();
-    List<WeightedPropertyVectorWritable> points = 
clusterIdToPoints.get(clusterWritable.getValue().getId());
-    if (points != null) {
-      writer.write("\tWeight : [props - optional]:  Point:\n\t");
-      for (Iterator<WeightedPropertyVectorWritable> iterator = 
points.iterator(); iterator.hasNext();) {
-        WeightedPropertyVectorWritable point = iterator.next();
-        writer.write(String.valueOf(point.getWeight()));
-        Map<Text,Text> map = point.getProperties();
-        // map can be null since empty maps when written are returned as null
-        writer.write(" : [");
-        if (map != null) {
-          for (Map.Entry<Text,Text> entry : map.entrySet()) {
-            writer.write(entry.getKey().toString());
-            writer.write("=");
-            writer.write(entry.getValue().toString());
-          }
-        }
-        writer.write("]");
-        
-        writer.write(": ");
-        
-        writer.write(AbstractCluster.formatVector(point.getVector(), 
dictionary));
-        if (iterator.hasNext()) {
-          writer.write("\n\t");
-        }
-      }
-      writer.write('\n');
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterWriter.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterWriter.java
 
b/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterWriter.java
deleted file mode 100644
index 70f8f6f..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterWriter.java
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.clustering;
-
-import java.io.Closeable;
-import java.io.IOException;
-
-import org.apache.mahout.clustering.iterator.ClusterWritable;
-
-/**
- * Writes out clusters
- */
-public interface ClusterWriter extends Closeable {
-
-  /**
-   * Write all values in the Iterable to the output
-   *
-   * @param iterable The {@link Iterable} to loop over
-   * @return the number of docs written
-   * @throws java.io.IOException if there was a problem writing
-   */
-  long write(Iterable<ClusterWritable> iterable) throws IOException;
-
-  /**
-   * Write out a Cluster
-   */
-  void write(ClusterWritable clusterWritable) throws IOException;
-
-  /**
-   * Write the first {@code maxDocs} to the output.
-   *
-   * @param iterable The {@link Iterable} to loop over
-   * @param maxDocs  the maximum number of docs to write
-   * @return The number of docs written
-   * @throws IOException if there was a problem writing
-   */
-  long write(Iterable<ClusterWritable> iterable, long maxDocs) throws 
IOException;
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/clustering/GraphMLClusterWriter.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/clustering/GraphMLClusterWriter.java
 
b/integration/src/main/java/org/apache/mahout/utils/clustering/GraphMLClusterWriter.java
deleted file mode 100644
index 25e8f3b..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/clustering/GraphMLClusterWriter.java
+++ /dev/null
@@ -1,216 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.clustering;
-
-import java.io.IOException;
-import java.io.Writer;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Random;
-import java.util.regex.Pattern;
-
-import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
-import org.apache.mahout.clustering.classify.WeightedVectorWritable;
-import org.apache.mahout.clustering.iterator.ClusterWritable;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.common.StringUtils;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.math.NamedVector;
-import org.apache.mahout.math.Vector;
-
-/**
- * GraphML -- see 
http://gephi.org/users/supported-graph-formats/graphml-format/
- */
-public class GraphMLClusterWriter extends AbstractClusterWriter {
-
-  private static final Pattern VEC_PATTERN = 
Pattern.compile("\\{|\\:|\\,|\\}");
-  private final Map<Integer, Color> colors = new HashMap<>();
-  private Color lastClusterColor;
-  private float lastX;
-  private float lastY;
-  private Random random;
-  private int posStep;
-  private final String[] dictionary;
-  private final int numTopFeatures;
-  private final int subString;
-
-  public GraphMLClusterWriter(Writer writer, Map<Integer, 
List<WeightedPropertyVectorWritable>> clusterIdToPoints,
-                              DistanceMeasure measure, int numTopFeatures, 
String[] dictionary, int subString)
-    throws IOException {
-    super(writer, clusterIdToPoints, measure);
-    this.dictionary = dictionary;
-    this.numTopFeatures = numTopFeatures;
-    this.subString = subString;
-    init(writer);
-  }
-
-  private void init(Writer writer) throws IOException {
-    writer.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
-    writer.append("<graphml xmlns=\"http://graphml.graphdrawing.org/xmlns\"\n";
-                + "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n";
-                + 
"xsi:schemaLocation=\"http://graphml.graphdrawing.org/xmlns\n";
-                + "http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd\";>");
-    //support rgb
-    writer.append("<key attr.name=\"r\" attr.type=\"int\" for=\"node\" 
id=\"r\"/>\n"
-                + "<key attr.name=\"g\" attr.type=\"int\" for=\"node\" 
id=\"g\"/>\n"
-                + "<key attr.name=\"b\" attr.type=\"int\" for=\"node\" 
id=\"b\"/>"
-                + "<key attr.name=\"size\" attr.type=\"int\" for=\"node\" 
id=\"size\"/>"
-                + "<key attr.name=\"weight\" attr.type=\"float\" for=\"edge\" 
id=\"weight\"/>"
-                + "<key attr.name=\"x\" attr.type=\"float\" for=\"node\" 
id=\"x\"/>"
-                + "<key attr.name=\"y\" attr.type=\"float\" for=\"node\" 
id=\"y\"/>");
-    writer.append("<graph edgedefault=\"undirected\">");
-    lastClusterColor = new Color();
-    posStep = (int) (0.1 * clusterIdToPoints.size()) + 100;
-    random = RandomUtils.getRandom();
-  }
-
-  /*
-    <?xml version="1.0" encoding="UTF-8"?>
-    <graphml xmlns="http://graphml.graphdrawing.org/xmlns";
-    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
-    xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns
-    http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd";>
-    <graph id="G" edgedefault="undirected">
-    <node id="n0"/>
-    <node id="n1"/>
-    <edge id="e1" source="n0" target="n1"/>
-    </graph>
-    </graphml>
-   */
-
-  @Override
-  public void write(ClusterWritable clusterWritable) throws IOException {
-    StringBuilder line = new StringBuilder();
-    Cluster cluster = clusterWritable.getValue();
-    Color rgb = getColor(cluster.getId());
-
-    String topTerms = "";
-    if (dictionary != null) {
-      topTerms = getTopTerms(cluster.getCenter(), dictionary, numTopFeatures);
-    }
-    String clusterLabel = String.valueOf(cluster.getId()) + '_' + topTerms;
-    //do some positioning so that items are visible and grouped together
-    //TODO: put in a real layout algorithm
-    float x = lastX + 1000;
-    float y = lastY;
-    if (x > (1000 + posStep)) {
-      y = lastY + 1000;
-      x = 0;
-    }
-
-    line.append(createNode(clusterLabel, rgb, x, y));
-    List<WeightedPropertyVectorWritable> points = 
clusterIdToPoints.get(cluster.getId());
-    if (points != null) {
-      for (WeightedVectorWritable point : points) {
-        Vector theVec = point.getVector();
-        double distance = 1;
-        if (measure != null) {
-          //scale the distance
-          distance = measure.distance(cluster.getCenter().getLengthSquared(), 
cluster.getCenter(), theVec) * 500;
-        }
-        String vecStr;
-        int angle = random.nextInt(360); //pick an angle at random and then 
scale along that angle
-        double angleRads = Math.toRadians(angle);
-
-        float targetX = x + (float) (distance * Math.cos(angleRads));
-        float targetY = y + (float) (distance * Math.sin(angleRads));
-        if (theVec instanceof NamedVector) {
-          vecStr = ((NamedVector) theVec).getName();
-        } else {
-          vecStr = theVec.asFormatString();
-          //do some basic manipulations for display
-          vecStr = VEC_PATTERN.matcher(vecStr).replaceAll("_");
-        }
-        if (subString > 0 && vecStr.length() > subString) {
-          vecStr = vecStr.substring(0, subString);
-        }
-        line.append(createNode(vecStr, rgb, targetX, targetY));
-        line.append(createEdge(clusterLabel, vecStr, distance));
-      }
-    }
-    lastClusterColor = rgb;
-    lastX = x;
-    lastY = y;
-    getWriter().append(line).append("\n");
-  }
-
-  private Color getColor(int clusterId) {
-    Color result = colors.get(clusterId);
-    if (result == null) {
-      result = new Color();
-      //there is probably some better way to color a graph
-      int incR = 0;
-      int incG = 0;
-      int incB = 0;
-      if (lastClusterColor.r + 20 < 256 && lastClusterColor.g + 20 < 256 && 
lastClusterColor.b + 20 < 256) {
-        incR = 20;
-        incG = 0;
-        incB = 0;
-      } else if (lastClusterColor.r + 20 >= 256 && lastClusterColor.g + 20 < 
256 && lastClusterColor.b + 20 < 256) {
-        incG = 20;
-        incB = 0;
-      } else if (lastClusterColor.r + 20 >= 256 && lastClusterColor.g + 20 >= 
256 && lastClusterColor.b + 20 < 256) {
-        incB = 20;
-      } else {
-        incR += 3;
-        incG += 3;
-        incR += 3;
-      }
-      result.r = (lastClusterColor.r + incR) % 256;
-      result.g = (lastClusterColor.g + incG) % 256;
-      result.b = (lastClusterColor.b + incB) % 256;
-      colors.put(clusterId, result);
-    }
-    return result;
-  }
-
-  private static String createEdge(String left, String right, double distance) 
{
-    left = StringUtils.escapeXML(left);
-    right = StringUtils.escapeXML(right);
-    return "<edge id=\"" + left + '_' + right + "\" source=\"" + left + "\" 
target=\"" + right + "\">" 
-            + "<data key=\"weight\">" + distance + "</data></edge>";
-  }
-
-  private static String createNode(String s, Color rgb, float x, float y) {
-    return "<node id=\"" + StringUtils.escapeXML(s) + "\"><data key=\"r\">" + 
rgb.r 
-            + "</data>"
-            + "<data key=\"g\">" + rgb.g
-            + "</data>"
-            + "<data key=\"b\">" + rgb.b
-            + "</data>"
-            + "<data key=\"x\">" + x
-            + "</data>"
-            + "<data key=\"y\">" + y
-            + "</data>"
-            + "</node>";
-  }
-
-  @Override
-  public void close() throws IOException {
-    getWriter().append("</graph>").append("</graphml>");
-    super.close();
-  }
-
-  private static class Color {
-    int r;
-    int g;
-    int b;
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java
 
b/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java
deleted file mode 100644
index d564a73..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java
+++ /dev/null
@@ -1,188 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.utils.clustering;
-
-import java.io.IOException;
-import java.io.Writer;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.regex.Pattern;
-
-import org.apache.mahout.clustering.AbstractCluster;
-import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
-import org.apache.mahout.clustering.iterator.ClusterWritable;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.math.NamedVector;
-import org.apache.mahout.math.Vector;
-import org.codehaus.jackson.map.ObjectMapper;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Dump cluster info to JSON formatted lines. Heavily inspired by
- * ClusterDumperWriter.java and CSVClusterWriter.java
- *
- */
-public class JsonClusterWriter extends AbstractClusterWriter {
-  private final String[] dictionary;
-  private final int numTopFeatures;
-  private final ObjectMapper jxn;
-
-  private static final Logger log = 
LoggerFactory.getLogger(JsonClusterWriter.class);
-  private static final Pattern VEC_PATTERN = 
Pattern.compile("\\{|\\:|\\,|\\}");
-
-  public JsonClusterWriter(Writer writer,
-      Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints,
-      DistanceMeasure measure, int numTopFeatures, String[] dictionary) {
-    super(writer, clusterIdToPoints, measure);
-    this.numTopFeatures = numTopFeatures;
-    this.dictionary = dictionary;
-    jxn = new ObjectMapper();
-  }
-
-  /**
-   * Generate HashMap with cluster info and write as a single JSON formatted
-   * line
-   */
-  @Override
-  public void write(ClusterWritable clusterWritable) throws IOException {
-    Map<String, Object> res = new HashMap<>();
-
-    // get top terms
-    if (dictionary != null) {
-      List<Object> topTerms = getTopFeaturesList(clusterWritable.getValue()
-          .getCenter(), dictionary, numTopFeatures);
-      res.put("top_terms", topTerms);
-    } else {
-      res.put("top_terms", new ArrayList<>());
-    }
-
-    // get human-readable cluster representation
-    Cluster cluster = clusterWritable.getValue();
-    res.put("cluster_id", cluster.getId());
-
-    if (dictionary != null) {
-      Map<String,Object> fmtStr = cluster.asJson(dictionary);
-      res.put("cluster", fmtStr);
-
-      // get points
-      List<Object> points = getPoints(cluster, dictionary);
-      res.put("points", points);
-    } else {
-      res.put("cluster", new HashMap<>());
-      res.put("points", new ArrayList<>());
-    }
-
-    // write JSON
-    Writer writer = getWriter();
-    writer.write(jxn.writeValueAsString(res) + "\n");
-  }
-
-  /**
-   * Create a List of HashMaps containing top terms information
-   *
-   * @return List<Object>
-   */
-  public List<Object> getTopFeaturesList(Vector vector, String[] dictionary,
-      int numTerms) {
-
-    List<TermIndexWeight> vectorTerms = new ArrayList<>();
-
-    for (Vector.Element elt : vector.nonZeroes()) {
-      vectorTerms.add(new TermIndexWeight(elt.index(), elt.get()));
-    }
-
-    // Sort results in reverse order (i.e. weight in descending order)
-    Collections.sort(vectorTerms, new Comparator<TermIndexWeight>() {
-      @Override
-      public int compare(TermIndexWeight one, TermIndexWeight two) {
-        return Double.compare(two.weight, one.weight);
-      }
-    });
-
-    List<Object> topTerms = new ArrayList<>();
-
-    for (int i = 0; i < vectorTerms.size() && i < numTerms; i++) {
-      int index = vectorTerms.get(i).index;
-      String dictTerm = dictionary[index];
-      if (dictTerm == null) {
-        log.error("Dictionary entry missing for {}", index);
-        continue;
-      }
-      Map<String, Object> term_entry = new HashMap<>();
-      term_entry.put(dictTerm, vectorTerms.get(i).weight);
-      topTerms.add(term_entry);
-    }
-
-    return topTerms;
-  }
-
-  /**
-   * Create a List of HashMaps containing Vector point information
-   *
-   * @return List<Object>
-   */
-  public List<Object> getPoints(Cluster cluster, String[] dictionary) {
-    List<Object> vectorObjs = new ArrayList<>();
-    List<WeightedPropertyVectorWritable> points = getClusterIdToPoints().get(
-        cluster.getId());
-
-    if (points != null) {
-      for (WeightedPropertyVectorWritable point : points) {
-        Map<String, Object> entry = new HashMap<>();
-        Vector theVec = point.getVector();
-        if (theVec instanceof NamedVector) {
-          entry.put("vector_name", ((NamedVector) theVec).getName());
-        } else {
-          String vecStr = theVec.asFormatString();
-          // do some basic manipulations for display
-          vecStr = VEC_PATTERN.matcher(vecStr).replaceAll("_");
-          entry.put("vector_name", vecStr);
-        }
-        entry.put("weight", String.valueOf(point.getWeight()));
-        try {
-          entry.put("point",
-                  AbstractCluster.formatVectorAsJson(point.getVector(), 
dictionary));
-        } catch (IOException e) {
-          log.error("IOException:  ", e);
-        }
-        vectorObjs.add(entry);
-      }
-    }
-    return vectorObjs;
-  }
-
-  /**
-   * Convenience class for sorting terms
-   *
-   */
-  private static class TermIndexWeight {
-    private final int index;
-    private final double weight;
-
-    TermIndexWeight(int index, double weight) {
-      this.index = index;
-      this.weight = weight;
-    }
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java 
b/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java
deleted file mode 100644
index 54ad43f..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java
+++ /dev/null
@@ -1,186 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.email;
-
-import java.io.File;
-import java.nio.charset.Charset;
-import java.util.Map;
-import java.util.regex.Pattern;
-
-/**
- * Configuration options to be used by {@link MailProcessor}. Includes options 
controlling the exact output format 
- * and which mail fields are included (body, to, from, subject, etc.)
- */
-public class MailOptions {
-
-  public static final String FROM = "FROM";
-  public static final String TO = "TO";
-  public static final String REFS = "REFS";
-  public static final String SUBJECT = "SUBJECT";
-  public static final Pattern DEFAULT_QUOTED_TEXT = 
Pattern.compile("^(\\||>)");
-
-  private boolean stripQuotedText;
-  private File input;
-  private String outputDir;
-  private String prefix;
-  private int chunkSize;
-  private Charset charset;
-  private String separator;
-  private String bodySeparator = "\n";
-  private boolean includeBody;
-  private Pattern[] patternsToMatch;
-  //maps FROM, TO, REFS, SUBJECT, etc. to the order they appear in 
patternsToMatch.  See MailToRecMapper
-  private Map<String, Integer> patternOrder;
-
-  //the regular expression to use for identifying quoted text.
-  private Pattern quotedTextPattern = DEFAULT_QUOTED_TEXT;
-
-  public File getInput() {
-    return input;
-  }
-
-  public void setInput(File input) {
-    this.input = input;
-  }
-
-  public String getOutputDir() {
-    return outputDir;
-  }
-
-  /**
-   * Sets the output directory where sequence files will be written.
-   */
-  public void setOutputDir(String outputDir) {
-    this.outputDir = outputDir;
-  }
-
-  public String getPrefix() {
-    return prefix;
-  }
-
-  /**
-   * Sets the prefix that is combined with the archive name and with message 
ids to create {@code SequenceFile} keys. 
-   * @param prefix The name of the directory containing the mail archive is 
commonly used.
-   */
-  public void setPrefix(String prefix) {
-    this.prefix = prefix;
-  }
-
-  public int getChunkSize() {
-    return chunkSize;
-  }
-
-  /**
-   * Sets the size of each generated sequence file, in Megabytes.
-   */
-  public void setChunkSize(int chunkSize) {
-    this.chunkSize = chunkSize;
-  }
-
-  public Charset getCharset() {
-    return charset;
-  }
-
-  /**
-   * Sets the encoding of the input
-   */
-  public void setCharset(Charset charset) {
-    this.charset = charset;
-  }
-
-  public String getSeparator() {
-    return separator;
-  }
-
-  /**
-   * Sets the separator to use in the output between metadata items (to, from, 
etc.).
-   */
-  public void setSeparator(String separator) {
-    this.separator = separator;
-  }
-
-  public String getBodySeparator() {
-    return bodySeparator;
-  }
-
-  /**
-   * Sets the separator to use in the output between lines in the body, the 
default is "\n".
-   */
-  public void setBodySeparator(String bodySeparator) {
-    this.bodySeparator = bodySeparator;
-  }
-
-  public boolean isIncludeBody() {
-    return includeBody;
-  }
-
-  /**
-   * Sets whether mail bodies are included in the output
-   */
-  public void setIncludeBody(boolean includeBody) {
-    this.includeBody = includeBody;
-  }
-
-  public Pattern[] getPatternsToMatch() {
-    return patternsToMatch;
-  }
-
-  /**
-   * Sets the list of patterns to be applied in the given order to extract 
metadata fields (to, from, subject, etc.)
-   *  from the input 
-   */
-  public void setPatternsToMatch(Pattern[] patternsToMatch) {
-    this.patternsToMatch = patternsToMatch;
-  }
-
-  public Map<String, Integer> getPatternOrder() {
-    return patternOrder;
-  }
-
-  public void setPatternOrder(Map<String, Integer> patternOrder) {
-    this.patternOrder = patternOrder;
-  }
-
-  /**
-   *
-   * @return true if we should strip out quoted email text
-   */
-  public boolean isStripQuotedText() {
-    return stripQuotedText;
-  }
-
-  /**
-   *
-   * Sets whether quoted text such as lines starting with | or > is striped 
off.
-   */
-  public void setStripQuotedText(boolean stripQuotedText) {
-    this.stripQuotedText = stripQuotedText;
-  }
-
-  public Pattern getQuotedTextPattern() {
-    return quotedTextPattern;
-  }
-
-  /**
-   * Sets the {@link java.util.regex.Pattern} to use to identify lines that 
are quoted text. Default is | and >
-   * @see #setStripQuotedText(boolean)
-   */
-  public void setQuotedTextPattern(Pattern quotedTextPattern) {
-    this.quotedTextPattern = quotedTextPattern;
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java 
b/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java
deleted file mode 100644
index 7db836f..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java
+++ /dev/null
@@ -1,183 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.email;
-
-import org.apache.mahout.common.iterator.FileLineIterable;
-import org.apache.mahout.utils.io.ChunkedWriter;
-import org.apache.mahout.utils.io.ChunkedWrapper;
-import org.apache.mahout.utils.io.IOWriterWrapper;
-import org.apache.mahout.utils.io.WrappedWriter;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.Writer;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-/**
- * Converts an mbox mail archive into a group of Hadoop Sequence Files with 
equal size. The archive may optionally be
- * gzipped or zipped. @see org.apache.mahout.text.SequenceFilesFromMailArchives
- */
-public class MailProcessor {
-
-  private static final Pattern MESSAGE_START = Pattern.compile("^From 
\\S+@\\S.*\\d{4}$", Pattern.CASE_INSENSITIVE);
-  private static final Pattern MESSAGE_ID_PREFIX = 
Pattern.compile("^message-id: <(.*)>$", Pattern.CASE_INSENSITIVE);
-  // regular expressions used to parse individual messages
-  public static final Pattern SUBJECT_PREFIX = Pattern.compile("^subject: 
(.*)$", Pattern.CASE_INSENSITIVE);
-  //we need to have at least one character
-  public static final Pattern FROM_PREFIX = Pattern.compile("^from: (\\S.*)$", 
Pattern.CASE_INSENSITIVE);
-  public static final Pattern REFS_PREFIX = Pattern.compile("^references: 
(.*)$", Pattern.CASE_INSENSITIVE);
-  public static final Pattern TO_PREFIX = Pattern.compile("^to: (.*)$", 
Pattern.CASE_INSENSITIVE);
-
-  private final String prefix;
-  private final MailOptions options;
-  private final WrappedWriter writer;
-
-  private static final Logger log = 
LoggerFactory.getLogger(MailProcessor.class);
-
-  /**
-   * Creates a {@code MailProcessor} that does not write to sequence files, 
but to a single text file.
-   * This constructor is for debugging and testing purposes.
-   */
-  public MailProcessor(MailOptions options, String prefix, Writer writer) {
-    this.writer = new IOWriterWrapper(writer);
-    this.options = options;
-    this.prefix = prefix;
-  }
-
-  /**
-   * This is the main constructor of {@code MailProcessor}.
-   */
-  public MailProcessor(MailOptions options, String prefix, ChunkedWriter 
writer) {
-    this.writer = new ChunkedWrapper(writer);
-    this.options = options;
-    this.prefix = prefix;
-  }
-
-  /**
-   * Parses one complete mail archive, writing output to the {@code writer} 
constructor parameter.
-   * @param mboxFile  mail archive to parse
-   * @return number of parsed mails
-   * @throws IOException
-   */
-  public long parseMboxLineByLine(File mboxFile) throws IOException {
-    long messageCount = 0;
-    try {
-      StringBuilder contents = new StringBuilder();
-      // tmps used during mail message parsing
-      StringBuilder body = new StringBuilder();
-      Matcher messageIdMatcher = MESSAGE_ID_PREFIX.matcher("");
-      Matcher messageBoundaryMatcher = MESSAGE_START.matcher("");
-      String[] patternResults = new 
String[options.getPatternsToMatch().length];
-      Matcher[] matchers = new Matcher[options.getPatternsToMatch().length];
-      for (int i = 0; i < matchers.length; i++) {
-        matchers[i] = options.getPatternsToMatch()[i].matcher("");
-      }
-
-      String messageId = null;
-      boolean inBody = false;
-      Pattern quotedTextPattern = options.getQuotedTextPattern();
-      for (String nextLine : new FileLineIterable(mboxFile, 
options.getCharset(), false)) {
-        if (options.isStripQuotedText() && 
quotedTextPattern.matcher(nextLine).find()) {
-          continue;
-        }
-        for (int i = 0; i < matchers.length; i++) {
-          Matcher matcher = matchers[i];
-          matcher.reset(nextLine);
-          if (matcher.matches()) {
-            patternResults[i] = matcher.group(1);
-          }
-        }
-
-        // only start appending body content after we've seen a message ID
-        if (messageId != null) {
-          // first, see if we hit the end of the message
-          messageBoundaryMatcher.reset(nextLine);
-          if (messageBoundaryMatcher.matches()) {
-            // done parsing this message ... write it out
-            String key = generateKey(mboxFile, prefix, messageId);
-            //if this ordering changes, then also change 
FromEmailToDictionaryMapper
-            writeContent(options.getSeparator(), contents, body, 
patternResults);
-            writer.write(key, contents.toString());
-            contents.setLength(0); // reset the buffer
-            body.setLength(0);
-
-            messageId = null;
-            inBody = false;
-          } else {
-            if (inBody && options.isIncludeBody()) {
-              if (!nextLine.isEmpty()) {
-                body.append(nextLine).append(options.getBodySeparator());
-              }
-            } else {
-              // first empty line we see after reading the message Id
-              // indicates that we are in the body ...
-              inBody = nextLine.isEmpty();
-            }
-          }
-        } else {
-          if (nextLine.length() > 14) {
-            messageIdMatcher.reset(nextLine);
-            if (messageIdMatcher.matches()) {
-              messageId = messageIdMatcher.group(1);
-              ++messageCount;
-            }
-          }
-        }
-      }
-      // write the last message in the file if available
-      if (messageId != null) {
-        String key = generateKey(mboxFile, prefix, messageId);
-        writeContent(options.getSeparator(), contents, body, patternResults);
-        writer.write(key, contents.toString());
-        contents.setLength(0); // reset the buffer
-      }
-    } catch (FileNotFoundException e) {
-      // Skip file.
-      log.warn("Unable to process non-existing file", e);
-    }
-    // TODO: report exceptions and continue;
-    return messageCount;
-  }
-
-  protected static String generateKey(File mboxFile, String prefix, String 
messageId) {
-    return prefix + File.separator + mboxFile.getName() + File.separator + 
messageId;
-  }
-
-  public String getPrefix() {
-    return prefix;
-  }
-
-  public MailOptions getOptions() {
-    return options;
-  }
-
-  private static void writeContent(String separator, StringBuilder contents, 
CharSequence body, String[] matches) {
-    for (String match : matches) {
-      if (match != null) {
-        contents.append(match).append(separator);
-      } else {
-        contents.append(separator);
-      }
-    }
-    contents.append('\n').append(body);
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWrapper.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWrapper.java 
b/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWrapper.java
deleted file mode 100644
index 473e86a..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWrapper.java
+++ /dev/null
@@ -1,42 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.io;
-
-import java.io.IOException;
-
-/**
- * {@link ChunkedWriter} based implementation of the {@link WrappedWriter} 
interface.
- */
-public class ChunkedWrapper implements WrappedWriter {
-
-  private final ChunkedWriter writer;
-
-  public ChunkedWrapper(ChunkedWriter writer) {
-    this.writer = writer;
-  }
-
-  @Override
-  public void write(String key, String value) throws IOException {
-    writer.write(key, value);
-  }
-
-  @Override
-  public void close() throws IOException {
-    writer.close();
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWriter.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWriter.java 
b/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWriter.java
deleted file mode 100644
index 66cf15f..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWriter.java
+++ /dev/null
@@ -1,86 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.utils.io;
-
-import com.google.common.io.Closeables;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
-
-import java.io.Closeable;
-import java.io.IOException;
-
-/**
- * Writes data splitted in multiple Hadoop sequence files of approximate equal 
size. The data must consist
- * of key-value pairs, both of them of String type. All sequence files are 
created in the same
- * directory and named "chunk-0", "chunk-1", etc. 
- */
-public final class ChunkedWriter implements Closeable {
-
-  private final int maxChunkSizeInBytes;
-  private final Path output;
-  private SequenceFile.Writer writer;
-  private int currentChunkID;
-  private int currentChunkSize;
-  private final FileSystem fs;
-  private final Configuration conf;
-
-  /** 
-   * @param conf    needed by Hadoop to know what filesystem implementation to 
use.
-   * @param chunkSizeInMB approximate size of each file, in Megabytes.
-   * @param output        directory where the sequence files will be created.
-   * @throws IOException
-   */
-  public ChunkedWriter(Configuration conf, int chunkSizeInMB, Path output) 
throws IOException {
-    this.output = output;
-    this.conf = conf;
-    if (chunkSizeInMB > 1984) {
-      chunkSizeInMB = 1984;
-    }
-    maxChunkSizeInBytes = chunkSizeInMB * 1024 * 1024;
-    fs = FileSystem.get(output.toUri(), conf);
-    currentChunkID = 0;
-    writer = new SequenceFile.Writer(fs, conf, getPath(currentChunkID), 
Text.class, Text.class);
-  }
-
-  private Path getPath(int chunkID) {
-    return new Path(output, "chunk-" + chunkID);
-  }
-
-  /** Writes a new key-value pair, creating a new sequence file if necessary.*/
-  public void write(String key, String value) throws IOException {
-    if (currentChunkSize > maxChunkSizeInBytes) {
-      Closeables.close(writer, false);
-      currentChunkID++;
-      writer = new SequenceFile.Writer(fs, conf, getPath(currentChunkID), 
Text.class, Text.class);
-      currentChunkSize = 0;
-    }
-
-    Text keyT = new Text(key);
-    Text valueT = new Text(value);
-    currentChunkSize += keyT.getBytes().length + valueT.getBytes().length; // 
Overhead
-    writer.append(keyT, valueT);
-  }
-
-  @Override
-  public void close() throws IOException {
-    Closeables.close(writer, false);
-  }
-}
-

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/io/IOWriterWrapper.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/io/IOWriterWrapper.java 
b/integration/src/main/java/org/apache/mahout/utils/io/IOWriterWrapper.java
deleted file mode 100644
index b7c3d42..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/io/IOWriterWrapper.java
+++ /dev/null
@@ -1,45 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.io;
-
-import java.io.IOException;
-import java.io.Writer;
-/**
- * Implementation of the {@link WrappedWriter} interface based on {@link 
java.io.Writer}.
- */
-public class IOWriterWrapper implements WrappedWriter {
-
-  private final Writer writer;
-
-  public IOWriterWrapper(Writer writer) {
-    this.writer = writer;
-  }
-
-  /** Writes a new key and value, separating them with one space. The value 
must end with a
-   * new line or some other delimiter, as it is not automatically added by 
this method 
-   */
-  @Override
-  public void write(String key, String value) throws IOException {
-    writer.write(key + ' ' + value);
-  }
-
-  @Override
-  public void close() throws IOException {
-    writer.close();
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/io/WrappedWriter.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/io/WrappedWriter.java 
b/integration/src/main/java/org/apache/mahout/utils/io/WrappedWriter.java
deleted file mode 100644
index b9900e9..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/io/WrappedWriter.java
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.io;
-
-import java.io.Closeable;
-import java.io.IOException;
-
-/**
- * Convenience class for wrapping either a java.io.Writer or a 
SequenceFile.Writer with some basic functionality
- */
-public interface WrappedWriter extends Closeable {
-
-  /** Writes a new key-value pair.*/
-  void write(String key, String value) throws IOException;
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java
 
b/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java
deleted file mode 100644
index 964c8cc..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java
+++ /dev/null
@@ -1,78 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.utils.nlp.collocations.llr;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.nio.CharBuffer;
-import java.nio.charset.CharsetEncoder;
-import java.nio.charset.CodingErrorAction;
-
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.util.bloom.Filter;
-import org.apache.hadoop.util.bloom.Key;
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-
-/**
- * Emits tokens based on bloom filter membership.
- */
-public final class BloomTokenFilter extends TokenFilter {
-  
-  private final Filter filter;
-  private final CharTermAttribute termAtt;
-  private final CharsetEncoder encoder;
-  private final Key key;
-  private final boolean keepMembers;
-  
-  /** 
-   * @param filter tokens will be checked for membership in this bloom filter
-   * @param in the tokenstream to read.
-   * @param keepMembers keep memoers of the bloom filter? If true works like
-   *   a whitelist and members found in the list are kept and all others are
-   *   dropped. If false works like a stoplist and members found in the 
-   *   filter are dropped all others are kept.
-   */
-  public BloomTokenFilter(Filter filter, boolean keepMembers, TokenStream in) {
-    super(in);
-    this.filter = filter;
-    this.keepMembers = keepMembers;
-    this.key = new Key();
-    this.termAtt = addAttribute(CharTermAttribute.class);
-    this.encoder = Charsets.UTF_8.newEncoder().
-      onMalformedInput(CodingErrorAction.REPORT).
-      onUnmappableCharacter(CodingErrorAction.REPORT);
-  }
-  
-  @Override
-  public boolean incrementToken() throws IOException {
-    while (input.incrementToken()) {
-      ByteBuffer bytes =  encoder.encode(CharBuffer.wrap(termAtt.buffer(), 0, 
termAtt.length()));
-      key.set(bytes.array(), 1.0f);
-      boolean member = filter.membershipTest(key);
-      if ((keepMembers && member) || (!keepMembers && !member)) {
-        return true;
-      }
-    }
-    return false;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
 
b/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
deleted file mode 100644
index 4585a0a..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.regex;
-
-import java.io.IOException;
-import java.io.StringReader;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.mahout.common.lucene.TokenStreamIterator;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public class AnalyzerTransformer implements RegexTransformer {
-
-  private Analyzer analyzer;
-  private String fieldName = "text";
-
-  private static final Logger log = 
LoggerFactory.getLogger(AnalyzerTransformer.class);
-
-  public AnalyzerTransformer() {
-    this(new StandardAnalyzer(), "text");
-  }
-
-  public AnalyzerTransformer(Analyzer analyzer) {
-    this(analyzer, "text");
-  }
-
-  public AnalyzerTransformer(Analyzer analyzer, String fieldName) {
-    this.analyzer = analyzer;
-    this.fieldName = fieldName;
-  }
-
-  @Override
-  public String transformMatch(String match) {
-    StringBuilder result = new StringBuilder();
-    try (TokenStream ts = analyzer.tokenStream(fieldName, new 
StringReader(match))) {
-      ts.addAttribute(CharTermAttribute.class);
-      ts.reset();
-      TokenStreamIterator iter = new TokenStreamIterator(ts);
-      while (iter.hasNext()) {
-        result.append(iter.next()).append(' ');
-      }
-      ts.end();
-    } catch (IOException e) {
-      throw new IllegalStateException(e);
-    }
-    return result.toString();
-  }
-
-  public Analyzer getAnalyzer() {
-    return analyzer;
-  }
-
-  public void setAnalyzer(Analyzer analyzer) {
-    this.analyzer = analyzer;
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/regex/ChainTransformer.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/regex/ChainTransformer.java 
b/integration/src/main/java/org/apache/mahout/utils/regex/ChainTransformer.java
deleted file mode 100644
index d3e8e06..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/regex/ChainTransformer.java
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.regex;
-
-import com.google.common.collect.Lists;
-
-import java.util.List;
-
-/**
- * Chain together several {@link 
org.apache.mahout.utils.regex.RegexTransformer} and apply them to the match
- * in succession
- */
-public class ChainTransformer implements RegexTransformer {
-
-  private List<RegexTransformer> chain = Lists.newArrayList();
-
-  public ChainTransformer() {
-  }
-
-  public ChainTransformer(List<RegexTransformer> chain) {
-    this.chain = chain;
-  }
-
-  @Override
-  public String transformMatch(String match) {
-    String result = match;
-    for (RegexTransformer transformer : chain) {
-      result = transformer.transformMatch(result);
-    }
-    return result;
-  }
-
-  public List<RegexTransformer> getChain() {
-    return chain;
-  }
-
-  public void setChain(List<RegexTransformer> chain) {
-    this.chain = chain;
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/regex/FPGFormatter.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/regex/FPGFormatter.java 
b/integration/src/main/java/org/apache/mahout/utils/regex/FPGFormatter.java
deleted file mode 100644
index a0f296d..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/regex/FPGFormatter.java
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.regex;
-
-import java.util.regex.Pattern;
-
-/**
- *  Collapses/converts all whitespace to a single tab
- */
-public class FPGFormatter implements RegexFormatter {
-
-  private static final Pattern WHITESPACE = Pattern.compile("\\W+");
-
-  @Override
-  public String format(String toFormat) {
-    return '\t' + WHITESPACE.matcher(toFormat).replaceAll("|");
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/regex/IdentityFormatter.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/regex/IdentityFormatter.java
 
b/integration/src/main/java/org/apache/mahout/utils/regex/IdentityFormatter.java
deleted file mode 100644
index 5c1177c..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/regex/IdentityFormatter.java
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.regex;
-
-public class IdentityFormatter implements RegexFormatter {
-
-  @Override
-  public String format(String toFormat) {
-    return toFormat;
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/regex/IdentityTransformer.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/regex/IdentityTransformer.java
 
b/integration/src/main/java/org/apache/mahout/utils/regex/IdentityTransformer.java
deleted file mode 100644
index aea695d..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/regex/IdentityTransformer.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.regex;
-
-/**
- * No-op
- */
-public final class IdentityTransformer implements RegexTransformer {
-
-  @Override
-  public String transformMatch(String match) {
-    return match;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/regex/RegexConverterDriver.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/regex/RegexConverterDriver.java
 
b/integration/src/main/java/org/apache/mahout/utils/regex/RegexConverterDriver.java
deleted file mode 100644
index 53be239..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/regex/RegexConverterDriver.java
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.regex;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
-import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-
-/**
- * Experimental
- */
-public class RegexConverterDriver extends AbstractJob {
-
-  @Override
-  public int run(String[] args) throws Exception {
-    addInputOption();
-    addOutputOption();
-    addOption(DefaultOptionCreator.overwriteOption().create());
-    addOption("regex", "regex",
-            "The regular expression to use", true);
-    addOption("groupsToKeep", "g",
-            "The number of the capturing groups to keep", false);
-    addOption("transformerClass", "t",
-            "The optional class specifying the Regex Transformer", false);
-    addOption("formatterClass", "t",
-            "The optional class specifying the Regex Formatter", false);
-    addOption(DefaultOptionCreator.analyzerOption().create());
-
-    if (parseArguments(args) == null) {
-      return -1;
-    }
-
-    Configuration conf = getConf();
-    //TODO: How to deal with command line escaping?
-    conf.set(RegexMapper.REGEX, getOption("regex")); //
-    String gtk = getOption("groupsToKeep");
-    if (gtk != null) {
-      conf.set(RegexMapper.GROUP_MATCHERS, gtk);
-    }
-    String trans = getOption("transformerClass");
-    if (trans != null) {
-      if ("url".equalsIgnoreCase(trans)) {
-        trans = URLDecodeTransformer.class.getName();
-      }
-      conf.set(RegexMapper.TRANSFORMER_CLASS, trans);
-    }
-    String formatter = getOption("formatterClass");
-    if (formatter != null) {
-      if ("fpg".equalsIgnoreCase(formatter)) {
-        formatter = FPGFormatter.class.getName();
-      }
-      conf.set(RegexMapper.FORMATTER_CLASS, formatter);
-    }
-    Path input = getInputPath();
-    Path output = getOutputPath();
-    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
-      HadoopUtil.delete(getConf(), output);
-    }
-    Class<? extends Analyzer> analyzerClass = getAnalyzerClassFromOption();
-    if (analyzerClass != null) {
-      conf.set(RegexMapper.ANALYZER_NAME, analyzerClass.getName());
-    }
-    Job job = prepareJob(input, output,
-            TextInputFormat.class,
-            RegexMapper.class,
-            LongWritable.class,
-            Text.class,
-            TextOutputFormat.class);
-    boolean succeeded = job.waitForCompletion(true);
-    return succeeded ? 0 : -1;
-  }
-
-  public static void main(String[] args) throws Exception {
-    ToolRunner.run(new RegexConverterDriver(), args);
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/regex/RegexFormatter.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/regex/RegexFormatter.java 
b/integration/src/main/java/org/apache/mahout/utils/regex/RegexFormatter.java
deleted file mode 100644
index 8ef837b..0000000
--- 
a/integration/src/main/java/org/apache/mahout/utils/regex/RegexFormatter.java
+++ /dev/null
@@ -1,24 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.regex;
-
-public interface RegexFormatter {
-
-  String format(String toFormat);
-
-}

[23/52] [partial] mahout git commit: removed all files except for website directory

Reply via email to