http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/clustering/AbstractClusterWriter.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/clustering/AbstractClusterWriter.java b/integration/src/main/java/org/apache/mahout/utils/clustering/AbstractClusterWriter.java deleted file mode 100644 index ac884d0..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/clustering/AbstractClusterWriter.java +++ /dev/null @@ -1,160 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.clustering; - -import java.io.IOException; -import java.io.Writer; -import java.util.Collection; -import java.util.Collections; -import java.util.Comparator; -import java.util.Iterator; -import java.util.List; -import java.util.Map; - -import org.apache.commons.lang3.StringUtils; -import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable; -import org.apache.mahout.clustering.iterator.ClusterWritable; -import org.apache.mahout.common.Pair; -import org.apache.mahout.common.distance.DistanceMeasure; -import org.apache.mahout.math.Vector; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.collect.Lists; - -/** - * Base class for implementing ClusterWriter - */ -public abstract class AbstractClusterWriter implements ClusterWriter { - - private static final Logger log = LoggerFactory.getLogger(AbstractClusterWriter.class); - - protected final Writer writer; - protected final Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints; - protected final DistanceMeasure measure; - - /** - * - * @param writer The underlying {@link java.io.Writer} to use - * @param clusterIdToPoints The map between cluster ids {@link org.apache.mahout.clustering.Cluster#getId()} and the - * points in the cluster - * @param measure The {@link org.apache.mahout.common.distance.DistanceMeasure} used to calculate the distance. - * Some writers may wish to use it for calculating weights for display. May be null. - */ - protected AbstractClusterWriter(Writer writer, Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints, - DistanceMeasure measure) { - this.writer = writer; - this.clusterIdToPoints = clusterIdToPoints; - this.measure = measure; - } - - protected Writer getWriter() { - return writer; - } - - protected Map<Integer, List<WeightedPropertyVectorWritable>> getClusterIdToPoints() { - return clusterIdToPoints; - } - - public static String getTopFeatures(Vector vector, String[] dictionary, int numTerms) { - - StringBuilder sb = new StringBuilder(100); - - for (Pair<String, Double> item : getTopPairs(vector, dictionary, numTerms)) { - String term = item.getFirst(); - sb.append("\n\t\t"); - sb.append(StringUtils.rightPad(term, 40)); - sb.append("=>"); - sb.append(StringUtils.leftPad(item.getSecond().toString(), 20)); - } - return sb.toString(); - } - - public static String getTopTerms(Vector vector, String[] dictionary, int numTerms) { - - StringBuilder sb = new StringBuilder(100); - - for (Pair<String, Double> item : getTopPairs(vector, dictionary, numTerms)) { - String term = item.getFirst(); - sb.append(term).append('_'); - } - sb.deleteCharAt(sb.length() - 1); - return sb.toString(); - } - - @Override - public long write(Iterable<ClusterWritable> iterable) throws IOException { - return write(iterable, Long.MAX_VALUE); - } - - @Override - public void close() throws IOException { - writer.close(); - } - - @Override - public long write(Iterable<ClusterWritable> iterable, long maxDocs) throws IOException { - long result = 0; - Iterator<ClusterWritable> iterator = iterable.iterator(); - while (result < maxDocs && iterator.hasNext()) { - write(iterator.next()); - result++; - } - return result; - } - - private static Collection<Pair<String, Double>> getTopPairs(Vector vector, String[] dictionary, int numTerms) { - List<TermIndexWeight> vectorTerms = Lists.newArrayList(); - - for (Vector.Element elt : vector.nonZeroes()) { - vectorTerms.add(new TermIndexWeight(elt.index(), elt.get())); - } - - // Sort results in reverse order (ie weight in descending order) - Collections.sort(vectorTerms, new Comparator<TermIndexWeight>() { - @Override - public int compare(TermIndexWeight one, TermIndexWeight two) { - return Double.compare(two.weight, one.weight); - } - }); - - Collection<Pair<String, Double>> topTerms = Lists.newLinkedList(); - - for (int i = 0; i < vectorTerms.size() && i < numTerms; i++) { - int index = vectorTerms.get(i).index; - String dictTerm = dictionary[index]; - if (dictTerm == null) { - log.error("Dictionary entry missing for {}", index); - continue; - } - topTerms.add(new Pair<>(dictTerm, vectorTerms.get(i).weight)); - } - - return topTerms; - } - - private static class TermIndexWeight { - private final int index; - private final double weight; - - TermIndexWeight(int index, double weight) { - this.index = index; - this.weight = weight; - } - } -}
http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/clustering/CSVClusterWriter.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/clustering/CSVClusterWriter.java b/integration/src/main/java/org/apache/mahout/utils/clustering/CSVClusterWriter.java deleted file mode 100644 index 7269016..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/clustering/CSVClusterWriter.java +++ /dev/null @@ -1,69 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.clustering; - -import org.apache.mahout.clustering.Cluster; -import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable; -import org.apache.mahout.clustering.iterator.ClusterWritable; -import org.apache.mahout.common.distance.DistanceMeasure; -import org.apache.mahout.math.NamedVector; -import org.apache.mahout.math.Vector; - -import java.io.IOException; -import java.io.Writer; -import java.util.List; -import java.util.Map; -import java.util.regex.Pattern; - -/** - * Format is adjacency style as put forth at http://gephi.org/users/supported-graph-formats/csv-format/, the centroid - * is the first element and all the rest of the row are the points in that cluster - * - **/ -public class CSVClusterWriter extends AbstractClusterWriter { - - private static final Pattern VEC_PATTERN = Pattern.compile("\\{|\\:|\\,|\\}"); - - public CSVClusterWriter(Writer writer, Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints, - DistanceMeasure measure) { - super(writer, clusterIdToPoints, measure); - } - - @Override - public void write(ClusterWritable clusterWritable) throws IOException { - StringBuilder line = new StringBuilder(); - Cluster cluster = clusterWritable.getValue(); - line.append(cluster.getId()); - List<WeightedPropertyVectorWritable> points = getClusterIdToPoints().get(cluster.getId()); - if (points != null) { - for (WeightedPropertyVectorWritable point : points) { - Vector theVec = point.getVector(); - line.append(','); - if (theVec instanceof NamedVector) { - line.append(((NamedVector)theVec).getName()); - } else { - String vecStr = theVec.asFormatString(); - //do some basic manipulations for display - vecStr = VEC_PATTERN.matcher(vecStr).replaceAll("_"); - line.append(vecStr); - } - } - getWriter().append(line).append("\n"); - } - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java b/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java deleted file mode 100644 index 75b5ded..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java +++ /dev/null @@ -1,328 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.clustering; - -import java.io.File; -import java.io.IOException; -import java.io.OutputStreamWriter; -import java.io.Writer; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.TreeMap; - -import com.google.common.io.Closeables; -import com.google.common.io.Files; -import org.apache.commons.io.Charsets; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.mahout.clustering.cdbw.CDbwEvaluator; -import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable; -import org.apache.mahout.clustering.evaluation.ClusterEvaluator; -import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver; -import org.apache.mahout.clustering.iterator.ClusterWritable; -import org.apache.mahout.common.AbstractJob; -import org.apache.mahout.common.ClassUtils; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.common.Pair; -import org.apache.mahout.common.commandline.DefaultOptionCreator; -import org.apache.mahout.common.distance.DistanceMeasure; -import org.apache.mahout.common.iterator.sequencefile.PathFilters; -import org.apache.mahout.common.iterator.sequencefile.PathType; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable; -import org.apache.mahout.utils.vectors.VectorHelper; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public final class ClusterDumper extends AbstractJob { - - public static final String SAMPLE_POINTS = "samplePoints"; - DistanceMeasure measure; - - public enum OUTPUT_FORMAT { - TEXT, - CSV, - GRAPH_ML, - JSON, - } - - public static final String DICTIONARY_TYPE_OPTION = "dictionaryType"; - public static final String DICTIONARY_OPTION = "dictionary"; - public static final String POINTS_DIR_OPTION = "pointsDir"; - public static final String NUM_WORDS_OPTION = "numWords"; - public static final String SUBSTRING_OPTION = "substring"; - public static final String EVALUATE_CLUSTERS = "evaluate"; - - public static final String OUTPUT_FORMAT_OPT = "outputFormat"; - - private static final Logger log = LoggerFactory.getLogger(ClusterDumper.class); - private Path seqFileDir; - private Path pointsDir; - private long maxPointsPerCluster = Long.MAX_VALUE; - private String termDictionary; - private String dictionaryFormat; - private int subString = Integer.MAX_VALUE; - private int numTopFeatures = 10; - private Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints; - private OUTPUT_FORMAT outputFormat = OUTPUT_FORMAT.TEXT; - private boolean runEvaluation; - - public ClusterDumper(Path seqFileDir, Path pointsDir) { - this.seqFileDir = seqFileDir; - this.pointsDir = pointsDir; - init(); - } - - public ClusterDumper() { - setConf(new Configuration()); - } - - public static void main(String[] args) throws Exception { - new ClusterDumper().run(args); - } - - @Override - public int run(String[] args) throws Exception { - addInputOption(); - addOutputOption(); - addOption(OUTPUT_FORMAT_OPT, "of", "The optional output format for the results. Options: TEXT, CSV, JSON or GRAPH_ML", - "TEXT"); - addOption(SUBSTRING_OPTION, "b", "The number of chars of the asFormatString() to print"); - addOption(NUM_WORDS_OPTION, "n", "The number of top terms to print"); - addOption(POINTS_DIR_OPTION, "p", - "The directory containing points sequence files mapping input vectors to their cluster. " - + "If specified, then the program will output the points associated with a cluster"); - addOption(SAMPLE_POINTS, "sp", "Specifies the maximum number of points to include _per_ cluster. The default " - + "is to include all points"); - addOption(DICTIONARY_OPTION, "d", "The dictionary file"); - addOption(DICTIONARY_TYPE_OPTION, "dt", "The dictionary file type (text|sequencefile)", "text"); - addOption(buildOption(EVALUATE_CLUSTERS, "e", "Run ClusterEvaluator and CDbwEvaluator over the input. " - + "The output will be appended to the rest of the output at the end.", false, false, null)); - addOption(DefaultOptionCreator.distanceMeasureOption().create()); - - // output is optional, will print to System.out per default - if (parseArguments(args, false, true) == null) { - return -1; - } - - seqFileDir = getInputPath(); - if (hasOption(POINTS_DIR_OPTION)) { - pointsDir = new Path(getOption(POINTS_DIR_OPTION)); - } - outputFile = getOutputFile(); - if (hasOption(SUBSTRING_OPTION)) { - int sub = Integer.parseInt(getOption(SUBSTRING_OPTION)); - if (sub >= 0) { - subString = sub; - } - } - termDictionary = getOption(DICTIONARY_OPTION); - dictionaryFormat = getOption(DICTIONARY_TYPE_OPTION); - if (hasOption(NUM_WORDS_OPTION)) { - numTopFeatures = Integer.parseInt(getOption(NUM_WORDS_OPTION)); - } - if (hasOption(OUTPUT_FORMAT_OPT)) { - outputFormat = OUTPUT_FORMAT.valueOf(getOption(OUTPUT_FORMAT_OPT)); - } - if (hasOption(SAMPLE_POINTS)) { - maxPointsPerCluster = Long.parseLong(getOption(SAMPLE_POINTS)); - } else { - maxPointsPerCluster = Long.MAX_VALUE; - } - runEvaluation = hasOption(EVALUATE_CLUSTERS); - String distanceMeasureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION); - measure = ClassUtils.instantiateAs(distanceMeasureClass, DistanceMeasure.class); - - init(); - printClusters(null); - return 0; - } - - public void printClusters(String[] dictionary) throws Exception { - Configuration conf = new Configuration(); - - if (this.termDictionary != null) { - if ("text".equals(dictionaryFormat)) { - dictionary = VectorHelper.loadTermDictionary(new File(this.termDictionary)); - } else if ("sequencefile".equals(dictionaryFormat)) { - dictionary = VectorHelper.loadTermDictionary(conf, this.termDictionary); - } else { - throw new IllegalArgumentException("Invalid dictionary format"); - } - } - - Writer writer; - boolean shouldClose; - if (this.outputFile == null) { - shouldClose = false; - writer = new OutputStreamWriter(System.out, Charsets.UTF_8); - } else { - shouldClose = true; - if (outputFile.getName().startsWith("s3n://")) { - Path p = outputPath; - FileSystem fs = FileSystem.get(p.toUri(), conf); - writer = new OutputStreamWriter(fs.create(p), Charsets.UTF_8); - } else { - Files.createParentDirs(outputFile); - writer = Files.newWriter(this.outputFile, Charsets.UTF_8); - } - } - ClusterWriter clusterWriter = createClusterWriter(writer, dictionary); - try { - long numWritten = clusterWriter.write(new SequenceFileDirValueIterable<ClusterWritable>(new Path(seqFileDir, - "part-*"), PathType.GLOB, conf)); - - writer.flush(); - if (runEvaluation) { - HadoopUtil.delete(conf, new Path("tmp/representative")); - int numIters = 5; - RepresentativePointsDriver.main(new String[]{ - "--input", seqFileDir.toString(), - "--output", "tmp/representative", - "--clusteredPoints", pointsDir.toString(), - "--distanceMeasure", measure.getClass().getName(), - "--maxIter", String.valueOf(numIters) - }); - conf.set(RepresentativePointsDriver.DISTANCE_MEASURE_KEY, measure.getClass().getName()); - conf.set(RepresentativePointsDriver.STATE_IN_KEY, "tmp/representative/representativePoints-" + numIters); - ClusterEvaluator ce = new ClusterEvaluator(conf, seqFileDir); - writer.append("\n"); - writer.append("Inter-Cluster Density: ").append(String.valueOf(ce.interClusterDensity())).append("\n"); - writer.append("Intra-Cluster Density: ").append(String.valueOf(ce.intraClusterDensity())).append("\n"); - CDbwEvaluator cdbw = new CDbwEvaluator(conf, seqFileDir); - writer.append("CDbw Inter-Cluster Density: ").append(String.valueOf(cdbw.interClusterDensity())).append("\n"); - writer.append("CDbw Intra-Cluster Density: ").append(String.valueOf(cdbw.intraClusterDensity())).append("\n"); - writer.append("CDbw Separation: ").append(String.valueOf(cdbw.separation())).append("\n"); - writer.flush(); - } - log.info("Wrote {} clusters", numWritten); - } finally { - if (shouldClose) { - Closeables.close(clusterWriter, false); - } else { - if (clusterWriter instanceof GraphMLClusterWriter) { - clusterWriter.close(); - } - } - } - } - - ClusterWriter createClusterWriter(Writer writer, String[] dictionary) throws IOException { - ClusterWriter result; - - switch (outputFormat) { - case TEXT: - result = new ClusterDumperWriter(writer, clusterIdToPoints, measure, numTopFeatures, dictionary, subString); - break; - case CSV: - result = new CSVClusterWriter(writer, clusterIdToPoints, measure); - break; - case GRAPH_ML: - result = new GraphMLClusterWriter(writer, clusterIdToPoints, measure, numTopFeatures, dictionary, subString); - break; - case JSON: - result = new JsonClusterWriter(writer, clusterIdToPoints, measure, numTopFeatures, dictionary); - break; - default: - throw new IllegalStateException("Unknown outputformat: " + outputFormat); - } - return result; - } - - /** - * Convenience function to set the output format during testing. - */ - public void setOutputFormat(OUTPUT_FORMAT of) { - outputFormat = of; - } - - private void init() { - if (this.pointsDir != null) { - Configuration conf = new Configuration(); - // read in the points - clusterIdToPoints = readPoints(this.pointsDir, maxPointsPerCluster, conf); - } else { - clusterIdToPoints = Collections.emptyMap(); - } - } - - - public int getSubString() { - return subString; - } - - public void setSubString(int subString) { - this.subString = subString; - } - - public Map<Integer, List<WeightedPropertyVectorWritable>> getClusterIdToPoints() { - return clusterIdToPoints; - } - - public String getTermDictionary() { - return termDictionary; - } - - public void setTermDictionary(String termDictionary, String dictionaryType) { - this.termDictionary = termDictionary; - this.dictionaryFormat = dictionaryType; - } - - public void setNumTopFeatures(int num) { - this.numTopFeatures = num; - } - - public int getNumTopFeatures() { - return this.numTopFeatures; - } - - public long getMaxPointsPerCluster() { - return maxPointsPerCluster; - } - - public void setMaxPointsPerCluster(long maxPointsPerCluster) { - this.maxPointsPerCluster = maxPointsPerCluster; - } - - public static Map<Integer, List<WeightedPropertyVectorWritable>> readPoints(Path pointsPathDir, - long maxPointsPerCluster, - Configuration conf) { - Map<Integer, List<WeightedPropertyVectorWritable>> result = new TreeMap<>(); - for (Pair<IntWritable, WeightedPropertyVectorWritable> record - : new SequenceFileDirIterable<IntWritable, WeightedPropertyVectorWritable>(pointsPathDir, PathType.LIST, - PathFilters.logsCRCFilter(), conf)) { - // value is the cluster id as an int, key is the name/id of the - // vector, but that doesn't matter because we only care about printing it - //String clusterId = value.toString(); - int keyValue = record.getFirst().get(); - List<WeightedPropertyVectorWritable> pointList = result.get(keyValue); - if (pointList == null) { - pointList = new ArrayList<>(); - result.put(keyValue, pointList); - } - if (pointList.size() < maxPointsPerCluster) { - pointList.add(record.getSecond()); - } - } - return result; - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumperWriter.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumperWriter.java b/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumperWriter.java deleted file mode 100644 index 31858c4..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumperWriter.java +++ /dev/null @@ -1,100 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.clustering; - -import org.apache.hadoop.io.Text; -import org.apache.mahout.clustering.AbstractCluster; -import org.apache.mahout.clustering.Cluster; -import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable; -import org.apache.mahout.clustering.iterator.ClusterWritable; -import org.apache.mahout.common.distance.DistanceMeasure; - -import java.io.IOException; -import java.io.Writer; -import java.util.Iterator; -import java.util.List; -import java.util.Map; - -/** - * Implements a {@link ClusterWriter} that outputs in the format used by ClusterDumper in Mahout 0.5 - */ -public class ClusterDumperWriter extends AbstractClusterWriter { - - private final int subString; - private final String[] dictionary; - private final int numTopFeatures; - - public ClusterDumperWriter(Writer writer, Map<Integer,List<WeightedPropertyVectorWritable>> clusterIdToPoints, - DistanceMeasure measure, int numTopFeatures, String[] dictionary, int subString) { - super(writer, clusterIdToPoints, measure); - this.numTopFeatures = numTopFeatures; - this.dictionary = dictionary; - this.subString = subString; - } - - @Override - public void write(ClusterWritable clusterWritable) throws IOException { - Cluster cluster = clusterWritable.getValue(); - String fmtStr = cluster.asFormatString(dictionary); - Writer writer = getWriter(); - if (subString > 0 && fmtStr.length() > subString) { - writer.write(':'); - writer.write(fmtStr, 0, Math.min(subString, fmtStr.length())); - } else { - writer.write(fmtStr); - } - - writer.write('\n'); - - if (dictionary != null) { - String topTerms = getTopFeatures(clusterWritable.getValue().getCenter(), dictionary, numTopFeatures); - writer.write("\tTop Terms: "); - writer.write(topTerms); - writer.write('\n'); - } - - Map<Integer,List<WeightedPropertyVectorWritable>> clusterIdToPoints = getClusterIdToPoints(); - List<WeightedPropertyVectorWritable> points = clusterIdToPoints.get(clusterWritable.getValue().getId()); - if (points != null) { - writer.write("\tWeight : [props - optional]: Point:\n\t"); - for (Iterator<WeightedPropertyVectorWritable> iterator = points.iterator(); iterator.hasNext();) { - WeightedPropertyVectorWritable point = iterator.next(); - writer.write(String.valueOf(point.getWeight())); - Map<Text,Text> map = point.getProperties(); - // map can be null since empty maps when written are returned as null - writer.write(" : ["); - if (map != null) { - for (Map.Entry<Text,Text> entry : map.entrySet()) { - writer.write(entry.getKey().toString()); - writer.write("="); - writer.write(entry.getValue().toString()); - } - } - writer.write("]"); - - writer.write(": "); - - writer.write(AbstractCluster.formatVector(point.getVector(), dictionary)); - if (iterator.hasNext()) { - writer.write("\n\t"); - } - } - writer.write('\n'); - } - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterWriter.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterWriter.java b/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterWriter.java deleted file mode 100644 index 70f8f6f..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterWriter.java +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.clustering; - -import java.io.Closeable; -import java.io.IOException; - -import org.apache.mahout.clustering.iterator.ClusterWritable; - -/** - * Writes out clusters - */ -public interface ClusterWriter extends Closeable { - - /** - * Write all values in the Iterable to the output - * - * @param iterable The {@link Iterable} to loop over - * @return the number of docs written - * @throws java.io.IOException if there was a problem writing - */ - long write(Iterable<ClusterWritable> iterable) throws IOException; - - /** - * Write out a Cluster - */ - void write(ClusterWritable clusterWritable) throws IOException; - - /** - * Write the first {@code maxDocs} to the output. - * - * @param iterable The {@link Iterable} to loop over - * @param maxDocs the maximum number of docs to write - * @return The number of docs written - * @throws IOException if there was a problem writing - */ - long write(Iterable<ClusterWritable> iterable, long maxDocs) throws IOException; -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/clustering/GraphMLClusterWriter.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/clustering/GraphMLClusterWriter.java b/integration/src/main/java/org/apache/mahout/utils/clustering/GraphMLClusterWriter.java deleted file mode 100644 index 25e8f3b..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/clustering/GraphMLClusterWriter.java +++ /dev/null @@ -1,216 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.clustering; - -import java.io.IOException; -import java.io.Writer; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Random; -import java.util.regex.Pattern; - -import org.apache.mahout.clustering.Cluster; -import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable; -import org.apache.mahout.clustering.classify.WeightedVectorWritable; -import org.apache.mahout.clustering.iterator.ClusterWritable; -import org.apache.mahout.common.RandomUtils; -import org.apache.mahout.common.StringUtils; -import org.apache.mahout.common.distance.DistanceMeasure; -import org.apache.mahout.math.NamedVector; -import org.apache.mahout.math.Vector; - -/** - * GraphML -- see http://gephi.org/users/supported-graph-formats/graphml-format/ - */ -public class GraphMLClusterWriter extends AbstractClusterWriter { - - private static final Pattern VEC_PATTERN = Pattern.compile("\\{|\\:|\\,|\\}"); - private final Map<Integer, Color> colors = new HashMap<>(); - private Color lastClusterColor; - private float lastX; - private float lastY; - private Random random; - private int posStep; - private final String[] dictionary; - private final int numTopFeatures; - private final int subString; - - public GraphMLClusterWriter(Writer writer, Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints, - DistanceMeasure measure, int numTopFeatures, String[] dictionary, int subString) - throws IOException { - super(writer, clusterIdToPoints, measure); - this.dictionary = dictionary; - this.numTopFeatures = numTopFeatures; - this.subString = subString; - init(writer); - } - - private void init(Writer writer) throws IOException { - writer.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); - writer.append("<graphml xmlns=\"http://graphml.graphdrawing.org/xmlns\"\n" - + "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n" - + "xsi:schemaLocation=\"http://graphml.graphdrawing.org/xmlns\n" - + "http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd\">"); - //support rgb - writer.append("<key attr.name=\"r\" attr.type=\"int\" for=\"node\" id=\"r\"/>\n" - + "<key attr.name=\"g\" attr.type=\"int\" for=\"node\" id=\"g\"/>\n" - + "<key attr.name=\"b\" attr.type=\"int\" for=\"node\" id=\"b\"/>" - + "<key attr.name=\"size\" attr.type=\"int\" for=\"node\" id=\"size\"/>" - + "<key attr.name=\"weight\" attr.type=\"float\" for=\"edge\" id=\"weight\"/>" - + "<key attr.name=\"x\" attr.type=\"float\" for=\"node\" id=\"x\"/>" - + "<key attr.name=\"y\" attr.type=\"float\" for=\"node\" id=\"y\"/>"); - writer.append("<graph edgedefault=\"undirected\">"); - lastClusterColor = new Color(); - posStep = (int) (0.1 * clusterIdToPoints.size()) + 100; - random = RandomUtils.getRandom(); - } - - /* - <?xml version="1.0" encoding="UTF-8"?> - <graphml xmlns="http://graphml.graphdrawing.org/xmlns" - xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns - http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd"> - <graph id="G" edgedefault="undirected"> - <node id="n0"/> - <node id="n1"/> - <edge id="e1" source="n0" target="n1"/> - </graph> - </graphml> - */ - - @Override - public void write(ClusterWritable clusterWritable) throws IOException { - StringBuilder line = new StringBuilder(); - Cluster cluster = clusterWritable.getValue(); - Color rgb = getColor(cluster.getId()); - - String topTerms = ""; - if (dictionary != null) { - topTerms = getTopTerms(cluster.getCenter(), dictionary, numTopFeatures); - } - String clusterLabel = String.valueOf(cluster.getId()) + '_' + topTerms; - //do some positioning so that items are visible and grouped together - //TODO: put in a real layout algorithm - float x = lastX + 1000; - float y = lastY; - if (x > (1000 + posStep)) { - y = lastY + 1000; - x = 0; - } - - line.append(createNode(clusterLabel, rgb, x, y)); - List<WeightedPropertyVectorWritable> points = clusterIdToPoints.get(cluster.getId()); - if (points != null) { - for (WeightedVectorWritable point : points) { - Vector theVec = point.getVector(); - double distance = 1; - if (measure != null) { - //scale the distance - distance = measure.distance(cluster.getCenter().getLengthSquared(), cluster.getCenter(), theVec) * 500; - } - String vecStr; - int angle = random.nextInt(360); //pick an angle at random and then scale along that angle - double angleRads = Math.toRadians(angle); - - float targetX = x + (float) (distance * Math.cos(angleRads)); - float targetY = y + (float) (distance * Math.sin(angleRads)); - if (theVec instanceof NamedVector) { - vecStr = ((NamedVector) theVec).getName(); - } else { - vecStr = theVec.asFormatString(); - //do some basic manipulations for display - vecStr = VEC_PATTERN.matcher(vecStr).replaceAll("_"); - } - if (subString > 0 && vecStr.length() > subString) { - vecStr = vecStr.substring(0, subString); - } - line.append(createNode(vecStr, rgb, targetX, targetY)); - line.append(createEdge(clusterLabel, vecStr, distance)); - } - } - lastClusterColor = rgb; - lastX = x; - lastY = y; - getWriter().append(line).append("\n"); - } - - private Color getColor(int clusterId) { - Color result = colors.get(clusterId); - if (result == null) { - result = new Color(); - //there is probably some better way to color a graph - int incR = 0; - int incG = 0; - int incB = 0; - if (lastClusterColor.r + 20 < 256 && lastClusterColor.g + 20 < 256 && lastClusterColor.b + 20 < 256) { - incR = 20; - incG = 0; - incB = 0; - } else if (lastClusterColor.r + 20 >= 256 && lastClusterColor.g + 20 < 256 && lastClusterColor.b + 20 < 256) { - incG = 20; - incB = 0; - } else if (lastClusterColor.r + 20 >= 256 && lastClusterColor.g + 20 >= 256 && lastClusterColor.b + 20 < 256) { - incB = 20; - } else { - incR += 3; - incG += 3; - incR += 3; - } - result.r = (lastClusterColor.r + incR) % 256; - result.g = (lastClusterColor.g + incG) % 256; - result.b = (lastClusterColor.b + incB) % 256; - colors.put(clusterId, result); - } - return result; - } - - private static String createEdge(String left, String right, double distance) { - left = StringUtils.escapeXML(left); - right = StringUtils.escapeXML(right); - return "<edge id=\"" + left + '_' + right + "\" source=\"" + left + "\" target=\"" + right + "\">" - + "<data key=\"weight\">" + distance + "</data></edge>"; - } - - private static String createNode(String s, Color rgb, float x, float y) { - return "<node id=\"" + StringUtils.escapeXML(s) + "\"><data key=\"r\">" + rgb.r - + "</data>" - + "<data key=\"g\">" + rgb.g - + "</data>" - + "<data key=\"b\">" + rgb.b - + "</data>" - + "<data key=\"x\">" + x - + "</data>" - + "<data key=\"y\">" + y - + "</data>" - + "</node>"; - } - - @Override - public void close() throws IOException { - getWriter().append("</graph>").append("</graphml>"); - super.close(); - } - - private static class Color { - int r; - int g; - int b; - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java b/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java deleted file mode 100644 index d564a73..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java +++ /dev/null @@ -1,188 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.utils.clustering; - -import java.io.IOException; -import java.io.Writer; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.regex.Pattern; - -import org.apache.mahout.clustering.AbstractCluster; -import org.apache.mahout.clustering.Cluster; -import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable; -import org.apache.mahout.clustering.iterator.ClusterWritable; -import org.apache.mahout.common.distance.DistanceMeasure; -import org.apache.mahout.math.NamedVector; -import org.apache.mahout.math.Vector; -import org.codehaus.jackson.map.ObjectMapper; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Dump cluster info to JSON formatted lines. Heavily inspired by - * ClusterDumperWriter.java and CSVClusterWriter.java - * - */ -public class JsonClusterWriter extends AbstractClusterWriter { - private final String[] dictionary; - private final int numTopFeatures; - private final ObjectMapper jxn; - - private static final Logger log = LoggerFactory.getLogger(JsonClusterWriter.class); - private static final Pattern VEC_PATTERN = Pattern.compile("\\{|\\:|\\,|\\}"); - - public JsonClusterWriter(Writer writer, - Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints, - DistanceMeasure measure, int numTopFeatures, String[] dictionary) { - super(writer, clusterIdToPoints, measure); - this.numTopFeatures = numTopFeatures; - this.dictionary = dictionary; - jxn = new ObjectMapper(); - } - - /** - * Generate HashMap with cluster info and write as a single JSON formatted - * line - */ - @Override - public void write(ClusterWritable clusterWritable) throws IOException { - Map<String, Object> res = new HashMap<>(); - - // get top terms - if (dictionary != null) { - List<Object> topTerms = getTopFeaturesList(clusterWritable.getValue() - .getCenter(), dictionary, numTopFeatures); - res.put("top_terms", topTerms); - } else { - res.put("top_terms", new ArrayList<>()); - } - - // get human-readable cluster representation - Cluster cluster = clusterWritable.getValue(); - res.put("cluster_id", cluster.getId()); - - if (dictionary != null) { - Map<String,Object> fmtStr = cluster.asJson(dictionary); - res.put("cluster", fmtStr); - - // get points - List<Object> points = getPoints(cluster, dictionary); - res.put("points", points); - } else { - res.put("cluster", new HashMap<>()); - res.put("points", new ArrayList<>()); - } - - // write JSON - Writer writer = getWriter(); - writer.write(jxn.writeValueAsString(res) + "\n"); - } - - /** - * Create a List of HashMaps containing top terms information - * - * @return List<Object> - */ - public List<Object> getTopFeaturesList(Vector vector, String[] dictionary, - int numTerms) { - - List<TermIndexWeight> vectorTerms = new ArrayList<>(); - - for (Vector.Element elt : vector.nonZeroes()) { - vectorTerms.add(new TermIndexWeight(elt.index(), elt.get())); - } - - // Sort results in reverse order (i.e. weight in descending order) - Collections.sort(vectorTerms, new Comparator<TermIndexWeight>() { - @Override - public int compare(TermIndexWeight one, TermIndexWeight two) { - return Double.compare(two.weight, one.weight); - } - }); - - List<Object> topTerms = new ArrayList<>(); - - for (int i = 0; i < vectorTerms.size() && i < numTerms; i++) { - int index = vectorTerms.get(i).index; - String dictTerm = dictionary[index]; - if (dictTerm == null) { - log.error("Dictionary entry missing for {}", index); - continue; - } - Map<String, Object> term_entry = new HashMap<>(); - term_entry.put(dictTerm, vectorTerms.get(i).weight); - topTerms.add(term_entry); - } - - return topTerms; - } - - /** - * Create a List of HashMaps containing Vector point information - * - * @return List<Object> - */ - public List<Object> getPoints(Cluster cluster, String[] dictionary) { - List<Object> vectorObjs = new ArrayList<>(); - List<WeightedPropertyVectorWritable> points = getClusterIdToPoints().get( - cluster.getId()); - - if (points != null) { - for (WeightedPropertyVectorWritable point : points) { - Map<String, Object> entry = new HashMap<>(); - Vector theVec = point.getVector(); - if (theVec instanceof NamedVector) { - entry.put("vector_name", ((NamedVector) theVec).getName()); - } else { - String vecStr = theVec.asFormatString(); - // do some basic manipulations for display - vecStr = VEC_PATTERN.matcher(vecStr).replaceAll("_"); - entry.put("vector_name", vecStr); - } - entry.put("weight", String.valueOf(point.getWeight())); - try { - entry.put("point", - AbstractCluster.formatVectorAsJson(point.getVector(), dictionary)); - } catch (IOException e) { - log.error("IOException: ", e); - } - vectorObjs.add(entry); - } - } - return vectorObjs; - } - - /** - * Convenience class for sorting terms - * - */ - private static class TermIndexWeight { - private final int index; - private final double weight; - - TermIndexWeight(int index, double weight) { - this.index = index; - this.weight = weight; - } - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java b/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java deleted file mode 100644 index 54ad43f..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java +++ /dev/null @@ -1,186 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.email; - -import java.io.File; -import java.nio.charset.Charset; -import java.util.Map; -import java.util.regex.Pattern; - -/** - * Configuration options to be used by {@link MailProcessor}. Includes options controlling the exact output format - * and which mail fields are included (body, to, from, subject, etc.) - */ -public class MailOptions { - - public static final String FROM = "FROM"; - public static final String TO = "TO"; - public static final String REFS = "REFS"; - public static final String SUBJECT = "SUBJECT"; - public static final Pattern DEFAULT_QUOTED_TEXT = Pattern.compile("^(\\||>)"); - - private boolean stripQuotedText; - private File input; - private String outputDir; - private String prefix; - private int chunkSize; - private Charset charset; - private String separator; - private String bodySeparator = "\n"; - private boolean includeBody; - private Pattern[] patternsToMatch; - //maps FROM, TO, REFS, SUBJECT, etc. to the order they appear in patternsToMatch. See MailToRecMapper - private Map<String, Integer> patternOrder; - - //the regular expression to use for identifying quoted text. - private Pattern quotedTextPattern = DEFAULT_QUOTED_TEXT; - - public File getInput() { - return input; - } - - public void setInput(File input) { - this.input = input; - } - - public String getOutputDir() { - return outputDir; - } - - /** - * Sets the output directory where sequence files will be written. - */ - public void setOutputDir(String outputDir) { - this.outputDir = outputDir; - } - - public String getPrefix() { - return prefix; - } - - /** - * Sets the prefix that is combined with the archive name and with message ids to create {@code SequenceFile} keys. - * @param prefix The name of the directory containing the mail archive is commonly used. - */ - public void setPrefix(String prefix) { - this.prefix = prefix; - } - - public int getChunkSize() { - return chunkSize; - } - - /** - * Sets the size of each generated sequence file, in Megabytes. - */ - public void setChunkSize(int chunkSize) { - this.chunkSize = chunkSize; - } - - public Charset getCharset() { - return charset; - } - - /** - * Sets the encoding of the input - */ - public void setCharset(Charset charset) { - this.charset = charset; - } - - public String getSeparator() { - return separator; - } - - /** - * Sets the separator to use in the output between metadata items (to, from, etc.). - */ - public void setSeparator(String separator) { - this.separator = separator; - } - - public String getBodySeparator() { - return bodySeparator; - } - - /** - * Sets the separator to use in the output between lines in the body, the default is "\n". - */ - public void setBodySeparator(String bodySeparator) { - this.bodySeparator = bodySeparator; - } - - public boolean isIncludeBody() { - return includeBody; - } - - /** - * Sets whether mail bodies are included in the output - */ - public void setIncludeBody(boolean includeBody) { - this.includeBody = includeBody; - } - - public Pattern[] getPatternsToMatch() { - return patternsToMatch; - } - - /** - * Sets the list of patterns to be applied in the given order to extract metadata fields (to, from, subject, etc.) - * from the input - */ - public void setPatternsToMatch(Pattern[] patternsToMatch) { - this.patternsToMatch = patternsToMatch; - } - - public Map<String, Integer> getPatternOrder() { - return patternOrder; - } - - public void setPatternOrder(Map<String, Integer> patternOrder) { - this.patternOrder = patternOrder; - } - - /** - * - * @return true if we should strip out quoted email text - */ - public boolean isStripQuotedText() { - return stripQuotedText; - } - - /** - * - * Sets whether quoted text such as lines starting with | or > is striped off. - */ - public void setStripQuotedText(boolean stripQuotedText) { - this.stripQuotedText = stripQuotedText; - } - - public Pattern getQuotedTextPattern() { - return quotedTextPattern; - } - - /** - * Sets the {@link java.util.regex.Pattern} to use to identify lines that are quoted text. Default is | and > - * @see #setStripQuotedText(boolean) - */ - public void setQuotedTextPattern(Pattern quotedTextPattern) { - this.quotedTextPattern = quotedTextPattern; - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java b/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java deleted file mode 100644 index 7db836f..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java +++ /dev/null @@ -1,183 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.email; - -import org.apache.mahout.common.iterator.FileLineIterable; -import org.apache.mahout.utils.io.ChunkedWriter; -import org.apache.mahout.utils.io.ChunkedWrapper; -import org.apache.mahout.utils.io.IOWriterWrapper; -import org.apache.mahout.utils.io.WrappedWriter; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.Writer; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** - * Converts an mbox mail archive into a group of Hadoop Sequence Files with equal size. The archive may optionally be - * gzipped or zipped. @see org.apache.mahout.text.SequenceFilesFromMailArchives - */ -public class MailProcessor { - - private static final Pattern MESSAGE_START = Pattern.compile("^From \\S+@\\S.*\\d{4}$", Pattern.CASE_INSENSITIVE); - private static final Pattern MESSAGE_ID_PREFIX = Pattern.compile("^message-id: <(.*)>$", Pattern.CASE_INSENSITIVE); - // regular expressions used to parse individual messages - public static final Pattern SUBJECT_PREFIX = Pattern.compile("^subject: (.*)$", Pattern.CASE_INSENSITIVE); - //we need to have at least one character - public static final Pattern FROM_PREFIX = Pattern.compile("^from: (\\S.*)$", Pattern.CASE_INSENSITIVE); - public static final Pattern REFS_PREFIX = Pattern.compile("^references: (.*)$", Pattern.CASE_INSENSITIVE); - public static final Pattern TO_PREFIX = Pattern.compile("^to: (.*)$", Pattern.CASE_INSENSITIVE); - - private final String prefix; - private final MailOptions options; - private final WrappedWriter writer; - - private static final Logger log = LoggerFactory.getLogger(MailProcessor.class); - - /** - * Creates a {@code MailProcessor} that does not write to sequence files, but to a single text file. - * This constructor is for debugging and testing purposes. - */ - public MailProcessor(MailOptions options, String prefix, Writer writer) { - this.writer = new IOWriterWrapper(writer); - this.options = options; - this.prefix = prefix; - } - - /** - * This is the main constructor of {@code MailProcessor}. - */ - public MailProcessor(MailOptions options, String prefix, ChunkedWriter writer) { - this.writer = new ChunkedWrapper(writer); - this.options = options; - this.prefix = prefix; - } - - /** - * Parses one complete mail archive, writing output to the {@code writer} constructor parameter. - * @param mboxFile mail archive to parse - * @return number of parsed mails - * @throws IOException - */ - public long parseMboxLineByLine(File mboxFile) throws IOException { - long messageCount = 0; - try { - StringBuilder contents = new StringBuilder(); - // tmps used during mail message parsing - StringBuilder body = new StringBuilder(); - Matcher messageIdMatcher = MESSAGE_ID_PREFIX.matcher(""); - Matcher messageBoundaryMatcher = MESSAGE_START.matcher(""); - String[] patternResults = new String[options.getPatternsToMatch().length]; - Matcher[] matchers = new Matcher[options.getPatternsToMatch().length]; - for (int i = 0; i < matchers.length; i++) { - matchers[i] = options.getPatternsToMatch()[i].matcher(""); - } - - String messageId = null; - boolean inBody = false; - Pattern quotedTextPattern = options.getQuotedTextPattern(); - for (String nextLine : new FileLineIterable(mboxFile, options.getCharset(), false)) { - if (options.isStripQuotedText() && quotedTextPattern.matcher(nextLine).find()) { - continue; - } - for (int i = 0; i < matchers.length; i++) { - Matcher matcher = matchers[i]; - matcher.reset(nextLine); - if (matcher.matches()) { - patternResults[i] = matcher.group(1); - } - } - - // only start appending body content after we've seen a message ID - if (messageId != null) { - // first, see if we hit the end of the message - messageBoundaryMatcher.reset(nextLine); - if (messageBoundaryMatcher.matches()) { - // done parsing this message ... write it out - String key = generateKey(mboxFile, prefix, messageId); - //if this ordering changes, then also change FromEmailToDictionaryMapper - writeContent(options.getSeparator(), contents, body, patternResults); - writer.write(key, contents.toString()); - contents.setLength(0); // reset the buffer - body.setLength(0); - - messageId = null; - inBody = false; - } else { - if (inBody && options.isIncludeBody()) { - if (!nextLine.isEmpty()) { - body.append(nextLine).append(options.getBodySeparator()); - } - } else { - // first empty line we see after reading the message Id - // indicates that we are in the body ... - inBody = nextLine.isEmpty(); - } - } - } else { - if (nextLine.length() > 14) { - messageIdMatcher.reset(nextLine); - if (messageIdMatcher.matches()) { - messageId = messageIdMatcher.group(1); - ++messageCount; - } - } - } - } - // write the last message in the file if available - if (messageId != null) { - String key = generateKey(mboxFile, prefix, messageId); - writeContent(options.getSeparator(), contents, body, patternResults); - writer.write(key, contents.toString()); - contents.setLength(0); // reset the buffer - } - } catch (FileNotFoundException e) { - // Skip file. - log.warn("Unable to process non-existing file", e); - } - // TODO: report exceptions and continue; - return messageCount; - } - - protected static String generateKey(File mboxFile, String prefix, String messageId) { - return prefix + File.separator + mboxFile.getName() + File.separator + messageId; - } - - public String getPrefix() { - return prefix; - } - - public MailOptions getOptions() { - return options; - } - - private static void writeContent(String separator, StringBuilder contents, CharSequence body, String[] matches) { - for (String match : matches) { - if (match != null) { - contents.append(match).append(separator); - } else { - contents.append(separator); - } - } - contents.append('\n').append(body); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWrapper.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWrapper.java b/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWrapper.java deleted file mode 100644 index 473e86a..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWrapper.java +++ /dev/null @@ -1,42 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.io; - -import java.io.IOException; - -/** - * {@link ChunkedWriter} based implementation of the {@link WrappedWriter} interface. - */ -public class ChunkedWrapper implements WrappedWriter { - - private final ChunkedWriter writer; - - public ChunkedWrapper(ChunkedWriter writer) { - this.writer = writer; - } - - @Override - public void write(String key, String value) throws IOException { - writer.write(key, value); - } - - @Override - public void close() throws IOException { - writer.close(); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWriter.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWriter.java b/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWriter.java deleted file mode 100644 index 66cf15f..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWriter.java +++ /dev/null @@ -1,86 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.utils.io; - -import com.google.common.io.Closeables; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Text; - -import java.io.Closeable; -import java.io.IOException; - -/** - * Writes data splitted in multiple Hadoop sequence files of approximate equal size. The data must consist - * of key-value pairs, both of them of String type. All sequence files are created in the same - * directory and named "chunk-0", "chunk-1", etc. - */ -public final class ChunkedWriter implements Closeable { - - private final int maxChunkSizeInBytes; - private final Path output; - private SequenceFile.Writer writer; - private int currentChunkID; - private int currentChunkSize; - private final FileSystem fs; - private final Configuration conf; - - /** - * @param conf needed by Hadoop to know what filesystem implementation to use. - * @param chunkSizeInMB approximate size of each file, in Megabytes. - * @param output directory where the sequence files will be created. - * @throws IOException - */ - public ChunkedWriter(Configuration conf, int chunkSizeInMB, Path output) throws IOException { - this.output = output; - this.conf = conf; - if (chunkSizeInMB > 1984) { - chunkSizeInMB = 1984; - } - maxChunkSizeInBytes = chunkSizeInMB * 1024 * 1024; - fs = FileSystem.get(output.toUri(), conf); - currentChunkID = 0; - writer = new SequenceFile.Writer(fs, conf, getPath(currentChunkID), Text.class, Text.class); - } - - private Path getPath(int chunkID) { - return new Path(output, "chunk-" + chunkID); - } - - /** Writes a new key-value pair, creating a new sequence file if necessary.*/ - public void write(String key, String value) throws IOException { - if (currentChunkSize > maxChunkSizeInBytes) { - Closeables.close(writer, false); - currentChunkID++; - writer = new SequenceFile.Writer(fs, conf, getPath(currentChunkID), Text.class, Text.class); - currentChunkSize = 0; - } - - Text keyT = new Text(key); - Text valueT = new Text(value); - currentChunkSize += keyT.getBytes().length + valueT.getBytes().length; // Overhead - writer.append(keyT, valueT); - } - - @Override - public void close() throws IOException { - Closeables.close(writer, false); - } -} - http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/io/IOWriterWrapper.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/io/IOWriterWrapper.java b/integration/src/main/java/org/apache/mahout/utils/io/IOWriterWrapper.java deleted file mode 100644 index b7c3d42..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/io/IOWriterWrapper.java +++ /dev/null @@ -1,45 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.io; - -import java.io.IOException; -import java.io.Writer; -/** - * Implementation of the {@link WrappedWriter} interface based on {@link java.io.Writer}. - */ -public class IOWriterWrapper implements WrappedWriter { - - private final Writer writer; - - public IOWriterWrapper(Writer writer) { - this.writer = writer; - } - - /** Writes a new key and value, separating them with one space. The value must end with a - * new line or some other delimiter, as it is not automatically added by this method - */ - @Override - public void write(String key, String value) throws IOException { - writer.write(key + ' ' + value); - } - - @Override - public void close() throws IOException { - writer.close(); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/io/WrappedWriter.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/io/WrappedWriter.java b/integration/src/main/java/org/apache/mahout/utils/io/WrappedWriter.java deleted file mode 100644 index b9900e9..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/io/WrappedWriter.java +++ /dev/null @@ -1,31 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.io; - -import java.io.Closeable; -import java.io.IOException; - -/** - * Convenience class for wrapping either a java.io.Writer or a SequenceFile.Writer with some basic functionality - */ -public interface WrappedWriter extends Closeable { - - /** Writes a new key-value pair.*/ - void write(String key, String value) throws IOException; - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java b/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java deleted file mode 100644 index 964c8cc..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java +++ /dev/null @@ -1,78 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.mahout.utils.nlp.collocations.llr; - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.CharBuffer; -import java.nio.charset.CharsetEncoder; -import java.nio.charset.CodingErrorAction; - -import org.apache.commons.io.Charsets; -import org.apache.hadoop.util.bloom.Filter; -import org.apache.hadoop.util.bloom.Key; -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; - -/** - * Emits tokens based on bloom filter membership. - */ -public final class BloomTokenFilter extends TokenFilter { - - private final Filter filter; - private final CharTermAttribute termAtt; - private final CharsetEncoder encoder; - private final Key key; - private final boolean keepMembers; - - /** - * @param filter tokens will be checked for membership in this bloom filter - * @param in the tokenstream to read. - * @param keepMembers keep memoers of the bloom filter? If true works like - * a whitelist and members found in the list are kept and all others are - * dropped. If false works like a stoplist and members found in the - * filter are dropped all others are kept. - */ - public BloomTokenFilter(Filter filter, boolean keepMembers, TokenStream in) { - super(in); - this.filter = filter; - this.keepMembers = keepMembers; - this.key = new Key(); - this.termAtt = addAttribute(CharTermAttribute.class); - this.encoder = Charsets.UTF_8.newEncoder(). - onMalformedInput(CodingErrorAction.REPORT). - onUnmappableCharacter(CodingErrorAction.REPORT); - } - - @Override - public boolean incrementToken() throws IOException { - while (input.incrementToken()) { - ByteBuffer bytes = encoder.encode(CharBuffer.wrap(termAtt.buffer(), 0, termAtt.length())); - key.set(bytes.array(), 1.0f); - boolean member = filter.membershipTest(key); - if ((keepMembers && member) || (!keepMembers && !member)) { - return true; - } - } - return false; - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java b/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java deleted file mode 100644 index 4585a0a..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.regex; - -import java.io.IOException; -import java.io.StringReader; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.mahout.common.lucene.TokenStreamIterator; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class AnalyzerTransformer implements RegexTransformer { - - private Analyzer analyzer; - private String fieldName = "text"; - - private static final Logger log = LoggerFactory.getLogger(AnalyzerTransformer.class); - - public AnalyzerTransformer() { - this(new StandardAnalyzer(), "text"); - } - - public AnalyzerTransformer(Analyzer analyzer) { - this(analyzer, "text"); - } - - public AnalyzerTransformer(Analyzer analyzer, String fieldName) { - this.analyzer = analyzer; - this.fieldName = fieldName; - } - - @Override - public String transformMatch(String match) { - StringBuilder result = new StringBuilder(); - try (TokenStream ts = analyzer.tokenStream(fieldName, new StringReader(match))) { - ts.addAttribute(CharTermAttribute.class); - ts.reset(); - TokenStreamIterator iter = new TokenStreamIterator(ts); - while (iter.hasNext()) { - result.append(iter.next()).append(' '); - } - ts.end(); - } catch (IOException e) { - throw new IllegalStateException(e); - } - return result.toString(); - } - - public Analyzer getAnalyzer() { - return analyzer; - } - - public void setAnalyzer(Analyzer analyzer) { - this.analyzer = analyzer; - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/regex/ChainTransformer.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/regex/ChainTransformer.java b/integration/src/main/java/org/apache/mahout/utils/regex/ChainTransformer.java deleted file mode 100644 index d3e8e06..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/regex/ChainTransformer.java +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.regex; - -import com.google.common.collect.Lists; - -import java.util.List; - -/** - * Chain together several {@link org.apache.mahout.utils.regex.RegexTransformer} and apply them to the match - * in succession - */ -public class ChainTransformer implements RegexTransformer { - - private List<RegexTransformer> chain = Lists.newArrayList(); - - public ChainTransformer() { - } - - public ChainTransformer(List<RegexTransformer> chain) { - this.chain = chain; - } - - @Override - public String transformMatch(String match) { - String result = match; - for (RegexTransformer transformer : chain) { - result = transformer.transformMatch(result); - } - return result; - } - - public List<RegexTransformer> getChain() { - return chain; - } - - public void setChain(List<RegexTransformer> chain) { - this.chain = chain; - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/regex/FPGFormatter.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/regex/FPGFormatter.java b/integration/src/main/java/org/apache/mahout/utils/regex/FPGFormatter.java deleted file mode 100644 index a0f296d..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/regex/FPGFormatter.java +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.regex; - -import java.util.regex.Pattern; - -/** - * Collapses/converts all whitespace to a single tab - */ -public class FPGFormatter implements RegexFormatter { - - private static final Pattern WHITESPACE = Pattern.compile("\\W+"); - - @Override - public String format(String toFormat) { - return '\t' + WHITESPACE.matcher(toFormat).replaceAll("|"); - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/regex/IdentityFormatter.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/regex/IdentityFormatter.java b/integration/src/main/java/org/apache/mahout/utils/regex/IdentityFormatter.java deleted file mode 100644 index 5c1177c..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/regex/IdentityFormatter.java +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.regex; - -public class IdentityFormatter implements RegexFormatter { - - @Override - public String format(String toFormat) { - return toFormat; - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/regex/IdentityTransformer.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/regex/IdentityTransformer.java b/integration/src/main/java/org/apache/mahout/utils/regex/IdentityTransformer.java deleted file mode 100644 index aea695d..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/regex/IdentityTransformer.java +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.regex; - -/** - * No-op - */ -public final class IdentityTransformer implements RegexTransformer { - - @Override - public String transformMatch(String match) { - return match; - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/regex/RegexConverterDriver.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/regex/RegexConverterDriver.java b/integration/src/main/java/org/apache/mahout/utils/regex/RegexConverterDriver.java deleted file mode 100644 index 53be239..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/regex/RegexConverterDriver.java +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.regex; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; -import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; -import org.apache.hadoop.util.ToolRunner; -import org.apache.lucene.analysis.Analyzer; -import org.apache.mahout.common.AbstractJob; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.common.commandline.DefaultOptionCreator; - -/** - * Experimental - */ -public class RegexConverterDriver extends AbstractJob { - - @Override - public int run(String[] args) throws Exception { - addInputOption(); - addOutputOption(); - addOption(DefaultOptionCreator.overwriteOption().create()); - addOption("regex", "regex", - "The regular expression to use", true); - addOption("groupsToKeep", "g", - "The number of the capturing groups to keep", false); - addOption("transformerClass", "t", - "The optional class specifying the Regex Transformer", false); - addOption("formatterClass", "t", - "The optional class specifying the Regex Formatter", false); - addOption(DefaultOptionCreator.analyzerOption().create()); - - if (parseArguments(args) == null) { - return -1; - } - - Configuration conf = getConf(); - //TODO: How to deal with command line escaping? - conf.set(RegexMapper.REGEX, getOption("regex")); // - String gtk = getOption("groupsToKeep"); - if (gtk != null) { - conf.set(RegexMapper.GROUP_MATCHERS, gtk); - } - String trans = getOption("transformerClass"); - if (trans != null) { - if ("url".equalsIgnoreCase(trans)) { - trans = URLDecodeTransformer.class.getName(); - } - conf.set(RegexMapper.TRANSFORMER_CLASS, trans); - } - String formatter = getOption("formatterClass"); - if (formatter != null) { - if ("fpg".equalsIgnoreCase(formatter)) { - formatter = FPGFormatter.class.getName(); - } - conf.set(RegexMapper.FORMATTER_CLASS, formatter); - } - Path input = getInputPath(); - Path output = getOutputPath(); - if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { - HadoopUtil.delete(getConf(), output); - } - Class<? extends Analyzer> analyzerClass = getAnalyzerClassFromOption(); - if (analyzerClass != null) { - conf.set(RegexMapper.ANALYZER_NAME, analyzerClass.getName()); - } - Job job = prepareJob(input, output, - TextInputFormat.class, - RegexMapper.class, - LongWritable.class, - Text.class, - TextOutputFormat.class); - boolean succeeded = job.waitForCompletion(true); - return succeeded ? 0 : -1; - } - - public static void main(String[] args) throws Exception { - ToolRunner.run(new RegexConverterDriver(), args); - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/regex/RegexFormatter.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/regex/RegexFormatter.java b/integration/src/main/java/org/apache/mahout/utils/regex/RegexFormatter.java deleted file mode 100644 index 8ef837b..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/regex/RegexFormatter.java +++ /dev/null @@ -1,24 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.regex; - -public interface RegexFormatter { - - String format(String toFormat); - -}
