http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java ---------------------------------------------------------------------- diff --git a/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java b/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java deleted file mode 100644 index bd1149b..0000000 --- a/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.streaming.tools; - -import com.google.common.base.Function; -import com.google.common.base.Preconditions; -import com.google.common.collect.Iterables; -import org.apache.mahout.clustering.iterator.ClusterWritable; -import org.apache.mahout.clustering.streaming.mapreduce.CentroidWritable; -import org.apache.mahout.math.Centroid; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; - -public class IOUtils { - - private IOUtils() {} - - /** - * Converts CentroidWritable values in a sequence file into Centroids lazily. - * @param dirIterable the source iterable (comes from a SequenceFileDirIterable). - * @return an Iterable<Centroid> with the converted vectors. - */ - public static Iterable<Centroid> getCentroidsFromCentroidWritableIterable( - Iterable<CentroidWritable> dirIterable) { - return Iterables.transform(dirIterable, new Function<CentroidWritable, Centroid>() { - @Override - public Centroid apply(CentroidWritable input) { - Preconditions.checkNotNull(input); - return input.getCentroid().clone(); - } - }); - } - - /** - * Converts CentroidWritable values in a sequence file into Centroids lazily. - * @param dirIterable the source iterable (comes from a SequenceFileDirIterable). - * @return an Iterable<Centroid> with the converted vectors. - */ - public static Iterable<Centroid> getCentroidsFromClusterWritableIterable(Iterable<ClusterWritable> dirIterable) { - return Iterables.transform(dirIterable, new Function<ClusterWritable, Centroid>() { - int numClusters = 0; - @Override - public Centroid apply(ClusterWritable input) { - Preconditions.checkNotNull(input); - return new Centroid(numClusters++, input.getValue().getCenter().clone(), - input.getValue().getTotalObservations()); - } - }); - } - - /** - * Converts VectorWritable values in a sequence file into Vectors lazily. - * @param dirIterable the source iterable (comes from a SequenceFileDirIterable). - * @return an Iterable<Vector> with the converted vectors. - */ - public static Iterable<Vector> getVectorsFromVectorWritableIterable(Iterable<VectorWritable> dirIterable) { - return Iterables.transform(dirIterable, new Function<VectorWritable, Vector>() { - @Override - public Vector apply(VectorWritable input) { - Preconditions.checkNotNull(input); - return input.get().clone(); - } - }); - } -}
http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java ---------------------------------------------------------------------- diff --git a/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java b/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java deleted file mode 100644 index 083cd8c..0000000 --- a/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java +++ /dev/null @@ -1,125 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.syntheticcontrol.canopy; - -import java.util.List; -import java.util.Map; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.util.ToolRunner; -import org.apache.mahout.clustering.canopy.CanopyDriver; -import org.apache.mahout.clustering.conversion.InputDriver; -import org.apache.mahout.common.AbstractJob; -import org.apache.mahout.common.ClassUtils; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.common.commandline.DefaultOptionCreator; -import org.apache.mahout.common.distance.DistanceMeasure; -import org.apache.mahout.common.distance.EuclideanDistanceMeasure; -import org.apache.mahout.utils.clustering.ClusterDumper; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -@Deprecated -public final class Job extends AbstractJob { - - private static final String DIRECTORY_CONTAINING_CONVERTED_INPUT = "data"; - - private Job() { - } - - private static final Logger log = LoggerFactory.getLogger(Job.class); - - public static void main(String[] args) throws Exception { - if (args.length > 0) { - log.info("Running with only user-supplied arguments"); - ToolRunner.run(new Configuration(), new Job(), args); - } else { - log.info("Running with default arguments"); - Path output = new Path("output"); - HadoopUtil.delete(new Configuration(), output); - run(new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 55); - } - } - - /** - * Run the canopy clustering job on an input dataset using the given distance - * measure, t1 and t2 parameters. All output data will be written to the - * output directory, which will be initially deleted if it exists. The - * clustered points will reside in the path <output>/clustered-points. By - * default, the job expects the a file containing synthetic_control.data as - * obtained from - * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series - * resides in a directory named "testdata", and writes output to a directory - * named "output". - * - * @param input - * the String denoting the input directory path - * @param output - * the String denoting the output directory path - * @param measure - * the DistanceMeasure to use - * @param t1 - * the canopy T1 threshold - * @param t2 - * the canopy T2 threshold - */ - private static void run(Path input, Path output, DistanceMeasure measure, - double t1, double t2) throws Exception { - Path directoryContainingConvertedInput = new Path(output, - DIRECTORY_CONTAINING_CONVERTED_INPUT); - InputDriver.runJob(input, directoryContainingConvertedInput, - "org.apache.mahout.math.RandomAccessSparseVector"); - CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, - output, measure, t1, t2, true, 0.0, false); - // run ClusterDumper - ClusterDumper clusterDumper = new ClusterDumper(new Path(output, - "clusters-0-final"), new Path(output, "clusteredPoints")); - clusterDumper.printClusters(null); - } - - @Override - public int run(String[] args) throws Exception { - - addInputOption(); - addOutputOption(); - addOption(DefaultOptionCreator.distanceMeasureOption().create()); - addOption(DefaultOptionCreator.t1Option().create()); - addOption(DefaultOptionCreator.t2Option().create()); - addOption(DefaultOptionCreator.overwriteOption().create()); - - Map<String, List<String>> argMap = parseArguments(args); - if (argMap == null) { - return -1; - } - - Path input = getInputPath(); - Path output = getOutputPath(); - if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { - HadoopUtil.delete(new Configuration(), output); - } - String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION); - double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION)); - double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION)); - DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class); - - run(input, output, measure, t1, t2); - return 0; - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java ---------------------------------------------------------------------- diff --git a/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java b/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java deleted file mode 100644 index 43beb78..0000000 --- a/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java +++ /dev/null @@ -1,144 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.syntheticcontrol.fuzzykmeans; - -import java.util.List; -import java.util.Map; - -import org.apache.commons.cli2.builder.ArgumentBuilder; -import org.apache.commons.cli2.builder.DefaultOptionBuilder; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.util.ToolRunner; -import org.apache.mahout.clustering.canopy.CanopyDriver; -import org.apache.mahout.clustering.conversion.InputDriver; -import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver; -import org.apache.mahout.common.AbstractJob; -import org.apache.mahout.common.ClassUtils; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.common.commandline.DefaultOptionCreator; -import org.apache.mahout.common.distance.DistanceMeasure; -import org.apache.mahout.common.distance.EuclideanDistanceMeasure; -import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure; -import org.apache.mahout.utils.clustering.ClusterDumper; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public final class Job extends AbstractJob { - - private static final Logger log = LoggerFactory.getLogger(Job.class); - - private static final String DIRECTORY_CONTAINING_CONVERTED_INPUT = "data"; - - private static final String M_OPTION = FuzzyKMeansDriver.M_OPTION; - - private Job() { - } - - public static void main(String[] args) throws Exception { - if (args.length > 0) { - log.info("Running with only user-supplied arguments"); - ToolRunner.run(new Configuration(), new Job(), args); - } else { - log.info("Running with default arguments"); - Path output = new Path("output"); - Configuration conf = new Configuration(); - HadoopUtil.delete(conf, output); - run(conf, new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 55, 10, 2.0f, 0.5); - } - } - - @Override - public int run(String[] args) throws Exception { - addInputOption(); - addOutputOption(); - addOption(DefaultOptionCreator.distanceMeasureOption().create()); - addOption(DefaultOptionCreator.convergenceOption().create()); - addOption(DefaultOptionCreator.maxIterationsOption().create()); - addOption(DefaultOptionCreator.overwriteOption().create()); - addOption(DefaultOptionCreator.t1Option().create()); - addOption(DefaultOptionCreator.t2Option().create()); - addOption(M_OPTION, M_OPTION, "coefficient normalization factor, must be greater than 1", true); - - Map<String,List<String>> argMap = parseArguments(args); - if (argMap == null) { - return -1; - } - - Path input = getInputPath(); - Path output = getOutputPath(); - String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION); - if (measureClass == null) { - measureClass = SquaredEuclideanDistanceMeasure.class.getName(); - } - double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION)); - int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION)); - float fuzziness = Float.parseFloat(getOption(M_OPTION)); - - addOption(new DefaultOptionBuilder().withLongName(M_OPTION).withRequired(true) - .withArgument(new ArgumentBuilder().withName(M_OPTION).withMinimum(1).withMaximum(1).create()) - .withDescription("coefficient normalization factor, must be greater than 1").withShortName(M_OPTION).create()); - if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { - HadoopUtil.delete(getConf(), output); - } - DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class); - double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION)); - double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION)); - run(getConf(), input, output, measure, t1, t2, maxIterations, fuzziness, convergenceDelta); - return 0; - } - - /** - * Run the kmeans clustering job on an input dataset using the given distance measure, t1, t2 and iteration - * parameters. All output data will be written to the output directory, which will be initially deleted if it exists. - * The clustered points will reside in the path <output>/clustered-points. By default, the job expects the a file - * containing synthetic_control.data as obtained from - * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a directory named "testdata", - * and writes output to a directory named "output". - * - * @param input - * the String denoting the input directory path - * @param output - * the String denoting the output directory path - * @param t1 - * the canopy T1 threshold - * @param t2 - * the canopy T2 threshold - * @param maxIterations - * the int maximum number of iterations - * @param fuzziness - * the float "m" fuzziness coefficient - * @param convergenceDelta - * the double convergence criteria for iterations - */ - public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2, - int maxIterations, float fuzziness, double convergenceDelta) throws Exception { - Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); - log.info("Preparing Input"); - InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); - log.info("Running Canopy to get initial clusters"); - Path canopyOutput = new Path(output, "canopies"); - CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, canopyOutput, measure, t1, t2, false, 0.0, false); - log.info("Running FuzzyKMeans"); - FuzzyKMeansDriver.run(directoryContainingConvertedInput, new Path(canopyOutput, "clusters-0-final"), output, - convergenceDelta, maxIterations, fuzziness, true, true, 0.0, false); - // run ClusterDumper - ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-*-final"), new Path(output, "clusteredPoints")); - clusterDumper.printClusters(null); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java ---------------------------------------------------------------------- diff --git a/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java b/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java deleted file mode 100644 index 70c41fe..0000000 --- a/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java +++ /dev/null @@ -1,187 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.syntheticcontrol.kmeans; - -import java.util.List; -import java.util.Map; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.util.ToolRunner; -import org.apache.mahout.clustering.Cluster; -import org.apache.mahout.clustering.canopy.CanopyDriver; -import org.apache.mahout.clustering.conversion.InputDriver; -import org.apache.mahout.clustering.kmeans.KMeansDriver; -import org.apache.mahout.clustering.kmeans.RandomSeedGenerator; -import org.apache.mahout.common.AbstractJob; -import org.apache.mahout.common.ClassUtils; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.common.commandline.DefaultOptionCreator; -import org.apache.mahout.common.distance.DistanceMeasure; -import org.apache.mahout.common.distance.EuclideanDistanceMeasure; -import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure; -import org.apache.mahout.utils.clustering.ClusterDumper; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public final class Job extends AbstractJob { - - private static final Logger log = LoggerFactory.getLogger(Job.class); - - private static final String DIRECTORY_CONTAINING_CONVERTED_INPUT = "data"; - - private Job() { - } - - public static void main(String[] args) throws Exception { - if (args.length > 0) { - log.info("Running with only user-supplied arguments"); - ToolRunner.run(new Configuration(), new Job(), args); - } else { - log.info("Running with default arguments"); - Path output = new Path("output"); - Configuration conf = new Configuration(); - HadoopUtil.delete(conf, output); - run(conf, new Path("testdata"), output, new EuclideanDistanceMeasure(), 6, 0.5, 10); - } - } - - @Override - public int run(String[] args) throws Exception { - addInputOption(); - addOutputOption(); - addOption(DefaultOptionCreator.distanceMeasureOption().create()); - addOption(DefaultOptionCreator.numClustersOption().create()); - addOption(DefaultOptionCreator.t1Option().create()); - addOption(DefaultOptionCreator.t2Option().create()); - addOption(DefaultOptionCreator.convergenceOption().create()); - addOption(DefaultOptionCreator.maxIterationsOption().create()); - addOption(DefaultOptionCreator.overwriteOption().create()); - - Map<String,List<String>> argMap = parseArguments(args); - if (argMap == null) { - return -1; - } - - Path input = getInputPath(); - Path output = getOutputPath(); - String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION); - if (measureClass == null) { - measureClass = SquaredEuclideanDistanceMeasure.class.getName(); - } - double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION)); - int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION)); - if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { - HadoopUtil.delete(getConf(), output); - } - DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class); - if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) { - int k = Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)); - run(getConf(), input, output, measure, k, convergenceDelta, maxIterations); - } else { - double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION)); - double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION)); - run(getConf(), input, output, measure, t1, t2, convergenceDelta, maxIterations); - } - return 0; - } - - /** - * Run the kmeans clustering job on an input dataset using the given the number of clusters k and iteration - * parameters. All output data will be written to the output directory, which will be initially deleted if it exists. - * The clustered points will reside in the path <output>/clustered-points. By default, the job expects a file - * containing equal length space delimited data that resides in a directory named "testdata", and writes output to a - * directory named "output". - * - * @param conf - * the Configuration to use - * @param input - * the String denoting the input directory path - * @param output - * the String denoting the output directory path - * @param measure - * the DistanceMeasure to use - * @param k - * the number of clusters in Kmeans - * @param convergenceDelta - * the double convergence criteria for iterations - * @param maxIterations - * the int maximum number of iterations - */ - public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k, - double convergenceDelta, int maxIterations) throws Exception { - Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); - log.info("Preparing Input"); - InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); - log.info("Running random seed to get initial clusters"); - Path clusters = new Path(output, "random-seeds"); - clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure); - log.info("Running KMeans with k = {}", k); - KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, convergenceDelta, - maxIterations, true, 0.0, false); - // run ClusterDumper - Path outGlob = new Path(output, "clusters-*-final"); - Path clusteredPoints = new Path(output,"clusteredPoints"); - log.info("Dumping out clusters from clusters: {} and clusteredPoints: {}", outGlob, clusteredPoints); - ClusterDumper clusterDumper = new ClusterDumper(outGlob, clusteredPoints); - clusterDumper.printClusters(null); - } - - /** - * Run the kmeans clustering job on an input dataset using the given distance measure, t1, t2 and iteration - * parameters. All output data will be written to the output directory, which will be initially deleted if it exists. - * The clustered points will reside in the path <output>/clustered-points. By default, the job expects the a file - * containing synthetic_control.data as obtained from - * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a directory named "testdata", - * and writes output to a directory named "output". - * - * @param conf - * the Configuration to use - * @param input - * the String denoting the input directory path - * @param output - * the String denoting the output directory path - * @param measure - * the DistanceMeasure to use - * @param t1 - * the canopy T1 threshold - * @param t2 - * the canopy T2 threshold - * @param convergenceDelta - * the double convergence criteria for iterations - * @param maxIterations - * the int maximum number of iterations - */ - public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2, - double convergenceDelta, int maxIterations) throws Exception { - Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); - log.info("Preparing Input"); - InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); - log.info("Running Canopy to get initial clusters"); - Path canopyOutput = new Path(output, "canopies"); - CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, canopyOutput, measure, t1, t2, false, 0.0, - false); - log.info("Running KMeans"); - KMeansDriver.run(conf, directoryContainingConvertedInput, new Path(canopyOutput, Cluster.INITIAL_CLUSTERS_DIR - + "-final"), output, convergenceDelta, maxIterations, true, 0.0, false); - // run ClusterDumper - ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-*-final"), new Path(output, - "clusteredPoints")); - clusterDumper.printClusters(null); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/DeliciousTagsExample.java ---------------------------------------------------------------------- diff --git a/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/DeliciousTagsExample.java b/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/DeliciousTagsExample.java deleted file mode 100644 index 92363e5..0000000 --- a/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/DeliciousTagsExample.java +++ /dev/null @@ -1,94 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.fpm.pfpgrowth; - -import java.io.IOException; - -import org.apache.commons.cli2.CommandLine; -import org.apache.commons.cli2.Group; -import org.apache.commons.cli2.Option; -import org.apache.commons.cli2.OptionException; -import org.apache.commons.cli2.builder.ArgumentBuilder; -import org.apache.commons.cli2.builder.DefaultOptionBuilder; -import org.apache.commons.cli2.builder.GroupBuilder; -import org.apache.commons.cli2.commandline.Parser; -import org.apache.mahout.common.CommandLineUtil; -import org.apache.mahout.common.Parameters; -import org.apache.mahout.common.commandline.DefaultOptionCreator; -import org.apache.mahout.fpm.pfpgrowth.dataset.KeyBasedStringTupleGrouper; - -public final class DeliciousTagsExample { - private DeliciousTagsExample() { } - - public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { - DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); - ArgumentBuilder abuilder = new ArgumentBuilder(); - GroupBuilder gbuilder = new GroupBuilder(); - Option inputDirOpt = DefaultOptionCreator.inputOption().create(); - - Option outputOpt = DefaultOptionCreator.outputOption().create(); - - Option helpOpt = DefaultOptionCreator.helpOption(); - Option recordSplitterOpt = obuilder.withLongName("splitterPattern").withArgument( - abuilder.withName("splitterPattern").withMinimum(1).withMaximum(1).create()).withDescription( - "Regular Expression pattern used to split given line into fields." - + " Default value splits comma or tab separated fields." - + " Default Value: \"[ ,\\t]*\\t[ ,\\t]*\" ").withShortName("regex").create(); - Option encodingOpt = obuilder.withLongName("encoding").withArgument( - abuilder.withName("encoding").withMinimum(1).withMaximum(1).create()).withDescription( - "(Optional) The file encoding. Default value: UTF-8").withShortName("e").create(); - Group group = gbuilder.withName("Options").withOption(inputDirOpt).withOption(outputOpt).withOption( - helpOpt).withOption(recordSplitterOpt).withOption(encodingOpt).create(); - - try { - Parser parser = new Parser(); - parser.setGroup(group); - CommandLine cmdLine = parser.parse(args); - - if (cmdLine.hasOption(helpOpt)) { - CommandLineUtil.printHelp(group); - return; - } - Parameters params = new Parameters(); - if (cmdLine.hasOption(recordSplitterOpt)) { - params.set("splitPattern", (String) cmdLine.getValue(recordSplitterOpt)); - } - - String encoding = "UTF-8"; - if (cmdLine.hasOption(encodingOpt)) { - encoding = (String) cmdLine.getValue(encodingOpt); - } - params.set("encoding", encoding); - String inputDir = (String) cmdLine.getValue(inputDirOpt); - String outputDir = (String) cmdLine.getValue(outputOpt); - params.set("input", inputDir); - params.set("output", outputDir); - params.set("groupingFieldCount", "2"); - params.set("gfield0", "1"); - params.set("gfield1", "2"); - params.set("selectedFieldCount", "1"); - params.set("field0", "3"); - params.set("maxTransactionLength", "100"); - KeyBasedStringTupleGrouper.startJob(params); - - } catch (OptionException ex) { - CommandLineUtil.printHelp(group); - } - - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleCombiner.java ---------------------------------------------------------------------- diff --git a/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleCombiner.java b/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleCombiner.java deleted file mode 100644 index 4c80a31..0000000 --- a/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleCombiner.java +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.fpm.pfpgrowth.dataset; - -import java.io.IOException; -import java.util.HashSet; -import java.util.Set; - -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.mahout.common.StringTuple; - -public class KeyBasedStringTupleCombiner extends Reducer<Text,StringTuple,Text,StringTuple> { - - @Override - protected void reduce(Text key, - Iterable<StringTuple> values, - Context context) throws IOException, InterruptedException { - Set<String> outputValues = new HashSet<>(); - for (StringTuple value : values) { - outputValues.addAll(value.getEntries()); - } - context.write(key, new StringTuple(outputValues)); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleGrouper.java ---------------------------------------------------------------------- diff --git a/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleGrouper.java b/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleGrouper.java deleted file mode 100644 index cd17770..0000000 --- a/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleGrouper.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.fpm.pfpgrowth.dataset; - -import java.io.IOException; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.common.Parameters; -import org.apache.mahout.common.StringTuple; - -public final class KeyBasedStringTupleGrouper { - - private KeyBasedStringTupleGrouper() { } - - public static void startJob(Parameters params) throws IOException, - InterruptedException, - ClassNotFoundException { - Configuration conf = new Configuration(); - - conf.set("job.parameters", params.toString()); - conf.set("mapred.compress.map.output", "true"); - conf.set("mapred.output.compression.type", "BLOCK"); - conf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); - conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," - + "org.apache.hadoop.io.serializer.WritableSerialization"); - - String input = params.get("input"); - Job job = new Job(conf, "Generating dataset based from input" + input); - job.setJarByClass(KeyBasedStringTupleGrouper.class); - - job.setMapOutputKeyClass(Text.class); - job.setMapOutputValueClass(StringTuple.class); - - job.setOutputKeyClass(Text.class); - job.setOutputValueClass(Text.class); - - FileInputFormat.addInputPath(job, new Path(input)); - Path outPath = new Path(params.get("output")); - FileOutputFormat.setOutputPath(job, outPath); - - HadoopUtil.delete(conf, outPath); - - job.setInputFormatClass(TextInputFormat.class); - job.setMapperClass(KeyBasedStringTupleMapper.class); - job.setCombinerClass(KeyBasedStringTupleCombiner.class); - job.setReducerClass(KeyBasedStringTupleReducer.class); - job.setOutputFormatClass(TextOutputFormat.class); - - boolean succeeded = job.waitForCompletion(true); - if (!succeeded) { - throw new IllegalStateException("Job failed!"); - } - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleMapper.java ---------------------------------------------------------------------- diff --git a/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleMapper.java b/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleMapper.java deleted file mode 100644 index 362d1ce..0000000 --- a/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleMapper.java +++ /dev/null @@ -1,90 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.fpm.pfpgrowth.dataset; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; -import java.util.regex.Pattern; - -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.mahout.common.Parameters; -import org.apache.mahout.common.StringTuple; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Splits the line using a {@link Pattern} and outputs key as given by the groupingFields - * - */ -public class KeyBasedStringTupleMapper extends Mapper<LongWritable,Text,Text,StringTuple> { - - private static final Logger log = LoggerFactory.getLogger(KeyBasedStringTupleMapper.class); - - private Pattern splitter; - - private int[] selectedFields; - - private int[] groupingFields; - - @Override - protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { - String[] fields = splitter.split(value.toString()); - if (fields.length != 4) { - log.info("{} {}", fields.length, value.toString()); - context.getCounter("Map", "ERROR").increment(1); - return; - } - Collection<String> oKey = new ArrayList<>(); - for (int groupingField : groupingFields) { - oKey.add(fields[groupingField]); - context.setStatus(fields[groupingField]); - } - - List<String> oValue = new ArrayList<>(); - for (int selectedField : selectedFields) { - oValue.add(fields[selectedField]); - } - - context.write(new Text(oKey.toString()), new StringTuple(oValue)); - - } - - @Override - protected void setup(Context context) throws IOException, InterruptedException { - super.setup(context); - Parameters params = new Parameters(context.getConfiguration().get("job.parameters", "")); - splitter = Pattern.compile(params.get("splitPattern", "[ \t]*\t[ \t]*")); - - int selectedFieldCount = Integer.valueOf(params.get("selectedFieldCount", "0")); - selectedFields = new int[selectedFieldCount]; - for (int i = 0; i < selectedFieldCount; i++) { - selectedFields[i] = Integer.valueOf(params.get("field" + i, "0")); - } - - int groupingFieldCount = Integer.valueOf(params.get("groupingFieldCount", "0")); - groupingFields = new int[groupingFieldCount]; - for (int i = 0; i < groupingFieldCount; i++) { - groupingFields[i] = Integer.valueOf(params.get("gfield" + i, "0")); - } - - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleReducer.java ---------------------------------------------------------------------- diff --git a/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleReducer.java b/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleReducer.java deleted file mode 100644 index a7ef762..0000000 --- a/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleReducer.java +++ /dev/null @@ -1,74 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.fpm.pfpgrowth.dataset; - -import java.io.IOException; -import java.util.Collection; -import java.util.HashSet; - -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.mahout.common.Parameters; -import org.apache.mahout.common.StringTuple; - -public class KeyBasedStringTupleReducer extends Reducer<Text,StringTuple,Text,Text> { - - private int maxTransactionLength = 100; - - @Override - protected void reduce(Text key, Iterable<StringTuple> values, Context context) - throws IOException, InterruptedException { - Collection<String> items = new HashSet<>(); - - for (StringTuple value : values) { - for (String field : value.getEntries()) { - items.add(field); - } - } - if (items.size() > 1) { - int i = 0; - StringBuilder sb = new StringBuilder(); - String sep = ""; - for (String field : items) { - if (i % maxTransactionLength == 0) { - if (i != 0) { - context.write(null, new Text(sb.toString())); - } - sb.replace(0, sb.length(), ""); - sep = ""; - } - - sb.append(sep).append(field); - sep = "\t"; - - i++; - - } - if (sb.length() > 0) { - context.write(null, new Text(sb.toString())); - } - } - } - - @Override - protected void setup(Context context) throws IOException, InterruptedException { - super.setup(context); - Parameters params = new Parameters(context.getConfiguration().get("job.parameters", "")); - maxTransactionLength = Integer.valueOf(params.get("maxTransactionLength", "100")); - } -}
