http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java b/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java deleted file mode 100644 index 8a226a0..0000000 --- a/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java +++ /dev/null @@ -1,321 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering; - -import java.io.IOException; -import java.util.List; -import java.util.Map; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.mahout.clustering.canopy.Canopy; -import org.apache.mahout.clustering.canopy.CanopyDriver; -import org.apache.mahout.clustering.evaluation.ClusterEvaluator; -import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver; -import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver; -import org.apache.mahout.clustering.kmeans.KMeansDriver; -import org.apache.mahout.clustering.kmeans.TestKmeansClustering; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.common.MahoutTestCase; -import org.apache.mahout.common.distance.DistanceMeasure; -import org.apache.mahout.common.distance.EuclideanDistanceMeasure; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.VectorWritable; -import org.junit.Before; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; - -public final class TestClusterEvaluator extends MahoutTestCase { - - private static final double[][] REFERENCE = { {1, 1}, {2, 1}, {1, 2}, {2, 2}, {3, 3}, {4, 4}, {5, 4}, {4, 5}, {5, 5}}; - - private List<VectorWritable> referenceData = Lists.newArrayList(); - - private final List<VectorWritable> sampleData = Lists.newArrayList(); - - private Map<Integer,List<VectorWritable>> representativePoints; - - private List<Cluster> clusters; - - private static final Logger log = LoggerFactory.getLogger(TestClusterEvaluator.class); - - private Configuration conf; - - private FileSystem fs; - - private Path testdata; - - private Path output; - - @Override - @Before - public void setUp() throws Exception { - super.setUp(); - conf = getConfiguration(); - fs = FileSystem.get(conf); - testdata = getTestTempDirPath("testdata"); - output = getTestTempDirPath("output"); - // Create small reference data set - referenceData = TestKmeansClustering.getPointsWritable(REFERENCE); - // generate larger test data set for the clustering tests to chew on - generateSamples(); - } - - /** - * Generate random samples and add them to the sampleData - * - * @param num - * int number of samples to generate - * @param mx - * double x-value of the sample mean - * @param my - * double y-value of the sample mean - * @param sd - * double standard deviation of the samples - */ - private void generateSamples(int num, double mx, double my, double sd) { - log.info("Generating {} samples m=[{}, {}] sd={}", num, mx, my, sd); - for (int i = 0; i < num; i++) { - sampleData.add(new VectorWritable(new DenseVector(new double[] {UncommonDistributions.rNorm(mx, sd), - UncommonDistributions.rNorm(my, sd)}))); - } - } - - private void generateSamples() { - generateSamples(500, 1, 1, 3); - generateSamples(300, 1, 0, 0.5); - generateSamples(300, 0, 2, 0.1); - } - - private void printRepPoints(int numIterations) { - RepresentativePointsDriver.printRepresentativePoints(output, numIterations); - } - - /** - * Initialize synthetic data using 4 clusters dC units from origin having 4 representative points dP from each center - * - * @param dC - * a double cluster center offset - * @param dP - * a double representative point offset - * @param measure - * the DistanceMeasure - */ - private void initData(double dC, double dP, DistanceMeasure measure) { - clusters = Lists.newArrayList(); - clusters.add(new Canopy(new DenseVector(new double[] {-dC, -dC}), 1, measure)); - clusters.add(new Canopy(new DenseVector(new double[] {-dC, dC}), 3, measure)); - clusters.add(new Canopy(new DenseVector(new double[] {dC, dC}), 5, measure)); - clusters.add(new Canopy(new DenseVector(new double[] {dC, -dC}), 7, measure)); - representativePoints = Maps.newHashMap(); - for (Cluster cluster : clusters) { - List<VectorWritable> points = Lists.newArrayList(); - representativePoints.put(cluster.getId(), points); - points.add(new VectorWritable(cluster.getCenter().clone())); - points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {dP, dP})))); - points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {dP, -dP})))); - points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {-dP, -dP})))); - points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {-dP, dP})))); - } - } - - @Test - public void testRepresentativePoints() throws Exception { - ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - Configuration conf = getConfiguration(); - // run using MR reference point calculation - CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, true, 0.0, true); - int numIterations = 2; - Path clustersIn = new Path(output, "clusters-0-final"); - RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure, - numIterations, false); - printRepPoints(numIterations); - ClusterEvaluator evaluatorMR = new ClusterEvaluator(conf, clustersIn); - // now run again using sequential reference point calculation - HadoopUtil.delete(conf, output); - CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, true, 0.0, true); - RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure, - numIterations, true); - printRepPoints(numIterations); - ClusterEvaluator evaluatorSeq = new ClusterEvaluator(conf, clustersIn); - // compare results - assertEquals("InterCluster Density", evaluatorMR.interClusterDensity(), evaluatorSeq.interClusterDensity(), EPSILON); - assertEquals("IntraCluster Density", evaluatorMR.intraClusterDensity(), evaluatorSeq.intraClusterDensity(), EPSILON); - } - - @Test - public void testCluster0() throws IOException { - ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - initData(1, 0.25, measure); - ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure); - assertEquals("inter cluster density", 0.33333333333333315, evaluator.interClusterDensity(), EPSILON); - assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON); - } - - @Test - public void testCluster1() throws IOException { - ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - initData(1, 0.5, measure); - ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure); - assertEquals("inter cluster density", 0.33333333333333315, evaluator.interClusterDensity(), EPSILON); - assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON); - } - - @Test - public void testCluster2() throws IOException { - ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - initData(1, 0.75, measure); - ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure); - assertEquals("inter cluster density", 0.33333333333333315, evaluator.interClusterDensity(), EPSILON); - assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON); - } - - /** - * adding an empty cluster should modify the inter cluster density but not change the intra-cluster density as that - * cluster would have NaN as its intra-cluster density and NaN values are ignored by the evaluator - * - * @throws IOException - */ - @Test - public void testEmptyCluster() throws IOException { - ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - initData(1, 0.25, measure); - Canopy cluster = new Canopy(new DenseVector(new double[] {10, 10}), 19, measure); - clusters.add(cluster); - List<VectorWritable> points = Lists.newArrayList(); - representativePoints.put(cluster.getId(), points); - ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure); - assertEquals("inter cluster density", 0.371534146934532, evaluator.interClusterDensity(), EPSILON); - assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON); - } - - /** - * adding an single-valued cluster should modify the inter cluster density but not change the intra-cluster density as - * that cluster would have NaN as its intra-cluster density and NaN values are ignored by the evaluator - * - * @throws IOException - */ - @Test - public void testSingleValueCluster() throws IOException { - ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - initData(1, 0.25, measure); - Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure); - clusters.add(cluster); - List<VectorWritable> points = Lists.newArrayList(); - points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {1, 1})))); - representativePoints.put(cluster.getId(), points); - ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure); - assertEquals("inter cluster density", 0.3656854249492381, evaluator.interClusterDensity(), EPSILON); - assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON); - } - - /** - * Representative points extraction will duplicate the cluster center if the cluster has no assigned points. These - * clusters are included in the inter-cluster density but their NaN intra-density values are ignored by the evaluator. - * - * @throws IOException - */ - @Test - public void testAllSameValueCluster() throws IOException { - ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - initData(1, 0.25, measure); - Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure); - clusters.add(cluster); - List<VectorWritable> points = Lists.newArrayList(); - points.add(new VectorWritable(cluster.getCenter())); - points.add(new VectorWritable(cluster.getCenter())); - points.add(new VectorWritable(cluster.getCenter())); - representativePoints.put(cluster.getId(), points); - ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure); - assertEquals("inter cluster density", 0.3656854249492381, evaluator.interClusterDensity(), EPSILON); - assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON); - } - - @Test - public void testCanopy() throws Exception { - ClusteringTestUtils.writePointsToFile(sampleData, new Path(testdata, "file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - Configuration conf = getConfiguration(); - CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, true, 0.0, true); - int numIterations = 10; - Path clustersIn = new Path(output, "clusters-0-final"); - RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure, - numIterations, true); - //printRepPoints(numIterations); - ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn); - // now print out the Results - System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity()); - System.out.println("Inter-cluster density = " + evaluator.interClusterDensity()); - } - - @Test - public void testKmeans() throws Exception { - ClusteringTestUtils.writePointsToFile(sampleData, new Path(testdata, "file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - // now run the Canopy job to prime kMeans canopies - Configuration conf = getConfiguration(); - CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, false, 0.0, true); - // now run the KMeans job - Path kmeansOutput = new Path(output, "kmeans"); - KMeansDriver.run(testdata, new Path(output, "clusters-0-final"), kmeansOutput, 0.001, 10, true, 0.0, true); - int numIterations = 10; - Path clustersIn = new Path(kmeansOutput, "clusters-2"); - RepresentativePointsDriver.run(conf, clustersIn, new Path(kmeansOutput, "clusteredPoints"), kmeansOutput, measure, - numIterations, true); - RepresentativePointsDriver.printRepresentativePoints(kmeansOutput, numIterations); - ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn); - // now print out the Results - System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity()); - System.out.println("Inter-cluster density = " + evaluator.interClusterDensity()); - } - - @Test - public void testFuzzyKmeans() throws Exception { - ClusteringTestUtils.writePointsToFile(sampleData, new Path(testdata, "file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - // now run the Canopy job to prime kMeans canopies - Configuration conf = getConfiguration(); - CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, false, 0.0, true); - Path fuzzyKMeansOutput = new Path(output, "fuzzyk"); - // now run the KMeans job - FuzzyKMeansDriver.run(testdata, new Path(output, "clusters-0-final"), fuzzyKMeansOutput, 0.001, 10, 2, - true, true, 0, true); - int numIterations = 10; - Path clustersIn = new Path(fuzzyKMeansOutput, "clusters-4"); - RepresentativePointsDriver.run(conf, clustersIn, new Path(fuzzyKMeansOutput, "clusteredPoints"), fuzzyKMeansOutput, - measure, numIterations, true); - RepresentativePointsDriver.printRepresentativePoints(fuzzyKMeansOutput, numIterations); - ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn); - // now print out the Results - System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity()); - System.out.println("Inter-cluster density = " + evaluator.interClusterDensity()); - } - -}
http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java b/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java deleted file mode 100644 index 597ed01..0000000 --- a/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java +++ /dev/null @@ -1,326 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.cdbw; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.mahout.clustering.Cluster; -import org.apache.mahout.clustering.ClusteringTestUtils; -import org.apache.mahout.clustering.TestClusterEvaluator; -import org.apache.mahout.clustering.UncommonDistributions; -import org.apache.mahout.clustering.canopy.Canopy; -import org.apache.mahout.clustering.canopy.CanopyDriver; -import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver; -import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver; -import org.apache.mahout.clustering.kmeans.KMeansDriver; -import org.apache.mahout.clustering.kmeans.TestKmeansClustering; -import org.apache.mahout.common.MahoutTestCase; -import org.apache.mahout.common.distance.DistanceMeasure; -import org.apache.mahout.common.distance.EuclideanDistanceMeasure; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; -import org.junit.Before; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public final class TestCDbwEvaluator extends MahoutTestCase { - - private static final double[][] REFERENCE = { {1, 1}, {2, 1}, {1, 2}, {2, 2}, {3, 3}, {4, 4}, {5, 4}, {4, 5}, {5, 5}}; - - private static final Logger log = LoggerFactory.getLogger(TestClusterEvaluator.class); - - private Map<Integer,List<VectorWritable>> representativePoints; - - private List<Cluster> clusters; - - private Configuration conf; - - private FileSystem fs; - - private final Collection<VectorWritable> sampleData = new ArrayList<>(); - - private List<VectorWritable> referenceData = new ArrayList<>(); - - private Path testdata; - - private Path output; - - @Override - @Before - public void setUp() throws Exception { - super.setUp(); - conf = getConfiguration(); - fs = FileSystem.get(conf); - testdata = getTestTempDirPath("testdata"); - output = getTestTempDirPath("output"); - // Create small reference data set - referenceData = TestKmeansClustering.getPointsWritable(REFERENCE); - // generate larger test data set for the clustering tests to chew on - generateSamples(); - } - - /** - * Initialize synthetic data using 4 clusters dC units from origin having 4 representative points dP from each center - * - * @param dC - * a double cluster center offset - * @param dP - * a double representative point offset - * @param measure - * the DistanceMeasure - */ - private void initData(double dC, double dP, DistanceMeasure measure) { - clusters = new ArrayList<>(); - clusters.add(new Canopy(new DenseVector(new double[] {-dC, -dC}), 1, measure)); - clusters.add(new Canopy(new DenseVector(new double[] {-dC, dC}), 3, measure)); - clusters.add(new Canopy(new DenseVector(new double[] {dC, dC}), 5, measure)); - clusters.add(new Canopy(new DenseVector(new double[] {dC, -dC}), 7, measure)); - representativePoints = new HashMap<>(); - for (Cluster cluster : clusters) { - List<VectorWritable> points = new ArrayList<>(); - representativePoints.put(cluster.getId(), points); - points.add(new VectorWritable(cluster.getCenter().clone())); - points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {dP, dP})))); - points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {dP, -dP})))); - points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {-dP, -dP})))); - points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {-dP, dP})))); - } - } - - /** - * Generate random samples and add them to the sampleData - * - * @param num - * int number of samples to generate - * @param mx - * double x-value of the sample mean - * @param my - * double y-value of the sample mean - * @param sd - * double standard deviation of the samples - */ - private void generateSamples(int num, double mx, double my, double sd) { - log.info("Generating {} samples m=[{}, {}] sd={}", num, mx, my, sd); - for (int i = 0; i < num; i++) { - sampleData.add(new VectorWritable(new DenseVector(new double[] {UncommonDistributions.rNorm(mx, sd), - UncommonDistributions.rNorm(my, sd)}))); - } - } - - private void generateSamples() { - generateSamples(500, 1, 1, 3); - generateSamples(300, 1, 0, 0.5); - generateSamples(300, 0, 2, 0.1); - } - - @Test - public void testCDbw0() throws IOException { - ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - initData(1, 0.25, measure); - CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure); - System.out.println("CDbw = " + evaluator.getCDbw()); - System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity()); - System.out.println("Inter-cluster density = " + evaluator.interClusterDensity()); - System.out.println("Separation = " + evaluator.separation()); - } - - @Test - public void testCDbw1() throws IOException { - ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - initData(1, 0.5, measure); - CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure); - System.out.println("CDbw = " + evaluator.getCDbw()); - System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity()); - System.out.println("Inter-cluster density = " + evaluator.interClusterDensity()); - System.out.println("Separation = " + evaluator.separation()); - } - - @Test - public void testCDbw2() throws IOException { - ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - initData(1, 0.75, measure); - CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure); - System.out.println("CDbw = " + evaluator.getCDbw()); - System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity()); - System.out.println("Inter-cluster density = " + evaluator.interClusterDensity()); - System.out.println("Separation = " + evaluator.separation()); - } - - @Test - public void testEmptyCluster() throws IOException { - ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - initData(1, 0.25, measure); - Canopy cluster = new Canopy(new DenseVector(new double[] {10, 10}), 19, measure); - clusters.add(cluster); - List<VectorWritable> points = new ArrayList<>(); - representativePoints.put(cluster.getId(), points); - CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure); - System.out.println("CDbw = " + evaluator.getCDbw()); - System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity()); - System.out.println("Inter-cluster density = " + evaluator.interClusterDensity()); - System.out.println("Separation = " + evaluator.separation()); - } - - @Test - public void testSingleValueCluster() throws IOException { - ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - initData(1, 0.25, measure); - Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure); - clusters.add(cluster); - List<VectorWritable> points = new ArrayList<>(); - points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {1, 1})))); - representativePoints.put(cluster.getId(), points); - CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure); - System.out.println("CDbw = " + evaluator.getCDbw()); - System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity()); - System.out.println("Inter-cluster density = " + evaluator.interClusterDensity()); - System.out.println("Separation = " + evaluator.separation()); - } - - /** - * Representative points extraction will duplicate the cluster center if the cluster has no assigned points. These - * clusters should be ignored like empty clusters above - * - * @throws IOException - */ - @Test - public void testAllSameValueCluster() throws IOException { - ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - initData(1, 0.25, measure); - Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure); - clusters.add(cluster); - List<VectorWritable> points = new ArrayList<>(); - points.add(new VectorWritable(cluster.getCenter())); - points.add(new VectorWritable(cluster.getCenter())); - points.add(new VectorWritable(cluster.getCenter())); - representativePoints.put(cluster.getId(), points); - CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure); - System.out.println("CDbw = " + evaluator.getCDbw()); - System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity()); - System.out.println("Inter-cluster density = " + evaluator.interClusterDensity()); - System.out.println("Separation = " + evaluator.separation()); - } - - /** - * Clustering can produce very, very tight clusters that can cause the std calculation to fail. These clusters should - * be processed correctly. - * - * @throws IOException - */ - @Test - public void testAlmostSameValueCluster() throws IOException { - ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - initData(1, 0.25, measure); - Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure); - clusters.add(cluster); - List<VectorWritable> points = new ArrayList<>(); - Vector delta = new DenseVector(new double[] {0, Double.MIN_NORMAL}); - points.add(new VectorWritable(delta.clone())); - points.add(new VectorWritable(delta.clone())); - points.add(new VectorWritable(delta.clone())); - points.add(new VectorWritable(delta.clone())); - points.add(new VectorWritable(delta.clone())); - representativePoints.put(cluster.getId(), points); - CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure); - System.out.println("CDbw = " + evaluator.getCDbw()); - System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity()); - System.out.println("Inter-cluster density = " + evaluator.interClusterDensity()); - System.out.println("Separation = " + evaluator.separation()); - } - - @Test - public void testCanopy() throws Exception { - ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("testdata/file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - CanopyDriver.run(getConfiguration(), testdata, output, measure, 3.1, 2.1, true, 0.0, true); - int numIterations = 10; - Path clustersIn = new Path(output, "clusters-0-final"); - RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure, - numIterations, true); - CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn); - // printRepPoints(numIterations); - // now print out the Results - System.out.println("Canopy CDbw = " + evaluator.getCDbw()); - System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity()); - System.out.println("Inter-cluster density = " + evaluator.interClusterDensity()); - System.out.println("Separation = " + evaluator.separation()); - } - - @Test - public void testKmeans() throws Exception { - ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("testdata/file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - // now run the Canopy job to prime kMeans canopies - CanopyDriver.run(getConfiguration(), testdata, output, measure, 3.1, 2.1, false, 0.0, true); - // now run the KMeans job - Path kmeansOutput = new Path(output, "kmeans"); - KMeansDriver.run(testdata, new Path(output, "clusters-0-final"), kmeansOutput, 0.001, 10, true, 0.0, true); - int numIterations = 10; - Path clustersIn = new Path(kmeansOutput, "clusters-10-final"); - RepresentativePointsDriver.run(conf, clustersIn, new Path(kmeansOutput, "clusteredPoints"), kmeansOutput, measure, - numIterations, true); - CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn); - RepresentativePointsDriver.printRepresentativePoints(kmeansOutput, numIterations); - // now print out the Results - System.out.println("K-Means CDbw = " + evaluator.getCDbw()); - System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity()); - System.out.println("Inter-cluster density = " + evaluator.interClusterDensity()); - System.out.println("Separation = " + evaluator.separation()); - } - - @Test - public void testFuzzyKmeans() throws Exception { - ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("testdata/file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - // now run the Canopy job to prime kMeans canopies - CanopyDriver.run(getConfiguration(), testdata, output, measure, 3.1, 2.1, false, 0.0, true); - Path fuzzyKMeansOutput = new Path(output, "fuzzyk"); - // now run the KMeans job - FuzzyKMeansDriver.run(testdata, new Path(output, "clusters-0-final"), fuzzyKMeansOutput, 0.001, 10, 2, - true, true, 0, true); - int numIterations = 10; - Path clustersIn = new Path(fuzzyKMeansOutput, "clusters-4"); - RepresentativePointsDriver.run(conf, clustersIn, new Path(fuzzyKMeansOutput, "clusteredPoints"), fuzzyKMeansOutput, - measure, numIterations, true); - CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn); - RepresentativePointsDriver.printRepresentativePoints(fuzzyKMeansOutput, numIterations); - // now print out the Results - System.out.println("Fuzzy K-Means CDbw = " + evaluator.getCDbw()); - System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity()); - System.out.println("Inter-cluster density = " + evaluator.interClusterDensity()); - System.out.println("Separation = " + evaluator.separation()); - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java b/integration/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java deleted file mode 100644 index ba73c82..0000000 --- a/integration/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java +++ /dev/null @@ -1,66 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.text; - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.mahout.common.MahoutTestCase; -import org.junit.Test; - -/** - * Unit tests for the MailArchivesClusteringAnalyzer text analyzer. - */ -public class MailArchivesClusteringAnalyzerTest extends MahoutTestCase { - - @Test - public void testAnalysis() throws Exception { - Analyzer analyzer = new MailArchivesClusteringAnalyzer(); - - String text = "A test message\n" - + "atokenthatistoolongtobeusefulforclustertextanalysis\n" - + "Mahout is a scalable, machine-learning LIBRARY\n" - + "we've added some additional stopwords such as html, mailto, regards\t" - + "apache_hadoop provides the foundation for scalability\n" - + "www.nabble.com [email protected]\n" - + "public void int protected package"; - Reader reader = new StringReader(text); - - // if you change the text above, then you may need to change this as well - // order matters too - String[] expectedTokens = { - "test", "mahout", "scalabl", "machin", "learn", "librari", "weve", "ad", - "stopword", "apache_hadoop","provid", "foundat", "scalabl" - }; - - TokenStream tokenStream = analyzer.tokenStream("test", reader); - assertNotNull(tokenStream); - tokenStream.reset(); - CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); - int e = 0; - while (tokenStream.incrementToken() && e < expectedTokens.length) { - assertEquals(expectedTokens[e++], termAtt.toString()); - } - assertEquals(e, expectedTokens.length); - tokenStream.end(); - tokenStream.close(); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java b/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java deleted file mode 100644 index ef2b8a6..0000000 --- a/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java +++ /dev/null @@ -1,240 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.text; - -import java.io.File; -import java.io.FileOutputStream; -import java.util.zip.GZIPOutputStream; - -import org.apache.commons.lang3.SystemUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.Text; -import org.apache.mahout.common.MahoutTestCase; -import org.apache.mahout.common.Pair; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -/** - * Test case for the SequenceFilesFromMailArchives command-line application. - */ -public final class SequenceFilesFromMailArchivesTest extends MahoutTestCase { - - private File inputDir; - - /** - * Create the input and output directories needed for testing - * the SequenceFilesFromMailArchives application. - */ - @Override - @Before - public void setUp() throws Exception { - super.setUp(); - inputDir = getTestTempDir("mail-archives-in"); - - // write test mail messages to a gzipped file in a nested directory - File subDir = new File(inputDir, "subdir"); - subDir.mkdir(); - File gzFile = new File(subDir, "mail-messages.gz"); - try (GZIPOutputStream gzOut = new GZIPOutputStream(new FileOutputStream(gzFile))) { - gzOut.write(testMailMessages.getBytes("UTF-8")); - gzOut.finish(); - } - - File subDir2 = new File(subDir, "subsubdir"); - subDir2.mkdir(); - File gzFile2 = new File(subDir2, "mail-messages-2.gz"); - try (GZIPOutputStream gzOut = new GZIPOutputStream(new FileOutputStream(gzFile2))) { - gzOut.write(testMailMessages.getBytes("UTF-8")); - gzOut.finish(); - } - } - - @Test - public void testSequential() throws Exception { - - File outputDir = this.getTestTempDir("mail-archives-out"); - - String[] args = { - "--input", inputDir.getAbsolutePath(), - "--output", outputDir.getAbsolutePath(), - "--charset", "UTF-8", - "--keyPrefix", "TEST", - "--method", "sequential", - "--body", "--subject", "--separator", "" - }; - - // run the application's main method - SequenceFilesFromMailArchives.main(args); - - // app should create a single SequenceFile named "chunk-0" in the output dir - File expectedChunkFile = new File(outputDir, "chunk-0"); - String expectedChunkPath = expectedChunkFile.getAbsolutePath(); - Assert.assertTrue("Expected chunk file " + expectedChunkPath + " not found!", expectedChunkFile.isFile()); - - Configuration conf = getConfiguration(); - SequenceFileIterator<Text, Text> iterator = new SequenceFileIterator<>(new Path(expectedChunkPath), true, conf); - Assert.assertTrue("First key/value pair not found!", iterator.hasNext()); - Pair<Text, Text> record = iterator.next(); - - File parentFile = new File(new File(new File("TEST"), "subdir"), "mail-messages.gz"); - Assert.assertEquals(new File(parentFile, testVars[0][0]).toString(), record.getFirst().toString()); - Assert.assertEquals(testVars[0][1] + testVars[0][2], record.getSecond().toString()); - - Assert.assertTrue("Second key/value pair not found!", iterator.hasNext()); - - record = iterator.next(); - Assert.assertEquals(new File(parentFile, testVars[1][0]).toString(), record.getFirst().toString()); - Assert.assertEquals(testVars[1][1] + testVars[1][2], record.getSecond().toString()); - - record = iterator.next(); - File parentFileSubSubDir = new File(new File(new File(new File("TEST"), "subdir"), "subsubdir"), "mail-messages-2.gz"); - Assert.assertEquals(new File(parentFileSubSubDir, testVars[0][0]).toString(), record.getFirst().toString()); - Assert.assertEquals(testVars[0][1] + testVars[0][2], record.getSecond().toString()); - - Assert.assertTrue("Second key/value pair not found!", iterator.hasNext()); - record = iterator.next(); - Assert.assertEquals(new File(parentFileSubSubDir, testVars[1][0]).toString(), record.getFirst().toString()); - Assert.assertEquals(testVars[1][1] + testVars[1][2], record.getSecond().toString()); - - Assert.assertFalse("Only two key/value pairs expected!", iterator.hasNext()); - } - - @Test - public void testMapReduce() throws Exception { - - Path tmpDir = getTestTempDirPath(); - Path mrOutputDir = new Path(tmpDir, "mail-archives-out-mr"); - Configuration configuration = getConfiguration(); - FileSystem fs = FileSystem.get(configuration); - - File expectedInputFile = new File(inputDir.toString()); - - String[] args = { - "-Dhadoop.tmp.dir=" + configuration.get("hadoop.tmp.dir"), - "--input", expectedInputFile.getAbsolutePath(), - "--output", mrOutputDir.toString(), - "--charset", "UTF-8", - "--keyPrefix", "TEST", - "--method", "mapreduce", - "--body", "--subject", "--separator", "" - }; - - // run the application's main method - SequenceFilesFromMailArchives.main(args); - - // app should create a single SequenceFile named "chunk-0" in the output dir - FileStatus[] fileStatuses = fs.listStatus(mrOutputDir.suffix("/part-m-00000")); - assertEquals(1, fileStatuses.length); // only one - assertEquals("part-m-00000", fileStatuses[0].getPath().getName()); - SequenceFileIterator<Text, Text> iterator = - new SequenceFileIterator<>(mrOutputDir.suffix("/part-m-00000"), true, configuration); - - Assert.assertTrue("First key/value pair not found!", iterator.hasNext()); - Pair<Text, Text> record = iterator.next(); - - File parentFileSubSubDir = new File(new File(new File(new File("TEST"), "subdir"), "subsubdir"), "mail-messages-2.gz"); - - String expected = record.getFirst().toString(); - if (SystemUtils.IS_OS_WINDOWS) { - expected = expected.replace("/", "\\"); - } - Assert.assertEquals(new File(parentFileSubSubDir, testVars[0][0]).toString(), expected); - Assert.assertEquals(testVars[0][1] + testVars[0][2], record.getSecond().toString()); - Assert.assertTrue("Second key/value pair not found!", iterator.hasNext()); - - record = iterator.next(); - expected = record.getFirst().toString(); - if (SystemUtils.IS_OS_WINDOWS) { - expected = expected.replace("/", "\\"); - } - Assert.assertEquals(new File(parentFileSubSubDir, testVars[1][0]).toString(), expected); - Assert.assertEquals(testVars[1][1] + testVars[1][2], record.getSecond().toString()); - - // test other file - File parentFile = new File(new File(new File("TEST"), "subdir"), "mail-messages.gz"); - record = iterator.next(); - expected = record.getFirst().toString(); - if (SystemUtils.IS_OS_WINDOWS) { - expected = expected.replace("/", "\\"); - } - Assert.assertEquals(new File(parentFile, testVars[0][0]).toString(), expected); - Assert.assertEquals(testVars[0][1] + testVars[0][2], record.getSecond().toString()); - Assert.assertTrue("Second key/value pair not found!", iterator.hasNext()); - - record = iterator.next(); - expected = record.getFirst().toString(); - if (SystemUtils.IS_OS_WINDOWS) { - expected = expected.replace("/", "\\"); - } - Assert.assertEquals(new File(parentFile, testVars[1][0]).toString(), expected); - Assert.assertEquals(testVars[1][1] + testVars[1][2], record.getSecond().toString()); - Assert.assertFalse("Only four key/value pairs expected!", iterator.hasNext()); - } - - // Messages extracted and made anonymous from the ASF mail archives - private static final String[][] testVars = { - new String[] { - "[email protected]", - "Ant task for JDK1.1 collections build option", - "\nThis is just a test message\n--\nTesty McTester\n" - }, - new String[] { - "[email protected]", - "Problem with build files in several directories", - "\nHi all,\nThis is another test message.\nRegards,\nAnother Test\n" - } - }; - - private static final String testMailMessages = - "From [email protected] Mon Jul 24 19:13:53 2000\n" - + "Return-Path: <[email protected]>\n" - + "Mailing-List: contact [email protected]; run by ezmlm\n" - + "Delivered-To: mailing list [email protected]\n" - + "Received: (qmail 49267 invoked from network); 24 Jul 2000 19:13:53 -0000\n" - + "Message-ID: <" + testVars[0][0] + ">\n" - + "From: \"Testy McTester\" <[email protected]>\n" - + "To: <[email protected]>\n" - + "Subject: " + testVars[0][1] + '\n' - + "Date: Mon, 24 Jul 2000 12:24:56 -0700\n" - + "MIME-Version: 1.0\n" - + "Content-Type: text/plain;\n" - + " charset=\"Windows-1252\"\n" - + "Content-Transfer-Encoding: 7bit\n" - + "X-Spam-Rating: locus.apache.org 1.6.2 0/1000/N\n" - + testVars[0][2] + '\n' - + "From [email protected] Wed Jul 26 11:32:16 2000\n" - + "Return-Path: <[email protected]>\n" - + "Mailing-List: contact [email protected]; run by ezmlm\n" - + "Delivered-To: mailing list [email protected]\n" - + "Received: (qmail 73966 invoked from network); 26 Jul 2000 11:32:16 -0000\n" - + "User-Agent: Microsoft-Outlook-Express-Macintosh-Edition/5.02.2022\n" - + "Date: Wed, 26 Jul 2000 13:32:08 +0200\n" - + "Subject: " + testVars[1][1] + '\n' - + "From: Another Test <[email protected]>\n" - + "To: <[email protected]>\n" - + "Message-Id: <" + testVars[1][0] + ">\n" - + "Mime-Version: 1.0\n" - + "Content-Type: text/plain; charset=\"US-ASCII\"\n" - + "Content-Transfer-Encoding: 7bit\n" - + "X-Spam-Rating: locus.apache.org 1.6.2 0/1000/N\n" - + testVars[1][2]; -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/text/TestPathFilter.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/text/TestPathFilter.java b/integration/src/test/java/org/apache/mahout/text/TestPathFilter.java deleted file mode 100644 index 227521a..0000000 --- a/integration/src/test/java/org/apache/mahout/text/TestPathFilter.java +++ /dev/null @@ -1,32 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.text; - -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.PathFilter; - -/** - * Dummy Path Filter for testing the MapReduce version of - * SequenceFilesFromDirectory - */ -public class TestPathFilter implements PathFilter { - - @Override - public boolean accept(Path path) { - return path.getName().startsWith("t") || path.getName().startsWith("r") || path.getName().startsWith("f"); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java b/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java deleted file mode 100644 index 040c8e4..0000000 --- a/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java +++ /dev/null @@ -1,313 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.text; - -import java.io.File; -import java.io.IOException; -import java.io.OutputStreamWriter; -import java.util.HashMap; -import java.util.Map; - -import org.apache.commons.io.Charsets; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.Text; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.common.MahoutTestCase; -import org.apache.mahout.common.Pair; -import org.apache.mahout.common.iterator.sequencefile.PathFilters; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public final class TestSequenceFilesFromDirectory extends MahoutTestCase { - - private static final Logger logger = LoggerFactory.getLogger(TestSequenceFilesFromDirectory.class); - - private static final String[][] DATA1 = { - {"test1", "This is the first text."}, - {"test2", "This is the second text."}, - {"test3", "This is the third text."} - }; - - private static final String[][] DATA2 = { - {"recursive_test1", "This is the first text."}, - {"recursive_test2", "This is the second text."}, - {"recursive_test3", "This is the third text."} - }; - - @Test - public void testSequenceFileFromDirectoryBasic() throws Exception { - // parameters - Configuration configuration = getConfiguration(); - - FileSystem fs = FileSystem.get(configuration); - - // create - Path tmpDir = this.getTestTempDirPath(); - Path inputDir = new Path(tmpDir, "inputDir"); - fs.mkdirs(inputDir); - - Path outputDir = new Path(tmpDir, "outputDir"); - Path outputDirRecursive = new Path(tmpDir, "outputDirRecursive"); - - Path inputDirRecursive = new Path(tmpDir, "inputDirRecur"); - fs.mkdirs(inputDirRecursive); - - // prepare input files - createFilesFromArrays(configuration, inputDir, DATA1); - - SequenceFilesFromDirectory.main(new String[]{ - "--input", inputDir.toString(), - "--output", outputDir.toString(), - "--chunkSize", "64", - "--charset", Charsets.UTF_8.name(), - "--keyPrefix", "UID", - "--method", "sequential"}); - - // check output chunk files - checkChunkFiles(configuration, outputDir, DATA1, "UID"); - - createRecursiveDirFilesFromArrays(configuration, inputDirRecursive, DATA2); - - FileStatus fstInputPath = fs.getFileStatus(inputDirRecursive); - String dirs = HadoopUtil.buildDirList(fs, fstInputPath); - - System.out.println("\n\n ----- recursive dirs: " + dirs); - SequenceFilesFromDirectory.main(new String[]{ - "--input", inputDirRecursive.toString(), - "--output", outputDirRecursive.toString(), - "--chunkSize", "64", - "--charset", Charsets.UTF_8.name(), - "--keyPrefix", "UID", - "--method", "sequential"}); - - checkRecursiveChunkFiles(configuration, outputDirRecursive, DATA2, "UID"); - } - - @Test - public void testSequenceFileFromDirectoryMapReduce() throws Exception { - - Configuration conf = getConfiguration(); - - FileSystem fs = FileSystem.get(conf); - - // create - Path tmpDir = this.getTestTempDirPath(); - Path inputDir = new Path(tmpDir, "inputDir"); - fs.mkdirs(inputDir); - - Path inputDirRecur = new Path(tmpDir, "inputDirRecur"); - fs.mkdirs(inputDirRecur); - - Path mrOutputDir = new Path(tmpDir, "mrOutputDir"); - Path mrOutputDirRecur = new Path(tmpDir, "mrOutputDirRecur"); - - createFilesFromArrays(conf, inputDir, DATA1); - - SequenceFilesFromDirectory.main(new String[]{ - "-Dhadoop.tmp.dir=" + conf.get("hadoop.tmp.dir"), - "--input", inputDir.toString(), - "--output", mrOutputDir.toString(), - "--chunkSize", "64", - "--charset", Charsets.UTF_8.name(), - "--method", "mapreduce", - "--keyPrefix", "UID", - "--fileFilterClass", "org.apache.mahout.text.TestPathFilter" - }); - - checkMRResultFiles(conf, mrOutputDir, DATA1, "UID"); - - createRecursiveDirFilesFromArrays(conf, inputDirRecur, DATA2); - - FileStatus fst_input_path = fs.getFileStatus(inputDirRecur); - String dirs = HadoopUtil.buildDirList(fs, fst_input_path); - - logger.info("\n\n ---- recursive dirs: {}", dirs); - - SequenceFilesFromDirectory.main(new String[]{ - "-Dhadoop.tmp.dir=" + conf.get("hadoop.tmp.dir"), - "--input", inputDirRecur.toString(), - "--output", mrOutputDirRecur.toString(), - "--chunkSize", "64", - "--charset", Charsets.UTF_8.name(), - "--method", "mapreduce", - "--keyPrefix", "UID", - "--fileFilterClass", "org.apache.mahout.text.TestPathFilter" - }); - - checkMRResultFilesRecursive(conf, mrOutputDirRecur, DATA2, "UID"); - } - - - private static void createFilesFromArrays(Configuration conf, Path inputDir, String[][] data) throws IOException { - FileSystem fs = FileSystem.get(conf); - for (String[] aData : data) { - try (OutputStreamWriter writer = - new OutputStreamWriter(fs.create(new Path(inputDir, aData[0])), Charsets.UTF_8)){ - writer.write(aData[1]); - } - } - } - - private static void createRecursiveDirFilesFromArrays(Configuration configuration, Path inputDir, - String[][] data) throws IOException { - FileSystem fs = FileSystem.get(configuration); - - logger.info("creativeRecursiveDirFilesFromArrays > based on: {}", inputDir.toString()); - Path curPath; - String currentRecursiveDir = inputDir.toString(); - - for (String[] aData : data) { - currentRecursiveDir += "/" + aData[0]; - File subDir = new File(currentRecursiveDir); - subDir.mkdir(); - - curPath = new Path(subDir.toString(), "file.txt"); - logger.info("Created file: {}", curPath.toString()); - - try (OutputStreamWriter writer = new OutputStreamWriter(fs.create(curPath), Charsets.UTF_8)){ - writer.write(aData[1]); - } - } - } - - private static void checkChunkFiles(Configuration configuration, - Path outputDir, - String[][] data, - String prefix) throws IOException { - FileSystem fs = FileSystem.get(configuration); - - // output exists? - FileStatus[] fileStatuses = fs.listStatus(outputDir, PathFilters.logsCRCFilter()); - assertEquals(1, fileStatuses.length); // only one - assertEquals("chunk-0", fileStatuses[0].getPath().getName()); - - Map<String, String> fileToData = new HashMap<>(); - for (String[] aData : data) { - fileToData.put(prefix + Path.SEPARATOR + aData[0], aData[1]); - } - - // read a chunk to check content - try (SequenceFileIterator<Text, Text> iterator = - new SequenceFileIterator<>(fileStatuses[0].getPath(), true, configuration)){ - while (iterator.hasNext()) { - Pair<Text, Text> record = iterator.next(); - String retrievedData = fileToData.get(record.getFirst().toString().trim()); - assertNotNull(retrievedData); - assertEquals(retrievedData, record.getSecond().toString().trim()); - } - } - } - - private static void checkRecursiveChunkFiles(Configuration configuration, - Path outputDir, - String[][] data, - String prefix) throws IOException { - FileSystem fs = FileSystem.get(configuration); - - System.out.println(" ----------- check_Recursive_ChunkFiles ------------"); - - // output exists? - FileStatus[] fileStatuses = fs.listStatus(outputDir, PathFilters.logsCRCFilter()); - assertEquals(1, fileStatuses.length); // only one - assertEquals("chunk-0", fileStatuses[0].getPath().getName()); - - - Map<String, String> fileToData = new HashMap<>(); - String currentPath = prefix; - for (String[] aData : data) { - currentPath += Path.SEPARATOR + aData[0]; - fileToData.put(currentPath + Path.SEPARATOR + "file.txt", aData[1]); - } - - // read a chunk to check content - try (SequenceFileIterator<Text, Text> iterator = - new SequenceFileIterator<>(fileStatuses[0].getPath(), true, configuration)) { - while (iterator.hasNext()) { - Pair<Text, Text> record = iterator.next(); - String retrievedData = fileToData.get(record.getFirst().toString().trim()); - System.out.printf("%s >> %s\n", record.getFirst().toString().trim(), record.getSecond().toString().trim()); - - assertNotNull(retrievedData); - assertEquals(retrievedData, record.getSecond().toString().trim()); - System.out.printf(">>> k: %s, v: %s\n", record.getFirst().toString(), record.getSecond().toString()); - } - } - } - - private static void checkMRResultFiles(Configuration conf, Path outputDir, - String[][] data, String prefix) throws IOException { - FileSystem fs = FileSystem.get(conf); - - // output exists? - FileStatus[] fileStatuses = fs.listStatus(outputDir.suffix("/part-m-00000"), PathFilters.logsCRCFilter()); - assertEquals(1, fileStatuses.length); // only one - assertEquals("part-m-00000", fileStatuses[0].getPath().getName()); - Map<String, String> fileToData = new HashMap<>(); - for (String[] aData : data) { - System.out.printf("map.put: %s %s\n", prefix + Path.SEPARATOR + aData[0], aData[1]); - fileToData.put(prefix + Path.SEPARATOR + aData[0], aData[1]); - } - - // read a chunk to check content - try (SequenceFileIterator<Text, Text> iterator = new SequenceFileIterator<>( - fileStatuses[0].getPath(), true, conf)) { - while (iterator.hasNext()) { - Pair<Text, Text> record = iterator.next(); - String retrievedData = fileToData.get(record.getFirst().toString().trim()); - - System.out.printf("MR> %s >> %s\n", record.getFirst().toString().trim(), record.getSecond().toString().trim()); - assertNotNull(retrievedData); - assertEquals(retrievedData, record.getSecond().toString().trim()); - } - } - } - - private static void checkMRResultFilesRecursive(Configuration configuration, Path outputDir, - String[][] data, String prefix) throws IOException { - FileSystem fs = FileSystem.get(configuration); - - // output exists? - FileStatus[] fileStatuses = fs.listStatus(outputDir.suffix("/part-m-00000"), PathFilters.logsCRCFilter()); - assertEquals(1, fileStatuses.length); // only one - assertEquals("part-m-00000", fileStatuses[0].getPath().getName()); - Map<String, String> fileToData = new HashMap<>(); - String currentPath = prefix; - - for (String[] aData : data) { - currentPath += Path.SEPARATOR + aData[0]; - fileToData.put(currentPath + Path.SEPARATOR + "file.txt", aData[1]); - } - - // read a chunk to check content - try (SequenceFileIterator<Text, Text> iterator = new SequenceFileIterator<>( - fileStatuses[0].getPath(), true, configuration)){ - while (iterator.hasNext()) { - Pair<Text, Text> record = iterator.next(); - System.out.printf("MR-Recur > Trying to check: %s\n", record.getFirst().toString().trim()); - String retrievedData = fileToData.get(record.getFirst().toString().trim()); - assertNotNull(retrievedData); - assertEquals(retrievedData, record.getSecond().toString().trim()); - } - } - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/text/doc/MultipleFieldsDocument.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/text/doc/MultipleFieldsDocument.java b/integration/src/test/java/org/apache/mahout/text/doc/MultipleFieldsDocument.java deleted file mode 100644 index 7483b2d..0000000 --- a/integration/src/test/java/org/apache/mahout/text/doc/MultipleFieldsDocument.java +++ /dev/null @@ -1,58 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.text.doc; - -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.TextField; - -/** - * Used for testing lucene2seq - */ -@Deprecated -public class MultipleFieldsDocument extends SingleFieldDocument { - - public static final String FIELD1 = "field1"; - public static final String FIELD2 = "field2"; - - private String field1; - private String field2; - - public MultipleFieldsDocument(String id, String field, String field1, String field2) { - super(id, field); - this.field1 = field1; - this.field2 = field2; - } - - public String getField1() { - return field1; - } - - public String getField2() { - return field2; - } - - @Override - public Document asLuceneDocument() { - Document document = super.asLuceneDocument(); - - document.add(new TextField(FIELD1, this.field1, Field.Store.YES)); - document.add(new TextField(FIELD2, this.field2, Field.Store.YES)); - - return document; - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/text/doc/NumericFieldDocument.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/text/doc/NumericFieldDocument.java b/integration/src/test/java/org/apache/mahout/text/doc/NumericFieldDocument.java deleted file mode 100644 index e06e8d6..0000000 --- a/integration/src/test/java/org/apache/mahout/text/doc/NumericFieldDocument.java +++ /dev/null @@ -1,54 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.text.doc; - -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.IntField; -import org.apache.lucene.document.StringField; -import org.apache.lucene.document.TextField; - -/** - * Document with numeric field. - */ -@Deprecated -public class NumericFieldDocument extends SingleFieldDocument { - - public static final String NUMERIC_FIELD = "numeric"; - - private int numericField; - - public NumericFieldDocument(String id, String field, int numericField) { - super(id, field); - this.numericField = numericField; - } - - @Override - public Document asLuceneDocument() { - Document document = new Document(); - - document.add(new StringField(ID_FIELD, getId(), Field.Store.YES)); - document.add(new TextField(FIELD, getField(), Field.Store.YES)); - document.add(new IntField(NUMERIC_FIELD, numericField, Field.Store.YES)); - - return document; - } - - public int getNumericField() { - return numericField; - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/text/doc/SingleFieldDocument.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/text/doc/SingleFieldDocument.java b/integration/src/test/java/org/apache/mahout/text/doc/SingleFieldDocument.java deleted file mode 100644 index 4636a51..0000000 --- a/integration/src/test/java/org/apache/mahout/text/doc/SingleFieldDocument.java +++ /dev/null @@ -1,63 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.text.doc; - -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.StringField; -import org.apache.lucene.document.TextField; - -/** - * Used for testing lucene2seq - */ -@Deprecated -public class SingleFieldDocument implements TestDocument { - - public static final String ID_FIELD = "idField"; - public static final String FIELD = "field"; - - private String id; - private String field; - - public SingleFieldDocument(String id, String field) { - this.id = id; - this.field = field; - } - - @Override - public String getId() { - return id; - } - - @Override - public String getField() { - return field; - } - - @Override - public Document asLuceneDocument() { - Document document = new Document(); - - Field idField = new StringField(ID_FIELD, getId(), Field.Store.YES); - Field field = new TextField(FIELD, getField(), Field.Store.YES); - - document.add(idField); - document.add(field); - - return document; - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/text/doc/TestDocument.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/text/doc/TestDocument.java b/integration/src/test/java/org/apache/mahout/text/doc/TestDocument.java deleted file mode 100644 index 7243c71..0000000 --- a/integration/src/test/java/org/apache/mahout/text/doc/TestDocument.java +++ /dev/null @@ -1,29 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.text.doc; - -import org.apache.lucene.document.Document; -@Deprecated -public interface TestDocument { - - String getId(); - - String getField(); - - Document asLuceneDocument(); - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/text/doc/UnstoredFieldsDocument.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/text/doc/UnstoredFieldsDocument.java b/integration/src/test/java/org/apache/mahout/text/doc/UnstoredFieldsDocument.java deleted file mode 100644 index 6eb43f6..0000000 --- a/integration/src/test/java/org/apache/mahout/text/doc/UnstoredFieldsDocument.java +++ /dev/null @@ -1,43 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.text.doc; - -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.StringField; - -/** - * Used for testing lucene2seq - */ -@Deprecated -public class UnstoredFieldsDocument extends SingleFieldDocument { - - public static final String UNSTORED_FIELD = "unstored"; - - public UnstoredFieldsDocument(String id, String field) { - super(id, field); - } - - @Override - public Document asLuceneDocument() { - Document document = super.asLuceneDocument(); - - document.add(new StringField(UNSTORED_FIELD, "", Field.Store.NO)); - - return document; - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/Bump125Test.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/utils/Bump125Test.java b/integration/src/test/java/org/apache/mahout/utils/Bump125Test.java deleted file mode 100644 index 65b308f..0000000 --- a/integration/src/test/java/org/apache/mahout/utils/Bump125Test.java +++ /dev/null @@ -1,42 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils; - -import com.google.common.collect.Lists; - -import org.apache.mahout.common.MahoutTestCase; -import org.junit.Test; - -import java.util.Iterator; - -public class Bump125Test extends MahoutTestCase { - @Test - public void testIncrement() throws Exception { - Iterator<Integer> ref = Lists.newArrayList(1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 12, 14, 16, 18, 20, 25, 30, 35, 40, 50, 60, - 70, 80, 100, 120, 140, 160, 180, 200, 250, 300, 350, - 400, 500, 600, 700, 800, 1000, 1200, 1400, 1600, 1800, - 2000, 2500, 3000, 3500, 4000, 5000, 6000, 7000) - .iterator(); - Bump125 b = new Bump125(); - for (int i = 0; i < 50; i++) { - long x = b.increment(); - assertEquals(ref.next().longValue(), x); - } - } -}
