http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java deleted file mode 100644 index 6a8c659..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java +++ /dev/null @@ -1,99 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors.lucene; - -import java.io.IOException; -import java.util.Set; -import java.util.TreeSet; - -import com.google.common.base.Preconditions; -import org.apache.lucene.index.IndexReader; -import org.apache.mahout.utils.vectors.TermInfo; -import org.apache.mahout.vectorizer.Weight; - -/** - * An {@link java.util.Iterator} over {@link org.apache.mahout.math.Vector}s that uses a Lucene index as the source - * for creating the {@link org.apache.mahout.math.Vector}s. The field used to create the vectors currently must have - * term vectors stored for it. - */ -public class LuceneIterator extends AbstractLuceneIterator { - - protected final Set<String> idFieldSelector; - protected final String idField; - - /** - * Produce a LuceneIterable that can create the Vector plus normalize it. - * - * @param indexReader {@link IndexReader} to read the documents from. - * @param idField field containing the id. May be null. - * @param field field to use for the Vector - * @param termInfo termInfo - * @param weight weight - * @param normPower the normalization value. Must be non-negative, or {@link LuceneIterable#NO_NORMALIZING} - */ - public LuceneIterator(IndexReader indexReader, String idField, String field, TermInfo termInfo, Weight weight, - double normPower) { - this(indexReader, idField, field, termInfo, weight, normPower, 0.0); - } - - /** - * @param indexReader {@link IndexReader} to read the documents from. - * @param idField field containing the id. May be null. - * @param field field to use for the Vector - * @param termInfo termInfo - * @param weight weight - * @param normPower the normalization value. Must be non-negative, or {@link LuceneIterable#NO_NORMALIZING} - * @param maxPercentErrorDocs most documents that will be tolerated without a term freq vector. In [0,1]. - * @see #LuceneIterator(org.apache.lucene.index.IndexReader, String, String, org.apache.mahout.utils.vectors.TermInfo, - * org.apache.mahout.vectorizer.Weight, double) - */ - public LuceneIterator(IndexReader indexReader, - String idField, - String field, - TermInfo termInfo, - Weight weight, - double normPower, - double maxPercentErrorDocs) { - super(termInfo, normPower, indexReader, weight, maxPercentErrorDocs, field); - // term docs(null) is a better way of iterating all the docs in Lucene - Preconditions.checkArgument(normPower == LuceneIterable.NO_NORMALIZING || normPower >= 0, - "normPower must be non-negative or -1, but normPower = " + normPower); - Preconditions.checkArgument(maxPercentErrorDocs >= 0.0 && maxPercentErrorDocs <= 1.0, - "Must be: 0.0 <= maxPercentErrorDocs <= 1.0"); - this.idField = idField; - if (idField != null) { - idFieldSelector = new TreeSet<>(); - idFieldSelector.add(idField); - } else { - /*The field in the index containing the index. If null, then the Lucene internal doc id is used - which is prone to error if the underlying index changes*/ - idFieldSelector = null; - } - } - - @Override - protected String getVectorName(int documentIndex) throws IOException { - String name; - if (idField != null) { - name = indexReader.document(documentIndex, idFieldSelector).get(idField); - } else { - name = String.valueOf(documentIndex); - } - return name; - } -}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java deleted file mode 100644 index 5830ccc..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java +++ /dev/null @@ -1,64 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors.lucene; - -import org.apache.lucene.util.BytesRef; -import org.apache.mahout.math.RandomAccessSparseVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.utils.vectors.TermEntry; -import org.apache.mahout.utils.vectors.TermInfo; -import org.apache.mahout.vectorizer.Weight; - - -/** - * Not thread-safe - */ -public class TFDFMapper { - - private Vector vector; - - private final Weight weight; - private long numTerms; - private final TermInfo termInfo; - private String field; - private final int numDocs; - - public TFDFMapper(int numDocs, Weight weight, TermInfo termInfo) { - this.weight = weight; - this.termInfo = termInfo; - this.numDocs = numDocs; - } - - public void setExpectations(String field, long numTerms) { - this.field = field; - vector = new RandomAccessSparseVector(termInfo.totalTerms(field)); - this.numTerms = numTerms; - } - - public void map(BytesRef term, int frequency) { - TermEntry entry = termInfo.getTermEntry(field, term.utf8ToString()); - if (entry != null) { - vector.setQuick(entry.getTermIdx(), weight.calculate(frequency, entry.getDocFreq(), (int)numTerms, numDocs)); - } - } - - public Vector getVector() { - return this.vector; - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TermInfoClusterInOut.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TermInfoClusterInOut.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TermInfoClusterInOut.java deleted file mode 100644 index b0311c7..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TermInfoClusterInOut.java +++ /dev/null @@ -1,81 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors.lucene; - -import org.apache.mahout.common.RandomUtils; - -class TermInfoClusterInOut implements Comparable<TermInfoClusterInOut> { - - private final String term; - private final int inClusterDF; - private final int outClusterDF; - private final double logLikelihoodRatio; - - TermInfoClusterInOut(String term, int inClusterDF, int outClusterDF, double logLikelihoodRatio) { - this.term = term; - this.inClusterDF = inClusterDF; - this.outClusterDF = outClusterDF; - this.logLikelihoodRatio = logLikelihoodRatio; - } - - @Override - public int hashCode() { - return term.hashCode() ^ inClusterDF ^ outClusterDF ^ RandomUtils.hashDouble(logLikelihoodRatio); - } - - @Override - public boolean equals(Object o) { - if (!(o instanceof TermInfoClusterInOut)) { - return false; - } - TermInfoClusterInOut other = (TermInfoClusterInOut) o; - return term.equals(other.getTerm()) - && inClusterDF == other.getInClusterDF() - && outClusterDF == other.getOutClusterDF() - && logLikelihoodRatio == other.getLogLikelihoodRatio(); - } - - @Override - public int compareTo(TermInfoClusterInOut that) { - int res = Double.compare(that.logLikelihoodRatio, logLikelihoodRatio); - if (res == 0) { - res = term.compareTo(that.term); - } - return res; - } - - public int getInClusterDiff() { - return this.inClusterDF - this.outClusterDF; - } - - String getTerm() { - return term; - } - - int getInClusterDF() { - return inClusterDF; - } - - int getOutClusterDF() { - return outClusterDF; - } - - double getLogLikelihoodRatio() { - return logLikelihoodRatio; - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarityTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarityTest.java b/integration/src/test/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarityTest.java deleted file mode 100644 index 463a45f..0000000 --- a/integration/src/test/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarityTest.java +++ /dev/null @@ -1,79 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.similarity.jdbc; - -import org.apache.mahout.cf.taste.impl.TasteTestCase; -import org.apache.mahout.cf.taste.similarity.ItemSimilarity; -import org.easymock.EasyMock; -import org.junit.Test; - -import javax.sql.DataSource; -import java.sql.Connection; -import java.sql.PreparedStatement; -import java.sql.ResultSet; - -public class MySQLJDBCInMemoryItemSimilarityTest extends TasteTestCase { - - @Test - public void testMemoryLoad() throws Exception { - - DataSource dataSource = EasyMock.createMock(DataSource.class); - Connection connection = EasyMock.createMock(Connection.class); - PreparedStatement statement = EasyMock.createMock(PreparedStatement.class); - ResultSet resultSet = EasyMock.createMock(ResultSet.class); - - EasyMock.expect(dataSource.getConnection()).andReturn(connection); - EasyMock.expect(connection.prepareStatement(MySQLJDBCInMemoryItemSimilarity.DEFAULT_GET_ALL_ITEMSIMILARITIES_SQL, - ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY)).andReturn(statement); - statement.setFetchDirection(ResultSet.FETCH_FORWARD); - EasyMock.expect(statement.executeQuery()).andReturn(resultSet); - - EasyMock.expect(resultSet.next()).andReturn(true); - - EasyMock.expect(resultSet.getLong(1)).andReturn(1L); - EasyMock.expect(resultSet.getLong(2)).andReturn(2L); - EasyMock.expect(resultSet.getDouble(3)).andReturn(0.5); - EasyMock.expect(resultSet.next()).andReturn(true); - - EasyMock.expect(resultSet.getLong(1)).andReturn(1L); - EasyMock.expect(resultSet.getLong(2)).andReturn(3L); - EasyMock.expect(resultSet.getDouble(3)).andReturn(0.4); - EasyMock.expect(resultSet.next()).andReturn(true); - - EasyMock.expect(resultSet.getLong(1)).andReturn(3L); - EasyMock.expect(resultSet.getLong(2)).andReturn(4L); - EasyMock.expect(resultSet.getDouble(3)).andReturn(0.1); - - EasyMock.expect(resultSet.next()).andReturn(false); - - resultSet.close(); - statement.close(); - connection.close(); - - EasyMock.replay(dataSource, connection, statement, resultSet); - - ItemSimilarity similarity = new MySQLJDBCInMemoryItemSimilarity(dataSource); - - assertEquals(0.5, similarity.itemSimilarity(1L, 2L), EPSILON); - assertEquals(0.4, similarity.itemSimilarity(1L, 3L), EPSILON); - assertEquals(0.1, similarity.itemSimilarity(3L, 4L), EPSILON); - assertTrue(Double.isNaN(similarity.itemSimilarity(1L, 4L))); - - EasyMock.verify(dataSource, connection, statement, resultSet); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java b/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java deleted file mode 100644 index 01d46fc..0000000 --- a/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java +++ /dev/null @@ -1,236 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.FieldType; -import org.apache.lucene.document.StringField; -import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.index.IndexOptions; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.store.RAMDirectory; -import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver; -import org.apache.mahout.clustering.kmeans.KMeansDriver; -import org.apache.mahout.clustering.kmeans.RandomSeedGenerator; -import org.apache.mahout.common.MahoutTestCase; -import org.apache.mahout.common.distance.DistanceMeasure; -import org.apache.mahout.common.distance.EuclideanDistanceMeasure; -import org.apache.mahout.math.NamedVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; -import org.apache.mahout.utils.clustering.ClusterDumper; -import org.apache.mahout.utils.vectors.TermEntry; -import org.apache.mahout.utils.vectors.TermInfo; -import org.apache.mahout.utils.vectors.lucene.CachedTermInfo; -import org.apache.mahout.utils.vectors.lucene.LuceneIterable; -import org.apache.mahout.vectorizer.TFIDF; -import org.apache.mahout.vectorizer.Weight; -import org.junit.Before; -import org.junit.Test; - -public final class TestClusterDumper extends MahoutTestCase { - - private static final String[] DOCS = { - "The quick red fox jumped over the lazy brown dogs.", - "The quick brown fox jumped over the lazy red dogs.", - "The quick red cat jumped over the lazy brown dogs.", - "The quick brown cat jumped over the lazy red dogs.", - "Mary had a little lamb whose fleece was white as snow.", - "Mary had a little goat whose fleece was white as snow.", - "Mary had a little lamb whose fleece was black as tar.", - "Dick had a little goat whose fleece was white as snow.", - "Moby Dick is a story of a whale and a man obsessed.", - "Moby Bob is a story of a walrus and a man obsessed.", - "Moby Dick is a story of a whale and a crazy man.", - "The robber wore a black fleece jacket and a baseball cap.", - "The robber wore a red fleece jacket and a baseball cap.", - "The robber wore a white fleece jacket and a baseball cap.", - "The English Springer Spaniel is the best of all dogs."}; - - private List<VectorWritable> sampleData; - - private String[] termDictionary; - - @Override - @Before - public void setUp() throws Exception { - super.setUp(); - Configuration conf = getConfiguration(); - FileSystem fs = FileSystem.get(conf); - // Create test data - getSampleData(DOCS); - ClusteringTestUtils.writePointsToFile(sampleData, true, - getTestTempFilePath("testdata/file1"), fs, conf); - } - - private void getSampleData(String[] docs2) throws IOException { - sampleData = new ArrayList<>(); - RAMDirectory directory = new RAMDirectory(); - try (IndexWriter writer = new IndexWriter(directory, - new IndexWriterConfig(new StandardAnalyzer()))){ - for (int i = 0; i < docs2.length; i++) { - Document doc = new Document(); - Field id = new StringField("id", "doc_" + i, Field.Store.YES); - doc.add(id); - // Store both position and offset information - FieldType fieldType = new FieldType(); - fieldType.setStored(false); - fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - fieldType.setTokenized(true); - fieldType.setStoreTermVectors(true); - fieldType.setStoreTermVectorPositions(true); - fieldType.setStoreTermVectorOffsets(true); - fieldType.freeze(); - Field text = new Field("content", docs2[i], fieldType); - doc.add(text); - writer.addDocument(doc); - } - } - - IndexReader reader = DirectoryReader.open(directory); - - Weight weight = new TFIDF(); - TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100); - - int numTerms = 0; - for (Iterator<TermEntry> it = termInfo.getAllEntries(); it.hasNext();) { - it.next(); - numTerms++; - } - termDictionary = new String[numTerms]; - int i = 0; - for (Iterator<TermEntry> it = termInfo.getAllEntries(); it.hasNext();) { - String term = it.next().getTerm(); - termDictionary[i] = term; - System.out.println(i + " " + term); - i++; - } - Iterable<Vector> iterable = new LuceneIterable(reader, "id", "content", - termInfo,weight); - - i = 0; - for (Vector vector : iterable) { - assertNotNull(vector); - NamedVector namedVector; - if (vector instanceof NamedVector) { - // rename it for testing purposes - namedVector = new NamedVector(((NamedVector) vector).getDelegate(), - "P(" + i + ')'); - - } else { - namedVector = new NamedVector(vector, "P(" + i + ')'); - } - System.out.println(AbstractCluster.formatVector(namedVector, - termDictionary)); - sampleData.add(new VectorWritable(namedVector)); - i++; - } - } - - /** - * Return the path to the final iteration's clusters - */ - private static Path finalClusterPath(Configuration conf, Path output, - int maxIterations) throws IOException { - FileSystem fs = FileSystem.get(conf); - for (int i = maxIterations; i >= 0; i--) { - Path clusters = new Path(output, "clusters-" + i + "-final"); - if (fs.exists(clusters)) { - return clusters; - } - } - return null; - } - - @Test - public void testKmeans() throws Exception { - DistanceMeasure measure = new EuclideanDistanceMeasure(); - Path input = getTestTempFilePath("input"); - Path output = getTestTempDirPath("output"); - Path initialPoints = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX); - Configuration conf = getConfiguration(); - FileSystem fs = FileSystem.get(conf); - // Write test data to file - ClusteringTestUtils.writePointsToFile(sampleData, input, fs, conf); - // Select initial centroids - RandomSeedGenerator.buildRandom(conf, input, initialPoints, 8, measure, 1L); - // Run k-means - Path kMeansOutput = new Path(output, "kmeans"); - KMeansDriver.run(conf, getTestTempDirPath("testdata"), initialPoints, kMeansOutput, 0.001, 10, true, 0.0, false); - // Print out clusters - ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, - output, 10), new Path(kMeansOutput, "clusteredPoints")); - clusterDumper.printClusters(termDictionary); - } - - @Test - public void testJsonClusterDumper() throws Exception { - DistanceMeasure measure = new EuclideanDistanceMeasure(); - Path input = getTestTempFilePath("input"); - Path output = getTestTempDirPath("output"); - Path initialPoints = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX); - Configuration conf = getConfiguration(); - FileSystem fs = FileSystem.get(conf); - // Write test data to file - ClusteringTestUtils.writePointsToFile(sampleData, input, fs, conf); - // Select initial centroids - RandomSeedGenerator.buildRandom(conf, input, initialPoints, 8, measure, 1L); - // Run k-means - Path kmeansOutput = new Path(output, "kmeans"); - KMeansDriver.run(conf, getTestTempDirPath("testdata"), initialPoints, kmeansOutput, 0.001, 10, true, 0.0, false); - // Print out clusters - ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, - output, 10), new Path(kmeansOutput, "clusteredPoints")); - clusterDumper.setOutputFormat(ClusterDumper.OUTPUT_FORMAT.JSON); - clusterDumper.printClusters(termDictionary); - } - - @Test - public void testFuzzyKmeans() throws Exception { - DistanceMeasure measure = new EuclideanDistanceMeasure(); - Path input = getTestTempFilePath("input"); - Path output = getTestTempDirPath("output"); - Path initialPoints = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX); - Configuration conf = getConfiguration(); - FileSystem fs = FileSystem.get(conf); - // Write test data to file - ClusteringTestUtils.writePointsToFile(sampleData, input, fs, conf); - // Select initial centroids - RandomSeedGenerator.buildRandom(conf, input, initialPoints, 8, measure, 1L); - // Run k-means - Path kMeansOutput = new Path(output, "kmeans"); - FuzzyKMeansDriver.run(conf, getTestTempDirPath("testdata"), initialPoints, kMeansOutput, 0.001, 10, 1.1f, true, - true, 0, true); - // run ClusterDumper - ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, - output, 10), new Path(kMeansOutput, "clusteredPoints")); - clusterDumper.printClusters(termDictionary); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java b/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java deleted file mode 100644 index 8a226a0..0000000 --- a/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java +++ /dev/null @@ -1,321 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering; - -import java.io.IOException; -import java.util.List; -import java.util.Map; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.mahout.clustering.canopy.Canopy; -import org.apache.mahout.clustering.canopy.CanopyDriver; -import org.apache.mahout.clustering.evaluation.ClusterEvaluator; -import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver; -import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver; -import org.apache.mahout.clustering.kmeans.KMeansDriver; -import org.apache.mahout.clustering.kmeans.TestKmeansClustering; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.common.MahoutTestCase; -import org.apache.mahout.common.distance.DistanceMeasure; -import org.apache.mahout.common.distance.EuclideanDistanceMeasure; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.VectorWritable; -import org.junit.Before; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; - -public final class TestClusterEvaluator extends MahoutTestCase { - - private static final double[][] REFERENCE = { {1, 1}, {2, 1}, {1, 2}, {2, 2}, {3, 3}, {4, 4}, {5, 4}, {4, 5}, {5, 5}}; - - private List<VectorWritable> referenceData = Lists.newArrayList(); - - private final List<VectorWritable> sampleData = Lists.newArrayList(); - - private Map<Integer,List<VectorWritable>> representativePoints; - - private List<Cluster> clusters; - - private static final Logger log = LoggerFactory.getLogger(TestClusterEvaluator.class); - - private Configuration conf; - - private FileSystem fs; - - private Path testdata; - - private Path output; - - @Override - @Before - public void setUp() throws Exception { - super.setUp(); - conf = getConfiguration(); - fs = FileSystem.get(conf); - testdata = getTestTempDirPath("testdata"); - output = getTestTempDirPath("output"); - // Create small reference data set - referenceData = TestKmeansClustering.getPointsWritable(REFERENCE); - // generate larger test data set for the clustering tests to chew on - generateSamples(); - } - - /** - * Generate random samples and add them to the sampleData - * - * @param num - * int number of samples to generate - * @param mx - * double x-value of the sample mean - * @param my - * double y-value of the sample mean - * @param sd - * double standard deviation of the samples - */ - private void generateSamples(int num, double mx, double my, double sd) { - log.info("Generating {} samples m=[{}, {}] sd={}", num, mx, my, sd); - for (int i = 0; i < num; i++) { - sampleData.add(new VectorWritable(new DenseVector(new double[] {UncommonDistributions.rNorm(mx, sd), - UncommonDistributions.rNorm(my, sd)}))); - } - } - - private void generateSamples() { - generateSamples(500, 1, 1, 3); - generateSamples(300, 1, 0, 0.5); - generateSamples(300, 0, 2, 0.1); - } - - private void printRepPoints(int numIterations) { - RepresentativePointsDriver.printRepresentativePoints(output, numIterations); - } - - /** - * Initialize synthetic data using 4 clusters dC units from origin having 4 representative points dP from each center - * - * @param dC - * a double cluster center offset - * @param dP - * a double representative point offset - * @param measure - * the DistanceMeasure - */ - private void initData(double dC, double dP, DistanceMeasure measure) { - clusters = Lists.newArrayList(); - clusters.add(new Canopy(new DenseVector(new double[] {-dC, -dC}), 1, measure)); - clusters.add(new Canopy(new DenseVector(new double[] {-dC, dC}), 3, measure)); - clusters.add(new Canopy(new DenseVector(new double[] {dC, dC}), 5, measure)); - clusters.add(new Canopy(new DenseVector(new double[] {dC, -dC}), 7, measure)); - representativePoints = Maps.newHashMap(); - for (Cluster cluster : clusters) { - List<VectorWritable> points = Lists.newArrayList(); - representativePoints.put(cluster.getId(), points); - points.add(new VectorWritable(cluster.getCenter().clone())); - points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {dP, dP})))); - points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {dP, -dP})))); - points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {-dP, -dP})))); - points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {-dP, dP})))); - } - } - - @Test - public void testRepresentativePoints() throws Exception { - ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - Configuration conf = getConfiguration(); - // run using MR reference point calculation - CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, true, 0.0, true); - int numIterations = 2; - Path clustersIn = new Path(output, "clusters-0-final"); - RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure, - numIterations, false); - printRepPoints(numIterations); - ClusterEvaluator evaluatorMR = new ClusterEvaluator(conf, clustersIn); - // now run again using sequential reference point calculation - HadoopUtil.delete(conf, output); - CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, true, 0.0, true); - RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure, - numIterations, true); - printRepPoints(numIterations); - ClusterEvaluator evaluatorSeq = new ClusterEvaluator(conf, clustersIn); - // compare results - assertEquals("InterCluster Density", evaluatorMR.interClusterDensity(), evaluatorSeq.interClusterDensity(), EPSILON); - assertEquals("IntraCluster Density", evaluatorMR.intraClusterDensity(), evaluatorSeq.intraClusterDensity(), EPSILON); - } - - @Test - public void testCluster0() throws IOException { - ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - initData(1, 0.25, measure); - ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure); - assertEquals("inter cluster density", 0.33333333333333315, evaluator.interClusterDensity(), EPSILON); - assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON); - } - - @Test - public void testCluster1() throws IOException { - ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - initData(1, 0.5, measure); - ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure); - assertEquals("inter cluster density", 0.33333333333333315, evaluator.interClusterDensity(), EPSILON); - assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON); - } - - @Test - public void testCluster2() throws IOException { - ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - initData(1, 0.75, measure); - ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure); - assertEquals("inter cluster density", 0.33333333333333315, evaluator.interClusterDensity(), EPSILON); - assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON); - } - - /** - * adding an empty cluster should modify the inter cluster density but not change the intra-cluster density as that - * cluster would have NaN as its intra-cluster density and NaN values are ignored by the evaluator - * - * @throws IOException - */ - @Test - public void testEmptyCluster() throws IOException { - ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - initData(1, 0.25, measure); - Canopy cluster = new Canopy(new DenseVector(new double[] {10, 10}), 19, measure); - clusters.add(cluster); - List<VectorWritable> points = Lists.newArrayList(); - representativePoints.put(cluster.getId(), points); - ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure); - assertEquals("inter cluster density", 0.371534146934532, evaluator.interClusterDensity(), EPSILON); - assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON); - } - - /** - * adding an single-valued cluster should modify the inter cluster density but not change the intra-cluster density as - * that cluster would have NaN as its intra-cluster density and NaN values are ignored by the evaluator - * - * @throws IOException - */ - @Test - public void testSingleValueCluster() throws IOException { - ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - initData(1, 0.25, measure); - Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure); - clusters.add(cluster); - List<VectorWritable> points = Lists.newArrayList(); - points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {1, 1})))); - representativePoints.put(cluster.getId(), points); - ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure); - assertEquals("inter cluster density", 0.3656854249492381, evaluator.interClusterDensity(), EPSILON); - assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON); - } - - /** - * Representative points extraction will duplicate the cluster center if the cluster has no assigned points. These - * clusters are included in the inter-cluster density but their NaN intra-density values are ignored by the evaluator. - * - * @throws IOException - */ - @Test - public void testAllSameValueCluster() throws IOException { - ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - initData(1, 0.25, measure); - Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure); - clusters.add(cluster); - List<VectorWritable> points = Lists.newArrayList(); - points.add(new VectorWritable(cluster.getCenter())); - points.add(new VectorWritable(cluster.getCenter())); - points.add(new VectorWritable(cluster.getCenter())); - representativePoints.put(cluster.getId(), points); - ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure); - assertEquals("inter cluster density", 0.3656854249492381, evaluator.interClusterDensity(), EPSILON); - assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON); - } - - @Test - public void testCanopy() throws Exception { - ClusteringTestUtils.writePointsToFile(sampleData, new Path(testdata, "file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - Configuration conf = getConfiguration(); - CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, true, 0.0, true); - int numIterations = 10; - Path clustersIn = new Path(output, "clusters-0-final"); - RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure, - numIterations, true); - //printRepPoints(numIterations); - ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn); - // now print out the Results - System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity()); - System.out.println("Inter-cluster density = " + evaluator.interClusterDensity()); - } - - @Test - public void testKmeans() throws Exception { - ClusteringTestUtils.writePointsToFile(sampleData, new Path(testdata, "file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - // now run the Canopy job to prime kMeans canopies - Configuration conf = getConfiguration(); - CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, false, 0.0, true); - // now run the KMeans job - Path kmeansOutput = new Path(output, "kmeans"); - KMeansDriver.run(testdata, new Path(output, "clusters-0-final"), kmeansOutput, 0.001, 10, true, 0.0, true); - int numIterations = 10; - Path clustersIn = new Path(kmeansOutput, "clusters-2"); - RepresentativePointsDriver.run(conf, clustersIn, new Path(kmeansOutput, "clusteredPoints"), kmeansOutput, measure, - numIterations, true); - RepresentativePointsDriver.printRepresentativePoints(kmeansOutput, numIterations); - ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn); - // now print out the Results - System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity()); - System.out.println("Inter-cluster density = " + evaluator.interClusterDensity()); - } - - @Test - public void testFuzzyKmeans() throws Exception { - ClusteringTestUtils.writePointsToFile(sampleData, new Path(testdata, "file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - // now run the Canopy job to prime kMeans canopies - Configuration conf = getConfiguration(); - CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, false, 0.0, true); - Path fuzzyKMeansOutput = new Path(output, "fuzzyk"); - // now run the KMeans job - FuzzyKMeansDriver.run(testdata, new Path(output, "clusters-0-final"), fuzzyKMeansOutput, 0.001, 10, 2, - true, true, 0, true); - int numIterations = 10; - Path clustersIn = new Path(fuzzyKMeansOutput, "clusters-4"); - RepresentativePointsDriver.run(conf, clustersIn, new Path(fuzzyKMeansOutput, "clusteredPoints"), fuzzyKMeansOutput, - measure, numIterations, true); - RepresentativePointsDriver.printRepresentativePoints(fuzzyKMeansOutput, numIterations); - ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn); - // now print out the Results - System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity()); - System.out.println("Inter-cluster density = " + evaluator.interClusterDensity()); - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java b/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java deleted file mode 100644 index 597ed01..0000000 --- a/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java +++ /dev/null @@ -1,326 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering.cdbw; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.mahout.clustering.Cluster; -import org.apache.mahout.clustering.ClusteringTestUtils; -import org.apache.mahout.clustering.TestClusterEvaluator; -import org.apache.mahout.clustering.UncommonDistributions; -import org.apache.mahout.clustering.canopy.Canopy; -import org.apache.mahout.clustering.canopy.CanopyDriver; -import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver; -import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver; -import org.apache.mahout.clustering.kmeans.KMeansDriver; -import org.apache.mahout.clustering.kmeans.TestKmeansClustering; -import org.apache.mahout.common.MahoutTestCase; -import org.apache.mahout.common.distance.DistanceMeasure; -import org.apache.mahout.common.distance.EuclideanDistanceMeasure; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; -import org.junit.Before; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public final class TestCDbwEvaluator extends MahoutTestCase { - - private static final double[][] REFERENCE = { {1, 1}, {2, 1}, {1, 2}, {2, 2}, {3, 3}, {4, 4}, {5, 4}, {4, 5}, {5, 5}}; - - private static final Logger log = LoggerFactory.getLogger(TestClusterEvaluator.class); - - private Map<Integer,List<VectorWritable>> representativePoints; - - private List<Cluster> clusters; - - private Configuration conf; - - private FileSystem fs; - - private final Collection<VectorWritable> sampleData = new ArrayList<>(); - - private List<VectorWritable> referenceData = new ArrayList<>(); - - private Path testdata; - - private Path output; - - @Override - @Before - public void setUp() throws Exception { - super.setUp(); - conf = getConfiguration(); - fs = FileSystem.get(conf); - testdata = getTestTempDirPath("testdata"); - output = getTestTempDirPath("output"); - // Create small reference data set - referenceData = TestKmeansClustering.getPointsWritable(REFERENCE); - // generate larger test data set for the clustering tests to chew on - generateSamples(); - } - - /** - * Initialize synthetic data using 4 clusters dC units from origin having 4 representative points dP from each center - * - * @param dC - * a double cluster center offset - * @param dP - * a double representative point offset - * @param measure - * the DistanceMeasure - */ - private void initData(double dC, double dP, DistanceMeasure measure) { - clusters = new ArrayList<>(); - clusters.add(new Canopy(new DenseVector(new double[] {-dC, -dC}), 1, measure)); - clusters.add(new Canopy(new DenseVector(new double[] {-dC, dC}), 3, measure)); - clusters.add(new Canopy(new DenseVector(new double[] {dC, dC}), 5, measure)); - clusters.add(new Canopy(new DenseVector(new double[] {dC, -dC}), 7, measure)); - representativePoints = new HashMap<>(); - for (Cluster cluster : clusters) { - List<VectorWritable> points = new ArrayList<>(); - representativePoints.put(cluster.getId(), points); - points.add(new VectorWritable(cluster.getCenter().clone())); - points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {dP, dP})))); - points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {dP, -dP})))); - points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {-dP, -dP})))); - points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {-dP, dP})))); - } - } - - /** - * Generate random samples and add them to the sampleData - * - * @param num - * int number of samples to generate - * @param mx - * double x-value of the sample mean - * @param my - * double y-value of the sample mean - * @param sd - * double standard deviation of the samples - */ - private void generateSamples(int num, double mx, double my, double sd) { - log.info("Generating {} samples m=[{}, {}] sd={}", num, mx, my, sd); - for (int i = 0; i < num; i++) { - sampleData.add(new VectorWritable(new DenseVector(new double[] {UncommonDistributions.rNorm(mx, sd), - UncommonDistributions.rNorm(my, sd)}))); - } - } - - private void generateSamples() { - generateSamples(500, 1, 1, 3); - generateSamples(300, 1, 0, 0.5); - generateSamples(300, 0, 2, 0.1); - } - - @Test - public void testCDbw0() throws IOException { - ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - initData(1, 0.25, measure); - CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure); - System.out.println("CDbw = " + evaluator.getCDbw()); - System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity()); - System.out.println("Inter-cluster density = " + evaluator.interClusterDensity()); - System.out.println("Separation = " + evaluator.separation()); - } - - @Test - public void testCDbw1() throws IOException { - ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - initData(1, 0.5, measure); - CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure); - System.out.println("CDbw = " + evaluator.getCDbw()); - System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity()); - System.out.println("Inter-cluster density = " + evaluator.interClusterDensity()); - System.out.println("Separation = " + evaluator.separation()); - } - - @Test - public void testCDbw2() throws IOException { - ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - initData(1, 0.75, measure); - CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure); - System.out.println("CDbw = " + evaluator.getCDbw()); - System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity()); - System.out.println("Inter-cluster density = " + evaluator.interClusterDensity()); - System.out.println("Separation = " + evaluator.separation()); - } - - @Test - public void testEmptyCluster() throws IOException { - ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - initData(1, 0.25, measure); - Canopy cluster = new Canopy(new DenseVector(new double[] {10, 10}), 19, measure); - clusters.add(cluster); - List<VectorWritable> points = new ArrayList<>(); - representativePoints.put(cluster.getId(), points); - CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure); - System.out.println("CDbw = " + evaluator.getCDbw()); - System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity()); - System.out.println("Inter-cluster density = " + evaluator.interClusterDensity()); - System.out.println("Separation = " + evaluator.separation()); - } - - @Test - public void testSingleValueCluster() throws IOException { - ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - initData(1, 0.25, measure); - Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure); - clusters.add(cluster); - List<VectorWritable> points = new ArrayList<>(); - points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {1, 1})))); - representativePoints.put(cluster.getId(), points); - CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure); - System.out.println("CDbw = " + evaluator.getCDbw()); - System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity()); - System.out.println("Inter-cluster density = " + evaluator.interClusterDensity()); - System.out.println("Separation = " + evaluator.separation()); - } - - /** - * Representative points extraction will duplicate the cluster center if the cluster has no assigned points. These - * clusters should be ignored like empty clusters above - * - * @throws IOException - */ - @Test - public void testAllSameValueCluster() throws IOException { - ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - initData(1, 0.25, measure); - Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure); - clusters.add(cluster); - List<VectorWritable> points = new ArrayList<>(); - points.add(new VectorWritable(cluster.getCenter())); - points.add(new VectorWritable(cluster.getCenter())); - points.add(new VectorWritable(cluster.getCenter())); - representativePoints.put(cluster.getId(), points); - CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure); - System.out.println("CDbw = " + evaluator.getCDbw()); - System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity()); - System.out.println("Inter-cluster density = " + evaluator.interClusterDensity()); - System.out.println("Separation = " + evaluator.separation()); - } - - /** - * Clustering can produce very, very tight clusters that can cause the std calculation to fail. These clusters should - * be processed correctly. - * - * @throws IOException - */ - @Test - public void testAlmostSameValueCluster() throws IOException { - ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - initData(1, 0.25, measure); - Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure); - clusters.add(cluster); - List<VectorWritable> points = new ArrayList<>(); - Vector delta = new DenseVector(new double[] {0, Double.MIN_NORMAL}); - points.add(new VectorWritable(delta.clone())); - points.add(new VectorWritable(delta.clone())); - points.add(new VectorWritable(delta.clone())); - points.add(new VectorWritable(delta.clone())); - points.add(new VectorWritable(delta.clone())); - representativePoints.put(cluster.getId(), points); - CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure); - System.out.println("CDbw = " + evaluator.getCDbw()); - System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity()); - System.out.println("Inter-cluster density = " + evaluator.interClusterDensity()); - System.out.println("Separation = " + evaluator.separation()); - } - - @Test - public void testCanopy() throws Exception { - ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("testdata/file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - CanopyDriver.run(getConfiguration(), testdata, output, measure, 3.1, 2.1, true, 0.0, true); - int numIterations = 10; - Path clustersIn = new Path(output, "clusters-0-final"); - RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure, - numIterations, true); - CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn); - // printRepPoints(numIterations); - // now print out the Results - System.out.println("Canopy CDbw = " + evaluator.getCDbw()); - System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity()); - System.out.println("Inter-cluster density = " + evaluator.interClusterDensity()); - System.out.println("Separation = " + evaluator.separation()); - } - - @Test - public void testKmeans() throws Exception { - ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("testdata/file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - // now run the Canopy job to prime kMeans canopies - CanopyDriver.run(getConfiguration(), testdata, output, measure, 3.1, 2.1, false, 0.0, true); - // now run the KMeans job - Path kmeansOutput = new Path(output, "kmeans"); - KMeansDriver.run(testdata, new Path(output, "clusters-0-final"), kmeansOutput, 0.001, 10, true, 0.0, true); - int numIterations = 10; - Path clustersIn = new Path(kmeansOutput, "clusters-10-final"); - RepresentativePointsDriver.run(conf, clustersIn, new Path(kmeansOutput, "clusteredPoints"), kmeansOutput, measure, - numIterations, true); - CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn); - RepresentativePointsDriver.printRepresentativePoints(kmeansOutput, numIterations); - // now print out the Results - System.out.println("K-Means CDbw = " + evaluator.getCDbw()); - System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity()); - System.out.println("Inter-cluster density = " + evaluator.interClusterDensity()); - System.out.println("Separation = " + evaluator.separation()); - } - - @Test - public void testFuzzyKmeans() throws Exception { - ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("testdata/file1"), fs, conf); - DistanceMeasure measure = new EuclideanDistanceMeasure(); - // now run the Canopy job to prime kMeans canopies - CanopyDriver.run(getConfiguration(), testdata, output, measure, 3.1, 2.1, false, 0.0, true); - Path fuzzyKMeansOutput = new Path(output, "fuzzyk"); - // now run the KMeans job - FuzzyKMeansDriver.run(testdata, new Path(output, "clusters-0-final"), fuzzyKMeansOutput, 0.001, 10, 2, - true, true, 0, true); - int numIterations = 10; - Path clustersIn = new Path(fuzzyKMeansOutput, "clusters-4"); - RepresentativePointsDriver.run(conf, clustersIn, new Path(fuzzyKMeansOutput, "clusteredPoints"), fuzzyKMeansOutput, - measure, numIterations, true); - CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn); - RepresentativePointsDriver.printRepresentativePoints(fuzzyKMeansOutput, numIterations); - // now print out the Results - System.out.println("Fuzzy K-Means CDbw = " + evaluator.getCDbw()); - System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity()); - System.out.println("Inter-cluster density = " + evaluator.interClusterDensity()); - System.out.println("Separation = " + evaluator.separation()); - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java b/integration/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java deleted file mode 100644 index ba73c82..0000000 --- a/integration/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java +++ /dev/null @@ -1,66 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.text; - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.mahout.common.MahoutTestCase; -import org.junit.Test; - -/** - * Unit tests for the MailArchivesClusteringAnalyzer text analyzer. - */ -public class MailArchivesClusteringAnalyzerTest extends MahoutTestCase { - - @Test - public void testAnalysis() throws Exception { - Analyzer analyzer = new MailArchivesClusteringAnalyzer(); - - String text = "A test message\n" - + "atokenthatistoolongtobeusefulforclustertextanalysis\n" - + "Mahout is a scalable, machine-learning LIBRARY\n" - + "we've added some additional stopwords such as html, mailto, regards\t" - + "apache_hadoop provides the foundation for scalability\n" - + "www.nabble.com [email protected]\n" - + "public void int protected package"; - Reader reader = new StringReader(text); - - // if you change the text above, then you may need to change this as well - // order matters too - String[] expectedTokens = { - "test", "mahout", "scalabl", "machin", "learn", "librari", "weve", "ad", - "stopword", "apache_hadoop","provid", "foundat", "scalabl" - }; - - TokenStream tokenStream = analyzer.tokenStream("test", reader); - assertNotNull(tokenStream); - tokenStream.reset(); - CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); - int e = 0; - while (tokenStream.incrementToken() && e < expectedTokens.length) { - assertEquals(expectedTokens[e++], termAtt.toString()); - } - assertEquals(e, expectedTokens.length); - tokenStream.end(); - tokenStream.close(); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java b/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java deleted file mode 100644 index ef2b8a6..0000000 --- a/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java +++ /dev/null @@ -1,240 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.text; - -import java.io.File; -import java.io.FileOutputStream; -import java.util.zip.GZIPOutputStream; - -import org.apache.commons.lang3.SystemUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.Text; -import org.apache.mahout.common.MahoutTestCase; -import org.apache.mahout.common.Pair; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -/** - * Test case for the SequenceFilesFromMailArchives command-line application. - */ -public final class SequenceFilesFromMailArchivesTest extends MahoutTestCase { - - private File inputDir; - - /** - * Create the input and output directories needed for testing - * the SequenceFilesFromMailArchives application. - */ - @Override - @Before - public void setUp() throws Exception { - super.setUp(); - inputDir = getTestTempDir("mail-archives-in"); - - // write test mail messages to a gzipped file in a nested directory - File subDir = new File(inputDir, "subdir"); - subDir.mkdir(); - File gzFile = new File(subDir, "mail-messages.gz"); - try (GZIPOutputStream gzOut = new GZIPOutputStream(new FileOutputStream(gzFile))) { - gzOut.write(testMailMessages.getBytes("UTF-8")); - gzOut.finish(); - } - - File subDir2 = new File(subDir, "subsubdir"); - subDir2.mkdir(); - File gzFile2 = new File(subDir2, "mail-messages-2.gz"); - try (GZIPOutputStream gzOut = new GZIPOutputStream(new FileOutputStream(gzFile2))) { - gzOut.write(testMailMessages.getBytes("UTF-8")); - gzOut.finish(); - } - } - - @Test - public void testSequential() throws Exception { - - File outputDir = this.getTestTempDir("mail-archives-out"); - - String[] args = { - "--input", inputDir.getAbsolutePath(), - "--output", outputDir.getAbsolutePath(), - "--charset", "UTF-8", - "--keyPrefix", "TEST", - "--method", "sequential", - "--body", "--subject", "--separator", "" - }; - - // run the application's main method - SequenceFilesFromMailArchives.main(args); - - // app should create a single SequenceFile named "chunk-0" in the output dir - File expectedChunkFile = new File(outputDir, "chunk-0"); - String expectedChunkPath = expectedChunkFile.getAbsolutePath(); - Assert.assertTrue("Expected chunk file " + expectedChunkPath + " not found!", expectedChunkFile.isFile()); - - Configuration conf = getConfiguration(); - SequenceFileIterator<Text, Text> iterator = new SequenceFileIterator<>(new Path(expectedChunkPath), true, conf); - Assert.assertTrue("First key/value pair not found!", iterator.hasNext()); - Pair<Text, Text> record = iterator.next(); - - File parentFile = new File(new File(new File("TEST"), "subdir"), "mail-messages.gz"); - Assert.assertEquals(new File(parentFile, testVars[0][0]).toString(), record.getFirst().toString()); - Assert.assertEquals(testVars[0][1] + testVars[0][2], record.getSecond().toString()); - - Assert.assertTrue("Second key/value pair not found!", iterator.hasNext()); - - record = iterator.next(); - Assert.assertEquals(new File(parentFile, testVars[1][0]).toString(), record.getFirst().toString()); - Assert.assertEquals(testVars[1][1] + testVars[1][2], record.getSecond().toString()); - - record = iterator.next(); - File parentFileSubSubDir = new File(new File(new File(new File("TEST"), "subdir"), "subsubdir"), "mail-messages-2.gz"); - Assert.assertEquals(new File(parentFileSubSubDir, testVars[0][0]).toString(), record.getFirst().toString()); - Assert.assertEquals(testVars[0][1] + testVars[0][2], record.getSecond().toString()); - - Assert.assertTrue("Second key/value pair not found!", iterator.hasNext()); - record = iterator.next(); - Assert.assertEquals(new File(parentFileSubSubDir, testVars[1][0]).toString(), record.getFirst().toString()); - Assert.assertEquals(testVars[1][1] + testVars[1][2], record.getSecond().toString()); - - Assert.assertFalse("Only two key/value pairs expected!", iterator.hasNext()); - } - - @Test - public void testMapReduce() throws Exception { - - Path tmpDir = getTestTempDirPath(); - Path mrOutputDir = new Path(tmpDir, "mail-archives-out-mr"); - Configuration configuration = getConfiguration(); - FileSystem fs = FileSystem.get(configuration); - - File expectedInputFile = new File(inputDir.toString()); - - String[] args = { - "-Dhadoop.tmp.dir=" + configuration.get("hadoop.tmp.dir"), - "--input", expectedInputFile.getAbsolutePath(), - "--output", mrOutputDir.toString(), - "--charset", "UTF-8", - "--keyPrefix", "TEST", - "--method", "mapreduce", - "--body", "--subject", "--separator", "" - }; - - // run the application's main method - SequenceFilesFromMailArchives.main(args); - - // app should create a single SequenceFile named "chunk-0" in the output dir - FileStatus[] fileStatuses = fs.listStatus(mrOutputDir.suffix("/part-m-00000")); - assertEquals(1, fileStatuses.length); // only one - assertEquals("part-m-00000", fileStatuses[0].getPath().getName()); - SequenceFileIterator<Text, Text> iterator = - new SequenceFileIterator<>(mrOutputDir.suffix("/part-m-00000"), true, configuration); - - Assert.assertTrue("First key/value pair not found!", iterator.hasNext()); - Pair<Text, Text> record = iterator.next(); - - File parentFileSubSubDir = new File(new File(new File(new File("TEST"), "subdir"), "subsubdir"), "mail-messages-2.gz"); - - String expected = record.getFirst().toString(); - if (SystemUtils.IS_OS_WINDOWS) { - expected = expected.replace("/", "\\"); - } - Assert.assertEquals(new File(parentFileSubSubDir, testVars[0][0]).toString(), expected); - Assert.assertEquals(testVars[0][1] + testVars[0][2], record.getSecond().toString()); - Assert.assertTrue("Second key/value pair not found!", iterator.hasNext()); - - record = iterator.next(); - expected = record.getFirst().toString(); - if (SystemUtils.IS_OS_WINDOWS) { - expected = expected.replace("/", "\\"); - } - Assert.assertEquals(new File(parentFileSubSubDir, testVars[1][0]).toString(), expected); - Assert.assertEquals(testVars[1][1] + testVars[1][2], record.getSecond().toString()); - - // test other file - File parentFile = new File(new File(new File("TEST"), "subdir"), "mail-messages.gz"); - record = iterator.next(); - expected = record.getFirst().toString(); - if (SystemUtils.IS_OS_WINDOWS) { - expected = expected.replace("/", "\\"); - } - Assert.assertEquals(new File(parentFile, testVars[0][0]).toString(), expected); - Assert.assertEquals(testVars[0][1] + testVars[0][2], record.getSecond().toString()); - Assert.assertTrue("Second key/value pair not found!", iterator.hasNext()); - - record = iterator.next(); - expected = record.getFirst().toString(); - if (SystemUtils.IS_OS_WINDOWS) { - expected = expected.replace("/", "\\"); - } - Assert.assertEquals(new File(parentFile, testVars[1][0]).toString(), expected); - Assert.assertEquals(testVars[1][1] + testVars[1][2], record.getSecond().toString()); - Assert.assertFalse("Only four key/value pairs expected!", iterator.hasNext()); - } - - // Messages extracted and made anonymous from the ASF mail archives - private static final String[][] testVars = { - new String[] { - "[email protected]", - "Ant task for JDK1.1 collections build option", - "\nThis is just a test message\n--\nTesty McTester\n" - }, - new String[] { - "[email protected]", - "Problem with build files in several directories", - "\nHi all,\nThis is another test message.\nRegards,\nAnother Test\n" - } - }; - - private static final String testMailMessages = - "From [email protected] Mon Jul 24 19:13:53 2000\n" - + "Return-Path: <[email protected]>\n" - + "Mailing-List: contact [email protected]; run by ezmlm\n" - + "Delivered-To: mailing list [email protected]\n" - + "Received: (qmail 49267 invoked from network); 24 Jul 2000 19:13:53 -0000\n" - + "Message-ID: <" + testVars[0][0] + ">\n" - + "From: \"Testy McTester\" <[email protected]>\n" - + "To: <[email protected]>\n" - + "Subject: " + testVars[0][1] + '\n' - + "Date: Mon, 24 Jul 2000 12:24:56 -0700\n" - + "MIME-Version: 1.0\n" - + "Content-Type: text/plain;\n" - + " charset=\"Windows-1252\"\n" - + "Content-Transfer-Encoding: 7bit\n" - + "X-Spam-Rating: locus.apache.org 1.6.2 0/1000/N\n" - + testVars[0][2] + '\n' - + "From [email protected] Wed Jul 26 11:32:16 2000\n" - + "Return-Path: <[email protected]>\n" - + "Mailing-List: contact [email protected]; run by ezmlm\n" - + "Delivered-To: mailing list [email protected]\n" - + "Received: (qmail 73966 invoked from network); 26 Jul 2000 11:32:16 -0000\n" - + "User-Agent: Microsoft-Outlook-Express-Macintosh-Edition/5.02.2022\n" - + "Date: Wed, 26 Jul 2000 13:32:08 +0200\n" - + "Subject: " + testVars[1][1] + '\n' - + "From: Another Test <[email protected]>\n" - + "To: <[email protected]>\n" - + "Message-Id: <" + testVars[1][0] + ">\n" - + "Mime-Version: 1.0\n" - + "Content-Type: text/plain; charset=\"US-ASCII\"\n" - + "Content-Transfer-Encoding: 7bit\n" - + "X-Spam-Rating: locus.apache.org 1.6.2 0/1000/N\n" - + testVars[1][2]; -} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/text/TestPathFilter.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/text/TestPathFilter.java b/integration/src/test/java/org/apache/mahout/text/TestPathFilter.java deleted file mode 100644 index 227521a..0000000 --- a/integration/src/test/java/org/apache/mahout/text/TestPathFilter.java +++ /dev/null @@ -1,32 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.text; - -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.PathFilter; - -/** - * Dummy Path Filter for testing the MapReduce version of - * SequenceFilesFromDirectory - */ -public class TestPathFilter implements PathFilter { - - @Override - public boolean accept(Path path) { - return path.getName().startsWith("t") || path.getName().startsWith("r") || path.getName().startsWith("f"); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java b/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java deleted file mode 100644 index 040c8e4..0000000 --- a/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java +++ /dev/null @@ -1,313 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.text; - -import java.io.File; -import java.io.IOException; -import java.io.OutputStreamWriter; -import java.util.HashMap; -import java.util.Map; - -import org.apache.commons.io.Charsets; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.Text; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.common.MahoutTestCase; -import org.apache.mahout.common.Pair; -import org.apache.mahout.common.iterator.sequencefile.PathFilters; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public final class TestSequenceFilesFromDirectory extends MahoutTestCase { - - private static final Logger logger = LoggerFactory.getLogger(TestSequenceFilesFromDirectory.class); - - private static final String[][] DATA1 = { - {"test1", "This is the first text."}, - {"test2", "This is the second text."}, - {"test3", "This is the third text."} - }; - - private static final String[][] DATA2 = { - {"recursive_test1", "This is the first text."}, - {"recursive_test2", "This is the second text."}, - {"recursive_test3", "This is the third text."} - }; - - @Test - public void testSequenceFileFromDirectoryBasic() throws Exception { - // parameters - Configuration configuration = getConfiguration(); - - FileSystem fs = FileSystem.get(configuration); - - // create - Path tmpDir = this.getTestTempDirPath(); - Path inputDir = new Path(tmpDir, "inputDir"); - fs.mkdirs(inputDir); - - Path outputDir = new Path(tmpDir, "outputDir"); - Path outputDirRecursive = new Path(tmpDir, "outputDirRecursive"); - - Path inputDirRecursive = new Path(tmpDir, "inputDirRecur"); - fs.mkdirs(inputDirRecursive); - - // prepare input files - createFilesFromArrays(configuration, inputDir, DATA1); - - SequenceFilesFromDirectory.main(new String[]{ - "--input", inputDir.toString(), - "--output", outputDir.toString(), - "--chunkSize", "64", - "--charset", Charsets.UTF_8.name(), - "--keyPrefix", "UID", - "--method", "sequential"}); - - // check output chunk files - checkChunkFiles(configuration, outputDir, DATA1, "UID"); - - createRecursiveDirFilesFromArrays(configuration, inputDirRecursive, DATA2); - - FileStatus fstInputPath = fs.getFileStatus(inputDirRecursive); - String dirs = HadoopUtil.buildDirList(fs, fstInputPath); - - System.out.println("\n\n ----- recursive dirs: " + dirs); - SequenceFilesFromDirectory.main(new String[]{ - "--input", inputDirRecursive.toString(), - "--output", outputDirRecursive.toString(), - "--chunkSize", "64", - "--charset", Charsets.UTF_8.name(), - "--keyPrefix", "UID", - "--method", "sequential"}); - - checkRecursiveChunkFiles(configuration, outputDirRecursive, DATA2, "UID"); - } - - @Test - public void testSequenceFileFromDirectoryMapReduce() throws Exception { - - Configuration conf = getConfiguration(); - - FileSystem fs = FileSystem.get(conf); - - // create - Path tmpDir = this.getTestTempDirPath(); - Path inputDir = new Path(tmpDir, "inputDir"); - fs.mkdirs(inputDir); - - Path inputDirRecur = new Path(tmpDir, "inputDirRecur"); - fs.mkdirs(inputDirRecur); - - Path mrOutputDir = new Path(tmpDir, "mrOutputDir"); - Path mrOutputDirRecur = new Path(tmpDir, "mrOutputDirRecur"); - - createFilesFromArrays(conf, inputDir, DATA1); - - SequenceFilesFromDirectory.main(new String[]{ - "-Dhadoop.tmp.dir=" + conf.get("hadoop.tmp.dir"), - "--input", inputDir.toString(), - "--output", mrOutputDir.toString(), - "--chunkSize", "64", - "--charset", Charsets.UTF_8.name(), - "--method", "mapreduce", - "--keyPrefix", "UID", - "--fileFilterClass", "org.apache.mahout.text.TestPathFilter" - }); - - checkMRResultFiles(conf, mrOutputDir, DATA1, "UID"); - - createRecursiveDirFilesFromArrays(conf, inputDirRecur, DATA2); - - FileStatus fst_input_path = fs.getFileStatus(inputDirRecur); - String dirs = HadoopUtil.buildDirList(fs, fst_input_path); - - logger.info("\n\n ---- recursive dirs: {}", dirs); - - SequenceFilesFromDirectory.main(new String[]{ - "-Dhadoop.tmp.dir=" + conf.get("hadoop.tmp.dir"), - "--input", inputDirRecur.toString(), - "--output", mrOutputDirRecur.toString(), - "--chunkSize", "64", - "--charset", Charsets.UTF_8.name(), - "--method", "mapreduce", - "--keyPrefix", "UID", - "--fileFilterClass", "org.apache.mahout.text.TestPathFilter" - }); - - checkMRResultFilesRecursive(conf, mrOutputDirRecur, DATA2, "UID"); - } - - - private static void createFilesFromArrays(Configuration conf, Path inputDir, String[][] data) throws IOException { - FileSystem fs = FileSystem.get(conf); - for (String[] aData : data) { - try (OutputStreamWriter writer = - new OutputStreamWriter(fs.create(new Path(inputDir, aData[0])), Charsets.UTF_8)){ - writer.write(aData[1]); - } - } - } - - private static void createRecursiveDirFilesFromArrays(Configuration configuration, Path inputDir, - String[][] data) throws IOException { - FileSystem fs = FileSystem.get(configuration); - - logger.info("creativeRecursiveDirFilesFromArrays > based on: {}", inputDir.toString()); - Path curPath; - String currentRecursiveDir = inputDir.toString(); - - for (String[] aData : data) { - currentRecursiveDir += "/" + aData[0]; - File subDir = new File(currentRecursiveDir); - subDir.mkdir(); - - curPath = new Path(subDir.toString(), "file.txt"); - logger.info("Created file: {}", curPath.toString()); - - try (OutputStreamWriter writer = new OutputStreamWriter(fs.create(curPath), Charsets.UTF_8)){ - writer.write(aData[1]); - } - } - } - - private static void checkChunkFiles(Configuration configuration, - Path outputDir, - String[][] data, - String prefix) throws IOException { - FileSystem fs = FileSystem.get(configuration); - - // output exists? - FileStatus[] fileStatuses = fs.listStatus(outputDir, PathFilters.logsCRCFilter()); - assertEquals(1, fileStatuses.length); // only one - assertEquals("chunk-0", fileStatuses[0].getPath().getName()); - - Map<String, String> fileToData = new HashMap<>(); - for (String[] aData : data) { - fileToData.put(prefix + Path.SEPARATOR + aData[0], aData[1]); - } - - // read a chunk to check content - try (SequenceFileIterator<Text, Text> iterator = - new SequenceFileIterator<>(fileStatuses[0].getPath(), true, configuration)){ - while (iterator.hasNext()) { - Pair<Text, Text> record = iterator.next(); - String retrievedData = fileToData.get(record.getFirst().toString().trim()); - assertNotNull(retrievedData); - assertEquals(retrievedData, record.getSecond().toString().trim()); - } - } - } - - private static void checkRecursiveChunkFiles(Configuration configuration, - Path outputDir, - String[][] data, - String prefix) throws IOException { - FileSystem fs = FileSystem.get(configuration); - - System.out.println(" ----------- check_Recursive_ChunkFiles ------------"); - - // output exists? - FileStatus[] fileStatuses = fs.listStatus(outputDir, PathFilters.logsCRCFilter()); - assertEquals(1, fileStatuses.length); // only one - assertEquals("chunk-0", fileStatuses[0].getPath().getName()); - - - Map<String, String> fileToData = new HashMap<>(); - String currentPath = prefix; - for (String[] aData : data) { - currentPath += Path.SEPARATOR + aData[0]; - fileToData.put(currentPath + Path.SEPARATOR + "file.txt", aData[1]); - } - - // read a chunk to check content - try (SequenceFileIterator<Text, Text> iterator = - new SequenceFileIterator<>(fileStatuses[0].getPath(), true, configuration)) { - while (iterator.hasNext()) { - Pair<Text, Text> record = iterator.next(); - String retrievedData = fileToData.get(record.getFirst().toString().trim()); - System.out.printf("%s >> %s\n", record.getFirst().toString().trim(), record.getSecond().toString().trim()); - - assertNotNull(retrievedData); - assertEquals(retrievedData, record.getSecond().toString().trim()); - System.out.printf(">>> k: %s, v: %s\n", record.getFirst().toString(), record.getSecond().toString()); - } - } - } - - private static void checkMRResultFiles(Configuration conf, Path outputDir, - String[][] data, String prefix) throws IOException { - FileSystem fs = FileSystem.get(conf); - - // output exists? - FileStatus[] fileStatuses = fs.listStatus(outputDir.suffix("/part-m-00000"), PathFilters.logsCRCFilter()); - assertEquals(1, fileStatuses.length); // only one - assertEquals("part-m-00000", fileStatuses[0].getPath().getName()); - Map<String, String> fileToData = new HashMap<>(); - for (String[] aData : data) { - System.out.printf("map.put: %s %s\n", prefix + Path.SEPARATOR + aData[0], aData[1]); - fileToData.put(prefix + Path.SEPARATOR + aData[0], aData[1]); - } - - // read a chunk to check content - try (SequenceFileIterator<Text, Text> iterator = new SequenceFileIterator<>( - fileStatuses[0].getPath(), true, conf)) { - while (iterator.hasNext()) { - Pair<Text, Text> record = iterator.next(); - String retrievedData = fileToData.get(record.getFirst().toString().trim()); - - System.out.printf("MR> %s >> %s\n", record.getFirst().toString().trim(), record.getSecond().toString().trim()); - assertNotNull(retrievedData); - assertEquals(retrievedData, record.getSecond().toString().trim()); - } - } - } - - private static void checkMRResultFilesRecursive(Configuration configuration, Path outputDir, - String[][] data, String prefix) throws IOException { - FileSystem fs = FileSystem.get(configuration); - - // output exists? - FileStatus[] fileStatuses = fs.listStatus(outputDir.suffix("/part-m-00000"), PathFilters.logsCRCFilter()); - assertEquals(1, fileStatuses.length); // only one - assertEquals("part-m-00000", fileStatuses[0].getPath().getName()); - Map<String, String> fileToData = new HashMap<>(); - String currentPath = prefix; - - for (String[] aData : data) { - currentPath += Path.SEPARATOR + aData[0]; - fileToData.put(currentPath + Path.SEPARATOR + "file.txt", aData[1]); - } - - // read a chunk to check content - try (SequenceFileIterator<Text, Text> iterator = new SequenceFileIterator<>( - fileStatuses[0].getPath(), true, configuration)){ - while (iterator.hasNext()) { - Pair<Text, Text> record = iterator.next(); - System.out.printf("MR-Recur > Trying to check: %s\n", record.getFirst().toString().trim()); - String retrievedData = fileToData.get(record.getFirst().toString().trim()); - assertNotNull(retrievedData); - assertEquals(retrievedData, record.getSecond().toString().trim()); - } - } - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/text/doc/MultipleFieldsDocument.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/text/doc/MultipleFieldsDocument.java b/integration/src/test/java/org/apache/mahout/text/doc/MultipleFieldsDocument.java deleted file mode 100644 index 7483b2d..0000000 --- a/integration/src/test/java/org/apache/mahout/text/doc/MultipleFieldsDocument.java +++ /dev/null @@ -1,58 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.text.doc; - -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.TextField; - -/** - * Used for testing lucene2seq - */ -@Deprecated -public class MultipleFieldsDocument extends SingleFieldDocument { - - public static final String FIELD1 = "field1"; - public static final String FIELD2 = "field2"; - - private String field1; - private String field2; - - public MultipleFieldsDocument(String id, String field, String field1, String field2) { - super(id, field); - this.field1 = field1; - this.field2 = field2; - } - - public String getField1() { - return field1; - } - - public String getField2() { - return field2; - } - - @Override - public Document asLuceneDocument() { - Document document = super.asLuceneDocument(); - - document.add(new TextField(FIELD1, this.field1, Field.Store.YES)); - document.add(new TextField(FIELD2, this.field2, Field.Store.YES)); - - return document; - } -}
