Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFConverter.java URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFConverter.java?rev=1094158&r1=1094157&r2=1094158&view=diff ============================================================================== --- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFConverter.java (original) +++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFConverter.java Sun Apr 17 15:30:15 2011 @@ -149,7 +149,8 @@ public final class TFIDFConverter { Path wordCountPath = new Path(output, WORDCOUNT_OUTPUT_FOLDER); startDFCounting(input, wordCountPath, baseConf); - Pair<Long[], List<Path>> datasetFeatures = createDictionaryChunks(wordCountPath, output, baseConf, chunkSizeInMegabytes); + Pair<Long[], List<Path>> datasetFeatures = + createDictionaryChunks(wordCountPath, output, baseConf, chunkSizeInMegabytes); int partialVectorIndex = 0; List<Path> partialVectorPaths = new ArrayList<Path>(); @@ -210,8 +211,13 @@ public final class TFIDFConverter { long featureCount = 0; long vectorCount = Long.MAX_VALUE; Path filesPattern = new Path(featureCountPath, OUTPUT_FILES_PATTERN); - for (Pair<IntWritable,LongWritable> record : - new SequenceFileDirIterable<IntWritable,LongWritable>(filesPattern, PathType.GLOB, null, null, true, conf)) { + for (Pair<IntWritable,LongWritable> record + : new SequenceFileDirIterable<IntWritable,LongWritable>(filesPattern, + PathType.GLOB, + null, + null, + true, + conf)) { if (currentChunkSize > chunkSizeLimit) { freqWriter.close(); @@ -236,7 +242,7 @@ public final class TFIDFConverter { featureCount = Math.max(key.get(), featureCount); } - featureCount++; + featureCount++; freqWriter.close(); Long[] counts = {featureCount, vectorCount}; return new Pair<Long[], List<Path>>(counts, chunkPaths); @@ -323,7 +329,7 @@ public final class TFIDFConverter { + "org.apache.hadoop.io.serializer.WritableSerialization"); Job job = new Job(conf); - job.setJobName("VectorTfIdf Document Frequency Count running over input: " + input.toString()); + job.setJobName("VectorTfIdf Document Frequency Count running over input: " + input); job.setJarByClass(TFIDFConverter.class); job.setOutputKeyClass(IntWritable.class);
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFPartialVectorReducer.java URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFPartialVectorReducer.java?rev=1094158&r1=1094157&r2=1094158&view=diff ============================================================================== --- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFPartialVectorReducer.java (original) +++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFPartialVectorReducer.java Sun Apr 17 15:30:15 2011 @@ -115,8 +115,8 @@ public class TFIDFPartialVectorReducer e Path dictionaryFile = new Path(localFiles[0].getPath()); // key is feature, value is the document frequency - for (Pair<IntWritable,LongWritable> record : - new SequenceFileIterable<IntWritable,LongWritable>(dictionaryFile, true, conf)) { + for (Pair<IntWritable,LongWritable> record + : new SequenceFileIterable<IntWritable,LongWritable>(dictionaryFile, true, conf)) { dictionary.put(record.getFirst().get(), record.getSecond().get()); } } Modified: mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/TasteTestCase.java URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/TasteTestCase.java?rev=1094158&r1=1094157&r2=1094158&view=diff ============================================================================== --- mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/TasteTestCase.java (original) +++ mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/TasteTestCase.java Sun Apr 17 15:30:15 2011 @@ -32,7 +32,6 @@ import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.OutputStreamWriter; import java.io.PrintWriter; -import java.nio.charset.Charset; import java.util.ArrayList; import java.util.List; Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java?rev=1094158&r1=1094157&r2=1094158&view=diff ============================================================================== --- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java (original) +++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java Sun Apr 17 15:30:15 2011 @@ -397,9 +397,10 @@ public final class TestCanopyCreation ex "org.apache.mahout.common.distance.ManhattanDistanceMeasure"); conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1)); conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1)); - DummyRecordWriter<IntWritable, WeightedVectorWritable> writer = new DummyRecordWriter<IntWritable, WeightedVectorWritable>(); - Mapper<WritableComparable<?>, VectorWritable, IntWritable, WeightedVectorWritable>.Context context = DummyRecordWriter - .build(mapper, conf, writer); + DummyRecordWriter<IntWritable, WeightedVectorWritable> writer = + new DummyRecordWriter<IntWritable, WeightedVectorWritable>(); + Mapper<WritableComparable<?>, VectorWritable, IntWritable, WeightedVectorWritable>.Context context = + DummyRecordWriter.build(mapper, conf, writer); mapper.setup(context); Collection<Canopy> canopies = new ArrayList<Canopy>(); @@ -645,7 +646,7 @@ public final class TestCanopyCreation ex Reducer<Text, VectorWritable, Text, Canopy>.Context context = DummyRecordWriter .build(reducer, conf, writer, Text.class, VectorWritable.class); reducer.setup(context); - assertEquals(1.1, reducer.canopyClusterer.t1, EPSILON); - assertEquals(0.1, reducer.canopyClusterer.t2, EPSILON); + assertEquals(1.1, reducer.canopyClusterer.getT1(), EPSILON); + assertEquals(0.1, reducer.canopyClusterer.getT2(), EPSILON); } } Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java?rev=1094158&r1=1094157&r2=1094158&view=diff ============================================================================== --- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java (original) +++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java Sun Apr 17 15:30:15 2011 @@ -520,7 +520,7 @@ public final class TestFuzzyKmeansCluste String clusterId = key.getIdentifier(); List<SoftCluster> values = reducerWriter.getValue(new Text(clusterId)); SoftCluster cluster = values.get(0); - System.out.println("ref= " + key.toString() + " cluster= " + cluster.toString()); + System.out.println("ref= " + key.toString() + " cluster= " + cluster); cluster.computeParameters(); assertEquals("key center: " + AbstractCluster.formatVector(key.getCenter(), null) + " does not equal cluster: " + AbstractCluster.formatVector(cluster.getCenter(), null), key.getCenter(), cluster.getCenter()); Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java?rev=1094158&r1=1094157&r2=1094158&view=diff ============================================================================== --- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java (original) +++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java Sun Apr 17 15:30:15 2011 @@ -34,6 +34,7 @@ import org.apache.hadoop.io.WritableComp import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.util.ToolRunner; +import org.apache.mahout.clustering.Cluster; import org.apache.mahout.clustering.ClusteringTestUtils; import org.apache.mahout.common.DummyRecordWriter; import org.apache.mahout.common.HadoopUtil; @@ -331,12 +332,12 @@ public final class TestMeanShift extends long count = HadoopUtil.countRecords(outPart, conf); assertEquals("count", 3, count); outPart = new Path(output, "clusters-0/part-m-00000"); - Iterator<?> iterator = new SequenceFileValueIterator<Writable>(outPart, true, conf); - // now test the initial clusters to ensure the type of their centers has been retained - while (iterator.hasNext()) { - MeanShiftCanopy canopy = (MeanShiftCanopy) iterator.next(); - assertTrue(canopy.getCenter()instanceof DenseVector); - } + Iterator<?> iterator = new SequenceFileValueIterator<Writable>(outPart, true, conf); + // now test the initial clusters to ensure the type of their centers has been retained + while (iterator.hasNext()) { + Cluster canopy = (Cluster) iterator.next(); + assertTrue(canopy.getCenter() instanceof DenseVector); + } } /** @@ -356,7 +357,7 @@ public final class TestMeanShift extends ClusteringTestUtils.writePointsToFile(points, getTestTempFilePath("testdata/file2"), fs, conf); // now run the Job using the run() command. Other tests can continue to use runJob(). Path output = getTestTempDirPath("output"); - System.out.println("Output Path: " + output.toString()); + System.out.println("Output Path: " + output); //MeanShiftCanopyDriver.runJob(input, output, EuclideanDistanceMeasure.class.getName(), 4, 1, 0.5, 10, false, false); String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION), getTestTempDirPath("testdata").toString(), optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(), optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java?rev=1094158&r1=1094157&r2=1094158&view=diff ============================================================================== --- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java (original) +++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java Sun Apr 17 15:30:15 2011 @@ -147,7 +147,7 @@ public class TestMinHashClustering exten public void testLinearMinHashMRJob() throws Exception { String[] args = makeArguments(2, 3, 20, 3, HashType.LINEAR.toString()); int ret = ToolRunner.run(new Configuration(), new MinHashDriver(), args); - assertEquals("Minhash MR Job failed for " + HashType.LINEAR.toString(), 0, ret); + assertEquals("Minhash MR Job failed for " + HashType.LINEAR, 0, ret); verify(output, 0.2, "Hash Type: LINEAR"); } @@ -155,7 +155,7 @@ public class TestMinHashClustering exten public void testPolynomialMinHashMRJob() throws Exception { String[] args = makeArguments(2, 3, 20, 3, HashType.POLYNOMIAL.toString()); int ret = ToolRunner.run(new Configuration(), new MinHashDriver(), args); - assertEquals("Minhash MR Job failed for " + HashType.POLYNOMIAL.toString(), 0, ret); + assertEquals("Minhash MR Job failed for " + HashType.POLYNOMIAL, 0, ret); verify(output, 0.3, "Hash Type: POLYNOMIAL"); } @@ -163,7 +163,7 @@ public class TestMinHashClustering exten public void testMurmurMinHashMRJob() throws Exception { String[] args = makeArguments(2, 3, 20, 4, HashType.MURMUR.toString()); int ret = ToolRunner.run(new Configuration(), new MinHashDriver(), args); - assertEquals("Minhash MR Job failed for " + HashType.MURMUR.toString(), 0, ret); + assertEquals("Minhash MR Job failed for " + HashType.MURMUR, 0, ret); verify(output, 0.3, "Hash Type: MURMUR"); } Modified: mahout/trunk/core/src/test/java/org/apache/mahout/common/AbstractJobTest.java URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/common/AbstractJobTest.java?rev=1094158&r1=1094157&r2=1094158&view=diff ============================================================================== --- mahout/trunk/core/src/test/java/org/apache/mahout/common/AbstractJobTest.java (original) +++ mahout/trunk/core/src/test/java/org/apache/mahout/common/AbstractJobTest.java Sun Apr 17 15:30:15 2011 @@ -232,5 +232,5 @@ public final class AbstractJobTest exten testInputPath, job.getInputPath().toString()); assertEquals("output command-line option precedes property", testOutputPath, job.getOutputPath().toString()); - } + } } Modified: mahout/trunk/core/src/test/java/org/apache/mahout/common/distance/DefaultDistanceMeasureTest.java URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/common/distance/DefaultDistanceMeasureTest.java?rev=1094158&r1=1094157&r2=1094158&view=diff ============================================================================== --- mahout/trunk/core/src/test/java/org/apache/mahout/common/distance/DefaultDistanceMeasureTest.java (original) +++ mahout/trunk/core/src/test/java/org/apache/mahout/common/distance/DefaultDistanceMeasureTest.java Sun Apr 17 15:30:15 2011 @@ -87,7 +87,7 @@ public abstract class DefaultDistanceMea for (int a = 0; a < 4; a++) { for (int b = 0; b < 4; b++) { assertTrue("Distance between vectors less than zero: " - + distanceMatrix[a][b] + " = " + distanceMeasure.toString() + + + distanceMatrix[a][b] + " = " + distanceMeasure + ".distance("+ vectors[a].asFormatString() + ", " + vectors[b].asFormatString() + ')', distanceMatrix[a][b] >= 0); Modified: mahout/trunk/core/src/test/java/org/apache/mahout/df/mapreduce/partial/MockContext.java URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/df/mapreduce/partial/MockContext.java?rev=1094158&r1=1094157&r2=1094158&view=diff ============================================================================== --- mahout/trunk/core/src/test/java/org/apache/mahout/df/mapreduce/partial/MockContext.java (original) +++ mahout/trunk/core/src/test/java/org/apache/mahout/df/mapreduce/partial/MockContext.java Sun Apr 17 15:30:15 2011 @@ -31,10 +31,10 @@ final class MockContext extends Context private final TreeID[] keys; private final MapredOutput[] values; - private int index ; + private int index; - MockContext(Mapper<?,?,?,?> mapper, Configuration conf, TaskAttemptID taskid, - int nbTrees) throws IOException, InterruptedException { + MockContext(Mapper<?,?,?,?> mapper, Configuration conf, TaskAttemptID taskid, int nbTrees) + throws IOException, InterruptedException { mapper.super(conf, taskid, null, null, null, null, null); keys = new TreeID[nbTrees]; Modified: mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/PFPGrowthRetailDataTest.java URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/PFPGrowthRetailDataTest.java?rev=1094158&r1=1094157&r2=1094158&view=diff ============================================================================== --- mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/PFPGrowthRetailDataTest.java (original) +++ mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/PFPGrowthRetailDataTest.java Sun Apr 17 15:30:15 2011 @@ -18,8 +18,6 @@ package org.apache.mahout.fpm.pfpgrowth; import java.io.File; -import java.io.FileOutputStream; -import java.io.OutputStreamWriter; import java.io.Writer; import java.util.ArrayList; import java.util.Collection; Modified: mahout/trunk/math/src/main/java/org/apache/mahout/math/Algebra.java URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/Algebra.java?rev=1094158&r1=1094157&r2=1094158&view=diff ============================================================================== --- mahout/trunk/math/src/main/java/org/apache/mahout/math/Algebra.java (original) +++ mahout/trunk/math/src/main/java/org/apache/mahout/math/Algebra.java Sun Apr 17 15:30:15 2011 @@ -54,10 +54,10 @@ public final class Algebra { /** * Compute Maximum Absolute Row Sum Norm of input Matrix m * http://mathworld.wolfram.com/MaximumAbsoluteRowSumNorm.html - */ + */ public static double getNorm(Matrix m) { double max = 0.0; - for (int i = 0; i < m.numRows(); i++) { + for (int i = 0; i < m.numRows(); i++) { int sum = 0; Vector cv = m.getRow(i); for (int j = 0; j < cv.size(); j++) { @@ -68,6 +68,6 @@ public final class Algebra { } } return max; - } - + } + } Modified: mahout/trunk/math/src/main/java/org/apache/mahout/math/SingularValueDecomposition.java URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/SingularValueDecomposition.java?rev=1094158&r1=1094157&r2=1094158&view=diff ============================================================================== --- mahout/trunk/math/src/main/java/org/apache/mahout/math/SingularValueDecomposition.java (original) +++ mahout/trunk/math/src/main/java/org/apache/mahout/math/SingularValueDecomposition.java Sun Apr 17 15:30:15 2011 @@ -34,7 +34,7 @@ public class SingularValueDecomposition */ public SingularValueDecomposition(Matrix arg) { if (arg.numRows() < arg.numCols()) { - transpositionNeeded = true; + transpositionNeeded = true; } // Derived from LINPACK code. Modified: mahout/trunk/math/src/main/java/org/apache/mahout/math/VectorView.java URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/VectorView.java?rev=1094158&r1=1094157&r2=1094158&view=diff ============================================================================== --- mahout/trunk/math/src/main/java/org/apache/mahout/math/VectorView.java (original) +++ mahout/trunk/math/src/main/java/org/apache/mahout/math/VectorView.java Sun Apr 17 15:30:15 2011 @@ -153,7 +153,7 @@ public class VectorView extends Abstract } - private class DecoratorElement implements Element { + private final class DecoratorElement implements Element { private final Element decorated; Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/benchmark/VectorBenchmarks.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/benchmark/VectorBenchmarks.java?rev=1094158&r1=1094157&r2=1094158&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/benchmark/VectorBenchmarks.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/benchmark/VectorBenchmarks.java Sun Apr 17 15:30:15 2011 @@ -707,8 +707,8 @@ public class VectorBenchmarks { abuilder.withName("nv").withMinimum(1).withMaximum(1).create()).withDescription( "Number of Vectors to create. Default: 100").withShortName("nv").create(); Option numClustersOpt = obuilder.withLongName("numClusters").withRequired(false).withArgument( - abuilder.withName("vs").withMinimum(1).withMaximum(1).create()).withDescription( - "Number of Vectors to create. Default: 10").withShortName("vs").create(); + abuilder.withName("vs").withMinimum(1).withMaximum(1).create()).withDescription( + "Number of Vectors to create. Default: 10").withShortName("vs").create(); Option loopOpt = obuilder.withLongName("loop").withRequired(false).withArgument( abuilder.withName("loop").withMinimum(1).withMaximum(1).create()).withDescription( "Number of times to loop. Default: 200").withShortName("l").create(); @@ -741,7 +741,7 @@ public class VectorBenchmarks { int numClusters=25; if (cmdLine.hasOption(numClustersOpt)) { - numClusters = Integer.parseInt((String) cmdLine.getValue(numClustersOpt)); + numClusters = Integer.parseInt((String) cmdLine.getValue(numClustersOpt)); } int sparsity = 1000; Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsMapper.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsMapper.java?rev=1094158&r1=1094157&r2=1094158&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsMapper.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsMapper.java Sun Apr 17 15:30:15 2011 @@ -26,12 +26,12 @@ import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.mapred.OutputLogFilter; import org.apache.hadoop.mapreduce.Mapper; import org.apache.mahout.clustering.WeightedVectorWritable; import org.apache.mahout.common.Pair; import org.apache.mahout.common.distance.DistanceMeasure; import org.apache.mahout.common.distance.EuclideanDistanceMeasure; +import org.apache.mahout.common.iterator.sequencefile.PathFilters; import org.apache.mahout.common.iterator.sequencefile.PathType; import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable; import org.apache.mahout.math.VectorWritable; @@ -107,9 +107,11 @@ public class RepresentativePointsMapper public static Map<Integer, List<VectorWritable>> getRepresentativePoints(Configuration conf, Path statePath) { Map<Integer, List<VectorWritable>> representativePoints = new HashMap<Integer, List<VectorWritable>>(); - for (Pair<IntWritable,VectorWritable> record : - new SequenceFileDirIterable<IntWritable,VectorWritable>( - statePath, PathType.LIST, new OutputLogFilter(), conf)) { + for (Pair<IntWritable,VectorWritable> record + : new SequenceFileDirIterable<IntWritable,VectorWritable>(statePath, + PathType.LIST, + PathFilters.logsCRCFilter(), + conf)) { int keyValue = record.getFirst().get(); List<VectorWritable> repPoints = representativePoints.get(keyValue); if (repPoints == null) { Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java?rev=1094158&r1=1094157&r2=1094158&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java Sun Apr 17 15:30:15 2011 @@ -43,90 +43,90 @@ public class MailArchivesClusteringAnaly // extended set of stop words composed of common mail terms like "hi", // HTML tags, and Java keywords asmany of the messages in the archives // are subversion check-in notifications - private static final String[] STOP_WORDS = { - "3d","7bit","a0","about","above","abstract","across","additional","after", - "afterwards","again","against","align","all","almost","alone","along", - "already","also","although","always","am","among","amongst","amoungst", - "amount","an","and","another","any","anybody","anyhow","anyone","anything", - "anyway","anywhere","are","arial","around","as","ascii","assert","at", - "back","background","base64","bcc","be","became","because","become","becomes", - "becoming","been","before","beforehand","behind","being","below","beside", - "besides","between","beyond","bgcolor","blank","blockquote","body","boolean", - "border","both","br","break","but","by","can","cannot","cant","case","catch", - "cc","cellpadding","cellspacing","center","char","charset","cheers","class", - "co","color","colspan","com","con","const","continue","could","couldnt", - "cry","css","de","dear","default","did","didnt","different","div","do", - "does","doesnt","done","dont","double","down","due","during","each","eg", - "eight","either","else","elsewhere","empty","encoding","enough","enum", - "etc","eu","even","ever","every","everyone","everything","everywhere", - "except","extends","face","family","few","ffffff","final","finally","float", - "font","for","former","formerly","fri","from","further","get","give","go", - "good","got","goto","gt","h1","ha","had","has","hasnt","have","he","head", - "height","hello","helvetica","hence","her","here","hereafter","hereby", - "herein","hereupon","hers","herself","hi","him","himself","his","how", - "however","hr","href","html","http","https","id","ie","if","ill","im", - "image","img","implements","import","in","inc","instanceof","int","interface", - "into","is","isnt","iso-8859-1","it","its","itself","ive","just","keep", - "last","latter","latterly","least","left","less","li","like","long","look", - "lt","ltd","mail","mailto","many","margin","may","me","meanwhile","message", - "meta","might","mill","mine","mon","more","moreover","most","mostly","mshtml", - "mso","much","must","my","myself","name","namely","native","nbsp","need", - "neither","never","nevertheless","new","next","nine","no","nobody","none", - "noone","nor","not","nothing","now","nowhere","null","of","off","often", - "ok","on","once","only","onto","or","org","other","others","otherwise", - "our","ours","ourselves","out","over","own","package","pad","per","perhaps", - "plain","please","pm","printable","private","protected","public","put", - "quot","quote","r1","r2","rather","re","really","regards","reply","return", - "right","said","same","sans","sat","say","saying","see","seem","seemed", - "seeming","seems","serif","serious","several","she","short","should","show", - "side","since","sincere","six","sixty","size","so","solid","some","somehow", - "someone","something","sometime","sometimes","somewhere","span","src", - "static","still","strictfp","string","strong","style","stylesheet","subject", - "such","sun","super","sure","switch","synchronized","table","take","target", - "td","text","th","than","thanks","that","the","their","them","themselves", - "then","thence","there","thereafter","thereby","therefore","therein","thereupon", - "these","they","thick","thin","think","third","this","those","though", - "three","through","throughout","throw","throws","thru","thu","thus","tm", - "to","together","too","top","toward","towards","tr","transfer","transient", - "try","tue","type","ul","un","under","unsubscribe","until","up","upon", - "us","use","used","uses","using","valign","verdana","very","via","void", - "volatile","want","was","we","wed","weight","well","were","what","whatever", - "when","whence","whenever","where","whereafter","whereas","whereby","wherein", - "whereupon","wherever","whether","which","while","whither","who","whoever", - "whole","whom","whose","why","width","will","with","within","without", - "wont","would","wrote","www","yes","yet","you","your","yours","yourself", - "yourselves" - }; + private static final String[] STOP_WORDS = { + "3d","7bit","a0","about","above","abstract","across","additional","after", + "afterwards","again","against","align","all","almost","alone","along", + "already","also","although","always","am","among","amongst","amoungst", + "amount","an","and","another","any","anybody","anyhow","anyone","anything", + "anyway","anywhere","are","arial","around","as","ascii","assert","at", + "back","background","base64","bcc","be","became","because","become","becomes", + "becoming","been","before","beforehand","behind","being","below","beside", + "besides","between","beyond","bgcolor","blank","blockquote","body","boolean", + "border","both","br","break","but","by","can","cannot","cant","case","catch", + "cc","cellpadding","cellspacing","center","char","charset","cheers","class", + "co","color","colspan","com","con","const","continue","could","couldnt", + "cry","css","de","dear","default","did","didnt","different","div","do", + "does","doesnt","done","dont","double","down","due","during","each","eg", + "eight","either","else","elsewhere","empty","encoding","enough","enum", + "etc","eu","even","ever","every","everyone","everything","everywhere", + "except","extends","face","family","few","ffffff","final","finally","float", + "font","for","former","formerly","fri","from","further","get","give","go", + "good","got","goto","gt","h1","ha","had","has","hasnt","have","he","head", + "height","hello","helvetica","hence","her","here","hereafter","hereby", + "herein","hereupon","hers","herself","hi","him","himself","his","how", + "however","hr","href","html","http","https","id","ie","if","ill","im", + "image","img","implements","import","in","inc","instanceof","int","interface", + "into","is","isnt","iso-8859-1","it","its","itself","ive","just","keep", + "last","latter","latterly","least","left","less","li","like","long","look", + "lt","ltd","mail","mailto","many","margin","may","me","meanwhile","message", + "meta","might","mill","mine","mon","more","moreover","most","mostly","mshtml", + "mso","much","must","my","myself","name","namely","native","nbsp","need", + "neither","never","nevertheless","new","next","nine","no","nobody","none", + "noone","nor","not","nothing","now","nowhere","null","of","off","often", + "ok","on","once","only","onto","or","org","other","others","otherwise", + "our","ours","ourselves","out","over","own","package","pad","per","perhaps", + "plain","please","pm","printable","private","protected","public","put", + "quot","quote","r1","r2","rather","re","really","regards","reply","return", + "right","said","same","sans","sat","say","saying","see","seem","seemed", + "seeming","seems","serif","serious","several","she","short","should","show", + "side","since","sincere","six","sixty","size","so","solid","some","somehow", + "someone","something","sometime","sometimes","somewhere","span","src", + "static","still","strictfp","string","strong","style","stylesheet","subject", + "such","sun","super","sure","switch","synchronized","table","take","target", + "td","text","th","than","thanks","that","the","their","them","themselves", + "then","thence","there","thereafter","thereby","therefore","therein","thereupon", + "these","they","thick","thin","think","third","this","those","though", + "three","through","throughout","throw","throws","thru","thu","thus","tm", + "to","together","too","top","toward","towards","tr","transfer","transient", + "try","tue","type","ul","un","under","unsubscribe","until","up","upon", + "us","use","used","uses","using","valign","verdana","very","via","void", + "volatile","want","was","we","wed","weight","well","were","what","whatever", + "when","whence","whenever","where","whereafter","whereas","whereby","wherein", + "whereupon","wherever","whether","which","while","whither","who","whoever", + "whole","whom","whose","why","width","will","with","within","without", + "wont","would","wrote","www","yes","yet","you","your","yours","yourself", + "yourselves" + }; - // Regex used to exclude non-alpha-numeric tokens + // Regex used to exclude non-alpha-numeric tokens private static final Pattern alphaNumeric = Pattern.compile("^[a-z][a-z0-9_]+$"); private final CharArraySet stopSet; - public MailArchivesClusteringAnalyzer() { - stopSet = (CharArraySet)StopFilter.makeStopSet(Arrays.asList(STOP_WORDS)); + public MailArchivesClusteringAnalyzer() { + stopSet = (CharArraySet)StopFilter.makeStopSet(Arrays.asList(STOP_WORDS)); /* - Collection<String> tmp = new java.util.TreeSet<String>(); + Collection<String> tmp = new java.util.TreeSet<String>(); for (Object entry : stopSet) { tmp.add(entry.toString()); } */ - } + } + + public MailArchivesClusteringAnalyzer(CharArraySet stopSet) { + this.stopSet = stopSet; + } - public MailArchivesClusteringAnalyzer(CharArraySet stopSet) { - this.stopSet = stopSet; - } - - @Override - public TokenStream tokenStream(String fieldName, java.io.Reader reader) { - @SuppressWarnings("deprecation") - TokenStream result = new StandardTokenizer(Version.LUCENE_CURRENT, reader); - result = new StandardFilter(result); - result = new LowerCaseFilter(result); + @Override + public TokenStream tokenStream(String fieldName, java.io.Reader reader) { + @SuppressWarnings("deprecation") + TokenStream result = new StandardTokenizer(Version.LUCENE_CURRENT, reader); + result = new StandardFilter(result); + result = new LowerCaseFilter(result); result = new ASCIIFoldingFilter(result); result = new AlphaNumericMaxLengthFilter(result); - result = new StopFilter(false, result, stopSet); - return new PorterStemFilter(result); - } + result = new StopFilter(false, result, stopSet); + return new PorterStemFilter(result); + } /** * Matches alpha-numeric tokens between 2 and 40 chars long. @@ -136,35 +136,35 @@ public class MailArchivesClusteringAnaly private final char[] output = new char[28]; private final Matcher matcher; - AlphaNumericMaxLengthFilter(TokenStream in) { - super(in); - termAtt = addAttribute(TermAttribute.class); - matcher = alphaNumeric.matcher("foo"); - } - - @Override - public final boolean incrementToken() throws IOException { - // return the first alpha-numeric token between 2 and 40 length - while (input.incrementToken()) { - int length = termAtt.termLength(); - if (length >= 2 && length <= 28) { - char[] buf = termAtt.termBuffer(); - int at = 0; - for (int c=0; c < length; c++) { - char ch = buf[c]; - if (ch != '\'') { - output[at++] = ch; - } - } - String term = new String(output, 0, at); - matcher.reset(term); - if (matcher.matches() && !term.startsWith("a0")) { + AlphaNumericMaxLengthFilter(TokenStream in) { + super(in); + termAtt = addAttribute(TermAttribute.class); + matcher = alphaNumeric.matcher("foo"); + } + + @Override + public final boolean incrementToken() throws IOException { + // return the first alpha-numeric token between 2 and 40 length + while (input.incrementToken()) { + int length = termAtt.termLength(); + if (length >= 2 && length <= 28) { + char[] buf = termAtt.termBuffer(); + int at = 0; + for (int c=0; c < length; c++) { + char ch = buf[c]; + if (ch != '\'') { + output[at++] = ch; + } + } + String term = new String(output, 0, at); + matcher.reset(term); + if (matcher.matches() && !term.startsWith("a0")) { termAtt.setTermBuffer(term); - return true; - } - } - } - return false; - } + return true; + } + } + } + return false; + } } } Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/eval/ParallelFactorizationEvaluator.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/eval/ParallelFactorizationEvaluator.java?rev=1094158&r1=1094157&r2=1094158&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/eval/ParallelFactorizationEvaluator.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/eval/ParallelFactorizationEvaluator.java Sun Apr 17 15:30:15 2011 @@ -18,13 +18,11 @@ package org.apache.mahout.utils.eval; import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.DoubleWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; @@ -88,7 +86,7 @@ public class ParallelFactorizationEvalua "--itemFeatures", parsedArgs.get("--itemFeatures"), "--tempDir", tempDir.toString() }); - Job estimationErrors = prepareJob(new Path(parsedArgs.get("--pairs") + "," + predictions.toString()), errors, + Job estimationErrors = prepareJob(new Path(parsedArgs.get("--pairs") + ',' + predictions), errors, TextInputFormat.class, PairsWithRatingMapper.class, IntPairWritable.class, DoubleWritable.class, ErrorReducer.class, DoubleWritable.class, NullWritable.class, SequenceFileOutputFormat.class); estimationErrors.waitForCompletion(true);
