Modified: mahout/trunk/math/src/main/java/org/apache/mahout/math/decomposer/hebbian/HebbianSolver.java URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/decomposer/hebbian/HebbianSolver.java?rev=1072858&r1=1072857&r2=1072858&view=diff ============================================================================== --- mahout/trunk/math/src/main/java/org/apache/mahout/math/decomposer/hebbian/HebbianSolver.java (original) +++ mahout/trunk/math/src/main/java/org/apache/mahout/math/decomposer/hebbian/HebbianSolver.java Mon Feb 21 06:47:02 2011 @@ -46,6 +46,7 @@ import org.slf4j.LoggerFactory; public class HebbianSolver { private static final Logger log = LoggerFactory.getLogger(HebbianSolver.class); + private static final boolean DEBUG = false; private final EigenUpdater updater; private final SingularVectorVerifier verifier; @@ -54,7 +55,6 @@ public class HebbianSolver { private final Random rng = RandomUtils.getRandom(); private int numPasses = 0; - private static final boolean debug = false; /** * Creates a new HebbianSolver @@ -198,13 +198,13 @@ public class HebbianSolver { } } state.setFirstPass(false); - if (debug) { + if (DEBUG) { if (previousEigen == null) { previousEigen = currentEigen.clone(); } else { double dot = currentEigen.dot(previousEigen); - if (dot > 0) { - dot /= (currentEigen.norm(2) * previousEigen.norm(2)); + if (dot > 0.0) { + dot /= currentEigen.norm(2) * previousEigen.norm(2); } // log.info("Current pass * previous pass = {}", dot); } @@ -249,8 +249,7 @@ public class HebbianSolver { double r = rng.nextDouble(); index = (int) (r * corpus.numRows()); v = corpus.getRow(index); - } - while (v == null || v.norm(2) == 0 || v.getNumNondefaultElements() < 5); + } while (v == null || v.norm(2) == 0 || v.getNumNondefaultElements() < 5); return index; } @@ -281,7 +280,7 @@ public class HebbianSolver { currentPseudoEigen.assign(previousEigen, new PlusMult(-state.getHelperVector().get(i))); state.getHelperVector().set(i, 0); } - if (debug && currentPseudoEigen.norm(2) > 0) { + if (DEBUG && currentPseudoEigen.norm(2) > 0) { for (int i = 0; i < state.getNumEigensProcessed(); i++) { Vector previousEigen = previousEigens.getRow(i); log.info("dot with previous: {}", (previousEigen.dot(currentPseudoEigen)) / currentPseudoEigen.norm(2)); @@ -294,10 +293,12 @@ public class HebbianSolver { if (status.inProgress()) { log.info("Verifier not finished, making another pass..."); } else { - log.info("Has 1 - cosAngle: {}, convergence target is: {}", (1 - status.getCosAngle()), convergenceTarget); + log.info("Has 1 - cosAngle: {}, convergence target is: {}", 1.0 - status.getCosAngle(), convergenceTarget); state.getStatusProgress().add(status); } - return (state.getStatusProgress().size() <= maxPassesPerEigen && 1 - status.getCosAngle() > convergenceTarget); + return + state.getStatusProgress().size() <= maxPassesPerEigen + && 1.0 - status.getCosAngle() > convergenceTarget; } protected EigenStatus verify(Matrix corpus, Vector currentPseudoEigen) { @@ -323,16 +324,15 @@ public class HebbianSolver { HebbianUpdater updater = new HebbianUpdater(); SingularVectorVerifier verifier = new AsyncEigenVerifier(); - HebbianSolver solver = new HebbianSolver(updater, - verifier, - convergence, - maxPasses); + HebbianSolver solver = new HebbianSolver(updater, verifier, convergence, maxPasses); Matrix corpus = null; + /* if (numThreads <= 1) { // corpus = new DiskBufferedDoubleMatrix(new File(corpusDir), inBufferSize); } else { // corpus = new ParallelMultiplyingDiskBufferedDoubleMatrix(new File(corpusDir), inBufferSize, numThreads); } + */ long now = System.currentTimeMillis(); TrainingState finalState = solver.solve(corpus, rank); long time = (System.currentTimeMillis() - now) / 1000;
Modified: mahout/trunk/math/src/main/java/org/apache/mahout/math/decomposer/lanczos/LanczosSolver.java URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/decomposer/lanczos/LanczosSolver.java?rev=1072858&r1=1072857&r2=1072858&view=diff ============================================================================== --- mahout/trunk/math/src/main/java/org/apache/mahout/math/decomposer/lanczos/LanczosSolver.java (original) +++ mahout/trunk/math/src/main/java/org/apache/mahout/math/decomposer/lanczos/LanczosSolver.java Mon Feb 21 06:47:02 2011 @@ -27,9 +27,9 @@ import org.apache.mahout.math.Matrix; import org.apache.mahout.math.MatrixSlice; import org.apache.mahout.math.SparseRowMatrix; import org.apache.mahout.math.VectorIterable; +import org.apache.mahout.math.function.Functions; import org.apache.mahout.math.function.PlusMult; import org.apache.mahout.math.function.DoubleFunction; -import static org.apache.mahout.math.function.Functions.*; import org.apache.mahout.math.Vector; import org.apache.mahout.math.matrix.DoubleMatrix1D; import org.apache.mahout.math.matrix.DoubleMatrix2D; @@ -68,7 +68,6 @@ public class LanczosSolver { private static final Logger log = LoggerFactory.getLogger(LanczosSolver.class); public static final double SAFE_MAX = 1.0e150; - private static final double NANOS_IN_MILLI = 1.0e6; public enum TimingSection { @@ -77,7 +76,7 @@ public class LanczosSolver { private final Map<TimingSection, Long> startTimes = new EnumMap<TimingSection, Long>(TimingSection.class); private final Map<TimingSection, Long> times = new EnumMap<TimingSection, Long>(TimingSection.class); - protected double scaleFactor = 0.0; + private double scaleFactor; private static final class Scale implements DoubleFunction { private final double d; @@ -115,7 +114,7 @@ public class LanczosSolver { Vector nextVector = isSymmetric ? corpus.times(currentVector) : corpus.timesSquared(currentVector); log.info("{} passes through the corpus so far...", i); calculateScaleFactor(nextVector); - nextVector.assign(new Scale(1 / scaleFactor)); + nextVector.assign(new Scale(1.0 / scaleFactor)); nextVector.assign(previousVector, new PlusMult(-beta)); // now orthogonalize double alpha = currentVector.dot(nextVector); @@ -170,7 +169,7 @@ public class LanczosSolver { } protected void calculateScaleFactor(Vector nextVector) { - if(scaleFactor == 0) { + if (scaleFactor == 0.0) { scaleFactor = nextVector.norm(2); } } @@ -200,10 +199,10 @@ public class LanczosSolver { if (v == null) { v = new DenseVector(vector.size()).plus(vector); } else { - v.assign(vector, PLUS); + v.assign(vector, Functions.PLUS); } } - v.assign(div(v.norm(2))); + v.assign(Functions.div(v.norm(2))); return v; } Modified: mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/math/Polynomial.java URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/math/Polynomial.java?rev=1072858&r1=1072857&r2=1072858&view=diff ============================================================================== --- mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/math/Polynomial.java (original) +++ mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/math/Polynomial.java Mon Feb 21 06:47:02 2011 @@ -57,7 +57,7 @@ public final class Polynomial { * @param coef the coefficients of the polynomial. * @param N the degree of the polynomial. */ - public static double p1evl(double x, double[] coef, int N) throws ArithmeticException { + public static double p1evl(double x, double[] coef, int N) { double ans = x + coef[0]; @@ -86,7 +86,7 @@ public final class Polynomial { * @param coef the coefficients of the polynomial. * @param N the degree of the polynomial. */ - public static double polevl(double x, double[] coef, int N) throws ArithmeticException { + public static double polevl(double x, double[] coef, int N) { double ans = coef[0]; for (int i = 1; i <= N; i++) { Modified: mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/random/Gamma.java URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/random/Gamma.java?rev=1072858&r1=1072857&r2=1072858&view=diff ============================================================================== --- mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/random/Gamma.java (original) +++ mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/random/Gamma.java Mon Feb 21 06:47:02 2011 @@ -107,12 +107,12 @@ public class Gamma extends AbstractConti if (p <= 1.0) { // Step 2. Case gds <= 1 gds = Math.exp(Math.log(p) / alpha); if (Math.log(randomGenerator.nextDouble()) <= -gds) { - return (gds / rate); + return gds / rate; } } else { // Step 3. Case gds > 1 gds = -Math.log((b - p) / alpha); if (Math.log(randomGenerator.nextDouble()) <= ((alpha - 1.0) * Math.log(gds))) { - return (gds / rate); + return gds / rate; } } } @@ -137,12 +137,12 @@ public class Gamma extends AbstractConti double x = s + 0.5 * t; gds = x * x; if (t >= 0.0) { - return (gds / rate); + return gds / rate; } // Immediate acceptance double u = randomGenerator.nextDouble(); if (d * u <= t * t * t) { - return (gds / rate); + return gds / rate; } // Squeeze acceptance double q0 = 0.0; @@ -159,8 +159,7 @@ public class Gamma extends AbstractConti double q3 = 0.0079849875; double q2 = 0.0208333723; double q1 = 0.0416666664; - q0 = ((((((((q9 * r + q8) * r + q7) * r + q6) * r + q5) * r + q4) * - r + q3) * r + q2) * r + q1) * r; + q0 = ((((((((q9 * r + q8) * r + q7) * r + q6) * r + q5) * r + q4) * r + q3) * r + q2) * r + q1) * r; if (alpha > 3.686) { if (alpha > 13.022) { b = 1.77; @@ -193,11 +192,11 @@ public class Gamma extends AbstractConti if (Math.abs(v) > 0.25) { q = q0 - s * t + 0.25 * t * t + (ss + ss) * Math.log(1.0 + v); } else { - q = q0 + 0.5 * t * t * ((((((((a9 * v + a8) * v + a7) * v + a6) * - v + a5) * v + a4) * v + a3) * v + a2) * v + a1) * v; + q = q0 + 0.5 * t * t * ((((((((a9 * v + a8) * v + a7) * v + a6) + * v + a5) * v + a4) * v + a3) * v + a2) * v + a1) * v; } // Step 7. Quotient acceptance if (Math.log(1.0 - u) <= q) { - return (gds / rate); + return gds / rate; } } @@ -222,8 +221,8 @@ public class Gamma extends AbstractConti if (Math.abs(v) > 0.25) { q = q0 - s * t + 0.25 * t * t + (ss + ss) * Math.log(1.0 + v); } else { - q = q0 + 0.5 * t * t * ((((((((a9 * v + a8) * v + a7) * v + a6) * - v + a5) * v + a4) * v + a3) * v + a2) * v + a1) * v; + q = q0 + 0.5 * t * t * ((((((((a9 * v + a8) * v + a7) * v + a6) + * v + a5) * v + a4) * v + a3) * v + a2) * v + a1) * v; } if (q <= 0.0) { continue; @@ -232,12 +231,11 @@ public class Gamma extends AbstractConti if (q > 0.5) { w = Math.exp(q) - 1.0; } else { - w = ((((((e7 * q + e6) * q + e5) * q + e4) * q + e3) * q + e2) * - q + e1) * q; + w = ((((((e7 * q + e6) * q + e5) * q + e4) * q + e3) * q + e2) * q + e1) * q; } // Step 12. Hat acceptance if (c * u * sign_u <= w * Math.exp(e - 0.5 * t * t)) { x = s + 0.5 * t; - return (x * x / rate); + return x * x / rate; } } } Modified: mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/Gamma.java URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/Gamma.java?rev=1072858&r1=1072857&r2=1072858&view=diff ============================================================================== --- mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/Gamma.java (original) +++ mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/Gamma.java Mon Feb 21 06:47:02 2011 @@ -31,6 +31,8 @@ import org.apache.mahout.math.jet.math.P /** Partially deprecated until unit tests are in place. Until this time, this class/interface is unsupported. */ public final class Gamma { + private static final double MAXSTIR = 143.01608; + private Gamma() { } @@ -664,7 +666,6 @@ public final class Gamma { w = 1.0 + w * Polynomial.polevl(w, coefficients, 4); - double MAXSTIR = 143.01608; if (x > MAXSTIR) { /* Avoid overflow in Math.pow() */ double v = Math.pow(x, 0.5 * x - 0.25); Modified: mahout/trunk/math/src/main/java/org/apache/mahout/math/matrix/linalg/EigenvalueDecomposition.java URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/matrix/linalg/EigenvalueDecomposition.java?rev=1072858&r1=1072857&r2=1072858&view=diff ============================================================================== --- mahout/trunk/math/src/main/java/org/apache/mahout/math/matrix/linalg/EigenvalueDecomposition.java (original) +++ mahout/trunk/math/src/main/java/org/apache/mahout/math/matrix/linalg/EigenvalueDecomposition.java Mon Feb 21 06:47:02 2011 @@ -40,8 +40,8 @@ public final class EigenvalueDecompositi // Complex scalar division. - private transient double cdivr; - private transient double cdivi; + private double cdivr; + private double cdivi; /** * Constructs and returns a new eigenvalue decomposition object; The decomposed matrices can be retrieved via instance Propchange: mahout/trunk/taste-web/ ------------------------------------------------------------------------------ --- svn:ignore (original) +++ svn:ignore Mon Feb 21 06:47:02 2011 @@ -1,10 +1,9 @@ +.pmd .classpath -.settings -*.iml -maven-eclipse.xml -target .externalToolBuilders +.ruleset .project +*.iml .checkstyle -.pmd -.ruleset +maven-eclipse.xml +.settings Modified: mahout/trunk/taste-web/src/main/java/org/apache/mahout/cf/taste/web/RecommenderWrapper.java URL: http://svn.apache.org/viewvc/mahout/trunk/taste-web/src/main/java/org/apache/mahout/cf/taste/web/RecommenderWrapper.java?rev=1072858&r1=1072857&r2=1072858&view=diff ============================================================================== --- mahout/trunk/taste-web/src/main/java/org/apache/mahout/cf/taste/web/RecommenderWrapper.java (original) +++ mahout/trunk/taste-web/src/main/java/org/apache/mahout/cf/taste/web/RecommenderWrapper.java Mon Feb 21 06:47:02 2011 @@ -60,14 +60,12 @@ public abstract class RecommenderWrapper protected abstract Recommender buildRecommender() throws IOException, TasteException; @Override - public List<RecommendedItem> recommend(long userID, int howMany) - throws TasteException { + public List<RecommendedItem> recommend(long userID, int howMany) throws TasteException { return delegate.recommend(userID, howMany); } @Override - public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer) - throws TasteException { + public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException { return delegate.recommend(userID, howMany, rescorer); } @@ -77,8 +75,7 @@ public abstract class RecommenderWrapper } @Override - public void setPreference(long userID, long itemID, float value) - throws TasteException { + public void setPreference(long userID, long itemID, float value) throws TasteException { delegate.setPreference(userID, itemID, value); } Propchange: mahout/trunk/utils/ ------------------------------------------------------------------------------ --- svn:ignore (original) +++ svn:ignore Mon Feb 21 06:47:02 2011 @@ -1,10 +1,9 @@ -output -testdata -*.iml -target -.settings +.pmd .classpath +.ruleset .project +*.iml +output .checkstyle -.pmd -.ruleset +.settings +testdata Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java?rev=1072858&r1=1072857&r2=1072858&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java Mon Feb 21 06:47:02 2011 @@ -65,7 +65,9 @@ public class CDbwEvaluator { * @param measure * an appropriate DistanceMeasure */ - public CDbwEvaluator(Map<Integer, List<VectorWritable>> representativePoints, List<Cluster> clusters, DistanceMeasure measure) { + public CDbwEvaluator(Map<Integer, List<VectorWritable>> representativePoints, + List<Cluster> clusters, + DistanceMeasure measure) { this.representativePoints = representativePoints; this.clusters = clusters; this.measure = measure; @@ -82,8 +84,8 @@ public class CDbwEvaluator { * @param clustersIn * a String path to the input clusters directory */ - public CDbwEvaluator(Configuration conf, Path clustersIn) throws ClassNotFoundException, InstantiationException, - IllegalAccessException, IOException { + public CDbwEvaluator(Configuration conf, Path clustersIn) + throws ClassNotFoundException, InstantiationException, IllegalAccessException, IOException { ClassLoader ccl = Thread.currentThread().getContextClassLoader(); measure = ccl.loadClass(conf.get(RepresentativePointsDriver.DISTANCE_MEASURE_KEY)).asSubclass(DistanceMeasure.class) .newInstance(); @@ -101,8 +103,8 @@ public class CDbwEvaluator { * a String pathname to the directory containing input cluster files * @return a List<Cluster> of the clusters */ - private static List<Cluster> loadClusters(Configuration conf, Path clustersIn) throws InstantiationException, - IllegalAccessException, IOException { + private static List<Cluster> loadClusters(Configuration conf, Path clustersIn) + throws InstantiationException, IllegalAccessException, IOException { List<Cluster> clusters = new ArrayList<Cluster>(); FileSystem fs = clustersIn.getFileSystem(conf); for (FileStatus part : fs.listStatus(clustersIn)) { @@ -246,7 +248,7 @@ public class CDbwEvaluator { for (VectorWritable pt : repPtsI) { // compute f(x, vIJ) (eqn 7) Vector repJ = pt.get(); - double densityIJ = (measure.distance(cluster.getCenter(), repJ) <= stdev ? 1.0 : 0.0); + double densityIJ = measure.distance(cluster.getCenter(), repJ) <= stdev ? 1.0 : 0.0; // accumulate sumJ sumJ += densityIJ / stdev; } Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/ClusterEvaluator.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/ClusterEvaluator.java?rev=1072858&r1=1072857&r2=1072858&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/ClusterEvaluator.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/ClusterEvaluator.java Mon Feb 21 06:47:02 2011 @@ -46,7 +46,7 @@ public class ClusterEvaluator { private final DistanceMeasure measure; - private boolean pruned = false; + private boolean pruned; /** * For testing only @@ -58,7 +58,8 @@ public class ClusterEvaluator { * @param measure * an appropriate DistanceMeasure */ - public ClusterEvaluator(Map<Integer, List<VectorWritable>> representativePoints, List<Cluster> clusters, DistanceMeasure measure) { + public ClusterEvaluator(Map<Integer, List<VectorWritable>> representativePoints, + List<Cluster> clusters, DistanceMeasure measure) { this.representativePoints = representativePoints; this.clusters = clusters; this.measure = measure; @@ -72,8 +73,8 @@ public class ClusterEvaluator { * @param clustersIn * a String path to the input clusters directory */ - public ClusterEvaluator(Configuration conf, Path clustersIn) throws ClassNotFoundException, InstantiationException, - IllegalAccessException, IOException { + public ClusterEvaluator(Configuration conf, Path clustersIn) + throws ClassNotFoundException, InstantiationException, IllegalAccessException, IOException { ClassLoader ccl = Thread.currentThread().getContextClassLoader(); measure = ccl.loadClass(conf.get(RepresentativePointsDriver.DISTANCE_MEASURE_KEY)).asSubclass(DistanceMeasure.class) .newInstance(); @@ -88,8 +89,8 @@ public class ClusterEvaluator { * a String pathname to the directory containing input cluster files * @return a List<Cluster> of the clusters */ - private static List<Cluster> loadClusters(Configuration conf, Path clustersIn) throws InstantiationException, - IllegalAccessException, IOException { + private static List<Cluster> loadClusters(Configuration conf, Path clustersIn) + throws InstantiationException, IllegalAccessException, IOException { List<Cluster> clusters = new ArrayList<Cluster>(); FileSystem fs = clustersIn.getFileSystem(conf); for (FileStatus part : fs.listStatus(clustersIn)) { Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java?rev=1072858&r1=1072857&r2=1072858&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java Mon Feb 21 06:47:02 2011 @@ -112,7 +112,7 @@ public final class RepresentativePointsD } private static void writeInitialState(Path output, Path clustersIn) - throws InstantiationException, IllegalAccessException, IOException, SecurityException { + throws InstantiationException, IllegalAccessException, IOException { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(output.toUri(), conf); for (FileStatus part : fs.listStatus(clustersIn)) { Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java?rev=1072858&r1=1072857&r2=1072858&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java Mon Feb 21 06:47:02 2011 @@ -64,8 +64,8 @@ public final class SequenceFilesFromDire int chunkSizeInMB, Charset charset, String fileFilterClassName) - throws IllegalArgumentException, InstantiationException, IllegalAccessException, InvocationTargetException, - IOException, SecurityException, NoSuchMethodException, ClassNotFoundException { + throws InstantiationException, IllegalAccessException, InvocationTargetException, IOException, + NoSuchMethodException, ClassNotFoundException { FileSystem fs = FileSystem.get(conf); ChunkedWriter writer = new ChunkedWriter(conf, chunkSizeInMB, output); Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java?rev=1072858&r1=1072857&r2=1072858&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java Mon Feb 21 06:47:02 2011 @@ -55,39 +55,24 @@ import org.slf4j.LoggerFactory; public final class ClusterDumper extends AbstractJob { public static final String OUTPUT_OPTION = "output"; - public static final String DICTIONARY_TYPE_OPTION = "dictionaryType"; - public static final String DICTIONARY_OPTION = "dictionary"; - public static final String POINTS_DIR_OPTION = "pointsDir"; - public static final String JSON_OPTION = "json"; - public static final String NUM_WORDS_OPTION = "numWords"; - public static final String SUBSTRING_OPTION = "substring"; - public static final String SEQ_FILE_DIR_OPTION = "seqFileDir"; private static final Logger log = LoggerFactory.getLogger(ClusterDumper.class); private Path seqFileDir; - private Path pointsDir; - private String termDictionary; - private String dictionaryFormat; - private String outputFile; - private int subString = Integer.MAX_VALUE; - private int numTopFeatures = 10; - private Map<Integer, List<WeightedVectorWritable>> clusterIdToPoints; - private boolean useJSON; public ClusterDumper(Path seqFileDir, Path pointsDir) throws IOException { @@ -306,11 +291,9 @@ public final class ClusterDumper extends return result; } - static class TermIndexWeight { - private int index = -1; - + private static class TermIndexWeight { + private final int index; private final double weight; - TermIndexWeight(int index, double weight) { this.index = index; this.weight = weight; Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java?rev=1072858&r1=1072857&r2=1072858&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java Mon Feb 21 06:47:02 2011 @@ -18,13 +18,26 @@ package org.apache.mahout.utils.vectors; public class TermEntry { - public final String term; - public final int termIdx; - public final int docFreq; + + private final String term; + private final int termIdx; + private final int docFreq; public TermEntry(String term, int termIdx, int docFreq) { this.term = term; this.termIdx = termIdx; this.docFreq = docFreq; } + + public String getTerm() { + return term; + } + + public int getTermIdx() { + return termIdx; + } + + public int getDocFreq() { + return docFreq; + } } Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterTermInfoWriter.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterTermInfoWriter.java?rev=1072858&r1=1072857&r2=1072858&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterTermInfoWriter.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterTermInfoWriter.java Mon Feb 21 06:47:02 2011 @@ -49,11 +49,11 @@ public class JWriterTermInfoWriter imple writer.write("\n"); while (entIter.hasNext()) { TermEntry entry = entIter.next(); - writer.write(entry.term); + writer.write(entry.getTerm()); writer.write(delimiter); - writer.write(String.valueOf(entry.docFreq)); + writer.write(String.valueOf(entry.getDocFreq())); writer.write(delimiter); - writer.write(String.valueOf(entry.termIdx)); + writer.write(String.valueOf(entry.getTermIdx())); writer.write("\n"); } writer.flush(); Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java?rev=1072858&r1=1072857&r2=1072858&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java Mon Feb 21 06:47:02 2011 @@ -55,7 +55,7 @@ public class CachedTermInfo implements T continue; } TermEntry entry = new TermEntry(term.text(), count++, df); - termEntries.put(entry.term, entry); + termEntries.put(entry.getTerm(), entry); } while (te.next()); te.close(); } Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java?rev=1072858&r1=1072857&r2=1072858&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java Mon Feb 21 06:47:02 2011 @@ -70,58 +70,18 @@ import org.slf4j.LoggerFactory; */ public class ClusterLabels { - static class TermInfoClusterInOut implements Comparable<TermInfoClusterInOut> { - private final String term; - - private final int inClusterDF; - - private final int outClusterDF; - - private double logLikelihoodRatio; - - TermInfoClusterInOut(String term, int inClusterDF, int outClusterDF) { - this.term = term; - this.inClusterDF = inClusterDF; - this.outClusterDF = outClusterDF; - } - - @Override - public int compareTo(TermInfoClusterInOut that) { - int res = -Double.compare(logLikelihoodRatio, that.logLikelihoodRatio); - if (res == 0) { - res = term.compareTo(that.term); - } - return res; - } - - public int getInClusterDiff() { - return this.inClusterDF - this.outClusterDF; - } - } - private static final Logger log = LoggerFactory.getLogger(ClusterLabels.class); public static final int DEFAULT_MIN_IDS = 50; - public static final int DEFAULT_MAX_LABELS = 25; - private final Path seqFileDir; - - private final Path pointsDir; - private final String indexDir; - private final String contentField; - private String idField; - - private Map<Integer, List<WeightedVectorWritable>> clusterIdToPoints; - + private final Map<Integer, List<WeightedVectorWritable>> clusterIdToPoints; private String output; - - private int minNumIds = DEFAULT_MIN_IDS; - - private int maxLabels = DEFAULT_MAX_LABELS; + private int minNumIds; + private int maxLabels; public ClusterLabels(Path seqFileDir, Path pointsDir, @@ -129,16 +89,12 @@ public class ClusterLabels { String contentField, int minNumIds, int maxLabels) throws IOException { - this.seqFileDir = seqFileDir; - this.pointsDir = pointsDir; this.indexDir = indexDir; this.contentField = contentField; this.minNumIds = minNumIds; this.maxLabels = maxLabels; - init(); - } - - private void init() throws IOException { + this.minNumIds = DEFAULT_MIN_IDS; + this.maxLabels = DEFAULT_MAX_LABELS; ClusterDumper clusterDumper = new ClusterDumper(seqFileDir, pointsDir); this.clusterIdToPoints = clusterDumper.getClusterIdToPoints(); } @@ -166,13 +122,13 @@ public class ClusterLabels { writer.write("Term \t\t LLR \t\t In-ClusterDF \t\t Out-ClusterDF "); writer.write('\n'); for (TermInfoClusterInOut termInfo : termInfos) { - writer.write(termInfo.term); + writer.write(termInfo.getTerm()); writer.write("\t\t"); - writer.write(String.valueOf(termInfo.logLikelihoodRatio)); + writer.write(String.valueOf(termInfo.getLogLikelihoodRatio())); writer.write("\t\t"); - writer.write(String.valueOf(termInfo.inClusterDF)); + writer.write(String.valueOf(termInfo.getInClusterDF())); writer.write("\t\t"); - writer.write(String.valueOf(termInfo.outClusterDF)); + writer.write(String.valueOf(termInfo.getOutClusterDF())); writer.write('\n'); } } @@ -247,7 +203,7 @@ public class ClusterLabels { int inclusterDF = (int) termBitset.cardinality(); TermEntry entry = new TermEntry(term.text(), count++, inclusterDF); - termEntryMap.put(entry.term, entry); + termEntryMap.put(entry.getTerm(), entry); } while (te.next()); te.close(); @@ -256,11 +212,12 @@ public class ClusterLabels { int clusterSize = wvws.size(); for (TermEntry termEntry : termEntryMap.values()) { - int corpusDF = reader.terms(new Term(this.contentField, termEntry.term)).docFreq(); - int outDF = corpusDF - termEntry.docFreq; - int inDF = termEntry.docFreq; - TermInfoClusterInOut termInfoCluster = new TermInfoClusterInOut(termEntry.term, inDF, outDF); - termInfoCluster.logLikelihoodRatio = scoreDocumentFrequencies(inDF, outDF, clusterSize, numDocs); + int corpusDF = reader.terms(new Term(this.contentField, termEntry.getTerm())).docFreq(); + int outDF = corpusDF - termEntry.getDocFreq(); + int inDF = termEntry.getDocFreq(); + double logLikelihoodRatio = scoreDocumentFrequencies(inDF, outDF, clusterSize, numDocs); + TermInfoClusterInOut termInfoCluster = + new TermInfoClusterInOut(termEntry.getTerm(), inDF, outDF, logLikelihoodRatio); clusteredTermInfo.add(termInfoCluster); } Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java?rev=1072858&r1=1072857&r2=1072858&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java Mon Feb 21 06:47:02 2011 @@ -65,7 +65,7 @@ public class TFDFMapper extends VectorMa public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { TermEntry entry = termInfo.getTermEntry(field, term); if (entry != null) { - vector.setQuick(entry.termIdx, weight.calculate(frequency, entry.docFreq, numTerms, numDocs)); + vector.setQuick(entry.getTermIdx(), weight.calculate(frequency, entry.getDocFreq(), numTerms, numDocs)); } } Added: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TermInfoClusterInOut.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TermInfoClusterInOut.java?rev=1072858&view=auto ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TermInfoClusterInOut.java (added) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TermInfoClusterInOut.java Mon Feb 21 06:47:02 2011 @@ -0,0 +1,81 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.utils.vectors.lucene; + +import org.apache.mahout.common.RandomUtils; + +class TermInfoClusterInOut implements Comparable<TermInfoClusterInOut> { + + private final String term; + private final int inClusterDF; + private final int outClusterDF; + private final double logLikelihoodRatio; + + TermInfoClusterInOut(String term, int inClusterDF, int outClusterDF, double logLikelihoodRatio) { + this.term = term; + this.inClusterDF = inClusterDF; + this.outClusterDF = outClusterDF; + this.logLikelihoodRatio = logLikelihoodRatio; + } + + @Override + public int hashCode() { + return term.hashCode() ^ inClusterDF ^ outClusterDF ^ RandomUtils.hashDouble(logLikelihoodRatio); + } + + @Override + public boolean equals(Object o) { + if (!(o instanceof TermInfoClusterInOut)) { + return false; + } + TermInfoClusterInOut other = (TermInfoClusterInOut) o; + return term.equals(other.getTerm()) + && inClusterDF == other.getInClusterDF() + && outClusterDF == other.getOutClusterDF() + && logLikelihoodRatio == other.getLogLikelihoodRatio(); + } + + @Override + public int compareTo(TermInfoClusterInOut that) { + int res = -Double.compare(logLikelihoodRatio, that.logLikelihoodRatio); + if (res == 0) { + res = term.compareTo(that.term); + } + return res; + } + + public int getInClusterDiff() { + return this.inClusterDF - this.outClusterDF; + } + + String getTerm() { + return term; + } + + int getInClusterDF() { + return inClusterDF; + } + + int getOutClusterDF() { + return outClusterDF; + } + + double getLogLikelihoodRatio() { + return logLikelihoodRatio; + } +} Modified: mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java?rev=1072858&r1=1072857&r2=1072858&view=diff ============================================================================== --- mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java (original) +++ mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java Mon Feb 21 06:47:02 2011 @@ -124,7 +124,7 @@ public final class TestClusterDumper ext termDictionary = new String[numTerms]; int i = 0; for (Iterator<TermEntry> it = termInfo.getAllEntries(); it.hasNext();) { - String term = it.next().term; + String term = it.next().getTerm(); termDictionary[i] = term; System.out.println(i + " " + term); i++;