Author: srowen
Date: Wed Jun 22 17:05:51 2011
New Revision: 1138553

URL: http://svn.apache.org/viewvc?rev=1138553&view=rev
Log:
MAHOUT-738 treat input to LLR as long to avoid possible overflow

Modified:
    
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarity.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/AbstractDistributedVectorSimilarity.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedCityBlockVectorSimilarity.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedCooccurrenceVectorSimilarity.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedEuclideanDistanceVectorSimilarity.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedLoglikelihoodVectorSimilarity.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedPearsonCorrelationVectorSimilarity.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedTanimotoCoefficientVectorSimilarity.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedUncenteredCosineVectorSimilarity.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedUncenteredZeroAssumingCosineVectorSimilarity.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedVectorSimilarity.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/collocations/llr/LLRReducer.java
    
mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/TestRowSimilarityJob.java
    
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/collocations/llr/LLRReducerTest.java
    
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
    
mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarity.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarity.java?rev=1138553&r1=1138552&r2=1138553&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarity.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarity.java
 Wed Jun 22 17:05:51 2011
@@ -55,14 +55,14 @@ public final class LogLikelihoodSimilari
     FastIDSet prefs1 = dataModel.getItemIDsFromUser(userID1);
     FastIDSet prefs2 = dataModel.getItemIDsFromUser(userID2);
     
-    int prefs1Size = prefs1.size();
-    int prefs2Size = prefs2.size();
-    int intersectionSize = prefs1Size < prefs2Size ? 
prefs2.intersectionSize(prefs1) : prefs1
-        .intersectionSize(prefs2);
+    long prefs1Size = prefs1.size();
+    long prefs2Size = prefs2.size();
+    long intersectionSize =
+        prefs1Size < prefs2Size ? prefs2.intersectionSize(prefs1) : 
prefs1.intersectionSize(prefs2);
     if (intersectionSize == 0) {
       return Double.NaN;
     }
-    int numItems = dataModel.getNumItems();
+    long numItems = dataModel.getNumItems();
     double logLikelihood =
         LogLikelihood.logLikelihoodRatio(intersectionSize,
                                          prefs2Size - intersectionSize,
@@ -74,16 +74,16 @@ public final class LogLikelihoodSimilari
   @Override
   public double itemSimilarity(long itemID1, long itemID2) throws 
TasteException {
     DataModel dataModel = getDataModel();
-    int preferring1 = dataModel.getNumUsersWithPreferenceFor(itemID1);
-    int numUsers = dataModel.getNumUsers();    
+    long preferring1 = dataModel.getNumUsersWithPreferenceFor(itemID1);
+    long numUsers = dataModel.getNumUsers();
     return doItemSimilarity(itemID1, itemID2, preferring1, numUsers);
   }
 
   @Override
   public double[] itemSimilarities(long itemID1, long[] itemID2s) throws 
TasteException {
     DataModel dataModel = getDataModel();
-    int preferring1 = dataModel.getNumUsersWithPreferenceFor(itemID1);
-    int numUsers = dataModel.getNumUsers();
+    long preferring1 = dataModel.getNumUsersWithPreferenceFor(itemID1);
+    long numUsers = dataModel.getNumUsers();
     int length = itemID2s.length;
     double[] result = new double[length];
     for (int i = 0; i < length; i++) {
@@ -92,13 +92,13 @@ public final class LogLikelihoodSimilari
     return result;
   }
 
-  private double doItemSimilarity(long itemID1, long itemID2, int preferring1, 
int numUsers) throws TasteException {
+  private double doItemSimilarity(long itemID1, long itemID2, long 
preferring1, long numUsers) throws TasteException {
     DataModel dataModel = getDataModel();
-    int preferring1and2 = dataModel.getNumUsersWithPreferenceFor(itemID1, 
itemID2);
+    long preferring1and2 = dataModel.getNumUsersWithPreferenceFor(itemID1, 
itemID2);
     if (preferring1and2 == 0) {
       return Double.NaN;
     }
-    int preferring2 = dataModel.getNumUsersWithPreferenceFor(itemID2);
+    long preferring2 = dataModel.getNumUsersWithPreferenceFor(itemID2);
     double logLikelihood =
         LogLikelihood.logLikelihoodRatio(preferring1and2,
                                          preferring2 - preferring1and2,

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/AbstractDistributedVectorSimilarity.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/AbstractDistributedVectorSimilarity.java?rev=1138553&r1=1138552&r2=1138553&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/AbstractDistributedVectorSimilarity.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/AbstractDistributedVectorSimilarity.java
 Wed Jun 22 17:05:51 2011
@@ -32,7 +32,7 @@ public abstract class AbstractDistribute
    */
   @Override
   public final double similarity(int rowA, int rowB, Iterable<Cooccurrence> 
cooccurrences, double weightOfVectorA,
-      double weightOfVectorB, int numberOfColumns) {
+      double weightOfVectorB, long numberOfColumns) {
 
     double result = doComputeResult(rowA, rowB, cooccurrences, 
weightOfVectorA, weightOfVectorB, numberOfColumns);
 
@@ -66,14 +66,14 @@ public abstract class AbstractDistribute
   /**
    * do the actual similarity computation
    *
-   * @see DistributedVectorSimilarity#similarity(int, int, Iterable, double, 
double, int)
+   * @see DistributedVectorSimilarity#similarity(int, int, Iterable, double, 
double, long)
    */
   protected abstract double doComputeResult(int rowA,
                                             int rowB,
                                             Iterable<Cooccurrence> 
cooccurrences,
                                             double weightOfVectorA,
                                             double weightOfVectorB,
-                                            int numberOfColumns);
+                                            long numberOfColumns);
 
   /**
    * vectors have no weight (NaN) by default, subclasses may override this

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedCityBlockVectorSimilarity.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedCityBlockVectorSimilarity.java?rev=1138553&r1=1138552&r2=1138553&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedCityBlockVectorSimilarity.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedCityBlockVectorSimilarity.java
 Wed Jun 22 17:05:51 2011
@@ -31,7 +31,7 @@ public final class DistributedCityBlockV
                                    Iterable<Cooccurrence> cooccurrences,
                                    double weightOfVectorA,
                                    double weightOfVectorB,
-                                   int numberOfColumns) {
+                                   long numberOfColumns) {
     int cooccurrenceCount = countElements(cooccurrences);
     if (cooccurrenceCount == 0) {
       return Double.NaN;

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedCooccurrenceVectorSimilarity.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedCooccurrenceVectorSimilarity.java?rev=1138553&r1=1138552&r2=1138553&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedCooccurrenceVectorSimilarity.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedCooccurrenceVectorSimilarity.java
 Wed Jun 22 17:05:51 2011
@@ -36,7 +36,7 @@ public class DistributedCooccurrenceVect
                            Iterable<Cooccurrence> cooccurrences,
                            double weightOfVectorA,
                            double weightOfVectorB,
-                           int numberOfColumns) {
+                           long numberOfColumns) {
     return AbstractDistributedVectorSimilarity.countElements(cooccurrences);
   }
 }

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedEuclideanDistanceVectorSimilarity.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedEuclideanDistanceVectorSimilarity.java?rev=1138553&r1=1138552&r2=1138553&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedEuclideanDistanceVectorSimilarity.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedEuclideanDistanceVectorSimilarity.java
 Wed Jun 22 17:05:51 2011
@@ -26,7 +26,7 @@ public class DistributedEuclideanDistanc
 
   @Override
   protected double doComputeResult(int rowA, int rowB, Iterable<Cooccurrence> 
cooccurrences, double weightOfVectorA,
-      double weightOfVectorB, int numberOfColumns) {
+      double weightOfVectorB, long numberOfColumns) {
 
     double n = 0.0;
     double sumXYdiff2 = 0.0;

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedLoglikelihoodVectorSimilarity.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedLoglikelihoodVectorSimilarity.java?rev=1138553&r1=1138552&r2=1138553&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedLoglikelihoodVectorSimilarity.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedLoglikelihoodVectorSimilarity.java
 Wed Jun 22 17:05:51 2011
@@ -29,15 +29,15 @@ public class DistributedLoglikelihoodVec
 
   @Override
   protected double doComputeResult(int rowA, int rowB, Iterable<Cooccurrence> 
cooccurrences, double weightOfVectorA,
-      double weightOfVectorB, int numberOfColumns) {
+      double weightOfVectorB, long numberOfColumns) {
 
-    int cooccurrenceCount = countElements(cooccurrences);
+    long cooccurrenceCount = countElements(cooccurrences);
     if (cooccurrenceCount == 0) {
       return Double.NaN;
     }
 
-    int occurrencesA = (int) weightOfVectorA;
-    int occurrencesB = (int) weightOfVectorB;
+    long occurrencesA = (long) weightOfVectorA;
+    long occurrencesB = (long) weightOfVectorB;
 
     double logLikelihood =
         LogLikelihood.logLikelihoodRatio(cooccurrenceCount,

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedPearsonCorrelationVectorSimilarity.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedPearsonCorrelationVectorSimilarity.java?rev=1138553&r1=1138552&r2=1138553&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedPearsonCorrelationVectorSimilarity.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedPearsonCorrelationVectorSimilarity.java
 Wed Jun 22 17:05:51 2011
@@ -26,7 +26,7 @@ public class DistributedPearsonCorrelati
 
   @Override
   protected double doComputeResult(int rowA, int rowB, Iterable<Cooccurrence> 
cooccurrences, double weightOfVectorA,
-      double weightOfVectorB, int numberOfColumns) {
+      double weightOfVectorB, long numberOfColumns) {
 
     int count = 0;
     double sumX = 0.0;

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedTanimotoCoefficientVectorSimilarity.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedTanimotoCoefficientVectorSimilarity.java?rev=1138553&r1=1138552&r2=1138553&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedTanimotoCoefficientVectorSimilarity.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedTanimotoCoefficientVectorSimilarity.java
 Wed Jun 22 17:05:51 2011
@@ -27,7 +27,7 @@ public class DistributedTanimotoCoeffici
 
   @Override
   protected double doComputeResult(int rowA, int rowB, Iterable<Cooccurrence> 
cooccurrences, double weightOfVectorA,
-      double weightOfVectorB, int numberOfColumns) {
+      double weightOfVectorB, long numberOfColumns) {
     double cooccurrenceCount = countElements(cooccurrences);
     if (cooccurrenceCount == 0) {
       return Double.NaN;

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedUncenteredCosineVectorSimilarity.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedUncenteredCosineVectorSimilarity.java?rev=1138553&r1=1138552&r2=1138553&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedUncenteredCosineVectorSimilarity.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedUncenteredCosineVectorSimilarity.java
 Wed Jun 22 17:05:51 2011
@@ -26,7 +26,7 @@ public class DistributedUncenteredCosine
 
   @Override
   protected double doComputeResult(int rowA, int rowB, Iterable<Cooccurrence> 
cooccurrences, double weightOfVectorA,
-      double weightOfVectorB, int numberOfColumns) {
+      double weightOfVectorB, long numberOfColumns) {
 
     int n = 0;
     double sumXY = 0.0;

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedUncenteredZeroAssumingCosineVectorSimilarity.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedUncenteredZeroAssumingCosineVectorSimilarity.java?rev=1138553&r1=1138552&r2=1138553&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedUncenteredZeroAssumingCosineVectorSimilarity.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedUncenteredZeroAssumingCosineVectorSimilarity.java
 Wed Jun 22 17:05:51 2011
@@ -29,7 +29,7 @@ public class DistributedUncenteredZeroAs
 
   @Override
   protected double doComputeResult(int rowA, int rowB, Iterable<Cooccurrence> 
cooccurrences, double weightOfVectorA,
-      double weightOfVectorB, int numberOfColumns) {
+      double weightOfVectorB, long numberOfColumns) {
 
     double sumXY = 0.0;
     for (Cooccurrence cooccurrence : cooccurrences) {

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedVectorSimilarity.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedVectorSimilarity.java?rev=1138553&r1=1138552&r2=1138553&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedVectorSimilarity.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedVectorSimilarity.java
 Wed Jun 22 17:05:51 2011
@@ -50,5 +50,5 @@ public interface DistributedVectorSimila
                     Iterable<Cooccurrence> cooccurrences,
                     double weightOfVectorA,
                     double weightOfVectorB,
-                    int numberOfColumns);
+                    long numberOfColumns);
 }

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/collocations/llr/LLRReducer.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/collocations/llr/LLRReducer.java?rev=1138553&r1=1138552&r2=1138553&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/collocations/llr/LLRReducer.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/collocations/llr/LLRReducer.java
 Wed Jun 22 17:05:51 2011
@@ -94,33 +94,31 @@ public class LLRReducer extends Reducer<
       log.warn("Missing head for {}, skipping.", ngram);
       context.getCounter(Skipped.MISSING_HEAD).increment(1);
       return;
-    } else if (gramFreq[1] == -1) {
+    }
+    if (gramFreq[1] == -1) {
       log.warn("Missing tail for {}, skipping", ngram);
       context.getCounter(Skipped.MISSING_TAIL).increment(1);
       return;
     }
 
-    int k11 = ngram.getFrequency(); /* a&b */
-    int k12 = gramFreq[0] - ngram.getFrequency(); /* a&!b */
-    int k21 = gramFreq[1] - ngram.getFrequency(); /* !b&a */
-    int k22 = (int) (ngramTotal - (gramFreq[0] + gramFreq[1] - 
ngram.getFrequency())); /* !a&!b */
+    long k11 = ngram.getFrequency(); /* a&b */
+    long k12 = gramFreq[0] - ngram.getFrequency(); /* a&!b */
+    long k21 = gramFreq[1] - ngram.getFrequency(); /* !b&a */
+    long k22 = ngramTotal - (gramFreq[0] + gramFreq[1] - 
ngram.getFrequency()); /* !a&!b */
 
+    double llr;
     try {
-      double llr = ll.logLikelihoodRatio(k11, k12, k21, k22);
-      if (llr < minLLRValue) {
-        context.getCounter(Skipped.LESS_THAN_MIN_LLR).increment(1);
-        return;
-      }
-      DoubleWritable dd = new DoubleWritable(llr);
-      Text t = new Text(ngram.getString());
-      context.write(t, dd);
+      llr = ll.logLikelihoodRatio(k11, k12, k21, k22);
     } catch (IllegalArgumentException ex) {
       context.getCounter(Skipped.LLR_CALCULATION_ERROR).increment(1);
-      log.error("Problem calculating LLR ratio: " + ex.getMessage());
-      log.error("NGram: " + ngram);
-      log.error("HEAD: " + gram[0] + ':' + gramFreq[0]);
-      log.error("TAIL: " + gram[1] + ':' + gramFreq[1]);
-      log.error("k11: " + k11 + " k12: " + k12 + " k21: " + k21 + " k22: " + 
k22);
+      log.warn("Problem calculating LLR ratio for ngram {}, HEAD {}:{}, TAIL 
{}:{}, k11/k12/k21/k22: {}/{}/{}/{}",
+          new Object[] {ngram, gram[0], gramFreq[0], gram[1], gramFreq[1], 
k11, k12, k21, k22}, ex);
+      return;
+    }
+    if (llr < minLLRValue) {
+      context.getCounter(Skipped.LESS_THAN_MIN_LLR).increment(1);
+    } else {
+      context.write(new Text(ngram.getString()), new DoubleWritable(llr));
     }
   }
 
@@ -133,11 +131,8 @@ public class LLRReducer extends Reducer<
 
     this.emitUnigrams = conf.getBoolean(CollocDriver.EMIT_UNIGRAMS, 
CollocDriver.DEFAULT_EMIT_UNIGRAMS);
 
-    if (log.isInfoEnabled()) {
-      log.info("NGram Total is {}", ngramTotal);
-      log.info("Min LLR value is {}", minLLRValue);
-      log.info("Emit Unitgrams is {}", emitUnigrams);
-    }
+    log.info("NGram Total: {}, Min LLR value: {}, Emit Unigrams: {}",
+             new Object[] {ngramTotal, minLLRValue, emitUnigrams});
 
     if (ngramTotal == -1) {
       throw new IllegalStateException("No NGRAM_TOTAL available in job 
config");
@@ -162,13 +157,13 @@ public class LLRReducer extends Reducer<
    * provide interface so the input to the llr calculation can be captured for 
validation in unit testing
    */
   public interface LLCallback {
-    double logLikelihoodRatio(int k11, int k12, int k21, int k22);
+    double logLikelihoodRatio(long k11, long k12, long k21, long k22);
   }
 
   /** concrete implementation delegates to LogLikelihood class */
   public static final class ConcreteLLCallback implements LLCallback {
     @Override
-    public double logLikelihoodRatio(int k11, int k12, int k21, int k22) {
+    public double logLikelihoodRatio(long k11, long k12, long k21, long k22) {
       return LogLikelihood.logLikelihoodRatio(k11, k12, k21, k22);
     }
   }

Modified: 
mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/TestRowSimilarityJob.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/TestRowSimilarityJob.java?rev=1138553&r1=1138552&r2=1138553&view=diff
==============================================================================
--- 
mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/TestRowSimilarityJob.java
 (original)
+++ 
mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/TestRowSimilarityJob.java
 Wed Jun 22 17:05:51 2011
@@ -324,7 +324,7 @@ public final class TestRowSimilarityJob 
 
     @Override
     public double similarity(int rowA, int rowB, Iterable<Cooccurrence> 
cooccurrences, double weightOfVectorA,
-        double weightOfVectorB, int numberOfRows) {
+        double weightOfVectorB, long numberOfRows) {
       if (rowA == rowB) {
         return Double.NaN;
       }

Modified: 
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/collocations/llr/LLRReducerTest.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/collocations/llr/LLRReducerTest.java?rev=1138553&r1=1138552&r2=1138553&view=diff
==============================================================================
--- 
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/collocations/llr/LLRReducerTest.java
 (original)
+++ 
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/collocations/llr/LLRReducerTest.java
 Wed Jun 22 17:05:51 2011
@@ -59,7 +59,7 @@ public final class LLRReducerTest extend
     ll        = EasyMock.createMock(LLCallback.class);
     cl        = new LLCallback() {
       @Override
-      public double logLikelihoodRatio(int k11, int k12, int k21, int k22) {
+      public double logLikelihoodRatio(long k11, long k12, long k21, long k22) 
{
         log.info("k11:{} k12:{} k21:{} k22:{}", new Object[] {k11, k12, k21, 
k22});
         return LogLikelihood.logLikelihoodRatio(k11, k12, k21, k22);
       }

Modified: 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java?rev=1138553&r1=1138552&r2=1138553&view=diff
==============================================================================
--- 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
 (original)
+++ 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
 Wed Jun 22 17:05:51 2011
@@ -260,9 +260,9 @@ public class ClusterLabels {
     return bitset;
   }
 
-  private static double scoreDocumentFrequencies(int inDF, int outDF, int 
clusterSize, int corpusSize) {
-    int k12 = clusterSize - inDF;
-    int k22 = corpusSize - clusterSize - outDF;
+  private static double scoreDocumentFrequencies(long inDF, long outDF, long 
clusterSize, long corpusSize) {
+    long k12 = clusterSize - inDF;
+    long k22 = corpusSize - clusterSize - outDF;
 
     return LogLikelihood.logLikelihoodRatio(inDF, k12, outDF, k22);
   }

Modified: 
mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java?rev=1138553&r1=1138552&r2=1138553&view=diff
==============================================================================
--- 
mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java 
(original)
+++ 
mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java 
Wed Jun 22 17:05:51 2011
@@ -47,10 +47,10 @@ public final class LogLikelihood {
    *
    * @return The entropy value for the elements
    */
-  public static double entropy(int... elements) {
+  public static double entropy(long... elements) {
     double sum = 0.0;
     double result = 0.0;
-    for (int element : elements) {
+    for (long element : elements) {
       if (element < 0) {
         throw new IllegalArgumentException("Should not have negative count for 
entropy computation: (" + element + ')');
       }
@@ -81,7 +81,7 @@ public final class LogLikelihood {
    * <p/>
    * Credit to 
http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html for the 
table and the descriptions.
    */
-  public static double logLikelihoodRatio(int k11, int k12, int k21, int k22) {
+  public static double logLikelihoodRatio(long k11, long k12, long k21, long 
k22) {
     // note that we have counts here, not probabilities, and that the entropy 
is not normalized.
     double rowEntropy = entropy(k11, k12) + entropy(k21, k22);
     double columnEntropy = entropy(k11, k21) + entropy(k12, k22);
@@ -95,7 +95,7 @@ public final class LogLikelihood {
   
   /** 
    * Calculates the root log-likelihood ratio for two events.
-   * See {@link #logLikelihoodRatio(int, int, int, int)}.
+   * See {@link #logLikelihoodRatio(long, long, long, long)}.
 
    * @param k11 The number of times the two events occurred together
    * @param k12 The number of times the second event occurred WITHOUT the 
first event
@@ -107,7 +107,7 @@ public final class LogLikelihood {
    * See discussion of raw vs. root LLR at 
    * 
http://www.lucidimagination.com/search/document/6dc8709e65a7ced1/llr_scoring_question
    */
-  public static double rootLogLikelihoodRatio(int k11, int k12, int k21, int 
k22) {
+  public static double rootLogLikelihoodRatio(long k11, long k12, long k21, 
long k22) {
     double llr = logLikelihoodRatio(k11, k12, k21, k22);
     double sqrt = Math.sqrt(llr);
     if ((double) k11 / (k11 + k12) < (double) k21 / (k21 + k22)) {


Reply via email to