Author: srowen
Date: Wed Jun 22 17:05:51 2011
New Revision: 1138553
URL: http://svn.apache.org/viewvc?rev=1138553&view=rev
Log:
MAHOUT-738 treat input to LLR as long to avoid possible overflow
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarity.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/AbstractDistributedVectorSimilarity.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedCityBlockVectorSimilarity.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedCooccurrenceVectorSimilarity.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedEuclideanDistanceVectorSimilarity.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedLoglikelihoodVectorSimilarity.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedPearsonCorrelationVectorSimilarity.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedTanimotoCoefficientVectorSimilarity.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedUncenteredCosineVectorSimilarity.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedUncenteredZeroAssumingCosineVectorSimilarity.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedVectorSimilarity.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/collocations/llr/LLRReducer.java
mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/TestRowSimilarityJob.java
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/collocations/llr/LLRReducerTest.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarity.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarity.java?rev=1138553&r1=1138552&r2=1138553&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarity.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarity.java
Wed Jun 22 17:05:51 2011
@@ -55,14 +55,14 @@ public final class LogLikelihoodSimilari
FastIDSet prefs1 = dataModel.getItemIDsFromUser(userID1);
FastIDSet prefs2 = dataModel.getItemIDsFromUser(userID2);
- int prefs1Size = prefs1.size();
- int prefs2Size = prefs2.size();
- int intersectionSize = prefs1Size < prefs2Size ?
prefs2.intersectionSize(prefs1) : prefs1
- .intersectionSize(prefs2);
+ long prefs1Size = prefs1.size();
+ long prefs2Size = prefs2.size();
+ long intersectionSize =
+ prefs1Size < prefs2Size ? prefs2.intersectionSize(prefs1) :
prefs1.intersectionSize(prefs2);
if (intersectionSize == 0) {
return Double.NaN;
}
- int numItems = dataModel.getNumItems();
+ long numItems = dataModel.getNumItems();
double logLikelihood =
LogLikelihood.logLikelihoodRatio(intersectionSize,
prefs2Size - intersectionSize,
@@ -74,16 +74,16 @@ public final class LogLikelihoodSimilari
@Override
public double itemSimilarity(long itemID1, long itemID2) throws
TasteException {
DataModel dataModel = getDataModel();
- int preferring1 = dataModel.getNumUsersWithPreferenceFor(itemID1);
- int numUsers = dataModel.getNumUsers();
+ long preferring1 = dataModel.getNumUsersWithPreferenceFor(itemID1);
+ long numUsers = dataModel.getNumUsers();
return doItemSimilarity(itemID1, itemID2, preferring1, numUsers);
}
@Override
public double[] itemSimilarities(long itemID1, long[] itemID2s) throws
TasteException {
DataModel dataModel = getDataModel();
- int preferring1 = dataModel.getNumUsersWithPreferenceFor(itemID1);
- int numUsers = dataModel.getNumUsers();
+ long preferring1 = dataModel.getNumUsersWithPreferenceFor(itemID1);
+ long numUsers = dataModel.getNumUsers();
int length = itemID2s.length;
double[] result = new double[length];
for (int i = 0; i < length; i++) {
@@ -92,13 +92,13 @@ public final class LogLikelihoodSimilari
return result;
}
- private double doItemSimilarity(long itemID1, long itemID2, int preferring1,
int numUsers) throws TasteException {
+ private double doItemSimilarity(long itemID1, long itemID2, long
preferring1, long numUsers) throws TasteException {
DataModel dataModel = getDataModel();
- int preferring1and2 = dataModel.getNumUsersWithPreferenceFor(itemID1,
itemID2);
+ long preferring1and2 = dataModel.getNumUsersWithPreferenceFor(itemID1,
itemID2);
if (preferring1and2 == 0) {
return Double.NaN;
}
- int preferring2 = dataModel.getNumUsersWithPreferenceFor(itemID2);
+ long preferring2 = dataModel.getNumUsersWithPreferenceFor(itemID2);
double logLikelihood =
LogLikelihood.logLikelihoodRatio(preferring1and2,
preferring2 - preferring1and2,
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/AbstractDistributedVectorSimilarity.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/AbstractDistributedVectorSimilarity.java?rev=1138553&r1=1138552&r2=1138553&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/AbstractDistributedVectorSimilarity.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/AbstractDistributedVectorSimilarity.java
Wed Jun 22 17:05:51 2011
@@ -32,7 +32,7 @@ public abstract class AbstractDistribute
*/
@Override
public final double similarity(int rowA, int rowB, Iterable<Cooccurrence>
cooccurrences, double weightOfVectorA,
- double weightOfVectorB, int numberOfColumns) {
+ double weightOfVectorB, long numberOfColumns) {
double result = doComputeResult(rowA, rowB, cooccurrences,
weightOfVectorA, weightOfVectorB, numberOfColumns);
@@ -66,14 +66,14 @@ public abstract class AbstractDistribute
/**
* do the actual similarity computation
*
- * @see DistributedVectorSimilarity#similarity(int, int, Iterable, double,
double, int)
+ * @see DistributedVectorSimilarity#similarity(int, int, Iterable, double,
double, long)
*/
protected abstract double doComputeResult(int rowA,
int rowB,
Iterable<Cooccurrence>
cooccurrences,
double weightOfVectorA,
double weightOfVectorB,
- int numberOfColumns);
+ long numberOfColumns);
/**
* vectors have no weight (NaN) by default, subclasses may override this
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedCityBlockVectorSimilarity.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedCityBlockVectorSimilarity.java?rev=1138553&r1=1138552&r2=1138553&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedCityBlockVectorSimilarity.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedCityBlockVectorSimilarity.java
Wed Jun 22 17:05:51 2011
@@ -31,7 +31,7 @@ public final class DistributedCityBlockV
Iterable<Cooccurrence> cooccurrences,
double weightOfVectorA,
double weightOfVectorB,
- int numberOfColumns) {
+ long numberOfColumns) {
int cooccurrenceCount = countElements(cooccurrences);
if (cooccurrenceCount == 0) {
return Double.NaN;
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedCooccurrenceVectorSimilarity.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedCooccurrenceVectorSimilarity.java?rev=1138553&r1=1138552&r2=1138553&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedCooccurrenceVectorSimilarity.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedCooccurrenceVectorSimilarity.java
Wed Jun 22 17:05:51 2011
@@ -36,7 +36,7 @@ public class DistributedCooccurrenceVect
Iterable<Cooccurrence> cooccurrences,
double weightOfVectorA,
double weightOfVectorB,
- int numberOfColumns) {
+ long numberOfColumns) {
return AbstractDistributedVectorSimilarity.countElements(cooccurrences);
}
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedEuclideanDistanceVectorSimilarity.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedEuclideanDistanceVectorSimilarity.java?rev=1138553&r1=1138552&r2=1138553&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedEuclideanDistanceVectorSimilarity.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedEuclideanDistanceVectorSimilarity.java
Wed Jun 22 17:05:51 2011
@@ -26,7 +26,7 @@ public class DistributedEuclideanDistanc
@Override
protected double doComputeResult(int rowA, int rowB, Iterable<Cooccurrence>
cooccurrences, double weightOfVectorA,
- double weightOfVectorB, int numberOfColumns) {
+ double weightOfVectorB, long numberOfColumns) {
double n = 0.0;
double sumXYdiff2 = 0.0;
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedLoglikelihoodVectorSimilarity.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedLoglikelihoodVectorSimilarity.java?rev=1138553&r1=1138552&r2=1138553&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedLoglikelihoodVectorSimilarity.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedLoglikelihoodVectorSimilarity.java
Wed Jun 22 17:05:51 2011
@@ -29,15 +29,15 @@ public class DistributedLoglikelihoodVec
@Override
protected double doComputeResult(int rowA, int rowB, Iterable<Cooccurrence>
cooccurrences, double weightOfVectorA,
- double weightOfVectorB, int numberOfColumns) {
+ double weightOfVectorB, long numberOfColumns) {
- int cooccurrenceCount = countElements(cooccurrences);
+ long cooccurrenceCount = countElements(cooccurrences);
if (cooccurrenceCount == 0) {
return Double.NaN;
}
- int occurrencesA = (int) weightOfVectorA;
- int occurrencesB = (int) weightOfVectorB;
+ long occurrencesA = (long) weightOfVectorA;
+ long occurrencesB = (long) weightOfVectorB;
double logLikelihood =
LogLikelihood.logLikelihoodRatio(cooccurrenceCount,
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedPearsonCorrelationVectorSimilarity.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedPearsonCorrelationVectorSimilarity.java?rev=1138553&r1=1138552&r2=1138553&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedPearsonCorrelationVectorSimilarity.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedPearsonCorrelationVectorSimilarity.java
Wed Jun 22 17:05:51 2011
@@ -26,7 +26,7 @@ public class DistributedPearsonCorrelati
@Override
protected double doComputeResult(int rowA, int rowB, Iterable<Cooccurrence>
cooccurrences, double weightOfVectorA,
- double weightOfVectorB, int numberOfColumns) {
+ double weightOfVectorB, long numberOfColumns) {
int count = 0;
double sumX = 0.0;
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedTanimotoCoefficientVectorSimilarity.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedTanimotoCoefficientVectorSimilarity.java?rev=1138553&r1=1138552&r2=1138553&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedTanimotoCoefficientVectorSimilarity.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedTanimotoCoefficientVectorSimilarity.java
Wed Jun 22 17:05:51 2011
@@ -27,7 +27,7 @@ public class DistributedTanimotoCoeffici
@Override
protected double doComputeResult(int rowA, int rowB, Iterable<Cooccurrence>
cooccurrences, double weightOfVectorA,
- double weightOfVectorB, int numberOfColumns) {
+ double weightOfVectorB, long numberOfColumns) {
double cooccurrenceCount = countElements(cooccurrences);
if (cooccurrenceCount == 0) {
return Double.NaN;
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedUncenteredCosineVectorSimilarity.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedUncenteredCosineVectorSimilarity.java?rev=1138553&r1=1138552&r2=1138553&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedUncenteredCosineVectorSimilarity.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedUncenteredCosineVectorSimilarity.java
Wed Jun 22 17:05:51 2011
@@ -26,7 +26,7 @@ public class DistributedUncenteredCosine
@Override
protected double doComputeResult(int rowA, int rowB, Iterable<Cooccurrence>
cooccurrences, double weightOfVectorA,
- double weightOfVectorB, int numberOfColumns) {
+ double weightOfVectorB, long numberOfColumns) {
int n = 0;
double sumXY = 0.0;
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedUncenteredZeroAssumingCosineVectorSimilarity.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedUncenteredZeroAssumingCosineVectorSimilarity.java?rev=1138553&r1=1138552&r2=1138553&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedUncenteredZeroAssumingCosineVectorSimilarity.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedUncenteredZeroAssumingCosineVectorSimilarity.java
Wed Jun 22 17:05:51 2011
@@ -29,7 +29,7 @@ public class DistributedUncenteredZeroAs
@Override
protected double doComputeResult(int rowA, int rowB, Iterable<Cooccurrence>
cooccurrences, double weightOfVectorA,
- double weightOfVectorB, int numberOfColumns) {
+ double weightOfVectorB, long numberOfColumns) {
double sumXY = 0.0;
for (Cooccurrence cooccurrence : cooccurrences) {
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedVectorSimilarity.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedVectorSimilarity.java?rev=1138553&r1=1138552&r2=1138553&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedVectorSimilarity.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/vector/DistributedVectorSimilarity.java
Wed Jun 22 17:05:51 2011
@@ -50,5 +50,5 @@ public interface DistributedVectorSimila
Iterable<Cooccurrence> cooccurrences,
double weightOfVectorA,
double weightOfVectorB,
- int numberOfColumns);
+ long numberOfColumns);
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/collocations/llr/LLRReducer.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/collocations/llr/LLRReducer.java?rev=1138553&r1=1138552&r2=1138553&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/collocations/llr/LLRReducer.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/collocations/llr/LLRReducer.java
Wed Jun 22 17:05:51 2011
@@ -94,33 +94,31 @@ public class LLRReducer extends Reducer<
log.warn("Missing head for {}, skipping.", ngram);
context.getCounter(Skipped.MISSING_HEAD).increment(1);
return;
- } else if (gramFreq[1] == -1) {
+ }
+ if (gramFreq[1] == -1) {
log.warn("Missing tail for {}, skipping", ngram);
context.getCounter(Skipped.MISSING_TAIL).increment(1);
return;
}
- int k11 = ngram.getFrequency(); /* a&b */
- int k12 = gramFreq[0] - ngram.getFrequency(); /* a&!b */
- int k21 = gramFreq[1] - ngram.getFrequency(); /* !b&a */
- int k22 = (int) (ngramTotal - (gramFreq[0] + gramFreq[1] -
ngram.getFrequency())); /* !a&!b */
+ long k11 = ngram.getFrequency(); /* a&b */
+ long k12 = gramFreq[0] - ngram.getFrequency(); /* a&!b */
+ long k21 = gramFreq[1] - ngram.getFrequency(); /* !b&a */
+ long k22 = ngramTotal - (gramFreq[0] + gramFreq[1] -
ngram.getFrequency()); /* !a&!b */
+ double llr;
try {
- double llr = ll.logLikelihoodRatio(k11, k12, k21, k22);
- if (llr < minLLRValue) {
- context.getCounter(Skipped.LESS_THAN_MIN_LLR).increment(1);
- return;
- }
- DoubleWritable dd = new DoubleWritable(llr);
- Text t = new Text(ngram.getString());
- context.write(t, dd);
+ llr = ll.logLikelihoodRatio(k11, k12, k21, k22);
} catch (IllegalArgumentException ex) {
context.getCounter(Skipped.LLR_CALCULATION_ERROR).increment(1);
- log.error("Problem calculating LLR ratio: " + ex.getMessage());
- log.error("NGram: " + ngram);
- log.error("HEAD: " + gram[0] + ':' + gramFreq[0]);
- log.error("TAIL: " + gram[1] + ':' + gramFreq[1]);
- log.error("k11: " + k11 + " k12: " + k12 + " k21: " + k21 + " k22: " +
k22);
+ log.warn("Problem calculating LLR ratio for ngram {}, HEAD {}:{}, TAIL
{}:{}, k11/k12/k21/k22: {}/{}/{}/{}",
+ new Object[] {ngram, gram[0], gramFreq[0], gram[1], gramFreq[1],
k11, k12, k21, k22}, ex);
+ return;
+ }
+ if (llr < minLLRValue) {
+ context.getCounter(Skipped.LESS_THAN_MIN_LLR).increment(1);
+ } else {
+ context.write(new Text(ngram.getString()), new DoubleWritable(llr));
}
}
@@ -133,11 +131,8 @@ public class LLRReducer extends Reducer<
this.emitUnigrams = conf.getBoolean(CollocDriver.EMIT_UNIGRAMS,
CollocDriver.DEFAULT_EMIT_UNIGRAMS);
- if (log.isInfoEnabled()) {
- log.info("NGram Total is {}", ngramTotal);
- log.info("Min LLR value is {}", minLLRValue);
- log.info("Emit Unitgrams is {}", emitUnigrams);
- }
+ log.info("NGram Total: {}, Min LLR value: {}, Emit Unigrams: {}",
+ new Object[] {ngramTotal, minLLRValue, emitUnigrams});
if (ngramTotal == -1) {
throw new IllegalStateException("No NGRAM_TOTAL available in job
config");
@@ -162,13 +157,13 @@ public class LLRReducer extends Reducer<
* provide interface so the input to the llr calculation can be captured for
validation in unit testing
*/
public interface LLCallback {
- double logLikelihoodRatio(int k11, int k12, int k21, int k22);
+ double logLikelihoodRatio(long k11, long k12, long k21, long k22);
}
/** concrete implementation delegates to LogLikelihood class */
public static final class ConcreteLLCallback implements LLCallback {
@Override
- public double logLikelihoodRatio(int k11, int k12, int k21, int k22) {
+ public double logLikelihoodRatio(long k11, long k12, long k21, long k22) {
return LogLikelihood.logLikelihoodRatio(k11, k12, k21, k22);
}
}
Modified:
mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/TestRowSimilarityJob.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/TestRowSimilarityJob.java?rev=1138553&r1=1138552&r2=1138553&view=diff
==============================================================================
---
mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/TestRowSimilarityJob.java
(original)
+++
mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/TestRowSimilarityJob.java
Wed Jun 22 17:05:51 2011
@@ -324,7 +324,7 @@ public final class TestRowSimilarityJob
@Override
public double similarity(int rowA, int rowB, Iterable<Cooccurrence>
cooccurrences, double weightOfVectorA,
- double weightOfVectorB, int numberOfRows) {
+ double weightOfVectorB, long numberOfRows) {
if (rowA == rowB) {
return Double.NaN;
}
Modified:
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/collocations/llr/LLRReducerTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/collocations/llr/LLRReducerTest.java?rev=1138553&r1=1138552&r2=1138553&view=diff
==============================================================================
---
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/collocations/llr/LLRReducerTest.java
(original)
+++
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/collocations/llr/LLRReducerTest.java
Wed Jun 22 17:05:51 2011
@@ -59,7 +59,7 @@ public final class LLRReducerTest extend
ll = EasyMock.createMock(LLCallback.class);
cl = new LLCallback() {
@Override
- public double logLikelihoodRatio(int k11, int k12, int k21, int k22) {
+ public double logLikelihoodRatio(long k11, long k12, long k21, long k22)
{
log.info("k11:{} k12:{} k21:{} k22:{}", new Object[] {k11, k12, k21,
k22});
return LogLikelihood.logLikelihoodRatio(k11, k12, k21, k22);
}
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java?rev=1138553&r1=1138552&r2=1138553&view=diff
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
(original)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
Wed Jun 22 17:05:51 2011
@@ -260,9 +260,9 @@ public class ClusterLabels {
return bitset;
}
- private static double scoreDocumentFrequencies(int inDF, int outDF, int
clusterSize, int corpusSize) {
- int k12 = clusterSize - inDF;
- int k22 = corpusSize - clusterSize - outDF;
+ private static double scoreDocumentFrequencies(long inDF, long outDF, long
clusterSize, long corpusSize) {
+ long k12 = clusterSize - inDF;
+ long k22 = corpusSize - clusterSize - outDF;
return LogLikelihood.logLikelihoodRatio(inDF, k12, outDF, k22);
}
Modified:
mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java?rev=1138553&r1=1138552&r2=1138553&view=diff
==============================================================================
---
mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java
(original)
+++
mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java
Wed Jun 22 17:05:51 2011
@@ -47,10 +47,10 @@ public final class LogLikelihood {
*
* @return The entropy value for the elements
*/
- public static double entropy(int... elements) {
+ public static double entropy(long... elements) {
double sum = 0.0;
double result = 0.0;
- for (int element : elements) {
+ for (long element : elements) {
if (element < 0) {
throw new IllegalArgumentException("Should not have negative count for
entropy computation: (" + element + ')');
}
@@ -81,7 +81,7 @@ public final class LogLikelihood {
* <p/>
* Credit to
http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html for the
table and the descriptions.
*/
- public static double logLikelihoodRatio(int k11, int k12, int k21, int k22) {
+ public static double logLikelihoodRatio(long k11, long k12, long k21, long
k22) {
// note that we have counts here, not probabilities, and that the entropy
is not normalized.
double rowEntropy = entropy(k11, k12) + entropy(k21, k22);
double columnEntropy = entropy(k11, k21) + entropy(k12, k22);
@@ -95,7 +95,7 @@ public final class LogLikelihood {
/**
* Calculates the root log-likelihood ratio for two events.
- * See {@link #logLikelihoodRatio(int, int, int, int)}.
+ * See {@link #logLikelihoodRatio(long, long, long, long)}.
* @param k11 The number of times the two events occurred together
* @param k12 The number of times the second event occurred WITHOUT the
first event
@@ -107,7 +107,7 @@ public final class LogLikelihood {
* See discussion of raw vs. root LLR at
*
http://www.lucidimagination.com/search/document/6dc8709e65a7ced1/llr_scoring_question
*/
- public static double rootLogLikelihoodRatio(int k11, int k12, int k21, int
k22) {
+ public static double rootLogLikelihoodRatio(long k11, long k12, long k21,
long k22) {
double llr = logLikelihoodRatio(k11, k12, k21, k22);
double sqrt = Math.sqrt(llr);
if ((double) k11 / (k11 + k12) < (double) k21 / (k21 + k22)) {