Author: ssc
Date: Thu Sep 29 09:24:47 2011
New Revision: 1177237
URL: http://svn.apache.org/viewvc?rev=1177237&view=rev
Log:
refined size constraints for pruning of similarity candidate pairs
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJob.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CityBlockSimilarity.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CooccurrenceCountSimilarity.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CosineSimilarity.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CountbasedMeasure.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/EuclideanDistanceSimilarity.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/LoglikelihoodSimilarity.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/TanimotoCoefficientSimilarity.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasure.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJob.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJob.java?rev=1177237&r1=1177236&r2=1177237&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJob.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJob.java
Thu Sep 29 09:24:47 2011
@@ -279,8 +279,9 @@ public class RowSimilarityJob extends Ab
int numNonZeroEntriesB = numNonZeroEntries.get(occurrenceB.index());
double maxValueA = maxValues.get(occurrenceA.index());
+ double maxValueB = maxValues.get(occurrenceB.index());
- return similarity.consider(numNonZeroEntriesA, numNonZeroEntriesB,
maxValueA, threshold);
+ return similarity.consider(numNonZeroEntriesA, numNonZeroEntriesB,
maxValueA, maxValueB, threshold);
}
@Override
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CityBlockSimilarity.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CityBlockSimilarity.java?rev=1177237&r1=1177236&r2=1177237&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CityBlockSimilarity.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CityBlockSimilarity.java
Thu Sep 29 09:24:47 2011
@@ -23,9 +23,4 @@ public class CityBlockSimilarity extends
public double similarity(double dots, double normA, double normB, int
numberOfColumns) {
return 1.0 / (1.0 + normA + normB - 2 * dots);
}
-
- @Override
- public boolean consider(int numNonZeroEntriesA, int numNonZeroEntriesB,
double maxValueA, double threshold) {
- return true;
- }
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CooccurrenceCountSimilarity.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CooccurrenceCountSimilarity.java?rev=1177237&r1=1177236&r2=1177237&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CooccurrenceCountSimilarity.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CooccurrenceCountSimilarity.java
Thu Sep 29 09:24:47 2011
@@ -25,7 +25,8 @@ public class CooccurrenceCountSimilarity
}
@Override
- public boolean consider(int numNonZeroEntriesA, int numNonZeroEntriesB,
double maxValueA, double threshold) {
+ public boolean consider(int numNonZeroEntriesA, int numNonZeroEntriesB,
double maxValueA, double maxValueB,
+ double threshold) {
return numNonZeroEntriesA >= threshold && numNonZeroEntriesB >= threshold;
}
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CosineSimilarity.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CosineSimilarity.java?rev=1177237&r1=1177236&r2=1177237&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CosineSimilarity.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CosineSimilarity.java
Thu Sep 29 09:24:47 2011
@@ -42,7 +42,9 @@ public class CosineSimilarity implements
}
@Override
- public boolean consider(int numNonZeroEntriesA, int numNonZeroEntriesB,
double maxValueA, double threshold) {
- return numNonZeroEntriesB >= threshold / maxValueA;
+ public boolean consider(int numNonZeroEntriesA, int numNonZeroEntriesB,
double maxValueA, double maxValueB,
+ double threshold) {
+ return numNonZeroEntriesB >= threshold / maxValueA &&
+ numNonZeroEntriesA >= threshold / maxValueB;
}
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CountbasedMeasure.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CountbasedMeasure.java?rev=1177237&r1=1177236&r2=1177237&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CountbasedMeasure.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CountbasedMeasure.java
Thu Sep 29 09:24:47 2011
@@ -36,4 +36,9 @@ public abstract class CountbasedMeasure
return 1;
}
+ @Override
+ public boolean consider(int numNonZeroEntriesA, int numNonZeroEntriesB,
double maxValueA, double maxValueB,
+ double threshold) {
+ return true;
+ }
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/EuclideanDistanceSimilarity.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/EuclideanDistanceSimilarity.java?rev=1177237&r1=1177236&r2=1177237&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/EuclideanDistanceSimilarity.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/EuclideanDistanceSimilarity.java
Thu Sep 29 09:24:47 2011
@@ -51,7 +51,8 @@ public class EuclideanDistanceSimilarity
}
@Override
- public boolean consider(int numNonZeroEntriesA, int numNonZeroEntriesB,
double maxValueA, double threshold) {
+ public boolean consider(int numNonZeroEntriesA, int numNonZeroEntriesB,
double maxValueA, double maxValueB,
+ double threshold) {
return true;
}
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/LoglikelihoodSimilarity.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/LoglikelihoodSimilarity.java?rev=1177237&r1=1177236&r2=1177237&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/LoglikelihoodSimilarity.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/LoglikelihoodSimilarity.java
Thu Sep 29 09:24:47 2011
@@ -29,8 +29,4 @@ public class LoglikelihoodSimilarity ext
return 1.0 - 1.0 / (1.0 + logLikelihood);
}
- @Override
- public boolean consider(int numNonZeroEntriesA, int numNonZeroEntriesB,
double maxValueA, double threshold) {
- return true;
- }
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/TanimotoCoefficientSimilarity.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/TanimotoCoefficientSimilarity.java?rev=1177237&r1=1177236&r2=1177237&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/TanimotoCoefficientSimilarity.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/TanimotoCoefficientSimilarity.java
Thu Sep 29 09:24:47 2011
@@ -25,7 +25,9 @@ public class TanimotoCoefficientSimilari
}
@Override
- public boolean consider(int numNonZeroEntriesA, int numNonZeroEntriesB,
double maxValueA, double threshold) {
- return numNonZeroEntriesA >= numNonZeroEntriesB * threshold;
+ public boolean consider(int numNonZeroEntriesA, int numNonZeroEntriesB,
double maxValueA, double maxValueB,
+ double threshold) {
+ return numNonZeroEntriesA >= numNonZeroEntriesB * threshold &&
+ numNonZeroEntriesB >= numNonZeroEntriesA * threshold;
}
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasure.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasure.java?rev=1177237&r1=1177236&r2=1177237&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasure.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasure.java
Thu Sep 29 09:24:47 2011
@@ -27,5 +27,6 @@ public interface VectorSimilarityMeasure
double norm(Vector vector);
double aggregate(double nonZeroValueA, double nonZeroValueB);
double similarity(double summedAggregations, double normA, double normB, int
numberOfColumns);
- boolean consider(int numNonZeroEntriesA, int numNonZeroEntriesB, double
maxValueA, double threshold);
+ boolean consider(int numNonZeroEntriesA, int numNonZeroEntriesB, double
maxValueA, double maxValueB,
+ double threshold);
}