Author: tommaso
Date: Fri Nov 23 11:19:06 2018
New Revision: 1847245

URL: http://svn.apache.org/viewvc?rev=1847245&view=rev
Log:
OAK-7824 - make distance threshold relative to current result set

Modified:
    
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java
    
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java
    
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java

Modified: 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java?rev=1847245&r1=1847244&r2=1847245&view=diff
==============================================================================
--- 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java
 (original)
+++ 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java
 Fri Nov 23 11:19:06 2018
@@ -405,7 +405,6 @@ public class LucenePropertyIndex extends
                                             long fvs = PERF_LOGGER.start();
                                             
SimSearchUtils.bruteForceFVRerank(sp, docs, indexSearcher);
                                             PERF_LOGGER.end(fvs, -1, "fv 
reranking done");
-                                            LOG.info("reranking done");
                                             earlyStop = true;
                                         }
                                     }

Modified: 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java?rev=1847245&r1=1847244&r2=1847245&view=diff
==============================================================================
--- 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java
 (original)
+++ 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java
 Fri Nov 23 11:19:06 2018
@@ -234,7 +234,9 @@ public class SimSearchUtils {
     }
 
     public static void bruteForceFVRerank(List<PropertyDefinition> sp, TopDocs 
docs, IndexSearcher indexSearcher) throws IOException {
-        double farthestDistance = 50d;
+        double distSum = 0d;
+        double counter = 0d;
+        Map<Integer, Double> distances = new HashMap<>();
         int k = 15;
         ScoreDoc inputDoc = docs.scoreDocs[0]; // we assume the input doc is 
the first one returned
         List<Integer> toDiscard = new LinkedList<>();
@@ -247,26 +249,37 @@ public class SimSearchUtils {
                     double[] currentVector = 
toDoubleArray(indexSearcher.doc(docs.scoreDocs[j].doc)
                             .getBinaryValue(fieldName).bytes);
                     double distance = dist(inputVector, currentVector) + 
1e-10; // constant term to avoid division by zero
-
-                    if (distance > farthestDistance) { // a threshold distance 
above which current vector is discarded
-                        toDiscard.add(docs.scoreDocs[j].doc);
-                    }
                     if (Double.isNaN(distance) || Double.isInfinite(distance)) 
{
                         toDiscard.add(docs.scoreDocs[j].doc);
+                    } else {
+                        distSum += distance;
+                        counter++;
+                        distances.put(docs.scoreDocs[j].doc, distance);
+                        docs.scoreDocs[j].score += (float) (1d / distance); // 
additive similarity boosting
                     }
-                    docs.scoreDocs[j].score += (float) (1d / distance); // 
additive similarity boosting
                 }
             }
         }
+
+        // remove docs having invalid distance
         if (!toDiscard.isEmpty()) {
-            docs.scoreDocs = Arrays.stream(docs.scoreDocs).filter(e -> 
!toDiscard.contains(e.doc)).toArray(ScoreDoc[]::new); // remove docs that are 
not close enough
+            docs.scoreDocs = Arrays.stream(docs.scoreDocs).filter(e -> 
!toDiscard.contains(e.doc)).toArray(ScoreDoc[]::new);
         }
-        Arrays.parallelSort(docs.scoreDocs, 0, docs.scoreDocs.length, (o1, o2) 
-> { // rerank scoreDocs
+
+        // remove docs whose distance is one order of magnitude higher than 
average distance
+        final double distanceThreshold = 10 * distSum / counter;
+        docs.scoreDocs = Arrays.stream(docs.scoreDocs).filter(e -> 
distances.containsKey(e.doc) && distances.get(e.doc) < 
distanceThreshold).toArray(ScoreDoc[]::new);
+
+        // rerank scoreDocs
+        Arrays.parallelSort(docs.scoreDocs, 0, docs.scoreDocs.length, (o1, o2) 
-> {
             return -1 * Double.compare(o1.score, o2.score);
         });
+
+        // retain only the top k nearest neighbours
         if (docs.scoreDocs.length > k) {
-            docs.scoreDocs = Arrays.copyOfRange(docs.scoreDocs, 0, k); // 
retain only the top k nearest neighbours
+            docs.scoreDocs = Arrays.copyOfRange(docs.scoreDocs, 0, k);
         }
+
         if (docs.scoreDocs.length > 0) {
             docs.setMaxScore(docs.scoreDocs[0].score);
         }

Modified: 
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java?rev=1847245&r1=1847244&r2=1847245&view=diff
==============================================================================
--- 
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java
 (original)
+++ 
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java
 Fri Nov 23 11:19:06 2018
@@ -393,7 +393,6 @@ public class LucenePropertyIndexTest ext
         root.commit();
 
         String propabQuery = "/jcr:root//element(*, nt:file)";
-        System.out.println(explainXpath(propabQuery));
         assertThat(explainXpath(propabQuery), containsString("nodeType"));
     }
 


Reply via email to