Author: tommaso
Date: Thu Nov 22 15:55:25 2018
New Revision: 1847180
URL: http://svn.apache.org/viewvc?rev=1847180&view=rev
Log:
OAK-7824 - enable reranking by similarity fvs
Modified:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java
Modified:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java?rev=1847180&r1=1847179&r2=1847180&view=diff
==============================================================================
---
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java
(original)
+++
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java
Thu Nov 22 15:55:25 2018
@@ -18,10 +18,7 @@ package org.apache.jackrabbit.oak.plugin
import java.io.IOException;
import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.LinkedList;
-import java.util.List;
+import java.util.*;
import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
import org.apache.jackrabbit.oak.plugins.index.search.PropertyDefinition;
@@ -53,7 +50,7 @@ public class SimSearchUtils {
private static final Logger log =
LoggerFactory.getLogger(SimSearchUtils.class);
public static String toDoubleString(byte[] bytes) {
- Double[] a = toDoubleArray(bytes);
+ double[] a = toDoubleArray(bytes);
StringBuilder builder = new StringBuilder();
for (Double d : a) {
if (builder.length() > 0) {
@@ -64,11 +61,6 @@ public class SimSearchUtils {
return builder.toString();
}
- private static Double[] toDoubleArray(byte[] array) {
- List<Double> doubles = toDoubles(array);
- return doubles.toArray(new Double[doubles.size()]);
- }
-
public static List<Double> toDoubles(byte[] array) {
int blockSize = Double.SIZE / Byte.SIZE;
ByteBuffer wrap = ByteBuffer.wrap(array);
@@ -81,6 +73,18 @@ public class SimSearchUtils {
return doubles;
}
+ private static double[] toDoubleArray(byte[] array) {
+ int blockSize = Double.SIZE / Byte.SIZE;
+ ByteBuffer wrap = ByteBuffer.wrap(array);
+ int capacity = array.length / blockSize;
+ double[] doubles = new double[capacity];
+ for (int i = 0; i < capacity; i++) {
+ double e = wrap.getDouble(i * blockSize);
+ doubles[i] = e;
+ }
+ return doubles;
+ }
+
private static Collection<BytesRef> getTokens(Analyzer analyzer, String
field, String sampleTextString) throws IOException {
Collection<BytesRef> tokens = new LinkedList<>();
TokenStream ts = analyzer.tokenStream(field, sampleTextString);
@@ -153,6 +157,8 @@ public class SimSearchUtils {
Query simQuery =
SimSearchUtils.getSimQuery(analyzer, similarityFieldName, fvString);
booleanQuery.add(new BooleanClause(simQuery,
SHOULD));
log.trace("similarity query generated for {}",
pd.name);
+ } else {
+ log.warn("could not create query for similarity
field {}", fvString);
}
}
}
@@ -227,4 +233,49 @@ public class SimSearchUtils {
return 1;
}
-}
+ public static void bruteForceFVRerank(List<PropertyDefinition> sp, TopDocs
docs, IndexSearcher indexSearcher) throws IOException {
+ double farthestDistance = 50d;
+ int k = 15;
+ ScoreDoc inputDoc = docs.scoreDocs[0]; // we assume the input doc is
the first one returned
+ List<Integer> toDiscard = new LinkedList<>();
+ for (PropertyDefinition pd : sp) {
+ String fieldName =
FieldNames.createBinSimilarityFieldName(pd.name);
+ BytesRef binaryValue =
indexSearcher.doc(inputDoc.doc).getBinaryValue(fieldName);
+ double[] inputVector = toDoubleArray(binaryValue.bytes);
+ for (int j = 0; j < docs.scoreDocs.length; j++) {
+ double[] currentVector =
toDoubleArray(indexSearcher.doc(docs.scoreDocs[j].doc)
+ .getBinaryValue(fieldName).bytes);
+ double distance = dist(inputVector, currentVector) + 1e-10; //
constant term to avoid division by zero
+
+ if (distance > farthestDistance) { // a threshold distance
above which current vector is discarded
+ toDiscard.add(docs.scoreDocs[j].doc);
+ }
+ if (Double.isNaN(distance) || Double.isInfinite(distance)) {
+ toDiscard.add(docs.scoreDocs[j].doc);
+ }
+ docs.scoreDocs[j].score += (float) (1d / distance); //
additive similarity boosting
+ }
+ }
+ if (!toDiscard.isEmpty()) {
+ docs.scoreDocs = Arrays.stream(docs.scoreDocs).filter(e ->
!toDiscard.contains(e.doc)).toArray(ScoreDoc[]::new); // remove docs that are
not close enough
+ }
+ Arrays.parallelSort(docs.scoreDocs, 0, docs.scoreDocs.length, (o1, o2)
-> { // rerank scoreDocs
+ return -1 * Double.compare(o1.score, o2.score);
+ });
+ if (docs.scoreDocs.length > k) {
+ docs.scoreDocs = Arrays.copyOfRange(docs.scoreDocs, 0, k); //
retain only the top k nearest neighbours
+ }
+ if (docs.scoreDocs.length > 0) {
+ docs.setMaxScore(docs.scoreDocs[0].score);
+ }
+ }
+
+ private static double dist(double[] x, double[] y) { // euclidean distance
+ double d = 0;
+ for (int i = 0; i < x.length; i++) {
+ d += Math.pow(y[i] - x[i], 2);
+ }
+ return Math.sqrt(d);
+ }
+
+}
\ No newline at end of file