nitirajrathore commented on a change in pull request #55: URL: https://github.com/apache/lucene/pull/55#discussion_r604607041
########## File path: lucene/test-framework/src/java/org/apache/lucene/util/FullKnn.java ########## @@ -0,0 +1,156 @@ +package org.apache.lucene.util; + +import org.apache.lucene.index.VectorValues; +import org.apache.lucene.util.hnsw.NeighborQueue; + +import java.io.IOException; +import java.nio.ByteOrder; +import java.nio.FloatBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; + +/** + * A utility class to calculate the Full KNN / Exact KNN over a set of query vectors and document vectors. + */ +public class FullKnn { + + private final int dim; + private final int topK; + private final VectorValues.SearchStrategy searchStrategy; + private final boolean quiet; + + public FullKnn(int dim, int topK, VectorValues.SearchStrategy searchStrategy, boolean quiet) { + this.dim = dim; + this.topK = topK; + this.searchStrategy = searchStrategy; + this.quiet = quiet; + } + + /** + * internal object to track KNN calculation for one query + */ + private static class KnnJob { + public int currDocIndex; + float[] queryVector; + float[] currDocVector; + int queryIndex; + NeighborQueue queue; + FloatBuffer docVectors; + VectorValues.SearchStrategy searchStrategy; + + public KnnJob(int queryIndex, float[] queryVector, int topK, VectorValues.SearchStrategy searchStrategy) { + this.queryIndex = queryIndex; + this.queryVector = queryVector; + this.currDocVector = new float[queryVector.length]; + queue = new NeighborQueue(topK, searchStrategy.reversed); + this.searchStrategy = searchStrategy; + } + + public void execute() { + while (this.docVectors.hasRemaining()) { + this.docVectors.get(this.currDocVector); + float d = this.searchStrategy.compare(this.queryVector, this.currDocVector); + this.queue.insertWithOverflow(this.currDocIndex, d); + this.currDocIndex++; + } + } + } + + /** + * computes the exact KNN match for each query vector in queryPath for all the document vectors in docPath + * + * @param docPath : path to the file containing the float 32 document vectors in bytes with little-endian byte order + * Throws exception if topK is greater than number of documents in this file + * @param numDocs : number of vectors in the document vector file at docPath + * @param queryPath : path to the file containing the containing 32-bit floating point vectors in little-endian byte order + * @param numIters : number of vectors in the query vector file at queryPath + * @param numThreads : create numThreads to parallelize work + * @return : returns an int 2D array ( int matches[][]) of size 'numIters x topK'. matches[i] is an array containing + * the indexes of the topK most similar document vectors to the ith query vector, and is sorted by similarity, with + * the most similar vector first. Similarity is defined by the searchStrategy used to construct this FullKnn. + * @throws IOException : if topK is greater than number of documents in docPath file Review comment: Here it is topK actually, but I agree other checks like numDocs and numQueries are missing. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org