jtibshirani commented on code in PR #992: URL: https://github.com/apache/lucene/pull/992#discussion_r916054211
########## lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldKnnVectorsFormat.java: ########## @@ -102,9 +104,22 @@ private class FieldsWriter extends KnnVectorsWriter { } @Override - public void writeField(FieldInfo fieldInfo, KnnVectorsReader knnVectorsReader) + public KnnFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOException { + KnnVectorsWriter writer = getInstance(fieldInfo); + return writer.addField(fieldInfo); + } + + @Override + public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { + for (WriterAndSuffix was : formats.values()) { + was.writer.flush(maxDoc, sortMap); + } + } + + @Override + public void mergeOneField(FieldInfo fieldInfo, KnnVectorsReader knnVectorsReader) throws IOException { - getInstance(fieldInfo).writeField(fieldInfo, knnVectorsReader); + getInstance(fieldInfo).mergeOneField(fieldInfo, knnVectorsReader); Review Comment: Small comment, maybe we can throw an `UnsupportedOperationException` here because we expect it never to be called? ########## lucene/core/src/java/org/apache/lucene/codecs/lucene93/Lucene93HnswVectorsWriter.java: ########## @@ -266,65 +470,128 @@ private void writeMeta( } } - private OnHeapHnswGraph writeGraph( - RandomAccessVectorValuesProducer vectorValues, VectorSimilarityFunction similarityFunction) + /** + * Writes the vector values to the output and returns a set of documents that contains vectors. + */ + private static DocsWithFieldSet writeVectorData(IndexOutput output, VectorValues vectors) throws IOException { + DocsWithFieldSet docsWithField = new DocsWithFieldSet(); + for (int docV = vectors.nextDoc(); docV != NO_MORE_DOCS; docV = vectors.nextDoc()) { + // write vector + BytesRef binaryValue = vectors.binaryValue(); + assert binaryValue.length == vectors.dimension() * Float.BYTES; + output.writeBytes(binaryValue.bytes, binaryValue.offset, binaryValue.length); + docsWithField.add(docV); + } + return docsWithField; + } - // build graph - HnswGraphBuilder hnswGraphBuilder = - new HnswGraphBuilder( - vectorValues, similarityFunction, M, beamWidth, HnswGraphBuilder.randSeed); - hnswGraphBuilder.setInfoStream(segmentWriteState.infoStream); - OnHeapHnswGraph graph = hnswGraphBuilder.build(vectorValues.randomAccess()); + @Override + public void close() throws IOException { + IOUtils.close(meta, vectorData, vectorIndex); + } - // write vectors' neighbours on each level into the vectorIndex file - int countOnLevel0 = graph.size(); - for (int level = 0; level < graph.numLevels(); level++) { - int maxConnOnLevel = level == 0 ? (M * 2) : M; - NodesIterator nodesOnLevel = graph.getNodesOnLevel(level); - while (nodesOnLevel.hasNext()) { - int node = nodesOnLevel.nextInt(); - NeighborArray neighbors = graph.getNeighbors(level, node); - int size = neighbors.size(); - vectorIndex.writeInt(size); - // Destructively modify; it's ok we are discarding it after this - int[] nnodes = neighbors.node(); - Arrays.sort(nnodes, 0, size); - for (int i = 0; i < size; i++) { - int nnode = nnodes[i]; - assert nnode < countOnLevel0 : "node too large: " + nnode + ">=" + countOnLevel0; - vectorIndex.writeInt(nnode); - } - // if number of connections < maxConn, add bogus values up to maxConn to have predictable - // offsets - for (int i = size; i < maxConnOnLevel; i++) { - vectorIndex.writeInt(0); - } + private static class FieldData extends KnnFieldVectorsWriter { Review Comment: Small comment, we could rename this to `FieldWriter` now since that's its purpose. ########## lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java: ########## @@ -24,28 +24,40 @@ import org.apache.lucene.index.DocIDMerger; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.MergeState; +import org.apache.lucene.index.Sorter; import org.apache.lucene.index.VectorValues; import org.apache.lucene.search.TopDocs; +import org.apache.lucene.util.Accountable; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; /** Writes vectors to an index. */ -public abstract class KnnVectorsWriter implements Closeable { +public abstract class KnnVectorsWriter implements Accountable, Closeable { /** Sole constructor */ protected KnnVectorsWriter() {} - /** Write all values contained in the provided reader */ - public abstract void writeField(FieldInfo fieldInfo, KnnVectorsReader knnVectorsReader) + /** Add new field for indexing */ + public abstract void addField(FieldInfo fieldInfo) throws IOException; + + /** Add new docID with its vector value to the given field for indexing */ + public abstract void addValue(FieldInfo fieldInfo, int docID, float[] vectorValue) + throws IOException; + + /** Flush all buffered data on disk * */ + public abstract void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException; Review Comment: Sorry, right, my suggestion didn't make sense! On a separate note, now that you've refactored things to have `KnnFieldVectorsWriter`, it seems like we should move `flush` to `KnnFieldVectorsWriter`, since that is the object in charge of writing each field's data. Brainstorming, maybe this would help make `Lucene93HnswVectorsWriter` easier to read, because we could separate out the complex sorting logic into a class like `SortingFieldWriter`? ########## lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java: ########## @@ -24,28 +24,40 @@ import org.apache.lucene.index.DocIDMerger; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.MergeState; +import org.apache.lucene.index.Sorter; import org.apache.lucene.index.VectorValues; import org.apache.lucene.search.TopDocs; +import org.apache.lucene.util.Accountable; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; /** Writes vectors to an index. */ -public abstract class KnnVectorsWriter implements Closeable { +public abstract class KnnVectorsWriter implements Accountable, Closeable { /** Sole constructor */ protected KnnVectorsWriter() {} - /** Write all values contained in the provided reader */ - public abstract void writeField(FieldInfo fieldInfo, KnnVectorsReader knnVectorsReader) + /** Add new field for indexing */ + public abstract void addField(FieldInfo fieldInfo) throws IOException; + + /** Add new docID with its vector value to the given field for indexing */ + public abstract void addValue(FieldInfo fieldInfo, int docID, float[] vectorValue) + throws IOException; + + /** Flush all buffered data on disk * */ + public abstract void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException; + + /** Write field for merging */ + public abstract void writeFieldForMerging(FieldInfo fieldInfo, KnnVectorsReader knnVectorsReader) Review Comment: On a related note, maybe it's a bit confusing now that we have two merge-related methods you can override (`mergeOneField` and `merge`). Maybe we could make `merge` final and remove the special handling in `PerFieldKnnVectorsFormat`? I added the ability to override `merge` in https://github.com/apache/lucene/pull/601, but I ended up refactoring it later so don't never need it anymore. ########## lucene/core/src/java/org/apache/lucene/index/VectorValuesWriter.java: ########## @@ -26,233 +26,153 @@ import org.apache.lucene.codecs.KnnVectorsWriter; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.TopDocs; +import org.apache.lucene.util.Accountable; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.Counter; import org.apache.lucene.util.RamUsageEstimator; /** - * Buffers up pending vector value(s) per doc, then flushes when segment flushes. + * Buffers up pending vector value(s) per doc, then flushes when segment flushes. Used for {@code + * SimpleTextKnnVectorsWriter} and for vectors writers before v 9.3 . * * @lucene.experimental */ -class VectorValuesWriter { - - private final FieldInfo fieldInfo; - private final Counter iwBytesUsed; - private final List<float[]> vectors = new ArrayList<>(); - private final DocsWithFieldSet docsWithField; - - private int lastDocID = -1; - - private long bytesUsed; - - VectorValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) { - this.fieldInfo = fieldInfo; - this.iwBytesUsed = iwBytesUsed; - this.docsWithField = new DocsWithFieldSet(); - this.bytesUsed = docsWithField.ramBytesUsed(); - if (iwBytesUsed != null) { - iwBytesUsed.addAndGet(bytesUsed); +public abstract class VectorValuesWriter extends KnnVectorsWriter { Review Comment: I was thinking we would rewrite `SimpleTextKnnVectorsWriter` to implement the new interface directly. For example it'd define its own `KnnFieldVectorsWriter` where `addValue` writes to the vectors data file directly. (When indexing sorting is enabled, maybe it'd return special `SortingKnnFieldVectorsWriter` that internally buffers and writes the sorted data?) It seemed most natural/ straightforward for this example format to implement the interface directly. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org