jpountz commented on a change in pull request #601:
URL: https://github.com/apache/lucene/pull/601#discussion_r783701570
##########
File path: lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java
##########
@@ -49,7 +43,7 @@ public abstract void writeField(FieldInfo fieldInfo,
KnnVectorsReader knnVectors
public abstract void finish() throws IOException;
/** Merge the vector values from multiple segments, for all fields */
- public void merge(MergeState mergeState) throws IOException {
+ public final void merge(MergeState mergeState) throws IOException {
Review comment:
I suggest we do not make it final for consistency with other file
formats.
##########
File path:
lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsWriter.java
##########
@@ -145,6 +139,64 @@ public void writeField(FieldInfo fieldInfo,
KnnVectorsReader knnVectorsReader)
throw new IllegalArgumentException(
"Indexing an HNSW graph requires a random access vector values, got
" + vectors);
}
+
+ long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset;
+ long vectorIndexLength = vectorIndex.getFilePointer() - vectorIndexOffset;
+ writeMeta(
+ fieldInfo,
+ vectorDataOffset,
+ vectorDataLength,
+ vectorIndexOffset,
+ vectorIndexLength,
+ count,
+ docIds);
+ writeGraphOffsets(meta, offsets);
+ }
+
+ @Override
+ public void mergeField(FieldInfo fieldInfo, MergeState mergeState) throws
IOException {
+ if (mergeState.infoStream.isEnabled("VV")) {
+ mergeState.infoStream.message("VV", "merging " + mergeState.segmentInfo);
+ }
+
+ writeVectorDataPadding();
+ long vectorDataOffset = vectorData.getFilePointer();
+
+ // write the merged vector data to a temporary file
+ VectorValues vectors = MergedVectorValues.mergeVectorValues(fieldInfo,
mergeState);
+ IndexOutput tempVectorData =
+ segmentWriteState.directory.createTempOutput(
+ vectorData.getName(), "temp", segmentWriteState.context);
+ int[] docIds = writeVectorData(tempVectorData, vectors);
+ tempVectorData.close();
+
+ // copy the temporary file vectors to the actual data file
+ IndexInput vectorDataInput =
+ segmentWriteState.directory.openInput(tempVectorData.getName(),
segmentWriteState.context);
+ vectorData.copyBytes(vectorDataInput, vectorDataInput.length());
+
+ // build the graph using the temporary vector data
+ Lucene90HnswVectorsReader.OffHeapVectorValues offHeapVectors =
+ new Lucene90HnswVectorsReader.OffHeapVectorValues(
+ vectors.dimension(), docIds, vectorDataInput);
+ // count may be < vectors.size() e,g, if some documents were deleted
+ int count = docIds.length;
+ long[] offsets = new long[count];
+ long vectorIndexOffset = vectorIndex.getFilePointer();
+ writeGraph(
+ vectorIndex,
+ offHeapVectors,
+ fieldInfo.getVectorSimilarityFunction(),
+ vectorIndexOffset,
+ offsets,
+ count,
+ maxConn,
+ beamWidth);
+
+ vectorDataInput.close();
+ segmentWriteState.directory.deleteFile(tempVectorData.getName());
Review comment:
can you do this in a finally block to make sure we close the input and
remove the file on all code paths?
##########
File path:
lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsWriter.java
##########
@@ -145,6 +139,64 @@ public void writeField(FieldInfo fieldInfo,
KnnVectorsReader knnVectorsReader)
throw new IllegalArgumentException(
"Indexing an HNSW graph requires a random access vector values, got
" + vectors);
}
+
+ long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset;
+ long vectorIndexLength = vectorIndex.getFilePointer() - vectorIndexOffset;
+ writeMeta(
+ fieldInfo,
+ vectorDataOffset,
+ vectorDataLength,
+ vectorIndexOffset,
+ vectorIndexLength,
+ count,
+ docIds);
+ writeGraphOffsets(meta, offsets);
+ }
+
+ @Override
+ public void mergeField(FieldInfo fieldInfo, MergeState mergeState) throws
IOException {
+ if (mergeState.infoStream.isEnabled("VV")) {
+ mergeState.infoStream.message("VV", "merging " + mergeState.segmentInfo);
+ }
+
+ writeVectorDataPadding();
+ long vectorDataOffset = vectorData.getFilePointer();
+
+ // write the merged vector data to a temporary file
+ VectorValues vectors = MergedVectorValues.mergeVectorValues(fieldInfo,
mergeState);
+ IndexOutput tempVectorData =
Review comment:
We could avoid writing a temporary file by overriding `merge` rathen
than `mergeField`, first write merged vectors for all fields, and then compute
HNSW graphs for all fields but if it makes things more complicated I prefer
that we use a temporary file instead.
##########
File path:
lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsWriter.java
##########
@@ -145,6 +139,64 @@ public void writeField(FieldInfo fieldInfo,
KnnVectorsReader knnVectorsReader)
throw new IllegalArgumentException(
"Indexing an HNSW graph requires a random access vector values, got
" + vectors);
}
+
+ long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset;
+ long vectorIndexLength = vectorIndex.getFilePointer() - vectorIndexOffset;
+ writeMeta(
+ fieldInfo,
+ vectorDataOffset,
+ vectorDataLength,
+ vectorIndexOffset,
+ vectorIndexLength,
+ count,
+ docIds);
+ writeGraphOffsets(meta, offsets);
+ }
+
+ @Override
+ public void mergeField(FieldInfo fieldInfo, MergeState mergeState) throws
IOException {
+ if (mergeState.infoStream.isEnabled("VV")) {
+ mergeState.infoStream.message("VV", "merging " + mergeState.segmentInfo);
+ }
+
+ writeVectorDataPadding();
+ long vectorDataOffset = vectorData.getFilePointer();
+
+ // write the merged vector data to a temporary file
+ VectorValues vectors = MergedVectorValues.mergeVectorValues(fieldInfo,
mergeState);
+ IndexOutput tempVectorData =
+ segmentWriteState.directory.createTempOutput(
+ vectorData.getName(), "temp", segmentWriteState.context);
+ int[] docIds = writeVectorData(tempVectorData, vectors);
+ tempVectorData.close();
+
+ // copy the temporary file vectors to the actual data file
+ IndexInput vectorDataInput =
+ segmentWriteState.directory.openInput(tempVectorData.getName(),
segmentWriteState.context);
+ vectorData.copyBytes(vectorDataInput, vectorDataInput.length());
+
+ // build the graph using the temporary vector data
+ Lucene90HnswVectorsReader.OffHeapVectorValues offHeapVectors =
+ new Lucene90HnswVectorsReader.OffHeapVectorValues(
+ vectors.dimension(), docIds, vectorDataInput);
+ // count may be < vectors.size() e,g, if some documents were deleted
Review comment:
Hmm I would have assumed that deleted docs would no longer be a
possibility with this new approach.
##########
File path: lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java
##########
@@ -247,11 +188,6 @@ public BytesRef binaryValue() throws IOException {
return current.values.binaryValue();
}
- @Override
- public RandomAccessVectorValues randomAccess() {
- return new MergerRandomAccess();
- }
Review comment:
\o/
##########
File path:
lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsWriter.java
##########
@@ -145,6 +139,64 @@ public void writeField(FieldInfo fieldInfo,
KnnVectorsReader knnVectorsReader)
throw new IllegalArgumentException(
"Indexing an HNSW graph requires a random access vector values, got
" + vectors);
}
+
+ long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset;
+ long vectorIndexLength = vectorIndex.getFilePointer() - vectorIndexOffset;
+ writeMeta(
+ fieldInfo,
+ vectorDataOffset,
+ vectorDataLength,
+ vectorIndexOffset,
+ vectorIndexLength,
+ count,
+ docIds);
+ writeGraphOffsets(meta, offsets);
+ }
+
+ @Override
+ public void mergeField(FieldInfo fieldInfo, MergeState mergeState) throws
IOException {
+ if (mergeState.infoStream.isEnabled("VV")) {
+ mergeState.infoStream.message("VV", "merging " + mergeState.segmentInfo);
+ }
+
+ writeVectorDataPadding();
+ long vectorDataOffset = vectorData.getFilePointer();
+
+ // write the merged vector data to a temporary file
+ VectorValues vectors = MergedVectorValues.mergeVectorValues(fieldInfo,
mergeState);
+ IndexOutput tempVectorData =
+ segmentWriteState.directory.createTempOutput(
+ vectorData.getName(), "temp", segmentWriteState.context);
+ int[] docIds = writeVectorData(tempVectorData, vectors);
+ tempVectorData.close();
+
+ // copy the temporary file vectors to the actual data file
+ IndexInput vectorDataInput =
+ segmentWriteState.directory.openInput(tempVectorData.getName(),
segmentWriteState.context);
+ vectorData.copyBytes(vectorDataInput, vectorDataInput.length());
Review comment:
Out of paranoia, I would add codec headers/footers to this file (with
padding after the header to keep things aligned) and verify the checksum here
in order to detecte misbehaving disks or file systems.
##########
File path:
lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsWriter.java
##########
@@ -155,6 +207,37 @@ public void writeField(FieldInfo fieldInfo,
KnnVectorsReader knnVectorsReader)
count,
docIds);
writeGraphOffsets(meta, offsets);
+ if (mergeState.infoStream.isEnabled("VV")) {
+ mergeState.infoStream.message("VV", "merge done " +
mergeState.segmentInfo);
+ }
+ }
+
+ private void writeVectorDataPadding() throws IOException {
+ long pos = vectorData.getFilePointer();
+ // write floats aligned at 4 bytes. This will not survive CFS, but it
shows a small benefit when
+ // CFS is not used, eg for larger indexes
Review comment:
This comment is no longer correct since @uschindler fixed CFS to align
sub files at 8 bytes. https://issues.apache.org/jira/browse/LUCENE-10019. Let's
remove this method and use `IndexOutput#alignFilePointer` instead?
##########
File path:
lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsWriter.java
##########
@@ -145,6 +139,64 @@ public void writeField(FieldInfo fieldInfo,
KnnVectorsReader knnVectorsReader)
throw new IllegalArgumentException(
"Indexing an HNSW graph requires a random access vector values, got
" + vectors);
}
+
+ long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset;
+ long vectorIndexLength = vectorIndex.getFilePointer() - vectorIndexOffset;
+ writeMeta(
+ fieldInfo,
+ vectorDataOffset,
+ vectorDataLength,
+ vectorIndexOffset,
+ vectorIndexLength,
+ count,
+ docIds);
+ writeGraphOffsets(meta, offsets);
+ }
+
+ @Override
+ public void mergeField(FieldInfo fieldInfo, MergeState mergeState) throws
IOException {
Review comment:
I wonder if this is the way that we should implement `writeField`
instead, and rely on the default merge implementation. I don't think that it
would slow down flushing of vectors, and then this codec would no longer need
input vectors to implement random access. I'm also fine if you would rather
like to do this as a follow-up.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]