msokolov commented on code in PR #12248: URL: https://github.com/apache/lucene/pull/12248#discussion_r1181072749
########## lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java: ########## @@ -265,19 +265,34 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { } } + List<Integer> sortedNodesOnLevel(HnswGraph h, int level) throws IOException { + NodesIterator nodesOnLevel = h.getNodesOnLevel(level); + List<Integer> nodes = new ArrayList<>(); + while (nodesOnLevel.hasNext()) { + nodes.add(nodesOnLevel.next()); + } + Collections.sort(nodes); + return nodes; + } + void assertGraphEqual(HnswGraph g, HnswGraph h) throws IOException { - assertEquals("the number of levels in the graphs are different!", g.numLevels(), h.numLevels()); - assertEquals("the number of nodes in the graphs are different!", g.size(), h.size()); + // construct these up front since they call seek which will mess up our test loop + var prettyG = prettyPrint(g); + var prettyH = prettyPrint(h); + var m1 = + "the number of levels in the graphs are different:%n%s%n%s".formatted(prettyG, prettyH); + assertEquals(m1, g.numLevels(), h.numLevels()); Review Comment: style nits: we have had a bunch of discussion about whether to allow `var`. I think in the end we decided not to forbid it, but there's an undercurrent of feeling against it in this project. Anyway in tests I don't think anyone will care that much, but in this case I find it actually makes the assert harder to read: what is m1 anyway. Maybe call it `theNumberOfLevelsDiffer`? Or just inline it again. It's OK to have statements that span multiple lines. I'm also not sure about `formatted`. We enforce the use of Locale with string formatting and I don't know if formatted supports that? ########## lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java: ########## @@ -265,19 +265,34 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { } } + List<Integer> sortedNodesOnLevel(HnswGraph h, int level) throws IOException { + NodesIterator nodesOnLevel = h.getNodesOnLevel(level); + List<Integer> nodes = new ArrayList<>(); + while (nodesOnLevel.hasNext()) { + nodes.add(nodesOnLevel.next()); + } + Collections.sort(nodes); + return nodes; + } + void assertGraphEqual(HnswGraph g, HnswGraph h) throws IOException { - assertEquals("the number of levels in the graphs are different!", g.numLevels(), h.numLevels()); - assertEquals("the number of nodes in the graphs are different!", g.size(), h.size()); + // construct these up front since they call seek which will mess up our test loop + var prettyG = prettyPrint(g); + var prettyH = prettyPrint(h); + var m1 = + "the number of levels in the graphs are different:%n%s%n%s".formatted(prettyG, prettyH); + assertEquals(m1, g.numLevels(), h.numLevels()); + var m2 = "the number of nodes in the graphs are different:%n%s%n%s".formatted(prettyG, prettyH); + assertEquals(m2, g.size(), h.size()); // assert equal nodes on each level for (int level = 0; level < g.numLevels(); level++) { - NodesIterator nodesOnLevel = g.getNodesOnLevel(level); - NodesIterator nodesOnLevel2 = h.getNodesOnLevel(level); - while (nodesOnLevel.hasNext() && nodesOnLevel2.hasNext()) { - int node = nodesOnLevel.nextInt(); - int node2 = nodesOnLevel2.nextInt(); - assertEquals("nodes in the graphs are different", node, node2); - } + var hNodes = sortedNodesOnLevel(h, level); + var gNodes = sortedNodesOnLevel(g, level); + var m3 = Review Comment: same - let's not introduce cryptic variable names ########## lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java: ########## @@ -607,13 +620,9 @@ private void assertGraphInitializedFromGraph( // assert the nodes from the previous graph are successfully to levels > 0 in the new graph for (int level = 1; level < g.numLevels(); level++) { - NodesIterator nodesOnLevel = g.getNodesOnLevel(level); - NodesIterator nodesOnLevel2 = h.getNodesOnLevel(level); - while (nodesOnLevel.hasNext() && nodesOnLevel2.hasNext()) { - int node = nodesOnLevel.nextInt(); - int node2 = oldToNewOrdMap.get(nodesOnLevel2.nextInt()); - assertEquals("nodes in the graphs are different", node, node2); - } + var nodesOnLevel = sortedNodesOnLevel(g, level); Review Comment: Please use a typed variable here. If nothing else it aids reader comprehension ########## lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java: ########## @@ -161,11 +162,12 @@ public static NeighborQueue search( similarityFunction, new NeighborQueue(topK, true), new SparseFixedBitSet(vectors.size())); - NeighborQueue results; + NeighborQueue results = new NeighborQueue(topK, false); Review Comment: so this pre-allocation/clear is fine. There is a bunch more we could do here; we recreate an HnswGraphSearcher for every call to search but we don't need to. However let's keep the scope of this commit limited and make small progress. If you want to tackle more optos like this, we can do more commits. ########## lucene/core/src/java/org/apache/lucene/util/hnsw/OnHeapHnswGraph.java: ########## @@ -76,7 +77,7 @@ public NeighborArray getNeighbors(int level, int node) { if (level == 0) { return graphLevel0.get(node); } - TreeMap<Integer, NeighborArray> levelMap = graphUpperLevels.get(level); + var levelMap = graphUpperLevels.get(level); Review Comment: please restore the type name ########## lucene/core/src/java/org/apache/lucene/util/hnsw/OnHeapHnswGraph.java: ########## @@ -204,4 +205,10 @@ public long ramBytesUsed() { } return total; } + + @Override + public String toString() { + return "OnHeapHnswGraph(size=%d, numLevels=%d, entryNode=%d)" + .formatted(size(), numLevels, entryNode); Review Comment: We need to use `String.format` with a `Locale` ########## lucene/core/src/java/org/apache/lucene/codecs/lucene95/Lucene95HnswVectorsWriter.java: ########## @@ -678,10 +678,16 @@ private int[][] writeGraph(OnHeapHnswGraph graph) throws IOException { int[][] offsets = new int[graph.numLevels()][]; for (int level = 0; level < graph.numLevels(); level++) { NodesIterator nodesOnLevel = graph.getNodesOnLevel(level); + int[] sortedNodes = new int[nodesOnLevel.size()]; + int n = 0; + while (nodesOnLevel.hasNext()) { + sortedNodes[n++] = nodesOnLevel.nextInt(); + } + Arrays.sort(sortedNodes); offsets[level] = new int[nodesOnLevel.size()]; int nodeOffsetId = 0; - while (nodesOnLevel.hasNext()) { - int node = nodesOnLevel.nextInt(); + for (n = 0; n < sortedNodes.length; n++) { Review Comment: please use a new variable for the loop iteration. Maybe factor out the copy-to-array above into a function? Or use a for-loop to avoid having `n` bleed into the following scope? ########## lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java: ########## @@ -244,7 +254,7 @@ private NeighborQueue searchLevel( if (results.size() >= topK) { minAcceptedSimilarity = results.topScore(); } - while (candidates.size() > 0 && results.incomplete() == false) { + while (candidates.size() > 0 && !results.incomplete()) { Review Comment: please restore the `== false` it's a style quirk that this project prefers. Same above. You just have to live with IntelliJ's annoying warnings -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org