This is an automated email from the ASF dual-hosted git repository. spmallette pushed a commit to branch TINKERPOP-3158 in repository https://gitbox.apache.org/repos/asf/tinkerpop.git
commit 566c19621bd4dc7d534f4e5957736b48b6b19aef Author: Stephen Mallette <[email protected]> AuthorDate: Tue May 6 13:57:52 2025 -0400 Resize index --- .../reference/implementations-tinkergraph.asciidoc | 43 ++++--- .../tinkergraph/structure/TinkerVectorIndex.java | 133 ++++++++++++++++----- .../structure/TinkerGraphVectorIndexTest.java | 85 ++++++++++++- 3 files changed, 208 insertions(+), 53 deletions(-) diff --git a/docs/src/reference/implementations-tinkergraph.asciidoc b/docs/src/reference/implementations-tinkergraph.asciidoc index c5aeeafa49..6cdcfd30eb 100644 --- a/docs/src/reference/implementations-tinkergraph.asciidoc +++ b/docs/src/reference/implementations-tinkergraph.asciidoc @@ -298,8 +298,7 @@ Here's a complete example: [gremlin-groovy] ---- graph.getServiceRegistry().registerService(new TinkerVectorSearchFactory(graph)) <1> -indexConfig = [:] -indexConfig.put(TinkerVectorIndex.CONFIG_DIMENSION, 3) <2> +indexConfig = [dimension: 3] <2> graph.createIndex(TinkerIndexType.VECTOR, "embedding", Vertex.class, indexConfig) g.addV("person").property("name", "Alice").property("embedding", new float[]{1.0f, 0.0f, 0.0f}).iterate() g.addV("person").property("name", "Bob").property("embedding", new float[]{0.0f, 1.0f, 0.0f}).iterate() @@ -322,6 +321,7 @@ Vector indices can also be created for edges: [gremlin-groovy] ---- +graph.getServiceRegistry().registerService(new TinkerVectorSearchFactory(graph)) graph.createIndex(TinkerIndexType.VECTOR, "embedding", Edge.class, indexConfig) alice = g.V().has("name", "Alice").next() bob = g.V().has("name", "Bob").next() @@ -346,9 +346,8 @@ You can specify the distance function when creating the vector index: [gremlin-groovy] ---- -indexConfig = [:] -indexConfig.put(TinkerVectorIndex.CONFIG_DIMENSION, 3) -indexConfig.put(TinkerVectorIndex.CONFIG_DISTANCE_TYPE, TinkerIndexType.Vector.EUCLIDEAN) +graph.getServiceRegistry().registerService(new TinkerVectorSearchFactory(graph)) +indexConfig = [dimension: 3, distanceType: TinkerIndexType.Vector.EUCLIDEAN] graph.createIndex(TinkerIndexType.VECTOR, "embedding", Vertex.class, indexConfig) ---- @@ -358,29 +357,35 @@ These options are specified when creating a vector index: [width="100%",cols="2,5,2",options="header"] |========================================================= |Configuration Option |Description |Default Value -|`CONFIG_DIMENSION` |The dimension of the vector embeddings. This is a required parameter and must match the length of the vector embeddings stored in the graph. |N/A (Required) -|`CONFIG_DISTANCE_TYPE` |The distance function to use for similarity calculations. Must be one of the `TinkerIndexType.Vector` enum values (COSINE, EUCLIDEAN, MANHATTAN, INNER_PRODUCT, BRAY_CURTIS, CANBERRA, CORRELATION). |COSINE -|`CONFIG_M` |The maximum number of connections per node in the HNSW graph. Higher values provide better search quality at the cost of increased memory usage and index build time. |16 -|`CONFIG_EF_CONSTRUCTION` |The size of the dynamic candidate list during index construction. Higher values improve index quality at the cost of longer build times. |200 -|`CONFIG_EF` |The size of the dynamic candidate list during search. Higher values improve search accuracy at the cost of slower search times. |10 -|`CONFIG_MAX_ITEMS` |The maximum number of items that can be stored in the index. |100 +|`dimension` |The dimension of the vector embeddings. This is a required parameter and must match the length of the vector embeddings stored in the graph. |N/A (Required) +|`distanceType` |The distance function to use for similarity calculations. Must be one of the `TinkerIndexType.Vector` enum values (COSINE, EUCLIDEAN, MANHATTAN, INNER_PRODUCT, BRAY_CURTIS, CANBERRA, CORRELATION). |COSINE +|`growthRate`| The rate at which the index will automatically increase in size once it is full. If set to `0` the index will not grow automatically and will throw `SizeLimitExceededException` when its maximum size is reached. |0.10 +|`m` |The maximum number of connections per node in the HNSW graph. Higher values provide better search quality at the cost of increased memory usage and index build time. |16 +|`efConstruction` |The size of the dynamic candidate list during index construction. Higher values improve index quality at the cost of longer build times. |200 +|`ef` |The size of the dynamic candidate list during search. Higher values improve search accuracy at the cost of slower search times. |10 +|`maxItems` |The maximum number of items expected to be stored in the index. Use in conjuction with `growthRate`. |10000 |========================================================= Here's an example of creating a vector index with custom configuration options: [gremlin-groovy] ---- -// create a vector index with custom configuration -indexConfig = [:] -indexConfig.put(TinkerVectorIndex.CONFIG_DIMENSION, 128) -indexConfig.put(TinkerVectorIndex.CONFIG_DISTANCE_TYPE, TinkerIndexType.Vector.COSINE) -indexConfig.put(TinkerVectorIndex.CONFIG_M, 32) -indexConfig.put(TinkerVectorIndex.CONFIG_EF_CONSTRUCTION, 300) -indexConfig.put(TinkerVectorIndex.CONFIG_EF, 20) -indexConfig.put(TinkerVectorIndex.CONFIG_MAX_ITEMS, 1000) +graph.getServiceRegistry().registerService(new TinkerVectorSearchFactory(graph)) +indexConfig = [ + dimension : 128, + distance : TinkerIndexType.Vector.COSINE, + growthRate : 0.15, + m : 32, + efConstruction : 300, + ef : 20, + maxItems : 1000 +] graph.createIndex(TinkerIndexType.VECTOR, "embedding", Vertex.class, indexConfig) ---- +TIP: Constants for all the configuration values can be found in `TinkerVectorIndex`. They are prefixed with "CONFIG_". +For example, "dimension" can be referenced as `TinkerVectorIndex.CONFIG_DIMENSION`. + [[tinkergraph-gremlin-tx]] === Transactions diff --git a/tinkergraph-gremlin/src/main/java/org/apache/tinkerpop/gremlin/tinkergraph/structure/TinkerVectorIndex.java b/tinkergraph-gremlin/src/main/java/org/apache/tinkerpop/gremlin/tinkergraph/structure/TinkerVectorIndex.java index 90317fb70c..becbaad03f 100644 --- a/tinkergraph-gremlin/src/main/java/org/apache/tinkerpop/gremlin/tinkergraph/structure/TinkerVectorIndex.java +++ b/tinkergraph-gremlin/src/main/java/org/apache/tinkerpop/gremlin/tinkergraph/structure/TinkerVectorIndex.java @@ -21,6 +21,7 @@ package org.apache.tinkerpop.gremlin.tinkergraph.structure; import com.github.jelmerk.hnswlib.core.Item; import com.github.jelmerk.hnswlib.core.SearchResult; import com.github.jelmerk.hnswlib.core.hnsw.HnswIndex; +import com.github.jelmerk.hnswlib.core.hnsw.SizeLimitExceededException; import com.github.jelmerk.hnswlib.core.Index; import org.apache.tinkerpop.gremlin.structure.Element; import org.apache.tinkerpop.gremlin.structure.Graph; @@ -44,7 +45,12 @@ final class TinkerVectorIndex<T extends Element> extends AbstractTinkerVectorInd /** * Map of property key to vector index */ - protected Map<String, Index<Object, float[], ElementItem, Float>> vectorIndices = new ConcurrentHashMap<>(); + private final Map<String, Index<Object, float[], ElementItem, Float>> vectorIndices = new ConcurrentHashMap<>(); + + /** + * Map of property key to growth rate + */ + private final Map<String, Double> growthRates = new ConcurrentHashMap<>(); /** * Default M parameter for HNSW index @@ -64,7 +70,12 @@ final class TinkerVectorIndex<T extends Element> extends AbstractTinkerVectorInd /** * Default maximum number of items in the index */ - private static final int DEFAULT_MAX_ITEMS = 100; + private static final int DEFAULT_MAX_ITEMS = 10000; + + /** + * Default growth rate for the index when it reaches capacity (10%) + */ + private static final double DEFAULT_GROWTH_RATE = 0.1; /** * Configuration key for the dimension of the vector @@ -96,6 +107,11 @@ final class TinkerVectorIndex<T extends Element> extends AbstractTinkerVectorInd */ public static final String CONFIG_DISTANCE_FUNCTION = "distanceFunction"; + /** + * Configuration key for the growth rate of the index when it reaches capacity + */ + public static final String CONFIG_GROWTH_RATE = "growthRate"; + /** * Creates a new vector index for the specified graph and element class. * @@ -178,6 +194,15 @@ final class TinkerVectorIndex<T extends Element> extends AbstractTinkerVectorInd } } + double growthRate = DEFAULT_GROWTH_RATE; + if (configuration.containsKey(CONFIG_GROWTH_RATE)) { + final Object growthObj = configuration.get(CONFIG_GROWTH_RATE); + if (growthObj instanceof Number) { + growthRate = ((Number) growthObj).doubleValue(); + } + } + this.growthRates.put(key, growthRate); + // Create a new HNSW index for this property key final Index<Object, float[], ElementItem, Float> index = HnswIndex .newBuilder(dimension, vector.getDistanceFunction(), Float::compare, maxItems) @@ -217,7 +242,8 @@ final class TinkerVectorIndex<T extends Element> extends AbstractTinkerVectorInd final Index<Object, float[], ElementItem, Float> index = this.vectorIndices.get(key); final ElementItem item = new ElementItem(element.id(), vector, element); - index.add(item); + + addWithResize(key, index, item); } /** @@ -291,7 +317,8 @@ final class TinkerVectorIndex<T extends Element> extends AbstractTinkerVectorInd // If the element is not in the index, just ignore the exception } final ElementItem item = new ElementItem(element.id(), newValue, element); - index.add(item); + + addWithResize(key, index, item); } /** @@ -305,37 +332,11 @@ final class TinkerVectorIndex<T extends Element> extends AbstractTinkerVectorInd this.vectorIndices.remove(key); } - this.indexedKeys.remove(key); - } - - /** - * A class that wraps an element with its vector for use in the HNSW index. - */ - private class ElementItem implements Item<Object, float[]>, Serializable { - private final Object id; - private final float[] vector; - private final T element; - - public ElementItem(final Object id, final float[] vector, final T element) { - this.id = id; - this.vector = vector; - this.element = element; - } - - @Override - public Object id() { - return id; - } - - @Override - public float[] vector() { - return vector; + if (this.growthRates.containsKey(key)) { + this.growthRates.remove(key); } - @Override - public int dimensions() { - return vector.length; - } + this.indexedKeys.remove(key); } // AbstractTinkerIndex implementation methods @@ -375,4 +376,70 @@ final class TinkerVectorIndex<T extends Element> extends AbstractTinkerVectorInd updateIndex(key, (float[]) newValue, element); } } + + /** + * Helper method to add an item to the index with automatic resizing if needed. + * + * @param key the property key + * @param index the vector index + * @param item the item to add + */ + private void addWithResize(final String key, final Index<Object, float[], ElementItem, Float> index, + final ElementItem item) { + try { + index.add(item); + } catch (SizeLimitExceededException e) { + // Get the growth rate for this index + final Double growthRate = this.growthRates.getOrDefault(key, 0.0d); + + // If growth rate is 0 or not set, rethrow the exception + if (growthRate <= 0) { + throw e; + } + + // Calculate new size based on growth rate + final int currentSize = ((HnswIndex<Object, float[], ElementItem, Float>) index).getMaxItemCount(); + final int newSize = currentSize + (int) Math.ceil(currentSize * growthRate); + + // Resize the index + ((HnswIndex<Object, float[], ElementItem, Float>) index).resize(newSize); + + // Try adding the item again + index.add(item); + } catch (Exception e) { + // If it's not a size limit exception, rethrow it + throw e; + } + } + + + /** + * A class that wraps an element with its vector for use in the HNSW index. + */ + private class ElementItem implements Item<Object, float[]>, Serializable { + private final Object id; + private final float[] vector; + private final T element; + + public ElementItem(final Object id, final float[] vector, final T element) { + this.id = id; + this.vector = vector; + this.element = element; + } + + @Override + public Object id() { + return id; + } + + @Override + public float[] vector() { + return vector; + } + + @Override + public int dimensions() { + return vector.length; + } + } } diff --git a/tinkergraph-gremlin/src/test/java/org/apache/tinkerpop/gremlin/tinkergraph/structure/TinkerGraphVectorIndexTest.java b/tinkergraph-gremlin/src/test/java/org/apache/tinkerpop/gremlin/tinkergraph/structure/TinkerGraphVectorIndexTest.java index daa4630766..f84806283b 100644 --- a/tinkergraph-gremlin/src/test/java/org/apache/tinkerpop/gremlin/tinkergraph/structure/TinkerGraphVectorIndexTest.java +++ b/tinkergraph-gremlin/src/test/java/org/apache/tinkerpop/gremlin/tinkergraph/structure/TinkerGraphVectorIndexTest.java @@ -26,6 +26,7 @@ import org.junit.Before; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; +import com.github.jelmerk.hnswlib.core.hnsw.SizeLimitExceededException; import java.util.Arrays; import java.util.Collection; @@ -505,6 +506,88 @@ public class TinkerGraphVectorIndexTest { assertEquals(0, nearest.size()); } + @Test + public void shouldGrowIndexWhenCapacityReached() { + // Skip this test for TinkerTransactionGraph as it handles transactions differently + if (graph instanceof TinkerTransactionGraph) { + return; + } + + final GraphTraversalSource g = traversal().with(graph); + + // Create a small index with only 5 items capacity and 50% growth rate + final Map<String, Object> smallIndexConfig = new HashMap<>(indexConfig); + smallIndexConfig.put(TinkerVectorIndex.CONFIG_MAX_ITEMS, 5); + smallIndexConfig.put(TinkerVectorIndex.CONFIG_GROWTH_RATE, 0.5); // 50% growth + + graph.createIndex(TinkerIndexType.VECTOR, "embedding", Vertex.class, smallIndexConfig); + + // Add 5 vertices (fills the index to capacity) + for (int i = 0; i < 5; i++) { + g.addV("person").property("name", "Person" + i) + .property("embedding", new float[]{(float)i, 0.0f, 0.0f}).iterate(); + } + tryCommitChanges(graph); + + // Add one more vertex with a very distinctive vector - this should trigger the index growth + g.addV("person").property("name", "PersonExtra") + .property("embedding", new float[]{10.0f, 0.0f, 0.0f}).iterate(); + tryCommitChanges(graph); + + // Verify we can find all 6 vertices + final List<TinkerVertex> allVertices = graph.findNearestVerticesOnly("embedding", new float[]{0.0f, 0.0f, 0.0f}, 10); + assertEquals(6, allVertices.size()); + + // Verify the extra vertex exists in the results + boolean foundExtra = false; + for (TinkerVertex vertex : allVertices) { + if ("PersonExtra".equals(vertex.value("name"))) { + foundExtra = true; + break; + } + } + assertThat("Should find the extra vertex", foundExtra, is(true)); + } + + @Test + public void shouldThrowExceptionWhenGrowthRateIsZero() { + // Skip this test for TinkerTransactionGraph as it handles transactions differently + if (graph instanceof TinkerTransactionGraph) { + return; + } + + final GraphTraversalSource g = traversal().with(graph); + + // Create a small index with only 5 items capacity and 0 growth rate + final Map<String, Object> smallIndexConfig = new HashMap<>(indexConfig); + smallIndexConfig.put(TinkerVectorIndex.CONFIG_MAX_ITEMS, 5); + smallIndexConfig.put(TinkerVectorIndex.CONFIG_GROWTH_RATE, 0.0); // No growth + + graph.createIndex(TinkerIndexType.VECTOR, "embedding", Vertex.class, smallIndexConfig); + + // Add 5 vertices (fills the index to capacity) + for (int i = 0; i < 5; i++) { + g.addV("person").property("name", "Person" + i) + .property("embedding", new float[]{(float)i, 0.0f, 0.0f}).iterate(); + } + tryCommitChanges(graph); + + try { + // Add one more vertex - this should throw SizeLimitExceededException + g.addV("person").property("name", "PersonExtra") + .property("embedding", new float[]{5.0f, 0.0f, 0.0f}).iterate(); + tryCommitChanges(graph); + fail("Should have thrown SizeLimitExceededException"); + } catch (Exception e) { + // Verify that the exception is caused by SizeLimitExceededException + Throwable cause = e; + while (cause != null && !(cause instanceof SizeLimitExceededException)) { + cause = cause.getCause(); + } + assertNotNull("Expected SizeLimitExceededException", cause); + } + } + private void tryCommitChanges(final Graph graph) { if (graph.features().graph().supportsTransactions()) graph.tx().commit(); @@ -514,4 +597,4 @@ public class TinkerGraphVectorIndexTest { if (graph.features().graph().supportsTransactions()) graph.tx().rollback(); } -} \ No newline at end of file +}
