This is an automated email from the ASF dual-hosted git repository. spmallette pushed a commit to branch TINKERPOP-3158 in repository https://gitbox.apache.org/repos/asf/tinkerpop.git
commit c9eb7e8fa1a19e422c445c047b564b03f9104abb Author: Stephen Mallette <stepm...@amazon.com> AuthorDate: Tue May 6 13:57:52 2025 -0400 Resize index --- .../reference/implementations-tinkergraph.asciidoc | 43 ++++--- docs/src/upgrade/release-3.8.x.asciidoc | 7 +- .../tinkergraph/structure/TinkerVectorIndex.java | 133 ++++++++++++++++----- .../structure/TinkerGraphVectorIndexTest.java | 85 ++++++++++++- 4 files changed, 212 insertions(+), 56 deletions(-) diff --git a/docs/src/reference/implementations-tinkergraph.asciidoc b/docs/src/reference/implementations-tinkergraph.asciidoc index a3be4ec1c9..35564d7842 100644 --- a/docs/src/reference/implementations-tinkergraph.asciidoc +++ b/docs/src/reference/implementations-tinkergraph.asciidoc @@ -281,8 +281,7 @@ Here's a complete example: [gremlin-groovy] ---- graph.getServiceRegistry().registerService(new TinkerVectorSearchFactory(graph)) <1> -indexConfig = [:] -indexConfig.put(TinkerVectorIndex.CONFIG_DIMENSION, 3) <2> +indexConfig = [dimension: 3] <2> graph.createIndex(TinkerIndexType.VECTOR, "embedding", Vertex.class, indexConfig) g.addV("person").property("name", "Alice").property("embedding", new float[]{1.0f, 0.0f, 0.0f}).iterate() g.addV("person").property("name", "Bob").property("embedding", new float[]{0.0f, 1.0f, 0.0f}).iterate() @@ -305,6 +304,7 @@ Vector indices can also be created for edges: [gremlin-groovy] ---- +graph.getServiceRegistry().registerService(new TinkerVectorSearchFactory(graph)) graph.createIndex(TinkerIndexType.VECTOR, "embedding", Edge.class, indexConfig) alice = g.V().has("name", "Alice").next() bob = g.V().has("name", "Bob").next() @@ -329,9 +329,8 @@ You can specify the distance function when creating the vector index: [gremlin-groovy] ---- -indexConfig = [:] -indexConfig.put(TinkerVectorIndex.CONFIG_DIMENSION, 3) -indexConfig.put(TinkerVectorIndex.CONFIG_DISTANCE_TYPE, TinkerIndexType.Vector.EUCLIDEAN) +graph.getServiceRegistry().registerService(new TinkerVectorSearchFactory(graph)) +indexConfig = [dimension: 3, distanceType: TinkerIndexType.Vector.EUCLIDEAN] graph.createIndex(TinkerIndexType.VECTOR, "embedding", Vertex.class, indexConfig) ---- @@ -341,29 +340,35 @@ These options are specified when creating a vector index: [width="100%",cols="2,5,2",options="header"] |========================================================= |Configuration Option |Description |Default Value -|`CONFIG_DIMENSION` |The dimension of the vector embeddings. This is a required parameter and must match the length of the vector embeddings stored in the graph. |N/A (Required) -|`CONFIG_DISTANCE_TYPE` |The distance function to use for similarity calculations. Must be one of the `TinkerIndexType.Vector` enum values (COSINE, EUCLIDEAN, MANHATTAN, INNER_PRODUCT, BRAY_CURTIS, CANBERRA, CORRELATION). |COSINE -|`CONFIG_M` |The maximum number of connections per node in the HNSW graph. Higher values provide better search quality at the cost of increased memory usage and index build time. |16 -|`CONFIG_EF_CONSTRUCTION` |The size of the dynamic candidate list during index construction. Higher values improve index quality at the cost of longer build times. |200 -|`CONFIG_EF` |The size of the dynamic candidate list during search. Higher values improve search accuracy at the cost of slower search times. |10 -|`CONFIG_MAX_ITEMS` |The maximum number of items that can be stored in the index. |100 +|`dimension` |The dimension of the vector embeddings. This is a required parameter and must match the length of the vector embeddings stored in the graph. |N/A (Required) +|`distanceType` |The distance function to use for similarity calculations. Must be one of the `TinkerIndexType.Vector` enum values (COSINE, EUCLIDEAN, MANHATTAN, INNER_PRODUCT, BRAY_CURTIS, CANBERRA, CORRELATION). |COSINE +|`growthRate`| The rate at which the index will automatically increase in size once it is full. If set to `0` the index will not grow automatically and will throw `SizeLimitExceededException` when its maximum size is reached. |0.10 +|`m` |The maximum number of connections per node in the HNSW graph. Higher values provide better search quality at the cost of increased memory usage and index build time. |16 +|`efConstruction` |The size of the dynamic candidate list during index construction. Higher values improve index quality at the cost of longer build times. |200 +|`ef` |The size of the dynamic candidate list during search. Higher values improve search accuracy at the cost of slower search times. |10 +|`maxItems` |The maximum number of items expected to be stored in the index. Use in conjuction with `growthRate`. |10000 |========================================================= Here's an example of creating a vector index with custom configuration options: [gremlin-groovy] ---- -// create a vector index with custom configuration -indexConfig = [:] -indexConfig.put(TinkerVectorIndex.CONFIG_DIMENSION, 128) -indexConfig.put(TinkerVectorIndex.CONFIG_DISTANCE_TYPE, TinkerIndexType.Vector.COSINE) -indexConfig.put(TinkerVectorIndex.CONFIG_M, 32) -indexConfig.put(TinkerVectorIndex.CONFIG_EF_CONSTRUCTION, 300) -indexConfig.put(TinkerVectorIndex.CONFIG_EF, 20) -indexConfig.put(TinkerVectorIndex.CONFIG_MAX_ITEMS, 1000) +graph.getServiceRegistry().registerService(new TinkerVectorSearchFactory(graph)) +indexConfig = [ + dimension : 128, + distance : TinkerIndexType.Vector.COSINE, + growthRate : 0.15, + m : 32, + efConstruction : 300, + ef : 20, + maxItems : 1000 +] graph.createIndex(TinkerIndexType.VECTOR, "embedding", Vertex.class, indexConfig) ---- +TIP: Constants for all the configuration values can be found in `TinkerVectorIndex`. They are prefixed with "CONFIG_". +For example, "dimension" can be referenced as `TinkerVectorIndex.CONFIG_DIMENSION`. + [[tinkergraph-gremlin-tx]] === Transactions diff --git a/docs/src/upgrade/release-3.8.x.asciidoc b/docs/src/upgrade/release-3.8.x.asciidoc index 415c1094dc..0604fb8fa9 100644 --- a/docs/src/upgrade/release-3.8.x.asciidoc +++ b/docs/src/upgrade/release-3.8.x.asciidoc @@ -200,7 +200,7 @@ link:https://issues.apache.org/jira/browse/TINKERPOP-3047[TINKERPOP-3047] The `SeedStrategy` public constructor has been removed for Java and has been replaced by the builder pattern common to all strategies. This change was made to ensure that the `SeedStrategy` could be constructed in a consistent manner. -=== TinkerGraph Vector Index +==== TinkerGraph Vector Index Vector search enables finding similar items based on their semantic meaning rather than exact keyword matches, which is essential for modern applications like recommendation systems, image search, and natural language processing. @@ -222,8 +222,7 @@ g = traversal().with(graph) graph.getServiceRegistry().registerService(new TinkerVectorSearchFactory(graph)) // Create a vector index with dimension 3 -indexConfig = [:] -indexConfig.put(TinkerVectorIndex.CONFIG_DIMENSION, 3) +indexConfig = [dimension: 3] graph.createIndex(TinkerIndexType.VECTOR, "embedding", Vertex.class, indexConfig) // Add vertices with vector embeddings @@ -244,6 +243,8 @@ The `call()` step returns a list of maps, each containing the matching element a ==>[distance:1.0, element:v[3]] ---- +See: link:https://issues.apache.org/jira/browse/TINKERPOP-3158[TINKERPOP-3158] + ==== Improved Translators The various Java `Translator` implementations allowing conversion of Gremlin traversals to string forms in various diff --git a/tinkergraph-gremlin/src/main/java/org/apache/tinkerpop/gremlin/tinkergraph/structure/TinkerVectorIndex.java b/tinkergraph-gremlin/src/main/java/org/apache/tinkerpop/gremlin/tinkergraph/structure/TinkerVectorIndex.java index 90317fb70c..becbaad03f 100644 --- a/tinkergraph-gremlin/src/main/java/org/apache/tinkerpop/gremlin/tinkergraph/structure/TinkerVectorIndex.java +++ b/tinkergraph-gremlin/src/main/java/org/apache/tinkerpop/gremlin/tinkergraph/structure/TinkerVectorIndex.java @@ -21,6 +21,7 @@ package org.apache.tinkerpop.gremlin.tinkergraph.structure; import com.github.jelmerk.hnswlib.core.Item; import com.github.jelmerk.hnswlib.core.SearchResult; import com.github.jelmerk.hnswlib.core.hnsw.HnswIndex; +import com.github.jelmerk.hnswlib.core.hnsw.SizeLimitExceededException; import com.github.jelmerk.hnswlib.core.Index; import org.apache.tinkerpop.gremlin.structure.Element; import org.apache.tinkerpop.gremlin.structure.Graph; @@ -44,7 +45,12 @@ final class TinkerVectorIndex<T extends Element> extends AbstractTinkerVectorInd /** * Map of property key to vector index */ - protected Map<String, Index<Object, float[], ElementItem, Float>> vectorIndices = new ConcurrentHashMap<>(); + private final Map<String, Index<Object, float[], ElementItem, Float>> vectorIndices = new ConcurrentHashMap<>(); + + /** + * Map of property key to growth rate + */ + private final Map<String, Double> growthRates = new ConcurrentHashMap<>(); /** * Default M parameter for HNSW index @@ -64,7 +70,12 @@ final class TinkerVectorIndex<T extends Element> extends AbstractTinkerVectorInd /** * Default maximum number of items in the index */ - private static final int DEFAULT_MAX_ITEMS = 100; + private static final int DEFAULT_MAX_ITEMS = 10000; + + /** + * Default growth rate for the index when it reaches capacity (10%) + */ + private static final double DEFAULT_GROWTH_RATE = 0.1; /** * Configuration key for the dimension of the vector @@ -96,6 +107,11 @@ final class TinkerVectorIndex<T extends Element> extends AbstractTinkerVectorInd */ public static final String CONFIG_DISTANCE_FUNCTION = "distanceFunction"; + /** + * Configuration key for the growth rate of the index when it reaches capacity + */ + public static final String CONFIG_GROWTH_RATE = "growthRate"; + /** * Creates a new vector index for the specified graph and element class. * @@ -178,6 +194,15 @@ final class TinkerVectorIndex<T extends Element> extends AbstractTinkerVectorInd } } + double growthRate = DEFAULT_GROWTH_RATE; + if (configuration.containsKey(CONFIG_GROWTH_RATE)) { + final Object growthObj = configuration.get(CONFIG_GROWTH_RATE); + if (growthObj instanceof Number) { + growthRate = ((Number) growthObj).doubleValue(); + } + } + this.growthRates.put(key, growthRate); + // Create a new HNSW index for this property key final Index<Object, float[], ElementItem, Float> index = HnswIndex .newBuilder(dimension, vector.getDistanceFunction(), Float::compare, maxItems) @@ -217,7 +242,8 @@ final class TinkerVectorIndex<T extends Element> extends AbstractTinkerVectorInd final Index<Object, float[], ElementItem, Float> index = this.vectorIndices.get(key); final ElementItem item = new ElementItem(element.id(), vector, element); - index.add(item); + + addWithResize(key, index, item); } /** @@ -291,7 +317,8 @@ final class TinkerVectorIndex<T extends Element> extends AbstractTinkerVectorInd // If the element is not in the index, just ignore the exception } final ElementItem item = new ElementItem(element.id(), newValue, element); - index.add(item); + + addWithResize(key, index, item); } /** @@ -305,37 +332,11 @@ final class TinkerVectorIndex<T extends Element> extends AbstractTinkerVectorInd this.vectorIndices.remove(key); } - this.indexedKeys.remove(key); - } - - /** - * A class that wraps an element with its vector for use in the HNSW index. - */ - private class ElementItem implements Item<Object, float[]>, Serializable { - private final Object id; - private final float[] vector; - private final T element; - - public ElementItem(final Object id, final float[] vector, final T element) { - this.id = id; - this.vector = vector; - this.element = element; - } - - @Override - public Object id() { - return id; - } - - @Override - public float[] vector() { - return vector; + if (this.growthRates.containsKey(key)) { + this.growthRates.remove(key); } - @Override - public int dimensions() { - return vector.length; - } + this.indexedKeys.remove(key); } // AbstractTinkerIndex implementation methods @@ -375,4 +376,70 @@ final class TinkerVectorIndex<T extends Element> extends AbstractTinkerVectorInd updateIndex(key, (float[]) newValue, element); } } + + /** + * Helper method to add an item to the index with automatic resizing if needed. + * + * @param key the property key + * @param index the vector index + * @param item the item to add + */ + private void addWithResize(final String key, final Index<Object, float[], ElementItem, Float> index, + final ElementItem item) { + try { + index.add(item); + } catch (SizeLimitExceededException e) { + // Get the growth rate for this index + final Double growthRate = this.growthRates.getOrDefault(key, 0.0d); + + // If growth rate is 0 or not set, rethrow the exception + if (growthRate <= 0) { + throw e; + } + + // Calculate new size based on growth rate + final int currentSize = ((HnswIndex<Object, float[], ElementItem, Float>) index).getMaxItemCount(); + final int newSize = currentSize + (int) Math.ceil(currentSize * growthRate); + + // Resize the index + ((HnswIndex<Object, float[], ElementItem, Float>) index).resize(newSize); + + // Try adding the item again + index.add(item); + } catch (Exception e) { + // If it's not a size limit exception, rethrow it + throw e; + } + } + + + /** + * A class that wraps an element with its vector for use in the HNSW index. + */ + private class ElementItem implements Item<Object, float[]>, Serializable { + private final Object id; + private final float[] vector; + private final T element; + + public ElementItem(final Object id, final float[] vector, final T element) { + this.id = id; + this.vector = vector; + this.element = element; + } + + @Override + public Object id() { + return id; + } + + @Override + public float[] vector() { + return vector; + } + + @Override + public int dimensions() { + return vector.length; + } + } } diff --git a/tinkergraph-gremlin/src/test/java/org/apache/tinkerpop/gremlin/tinkergraph/structure/TinkerGraphVectorIndexTest.java b/tinkergraph-gremlin/src/test/java/org/apache/tinkerpop/gremlin/tinkergraph/structure/TinkerGraphVectorIndexTest.java index daa4630766..f84806283b 100644 --- a/tinkergraph-gremlin/src/test/java/org/apache/tinkerpop/gremlin/tinkergraph/structure/TinkerGraphVectorIndexTest.java +++ b/tinkergraph-gremlin/src/test/java/org/apache/tinkerpop/gremlin/tinkergraph/structure/TinkerGraphVectorIndexTest.java @@ -26,6 +26,7 @@ import org.junit.Before; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; +import com.github.jelmerk.hnswlib.core.hnsw.SizeLimitExceededException; import java.util.Arrays; import java.util.Collection; @@ -505,6 +506,88 @@ public class TinkerGraphVectorIndexTest { assertEquals(0, nearest.size()); } + @Test + public void shouldGrowIndexWhenCapacityReached() { + // Skip this test for TinkerTransactionGraph as it handles transactions differently + if (graph instanceof TinkerTransactionGraph) { + return; + } + + final GraphTraversalSource g = traversal().with(graph); + + // Create a small index with only 5 items capacity and 50% growth rate + final Map<String, Object> smallIndexConfig = new HashMap<>(indexConfig); + smallIndexConfig.put(TinkerVectorIndex.CONFIG_MAX_ITEMS, 5); + smallIndexConfig.put(TinkerVectorIndex.CONFIG_GROWTH_RATE, 0.5); // 50% growth + + graph.createIndex(TinkerIndexType.VECTOR, "embedding", Vertex.class, smallIndexConfig); + + // Add 5 vertices (fills the index to capacity) + for (int i = 0; i < 5; i++) { + g.addV("person").property("name", "Person" + i) + .property("embedding", new float[]{(float)i, 0.0f, 0.0f}).iterate(); + } + tryCommitChanges(graph); + + // Add one more vertex with a very distinctive vector - this should trigger the index growth + g.addV("person").property("name", "PersonExtra") + .property("embedding", new float[]{10.0f, 0.0f, 0.0f}).iterate(); + tryCommitChanges(graph); + + // Verify we can find all 6 vertices + final List<TinkerVertex> allVertices = graph.findNearestVerticesOnly("embedding", new float[]{0.0f, 0.0f, 0.0f}, 10); + assertEquals(6, allVertices.size()); + + // Verify the extra vertex exists in the results + boolean foundExtra = false; + for (TinkerVertex vertex : allVertices) { + if ("PersonExtra".equals(vertex.value("name"))) { + foundExtra = true; + break; + } + } + assertThat("Should find the extra vertex", foundExtra, is(true)); + } + + @Test + public void shouldThrowExceptionWhenGrowthRateIsZero() { + // Skip this test for TinkerTransactionGraph as it handles transactions differently + if (graph instanceof TinkerTransactionGraph) { + return; + } + + final GraphTraversalSource g = traversal().with(graph); + + // Create a small index with only 5 items capacity and 0 growth rate + final Map<String, Object> smallIndexConfig = new HashMap<>(indexConfig); + smallIndexConfig.put(TinkerVectorIndex.CONFIG_MAX_ITEMS, 5); + smallIndexConfig.put(TinkerVectorIndex.CONFIG_GROWTH_RATE, 0.0); // No growth + + graph.createIndex(TinkerIndexType.VECTOR, "embedding", Vertex.class, smallIndexConfig); + + // Add 5 vertices (fills the index to capacity) + for (int i = 0; i < 5; i++) { + g.addV("person").property("name", "Person" + i) + .property("embedding", new float[]{(float)i, 0.0f, 0.0f}).iterate(); + } + tryCommitChanges(graph); + + try { + // Add one more vertex - this should throw SizeLimitExceededException + g.addV("person").property("name", "PersonExtra") + .property("embedding", new float[]{5.0f, 0.0f, 0.0f}).iterate(); + tryCommitChanges(graph); + fail("Should have thrown SizeLimitExceededException"); + } catch (Exception e) { + // Verify that the exception is caused by SizeLimitExceededException + Throwable cause = e; + while (cause != null && !(cause instanceof SizeLimitExceededException)) { + cause = cause.getCause(); + } + assertNotNull("Expected SizeLimitExceededException", cause); + } + } + private void tryCommitChanges(final Graph graph) { if (graph.features().graph().supportsTransactions()) graph.tx().commit(); @@ -514,4 +597,4 @@ public class TinkerGraphVectorIndexTest { if (graph.features().graph().supportsTransactions()) graph.tx().rollback(); } -} \ No newline at end of file +}