adelapena commented on code in PR #2673: URL: https://github.com/apache/cassandra/pull/2673#discussion_r1327395479
########## src/java/org/apache/cassandra/index/sai/VectorContext.java: ########## @@ -0,0 +1,168 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.index.sai; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashSet; +import java.util.NavigableSet; +import java.util.Set; +import java.util.TreeSet; + +import org.apache.cassandra.index.sai.disk.PrimaryKeyMap; +import org.apache.cassandra.index.sai.disk.v1.segment.SegmentMetadata; +import org.apache.cassandra.index.sai.disk.v1.vector.hnsw.CassandraOnDiskHnsw; +import org.apache.cassandra.index.sai.disk.v1.vector.hnsw.CassandraOnHeapHnsw; +import org.apache.cassandra.index.sai.utils.PrimaryKey; +import org.apache.lucene.util.Bits; + +public class VectorContext +{ + public int hnswVectorsAccessed; + public int hnswVectorCacheHits; Review Comment: Can we add some JavaDoc for these attributes? ########## src/java/org/apache/cassandra/index/Index.java: ########## @@ -440,6 +445,19 @@ default boolean filtersMultipleContains() */ public RowFilter getPostIndexQueryFilter(RowFilter filter); + /** + * Return a comparator that reorders query result before sending to client + * + * @param restriction restriction that requires current index + * @param columnIndex idx of the indexed column in returned row + * @param options query options + * @return comparator that for post-query ordering; or null if not supported + */ + default Comparator<List<ByteBuffer>> getPostQueryOrdering(Restriction restriction, int columnIndex, QueryOptions options) Review Comment: Nit: add `@Nullable` ########## src/java/org/apache/cassandra/index/sai/VectorContext.java: ########## @@ -0,0 +1,168 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.index.sai; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashSet; +import java.util.NavigableSet; +import java.util.Set; +import java.util.TreeSet; + +import org.apache.cassandra.index.sai.disk.PrimaryKeyMap; +import org.apache.cassandra.index.sai.disk.v1.segment.SegmentMetadata; +import org.apache.cassandra.index.sai.disk.v1.vector.hnsw.CassandraOnDiskHnsw; +import org.apache.cassandra.index.sai.disk.v1.vector.hnsw.CassandraOnHeapHnsw; +import org.apache.cassandra.index.sai.utils.PrimaryKey; +import org.apache.lucene.util.Bits; + +public class VectorContext Review Comment: Can we add some class-level javaDoc? ########## src/java/org/apache/cassandra/index/sai/disk/v1/segment/SegmentOrdering.java: ########## @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.index.sai.disk.v1.segment; + +import java.io.IOException; + +import org.apache.cassandra.index.sai.QueryContext; +import org.apache.cassandra.index.sai.iterators.KeyRangeIterator; +import org.apache.cassandra.index.sai.plan.Expression; +import org.apache.cassandra.index.sai.postings.PostingList; + +/** + * There are two steps in ordering: + * <p> + * 1. Limit a single sstable's results to the correct keys. At this stage + * we put them back in primary key order to play nice with the rest of + * the query pipeline. + * 2. Merge the results from multiple sstables. Now we leave them in the + * final, correct order. + * <p> + * SegmentOrdering handles the first step. + */ +public interface SegmentOrdering +{ + /** + * Reorder, limit, and put back into original order the results from a single sstable + */ + default KeyRangeIterator limitToTopResults(QueryContext context, PostingList iterator, Expression exp) throws IOException Review Comment: Maybe we could name this `limitToTopKResults`, to be consistent with the many `topK` vars and methods around? ########## src/java/org/apache/cassandra/index/sai/disk/SSTableIndex.java: ########## @@ -121,6 +123,24 @@ public SSTableIndex(SSTableContext sstableContext, IndexContext indexContext) */ public abstract AbstractBounds<PartitionPosition> bounds(); +// /** +// * Perform a search on the index for a single expression and keyRange. +// * <p> +// * The result is a {@link List} of {@link KeyRangeIterator} because there will +// * be a {@link KeyRangeIterator} for each segment in the index. The result +// * will never be null but may be an empty {@link List}. +// * +// * @param expression The {@link Expression} to be searched for +// * @param keyRange The {@link AbstractBounds<PartitionPosition>} defining the +// * token range for the search +// * @param context The {@link QueryContext} holding the per-query state +// * @return a {@link List} of {@link KeyRangeIterator}s containing the results +// * of the search +// */ +// public abstract List<KeyRangeIterator<PrimaryKey>> search(Expression expression, +// AbstractBounds<PartitionPosition> keyRange, +// QueryContext context) throws IOException; Review Comment: This seems accidentally included ########## src/java/org/apache/cassandra/index/sai/disk/RowMapping.java: ########## @@ -163,6 +177,18 @@ public void add(PrimaryKey key, long sstableRowId) throws InMemoryTrie.SpaceExha if (minKey == null) minKey = key; maxKey = key; + count++; + } + + public int get(PrimaryKey key) Review Comment: Nit: I'd add JavaDoc saying that `-1` means not found. ########## src/java/org/apache/cassandra/index/sai/disk/v1/segment/Segment.java: ########## @@ -97,6 +98,19 @@ public long indexFileCacheSize() return index == null ? 0 : index.indexFileCacheSize(); } +// /** +// * Search on-disk index synchronously +// * +// * @param expression to filter on disk index +// * @param context to track per sstable cache and per query metrics +// +// * @return range iterator that matches given expression +// */ +// public KeyRangeIterator<PrimaryKey> search(Expression expression, AbstractBounds<PartitionPosition> keyRange, QueryContext context) throws IOException +// { +// return index.search(expression, keyRange, context); +// } Review Comment: This seems accidentally included. ########## src/java/org/apache/cassandra/index/sai/disk/v1/postings/ReorderingPostingList.java: ########## @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.index.sai.disk.v1.postings; + +import java.io.IOException; +import java.util.PrimitiveIterator; + +import org.apache.cassandra.index.sai.postings.PostingList; +import org.apache.lucene.util.LongHeap; + +public class ReorderingPostingList implements PostingList +{ + private final LongHeap segmentRowIds; + private final int size; + + public ReorderingPostingList(PrimitiveIterator.OfInt source, int estimatedSize) throws IOException Review Comment: Nit: doesn't throw `IOException` ########## src/java/org/apache/cassandra/index/sai/disk/v1/IndexWriterConfig.java: ########## @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.index.sai.disk.v1; + +import java.util.Arrays; +import java.util.Map; +import java.util.stream.Collectors; + +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.lucene.index.VectorSimilarityFunction; + +import static org.apache.cassandra.config.CassandraRelevantProperties.SAI_VECTOR_SEARCH_MAX_TOP_K; + +/** + * Per-index config for storage-attached index writers. + */ +public class IndexWriterConfig +{ + public static final String MAXIMUM_NODE_CONNECTIONS = "maximum_node_connections"; Review Comment: I think the constants related to the same property could be grouped together for easier reading, as in: ```java public static final String MAXIMUM_NODE_CONNECTIONS = "maximum_node_connections"; public static final int MAXIMUM_MAXIMUM_NODE_CONNECTIONS = 512; public static final int DEFAULT_MAXIMUM_NODE_CONNECTIONS = 16; public static final String CONSTRUCTION_BEAM_WIDTH = "construction_beam_width"; public static final int MAXIMUM_CONSTRUCTION_BEAM_WIDTH = 3200; public static final int DEFAULT_CONSTRUCTION_BEAM_WIDTH = 100; ``` ########## src/java/org/apache/cassandra/index/sai/disk/v1/segment/VectorIndexSegmentSearcher.java: ########## @@ -0,0 +1,283 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.index.sai.disk.v1.segment; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.nio.ByteBuffer; +import java.util.Arrays; +import javax.annotation.Nullable; + +import com.google.common.base.MoreObjects; +import com.google.common.base.Preconditions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.agrona.collections.IntArrayList; +import org.apache.cassandra.db.PartitionPosition; +import org.apache.cassandra.db.marshal.VectorType; +import org.apache.cassandra.dht.AbstractBounds; +import org.apache.cassandra.index.sai.IndexContext; +import org.apache.cassandra.index.sai.QueryContext; +import org.apache.cassandra.index.sai.disk.PrimaryKeyMap; +import org.apache.cassandra.index.sai.disk.v1.PerColumnIndexFiles; +import org.apache.cassandra.index.sai.disk.v1.vector.hnsw.CassandraOnDiskHnsw; +import org.apache.cassandra.index.sai.disk.v1.postings.ReorderingPostingList; +import org.apache.cassandra.index.sai.iterators.KeyRangeIterator; +import org.apache.cassandra.index.sai.plan.Expression; +import org.apache.cassandra.index.sai.postings.IntArrayPostingList; +import org.apache.cassandra.index.sai.postings.PostingList; +import org.apache.cassandra.index.sai.utils.RangeUtil; +import org.apache.cassandra.index.sai.utils.TypeUtil; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.SparseFixedBitSet; + +/** + * Executes ann search against the HNSW graph for an individual index segment. + */ +public class VectorIndexSegmentSearcher extends IndexSegmentSearcher +{ + private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + private final CassandraOnDiskHnsw graph; + private final VectorType<float[]> type; + private int maxBruteForceRows; // not final so test can inject its own setting + private final ThreadLocal<SparseFixedBitSet> cachedBitSets; + + VectorIndexSegmentSearcher(PrimaryKeyMap.Factory primaryKeyMapFactory, + PerColumnIndexFiles perIndexFiles, + SegmentMetadata segmentMetadata, + IndexContext indexContext) throws IOException + { + super(primaryKeyMapFactory, perIndexFiles, segmentMetadata, indexContext); + graph = new CassandraOnDiskHnsw(segmentMetadata.componentMetadatas, perIndexFiles, indexContext); + type = (VectorType<float[]>) indexContext.getValidator(); Review Comment: This needs `@SuppressWarnings("unchecked")` ########## src/java/org/apache/cassandra/index/Index.java: ########## @@ -440,6 +445,19 @@ default boolean filtersMultipleContains() */ public RowFilter getPostIndexQueryFilter(RowFilter filter); + /** + * Return a comparator that reorders query result before sending to client + * + * @param restriction restriction that requires current index + * @param columnIndex idx of the indexed column in returned row + * @param options query options + * @return comparator that for post-query ordering; or null if not supported Review Comment: ```suggestion * @return a comparator for post-query ordering; or null if not supported ``` ########## src/java/org/apache/cassandra/index/sai/VectorContext.java: ########## @@ -0,0 +1,168 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.index.sai; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashSet; +import java.util.NavigableSet; +import java.util.Set; +import java.util.TreeSet; + +import org.apache.cassandra.index.sai.disk.PrimaryKeyMap; +import org.apache.cassandra.index.sai.disk.v1.segment.SegmentMetadata; +import org.apache.cassandra.index.sai.disk.v1.vector.hnsw.CassandraOnDiskHnsw; +import org.apache.cassandra.index.sai.disk.v1.vector.hnsw.CassandraOnHeapHnsw; +import org.apache.cassandra.index.sai.utils.PrimaryKey; +import org.apache.lucene.util.Bits; + +public class VectorContext +{ + public int hnswVectorsAccessed; + public int hnswVectorCacheHits; + + private TreeSet<PrimaryKey> shadowedPrimaryKeys; // allocate when needed + + public void recordShadowedPrimaryKey(PrimaryKey primaryKey) + { + if (shadowedPrimaryKeys == null) + shadowedPrimaryKeys = new TreeSet<>(); + shadowedPrimaryKeys.add(primaryKey); + } + + // Returns true if the row ID will be included or false if the row ID will be shadowed + public boolean shouldInclude(long sstableRowId, PrimaryKeyMap primaryKeyMap) + { + return shadowedPrimaryKeys == null || !shadowedPrimaryKeys.contains(primaryKeyMap.primaryKeyFromRowId(sstableRowId)); + } + + public boolean containsShadowedPrimaryKey(PrimaryKey primaryKey) + { + return shadowedPrimaryKeys != null && shadowedPrimaryKeys.contains(primaryKey); + } + + /** + * @return shadowed primary keys, in ascending order + */ + public NavigableSet<PrimaryKey> getShadowedPrimaryKeys() + { + if (shadowedPrimaryKeys == null) + return Collections.emptyNavigableSet(); + return shadowedPrimaryKeys; + } + + public Bits bitsetForShadowedPrimaryKeys(CassandraOnHeapHnsw<PrimaryKey> graph) + { + if (shadowedPrimaryKeys == null) + return null; + + return new IgnoredKeysBits(graph, shadowedPrimaryKeys); + } + + public Bits bitsetForShadowedPrimaryKeys(SegmentMetadata metadata, PrimaryKeyMap primaryKeyMap, CassandraOnDiskHnsw graph) throws IOException + { + Set<Integer> ignoredOrdinals = null; + try (var ordinalsView = graph.getOrdinalsView()) + { + for (PrimaryKey primaryKey : getShadowedPrimaryKeys()) + { + // not in current segment + if (primaryKey.compareTo(metadata.minKey) < 0 || primaryKey.compareTo(metadata.maxKey) > 0) + continue; + + long sstableRowId = primaryKeyMap.rowIdFromPrimaryKey(primaryKey); + if (sstableRowId == Long.MAX_VALUE) // not found + continue; + + int segmentRowId = Math.toIntExact(sstableRowId - metadata.rowIdOffset); + // not in segment yet + if (segmentRowId < 0) + continue; + // end of segment + if (segmentRowId > metadata.maxSSTableRowId) + break; + + int ordinal = ordinalsView.getOrdinalForRowId(segmentRowId); + if (ordinal >= 0) + { + if (ignoredOrdinals == null) + ignoredOrdinals = new HashSet<>(); + ignoredOrdinals.add(ordinal); + } + } + } + + if (ignoredOrdinals == null) + return null; + + return new IgnoringBits(graph, ignoredOrdinals, metadata); + } + + private static class IgnoringBits implements Bits + { + private final CassandraOnDiskHnsw graph; Review Comment: It seems we are not using `graph` for anything, can it be removed? ########## src/java/org/apache/cassandra/index/sai/disk/PrimaryKeyMap.java: ########## @@ -70,6 +73,28 @@ default void close() */ long rowIdFromPrimaryKey(PrimaryKey key); + /** + * Returns the first row Id for a given {@link Token} + * + * @param range the {@link AbstractBounds} to lookup + * @return the first row Id associated with the {@link AbstractBounds} Review Comment: ```suggestion * Returns the first row ID for a given partition range. * * @param range the {@link AbstractBounds} to lookup * @return the first row ID associated with the {@link AbstractBounds} ``` ########## src/java/org/apache/cassandra/index/sai/disk/v1/segment/SegmentOrdering.java: ########## @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.index.sai.disk.v1.segment; + +import java.io.IOException; + +import org.apache.cassandra.index.sai.QueryContext; +import org.apache.cassandra.index.sai.iterators.KeyRangeIterator; +import org.apache.cassandra.index.sai.plan.Expression; +import org.apache.cassandra.index.sai.postings.PostingList; + +/** + * There are two steps in ordering: + * <p> + * 1. Limit a single sstable's results to the correct keys. At this stage + * we put them back in primary key order to play nice with the rest of + * the query pipeline. + * 2. Merge the results from multiple sstables. Now we leave them in the + * final, correct order. + * <p> + * SegmentOrdering handles the first step. Review Comment: Nice JavaDoc. It could be helpful to include a hint on who handles the second step. ########## src/java/org/apache/cassandra/index/sai/disk/v1/segment/VectorIndexSegmentSearcher.java: ########## @@ -0,0 +1,283 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.index.sai.disk.v1.segment; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.nio.ByteBuffer; +import java.util.Arrays; +import javax.annotation.Nullable; + +import com.google.common.base.MoreObjects; +import com.google.common.base.Preconditions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.agrona.collections.IntArrayList; +import org.apache.cassandra.db.PartitionPosition; +import org.apache.cassandra.db.marshal.VectorType; +import org.apache.cassandra.dht.AbstractBounds; +import org.apache.cassandra.index.sai.IndexContext; +import org.apache.cassandra.index.sai.QueryContext; +import org.apache.cassandra.index.sai.disk.PrimaryKeyMap; +import org.apache.cassandra.index.sai.disk.v1.PerColumnIndexFiles; +import org.apache.cassandra.index.sai.disk.v1.vector.hnsw.CassandraOnDiskHnsw; +import org.apache.cassandra.index.sai.disk.v1.postings.ReorderingPostingList; +import org.apache.cassandra.index.sai.iterators.KeyRangeIterator; +import org.apache.cassandra.index.sai.plan.Expression; +import org.apache.cassandra.index.sai.postings.IntArrayPostingList; +import org.apache.cassandra.index.sai.postings.PostingList; +import org.apache.cassandra.index.sai.utils.RangeUtil; +import org.apache.cassandra.index.sai.utils.TypeUtil; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.SparseFixedBitSet; + +/** + * Executes ann search against the HNSW graph for an individual index segment. + */ +public class VectorIndexSegmentSearcher extends IndexSegmentSearcher +{ + private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + private final CassandraOnDiskHnsw graph; + private final VectorType<float[]> type; + private int maxBruteForceRows; // not final so test can inject its own setting Review Comment: Nit: can be `final` ########## src/java/org/apache/cassandra/index/sai/disk/v1/postings/ReorderingPostingList.java: ########## @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.index.sai.disk.v1.postings; + +import java.io.IOException; +import java.util.PrimitiveIterator; + +import org.apache.cassandra.index.sai.postings.PostingList; +import org.apache.lucene.util.LongHeap; + +public class ReorderingPostingList implements PostingList Review Comment: Can we add some JavaDoc saying what this posting list implementation does? ########## src/java/org/apache/cassandra/index/sai/disk/v1/postings/PostingListRangeIterator.java: ########## @@ -83,10 +82,23 @@ public PostingListRangeIterator(IndexContext indexContext, this.indexContext = indexContext; this.primaryKeyMap = primaryKeyMap; this.postingList = searcherContext.postingList; - this.searcherContext = searcherContext; + this.rowIdOffset = searcherContext.segmentRowIdOffset; this.queryContext = searcherContext.context; } + public PostingListRangeIterator(IndexContext indexContext, + PrimaryKeyMap primaryKeyMap, + PostingList postingList, + QueryContext queryContext) + { + super(primaryKeyMap.primaryKeyFromRowId(postingList.minimum()), primaryKeyMap.primaryKeyFromRowId(postingList.maximum()), postingList.size()); Review Comment: Nit: break the long line ```suggestion super(primaryKeyMap.primaryKeyFromRowId(postingList.minimum()), primaryKeyMap.primaryKeyFromRowId(postingList.maximum()), postingList.size()); ``` ########## src/java/org/apache/cassandra/index/sai/disk/v1/IndexWriterConfig.java: ########## @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.index.sai.disk.v1; + +import java.util.Arrays; +import java.util.Map; +import java.util.stream.Collectors; + +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.lucene.index.VectorSimilarityFunction; + +import static org.apache.cassandra.config.CassandraRelevantProperties.SAI_VECTOR_SEARCH_MAX_TOP_K; + +/** + * Per-index config for storage-attached index writers. + */ +public class IndexWriterConfig +{ + public static final String MAXIMUM_NODE_CONNECTIONS = "maximum_node_connections"; + public static final String CONSTRUCTION_BEAM_WIDTH = "construction_beam_width"; + public static final String SIMILARITY_FUNCTION = "similarity_function"; + + public static final int MAXIMUM_MAXIMUM_NODE_CONNECTIONS = 512; + public static final int MAXIMUM_CONSTRUCTION_BEAM_WIDTH = 3200; + + public static final int DEFAULT_MAXIMUM_NODE_CONNECTIONS = 16; + public static final int DEFAULT_CONSTRUCTION_BEAM_WIDTH = 100; + + public static final int MAX_TOP_K = SAI_VECTOR_SEARCH_MAX_TOP_K.getInt(); + + public static final VectorSimilarityFunction DEFAULT_SIMILARITY_FUNCTION = VectorSimilarityFunction.COSINE; + + public static final String validSimilarityFunctions = Arrays.stream(VectorSimilarityFunction.values()) + .map(Enum::name) + .collect(Collectors.joining(", ")); + + private static final IndexWriterConfig EMPTY_CONFIG = new IndexWriterConfig(-1, -1, null); + + private final int maximumNodeConnections; + + private final int constructionBeamWidth; + + private final VectorSimilarityFunction similarityFunction; Review Comment: Can we add some JavaDoc for these attributes? ########## src/java/org/apache/cassandra/index/sai/disk/PrimaryKeyMap.java: ########## @@ -70,6 +73,28 @@ default void close() */ long rowIdFromPrimaryKey(PrimaryKey key); + /** + * Returns the first row Id for a given {@link Token} + * + * @param range the {@link AbstractBounds} to lookup + * @return the first row Id associated with the {@link AbstractBounds} + */ + default long firstRowIdForRange(AbstractBounds<PartitionPosition> range) + { + throw new UnsupportedOperationException(); + } + + /** + * Returns the last row Id for a given {@link Token} + * + * @param range the {@link AbstractBounds} to lookup + * @return the last row Id associated with the {@link AbstractBounds} Review Comment: ```suggestion * Returns the last row ID for a given partition range. * * @param range the {@link AbstractBounds} to lookup * @return the last row ID associated with the {@link AbstractBounds} ``` ########## src/java/org/apache/cassandra/index/sai/disk/v1/segment/VectorIndexSegmentSearcher.java: ########## @@ -0,0 +1,283 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.index.sai.disk.v1.segment; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.nio.ByteBuffer; +import java.util.Arrays; +import javax.annotation.Nullable; + +import com.google.common.base.MoreObjects; +import com.google.common.base.Preconditions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.agrona.collections.IntArrayList; +import org.apache.cassandra.db.PartitionPosition; +import org.apache.cassandra.db.marshal.VectorType; +import org.apache.cassandra.dht.AbstractBounds; +import org.apache.cassandra.index.sai.IndexContext; +import org.apache.cassandra.index.sai.QueryContext; +import org.apache.cassandra.index.sai.disk.PrimaryKeyMap; +import org.apache.cassandra.index.sai.disk.v1.PerColumnIndexFiles; +import org.apache.cassandra.index.sai.disk.v1.vector.hnsw.CassandraOnDiskHnsw; +import org.apache.cassandra.index.sai.disk.v1.postings.ReorderingPostingList; +import org.apache.cassandra.index.sai.iterators.KeyRangeIterator; +import org.apache.cassandra.index.sai.plan.Expression; +import org.apache.cassandra.index.sai.postings.IntArrayPostingList; +import org.apache.cassandra.index.sai.postings.PostingList; +import org.apache.cassandra.index.sai.utils.RangeUtil; +import org.apache.cassandra.index.sai.utils.TypeUtil; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.SparseFixedBitSet; + +/** + * Executes ann search against the HNSW graph for an individual index segment. Review Comment: ```suggestion * Executes ANN search against the HNSW graph for an individual index segment. ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]

