bbotella commented on code in PR #3857: URL: https://github.com/apache/cassandra/pull/3857#discussion_r1939654751
########## src/java/org/apache/cassandra/db/virtual/PrimaryIdTable.java: ########## @@ -0,0 +1,356 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.virtual; + +import java.io.IOException; +import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.util.List; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.function.Consumer; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.Lists; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.cql3.Operator; +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.DataRange; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.DeletionTime; +import org.apache.cassandra.db.PartitionPosition; +import org.apache.cassandra.db.Slice; +import org.apache.cassandra.db.Slices; +import org.apache.cassandra.db.context.CounterContext; +import org.apache.cassandra.db.filter.ClusteringIndexFilter; +import org.apache.cassandra.db.filter.ColumnFilter; +import org.apache.cassandra.db.filter.RowFilter; +import org.apache.cassandra.db.marshal.CompositeType; +import org.apache.cassandra.db.marshal.CounterColumnType; +import org.apache.cassandra.db.marshal.IntegerType; +import org.apache.cassandra.db.marshal.UTF8Type; +import org.apache.cassandra.db.partitions.PartitionUpdate; +import org.apache.cassandra.db.partitions.SingletonUnfilteredPartitionIterator; +import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator; +import org.apache.cassandra.db.rows.AbstractUnfilteredRowIterator; +import org.apache.cassandra.db.rows.BTreeRow; +import org.apache.cassandra.db.rows.BufferCell; +import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.EncodingStats; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.Rows; +import org.apache.cassandra.db.rows.Unfiltered; +import org.apache.cassandra.db.rows.UnfilteredRowIterator; +import org.apache.cassandra.db.rows.UnfilteredRowIterators; +import org.apache.cassandra.dht.AbstractBounds; +import org.apache.cassandra.dht.Bounds; +import org.apache.cassandra.dht.LocalPartitioner; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.io.sstable.KeyReader; +import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableMetadata; + +import static org.apache.cassandra.cql3.statements.RequestValidations.invalidRequest; + +/** + * A virtual table for querying primary IDs of SSTables in a specific keyspace. + * + * <p>This table is implemented as a virtual table in Cassandra, meaning it does not + * store data persistently on disk but instead derives its data from live metadata. + * + * <p>The CQL equivalent of this virtual table is: + * <pre> + * CREATE TABLE system_views.primary_ids ( + * keyspace_name TEXT, + * table_name TEXT, + * token_value INT, + * key TEXT, + * size_estimate COUNTER, + * sstables COUNTER, + * PRIMARY KEY ((keyspace_name, table_name), token_value, key) + * ); + * </pre> + * + * <p>Note: + * <ul> + * <li>The `size_estimate` and `sstables` columns represent aggregate information about SSTable sizes and counts.</li> + * <li>Range queries across multiple tables and updates are not supported as this is a read-only table.</li> + * </ul> + */ +public class PrimaryIdTable implements VirtualTable Review Comment: Do we want to extend `AbstractVirtualTable` instead? ########## src/java/org/apache/cassandra/db/virtual/PrimaryIdTable.java: ########## @@ -0,0 +1,356 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.virtual; + +import java.io.IOException; +import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.util.List; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.function.Consumer; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.Lists; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.cql3.Operator; +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.DataRange; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.DeletionTime; +import org.apache.cassandra.db.PartitionPosition; +import org.apache.cassandra.db.Slice; +import org.apache.cassandra.db.Slices; +import org.apache.cassandra.db.context.CounterContext; +import org.apache.cassandra.db.filter.ClusteringIndexFilter; +import org.apache.cassandra.db.filter.ColumnFilter; +import org.apache.cassandra.db.filter.RowFilter; +import org.apache.cassandra.db.marshal.CompositeType; +import org.apache.cassandra.db.marshal.CounterColumnType; +import org.apache.cassandra.db.marshal.IntegerType; +import org.apache.cassandra.db.marshal.UTF8Type; +import org.apache.cassandra.db.partitions.PartitionUpdate; +import org.apache.cassandra.db.partitions.SingletonUnfilteredPartitionIterator; +import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator; +import org.apache.cassandra.db.rows.AbstractUnfilteredRowIterator; +import org.apache.cassandra.db.rows.BTreeRow; +import org.apache.cassandra.db.rows.BufferCell; +import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.EncodingStats; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.Rows; +import org.apache.cassandra.db.rows.Unfiltered; +import org.apache.cassandra.db.rows.UnfilteredRowIterator; +import org.apache.cassandra.db.rows.UnfilteredRowIterators; +import org.apache.cassandra.dht.AbstractBounds; +import org.apache.cassandra.dht.Bounds; +import org.apache.cassandra.dht.LocalPartitioner; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.io.sstable.KeyReader; +import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableMetadata; + +import static org.apache.cassandra.cql3.statements.RequestValidations.invalidRequest; + +/** + * A virtual table for querying primary IDs of SSTables in a specific keyspace. + * + * <p>This table is implemented as a virtual table in Cassandra, meaning it does not + * store data persistently on disk but instead derives its data from live metadata. + * + * <p>The CQL equivalent of this virtual table is: + * <pre> + * CREATE TABLE system_views.primary_ids ( + * keyspace_name TEXT, + * table_name TEXT, + * token_value INT, + * key TEXT, + * size_estimate COUNTER, + * sstables COUNTER, + * PRIMARY KEY ((keyspace_name, table_name), token_value, key) + * ); + * </pre> + * + * <p>Note: + * <ul> + * <li>The `size_estimate` and `sstables` columns represent aggregate information about SSTable sizes and counts.</li> + * <li>Range queries across multiple tables and updates are not supported as this is a read-only table.</li> + * </ul> + */ +public class PrimaryIdTable implements VirtualTable +{ + private static final Logger logger = LoggerFactory.getLogger(PrimaryIdTable.class); + public static final String NAME = "primary_ids"; + + private static final String TABLE_READ_ONLY_ERROR = "The specified table is read-only."; + private static final String UNSUPPORTED_RANGE_QUERY_ERROR = "Range queries are not supported. Please provide both a keyspace and a table name."; + private static final String REVERSED_QUERY_ERROR = "Reversed queries are not supported."; + private static final String KEYSPACE_NOT_EXIST_ERROR = "The keyspace '%s' does not exist."; + private static final String TABLE_NOT_EXIST_ERROR = "The table '%s' does not exist in the keyspace '%s'."; + private static final String KEY_ONLY_EQUALS_ERROR = "The 'key' column can only be used in an equality query for this virtual table."; + private static final String KEY_NOT_WITHIN_BOUNDS_ERROR = "The specified 'key' is not within the provided token value bounds."; + private static final String PARTITIONER_NOT_SUPPORTED = "Partitioner '%s' for table '%s' in keyspace '%s' is not supported."; Review Comment: Maybe not a refactor for this PR, but some of these seem generic enough to be promoted to `VirtualTable` or `AbstractVirtualTable` and be reused across virtual tables. ########## src/java/org/apache/cassandra/db/virtual/PrimaryIdTable.java: ########## @@ -0,0 +1,356 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.virtual; + +import java.io.IOException; +import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.util.List; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.function.Consumer; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.Lists; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.cql3.Operator; +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.DataRange; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.DeletionTime; +import org.apache.cassandra.db.PartitionPosition; +import org.apache.cassandra.db.Slice; +import org.apache.cassandra.db.Slices; +import org.apache.cassandra.db.context.CounterContext; +import org.apache.cassandra.db.filter.ClusteringIndexFilter; +import org.apache.cassandra.db.filter.ColumnFilter; +import org.apache.cassandra.db.filter.RowFilter; +import org.apache.cassandra.db.marshal.CompositeType; +import org.apache.cassandra.db.marshal.CounterColumnType; +import org.apache.cassandra.db.marshal.IntegerType; +import org.apache.cassandra.db.marshal.UTF8Type; +import org.apache.cassandra.db.partitions.PartitionUpdate; +import org.apache.cassandra.db.partitions.SingletonUnfilteredPartitionIterator; +import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator; +import org.apache.cassandra.db.rows.AbstractUnfilteredRowIterator; +import org.apache.cassandra.db.rows.BTreeRow; +import org.apache.cassandra.db.rows.BufferCell; +import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.EncodingStats; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.Rows; +import org.apache.cassandra.db.rows.Unfiltered; +import org.apache.cassandra.db.rows.UnfilteredRowIterator; +import org.apache.cassandra.db.rows.UnfilteredRowIterators; +import org.apache.cassandra.dht.AbstractBounds; +import org.apache.cassandra.dht.Bounds; +import org.apache.cassandra.dht.LocalPartitioner; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.io.sstable.KeyReader; +import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableMetadata; + +import static org.apache.cassandra.cql3.statements.RequestValidations.invalidRequest; + +/** + * A virtual table for querying primary IDs of SSTables in a specific keyspace. + * + * <p>This table is implemented as a virtual table in Cassandra, meaning it does not + * store data persistently on disk but instead derives its data from live metadata. + * + * <p>The CQL equivalent of this virtual table is: + * <pre> + * CREATE TABLE system_views.primary_ids ( + * keyspace_name TEXT, + * table_name TEXT, + * token_value INT, + * key TEXT, + * size_estimate COUNTER, + * sstables COUNTER, + * PRIMARY KEY ((keyspace_name, table_name), token_value, key) + * ); + * </pre> + * + * <p>Note: + * <ul> + * <li>The `size_estimate` and `sstables` columns represent aggregate information about SSTable sizes and counts.</li> + * <li>Range queries across multiple tables and updates are not supported as this is a read-only table.</li> + * </ul> + */ +public class PrimaryIdTable implements VirtualTable +{ + private static final Logger logger = LoggerFactory.getLogger(PrimaryIdTable.class); + public static final String NAME = "primary_ids"; + + private static final String TABLE_READ_ONLY_ERROR = "The specified table is read-only."; + private static final String UNSUPPORTED_RANGE_QUERY_ERROR = "Range queries are not supported. Please provide both a keyspace and a table name."; + private static final String REVERSED_QUERY_ERROR = "Reversed queries are not supported."; + private static final String KEYSPACE_NOT_EXIST_ERROR = "The keyspace '%s' does not exist."; + private static final String TABLE_NOT_EXIST_ERROR = "The table '%s' does not exist in the keyspace '%s'."; + private static final String KEY_ONLY_EQUALS_ERROR = "The 'key' column can only be used in an equality query for this virtual table."; + private static final String KEY_NOT_WITHIN_BOUNDS_ERROR = "The specified 'key' is not within the provided token value bounds."; + private static final String PARTITIONER_NOT_SUPPORTED = "Partitioner '%s' for table '%s' in keyspace '%s' is not supported."; + + private static final String COLUMN_KEYSPACE_NAME = "keyspace_name"; + private static final String COLUMN_TABLE_NAME = "table_name"; + private static final String COLUMN_TOKEN_VALUE = "token_value"; + private static final String COLUMN_KEY = "key"; + private static final String COLUMN_SIZE_ESTIMATE = "size_estimate"; + private static final String COLUMN_SSTABLES = "sstables"; + + private final TableMetadata metadata; + private final ColumnMetadata sizeEstimateColumn; + private final ColumnMetadata sstablesColumn; + + @VisibleForTesting + final CopyOnWriteArrayList<Consumer<DecoratedKey>> readListener = new CopyOnWriteArrayList<>(); + + public PrimaryIdTable(String keyspace) + { + this.metadata = TableMetadata.builder(keyspace, NAME) + .kind(TableMetadata.Kind.VIRTUAL) + .partitioner(new LocalPartitioner(CompositeType.getInstance(UTF8Type.instance, UTF8Type.instance))) + .addPartitionKeyColumn(COLUMN_KEYSPACE_NAME, UTF8Type.instance) + .addPartitionKeyColumn(COLUMN_TABLE_NAME, UTF8Type.instance) + .addClusteringColumn(COLUMN_TOKEN_VALUE, IntegerType.instance) + .addClusteringColumn(COLUMN_KEY, UTF8Type.instance) + .addRegularColumn(COLUMN_SIZE_ESTIMATE, CounterColumnType.instance) + .addRegularColumn(COLUMN_SSTABLES, CounterColumnType.instance) + .build(); + sizeEstimateColumn = metadata.regularColumns().getSimple(0); + sstablesColumn = metadata.regularColumns().getSimple(1); + } + + @Override + public UnfilteredPartitionIterator select(DecoratedKey partitionKey, ClusteringIndexFilter clusteringIndexFilter, ColumnFilter columnFilter, RowFilter rowFilter) + { + if (clusteringIndexFilter.isReversed()) + throw new InvalidRequestException(REVERSED_QUERY_ERROR); + + ByteBuffer[] key = ((CompositeType) this.metadata.partitionKeyType).split(partitionKey.getKey()); + String keyspace = UTF8Type.instance.getString(key[0]); + String table = UTF8Type.instance.getString(key[1]); + + KeyspaceMetadata ksm = Schema.instance.getKeyspaceMetadata(keyspace); + if (ksm == null) + throw invalidRequest(KEYSPACE_NOT_EXIST_ERROR, keyspace); + + TableMetadata metadata = ksm.getTableOrViewNullable(table); + if (metadata == null) + throw invalidRequest(TABLE_NOT_EXIST_ERROR, table, keyspace); + + if (!metadata.partitioner.supportsSplitting()) + throw invalidRequest(PARTITIONER_NOT_SUPPORTED, metadata.partitioner.getClass().getName(), table, keyspace); + + AbstractBounds<PartitionPosition> range = getBounds(metadata, clusteringIndexFilter, rowFilter); + return new SingletonUnfilteredPartitionIterator(select(partitionKey, metadata, clusteringIndexFilter, range)); + } + + private List<SSTableReader> getSStables(TableMetadata metadata, AbstractBounds<PartitionPosition> range) + { + return Lists.newArrayList(ColumnFamilyStore.getIfExists(metadata).getTracker().getView().liveSSTablesInBounds(range.left, range.right)); + } + + private UnfilteredRowIterator select(DecoratedKey partitionKey, TableMetadata metadata, ClusteringIndexFilter clusteringIndexFilter, AbstractBounds<PartitionPosition> range) + { + List<SSTableReader> sstables = getSStables(metadata, range); + if (sstables.isEmpty()) + return UnfilteredRowIterators.noRowsIterator(metadata, partitionKey, Rows.EMPTY_STATIC_ROW, DeletionTime.LIVE, false); + + List<UnfilteredRowIterator> sstableIterators = Lists.newArrayList(); + for (SSTableReader sstable : sstables) + sstableIterators.add(getSStableRowIterator(metadata, partitionKey, sstable, clusteringIndexFilter, range)); + + return UnfilteredRowIterators.merge(sstableIterators); + } + + private UnfilteredRowIterator getSStableRowIterator(TableMetadata target, DecoratedKey partitionKey, SSTableReader sstable, ClusteringIndexFilter filter, AbstractBounds<PartitionPosition> range) + { + final KeyReader reader; + try + { + // ignore warning on try-with-resources, the reader will be closed on endOfData or close + reader = sstable.keyReader(range.left); + } + catch (IOException e) + { + logger.error("Error generating keyReader for SSTable: {}", sstable, e); + throw new RuntimeException(e); + } + + return new AbstractUnfilteredRowIterator(metadata, partitionKey, DeletionTime.LIVE, + metadata.regularAndStaticColumns(), Rows.EMPTY_STATIC_ROW, + false, EncodingStats.NO_STATS) + { + public Unfiltered endOfData() + { + reader.close(); + return super.endOfData(); + } + + public void close() + { + reader.close(); + } + + private Row buildRow(Clustering<?> clustering, long size) + { + Row.Builder row = BTreeRow.sortedBuilder(); + row.newRow(clustering); + row.addCell(cell(sizeEstimateColumn, CounterContext.instance().createUpdate(size))); + row.addCell(cell(sstablesColumn, CounterContext.instance().createUpdate(1))); + return row.build(); + } + + @Override + protected Unfiltered computeNext() + { + while (!reader.isExhausted()) + { + DecoratedKey key = target.partitioner.decorateKey(reader.key()); + + for (Consumer<DecoratedKey> listener : readListener) + listener.accept(key); + + // Store the reader's current data position to calculate size later + long lastPosition = reader.dataPosition(); + try + { + // Advance the reader to the next key for the next iteration. Also by moving to next key + // we move the dataPosition to the start of the next key for calculating size + reader.advance(); + } + catch (IOException e) + { + logger.error("Error advancing reader for SSTable: {}", sstable, e); + return endOfData(); + } + + // Calculate the size of the current key. If EOF use the length of the file + long current = reader.dataPosition() == -1 ? sstable.uncompressedLength() : reader.dataPosition(); + long size = current - lastPosition; + + String keyString = target.partitionKeyType.asCQL3Type().toCQLLiteral(key.getKey()); + + // Check if the current key is outside the queried range; if so, stop + if (range.right.compareTo(key) < 0) + return endOfData(); + + // Convert the token to a string and create a clustering object + String tokenString = key.getToken().toString(); + Clustering<?> clustering = Clustering.make( + IntegerType.instance.decompose(new BigInteger(tokenString)), + UTF8Type.instance.decompose(keyString) + ); + + // Check if the current clustering matches the filter; if so, return the row + if (filter.selects(clustering)) + return buildRow(clustering, size); + } + return endOfData(); + } + }; + } + + /** + * This converts the clustering token/key into the partition level token/key for the target table. Also provides an + * optimization from RowFilter when a `key` is specified with or without the clustering `token` being set. + */ + private AbstractBounds<PartitionPosition> getBounds(TableMetadata target, ClusteringIndexFilter clusteringIndexFilter, RowFilter rowFilter) + { + Slices s = clusteringIndexFilter.getSlices(target); + Token startToken = target.partitioner.getMinimumToken(); + Token endToken = target.partitioner.getMaximumToken(); + BigInteger startTokenValue = new BigInteger(endToken.getTokenValue().toString(), 10); + BigInteger endTokenValue = new BigInteger(startToken.getTokenValue().toString(), 10); + + // find min/max token values from the clustering key + for (int i = 0; i < s.size(); i++) + { + Slice slice = s.get(i); + if (!slice.start().isEmpty()) + { + startTokenValue = startTokenValue.min(IntegerType.instance.compose(slice.start().bufferAt(0))); + startToken = target.partitioner.getTokenFactory().fromString(startTokenValue.toString()); + } + if (!slice.end().isEmpty()) + { + endTokenValue = endTokenValue.max(IntegerType.instance.compose(slice.end().bufferAt(0))); + endToken = target.partitioner.getTokenFactory().fromString(endTokenValue.toString()); + } + } + + // override min/max of token if the `key` is specified + for (RowFilter.Expression expression : rowFilter.getExpressions()) + { + if (expression.column().name.toString().equals(COLUMN_KEY)) + { + if (expression.operator() != Operator.EQ) + throw new InvalidRequestException(KEY_ONLY_EQUALS_ERROR); + + String keyString = UTF8Type.instance.compose(expression.getIndexValue()); + ByteBuffer keyAsBB = target.partitionKeyType.asCQL3Type().fromCQLLiteral(keyString); + Token keyToken = target.partitioner.decorateKey(keyAsBB).getToken(); + + if (!DataRange.forKeyRange(new Range<>(startToken.minKeyBound(), endToken.maxKeyBound())).contains(keyToken.minKeyBound())) + throw new InvalidRequestException(KEY_NOT_WITHIN_BOUNDS_ERROR); + + return Bounds.bounds(target.partitioner.decorateKey(keyAsBB), true, target.partitioner.decorateKey(keyAsBB), true); Review Comment: Nit: ```suggestion DecoratedKey decoratedKey = target.partitioner.decorateKey(keyAsBB); if (!DataRange.forKeyRange(new Range<>(startToken.minKeyBound(), endToken.maxKeyBound())).contains(decoratedKey.getToken().minKeyBound())) throw new InvalidRequestException(KEY_NOT_WITHIN_BOUNDS_ERROR); return Bounds.bounds(decoratedKey, true, decoratedKey, true); ``` ########## test/unit/org/apache/cassandra/db/virtual/PrimaryIdTableTest.java: ########## @@ -0,0 +1,322 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.virtual; + +import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; + +import com.google.common.collect.ImmutableList; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; + +import com.datastax.driver.core.ResultSet; +import com.datastax.driver.core.Row; +import com.datastax.driver.core.exceptions.InvalidQueryException; +import org.apache.cassandra.Util; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.cql3.CQLTester; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.io.sstable.format.bti.BtiFormat; +import org.bouncycastle.util.encoders.Hex; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +@RunWith(Parameterized.class) +public class PrimaryIdTableTest extends CQLTester +{ + private static final String KS_NAME = "vts"; + private String table; + private AtomicInteger scanned; + + private final boolean useBtiFormat; + + @Parameters(name = "Use BtiFormat = {0}") + public static Collection<Object[]> parameters() + { + return Arrays.asList(new Object[][]{ { false }, { true } }); + } + + public PrimaryIdTableTest(boolean useBtiFormat) + { + this.useBtiFormat = useBtiFormat; + } + + @Before + public void before() + { + if (useBtiFormat) + DatabaseDescriptor.setSelectedSSTableFormat(new BtiFormat.BtiFormatFactory().getInstance(Collections.emptyMap())); + + PrimaryIdTable primaryIdTable = new PrimaryIdTable(KS_NAME); + scanned = new AtomicInteger(); + VirtualKeyspaceRegistry.instance.register(new VirtualKeyspace(KS_NAME, ImmutableList.of(primaryIdTable))); + + table = createTable("CREATE TABLE %s (key blob PRIMARY KEY, value blob)"); + + ByteBuffer value = ByteBuffer.wrap(new byte[1]); + for (int i = -10; i < 1000; i++) + { + ByteBuffer key = Murmur3Partitioner.LongToken.keyForToken(i); + execute("INSERT INTO %s (key, value) VALUES (?, ?)", key, value); + } + Util.flushTable(KEYSPACE, table); + primaryIdTable.readListener.add(unused -> scanned.incrementAndGet()); + } + + @Test + public void testPrimaryIdTable() + { + ResultSet rs = executeNetWithPaging("SELECT * FROM vts.primary_ids WHERE keyspace_name = ? AND table_name = ?", + 10, KEYSPACE, table); + List<Row> all = rs.all(); + assertEquals(1010, all.size()); + assertResults(all, -10, 1000); + // 1010 + 100 for the 1 per 10 page, +1 for the last + assertEquals(1111, scanned.get()); + } + + @Test + public void testTokenValueGreaterThanZero() + { + ResultSet rs = executeNetWithPaging("SELECT * FROM vts.primary_ids WHERE keyspace_name = ? AND table_name = ? AND token_value > 0", + 10, KEYSPACE, table); + List<Row> all = rs.all(); + assertEquals(999, all.size()); + assertResults(all, 1, 1000); + assertEquals(1099, scanned.get()); + } + + @Test + public void testTokenValueGreaterThanNegativeFive() + { + ResultSet rs = executeNetWithPaging("SELECT * FROM vts.primary_ids WHERE keyspace_name = ? AND table_name = ? AND token_value > -5", + 10, KEYSPACE, table); + List<Row> all = rs.all(); + assertEquals(1004, all.size()); + assertResults(all, -4, 1000); + // 1004 + 100 for the 1 per 10 page, +1 for the last + assertEquals(1105, scanned.get()); + } + + @Test + public void testTokenValueLessThanOrEqualToFive() + { + ResultSet rs = executeNetWithPaging("SELECT * FROM vts.primary_ids WHERE keyspace_name = ? AND table_name = ? AND token_value <= 5", + 10, KEYSPACE, table); + List<Row> all = rs.all(); + assertEquals(16, all.size()); + assertResults(all, -10, 5); + assertEquals(18, scanned.get()); + } + + @Test + public void testTokenValueEqualToZero() + { + ResultSet rs = executeNetWithPaging("SELECT * FROM vts.primary_ids WHERE keyspace_name = ? AND table_name = ? AND token_value = 0", + 10, KEYSPACE, table); + List<Row> all = rs.all(); + assertEquals(1, all.size()); + Row row = all.get(0); + assertEquals(BigInteger.valueOf(0), row.get("token_value", BigInteger.class)); + assertEquals(2, scanned.get()); + } + + @Test + public void testTokenValueBounds() + { + ResultSet rs = executeNetWithPaging("SELECT * FROM vts.primary_ids WHERE keyspace_name = ? AND table_name = ? AND token_value > 0 AND token_value < 15", + 10, KEYSPACE, table); + List<Row> all = rs.all(); + assertEquals(14, all.size()); + assertResults(all, 1, 14); + // 0->10 = 11, 10->16 = 7 + assertEquals(18, scanned.get()); + } + + @Test + public void testTokenValueBoundsWithBetween() + { + ResultSet rs = executeNetWithPaging("SELECT * FROM vts.primary_ids WHERE keyspace_name = ? AND table_name = ? AND token_value BETWEEN 0 AND 15", + 10, KEYSPACE, table); + List<Row> all = rs.all(); + assertEquals(16, all.size()); + assertResults(all, 0, 15); + assertEquals(18, scanned.get()); + } + + @Test + public void testTokenValueBoundsWithIn() + { + ResultSet rs = executeNetWithPaging("SELECT * FROM vts.primary_ids WHERE keyspace_name = ? AND table_name = ? AND token_value IN (1,3,6)", + 10, KEYSPACE, table); + List<Row> all = rs.all(); + assertEquals(3, all.size()); + assertEquals(BigInteger.valueOf(1), all.get(0).get("token_value", BigInteger.class)); + assertEquals(BigInteger.valueOf(3), all.get(1).get("token_value", BigInteger.class)); + assertEquals(BigInteger.valueOf(6), all.get(2).get("token_value", BigInteger.class)); + assertEquals(7, scanned.get()); + } + + @Test + public void testTokenValueBoundsWithKey() + { + ByteBuffer ten = Murmur3Partitioner.LongToken.keyForToken(10); + String key = "0x" + Hex.toHexString(ten.array()); + ResultSet rs = executeNetWithPaging("SELECT * FROM vts.primary_ids WHERE keyspace_name = ? AND table_name = ? AND token_value > 0 AND token_value < 15 AND key = ?", + 10, KEYSPACE, table, key); + List<Row> all = rs.all(); + assertEquals(1, all.size()); + Row row = all.get(0); + assertEquals(BigInteger.valueOf(10), row.get("token_value", BigInteger.class)); + assertEquals(2, scanned.get()); + } + + @Test + public void testByKey() + { + ByteBuffer ten = Murmur3Partitioner.LongToken.keyForToken(10); + String key = "0x" + Hex.toHexString(ten.array()); + ResultSet rs = executeNetWithPaging("SELECT * FROM vts.primary_ids WHERE keyspace_name = ? AND table_name = ? AND key = ?", + 10, KEYSPACE, table, key); + List<Row> all = rs.all(); + assertEquals(1, all.size()); + Row row = all.get(0); + assertEquals(BigInteger.valueOf(10), row.get("token_value", BigInteger.class)); + assertEquals(2, scanned.get()); + } + + @Test + public void testIgnoreSStableOutOfRange() + { + ByteBuffer twok = Murmur3Partitioner.LongToken.keyForToken(2000); + execute("INSERT INTO %s (key, value) VALUES (?, ?)", twok, ByteBuffer.wrap(new byte[1])); + Util.flushTable(KEYSPACE, table); + ResultSet rs = executeNetWithPaging("SELECT * FROM vts.primary_ids WHERE keyspace_name = ? AND table_name = ? AND token_value > 1500", + 10, KEYSPACE, table); + List<Row> all = rs.all(); + assertEquals(1, all.size()); + Row row = all.get(0); + assertEquals(BigInteger.valueOf(2000), row.get("token_value", BigInteger.class)); + assertEquals(1L, row.get("sstables", Long.class).longValue()); + assertEquals(1, scanned.get()); + } + + @Test + public void testNoResults() + { + ResultSet rs = executeNetWithPaging("SELECT * FROM vts.primary_ids WHERE keyspace_name = ? AND table_name = ? AND token_value < -1000", + 10, KEYSPACE, table); + List<Row> all = rs.all(); + assertEquals(0, all.size()); + assertEquals(0, scanned.get()); // sstables shouldn't even of been touched + } + + @Test(expected = InvalidQueryException.class) + public void testNonExistantKeyspace() + { + ResultSet rs = executeNetWithPaging("SELECT * FROM vts.primary_ids WHERE keyspace_name = 'non_existent' AND table_name = ?", + 10, table); + List<Row> all = rs.all(); + assertEquals(0, all.size()); + assertEquals(0, scanned.get()); + } + + @Test + public void testNoResultsWithSSTables() + { + ByteBuffer o1 = Murmur3Partitioner.LongToken.keyForToken(10000); + ByteBuffer o2 = Murmur3Partitioner.LongToken.keyForToken(10002); + ByteBuffer value = ByteBuffer.wrap(new byte[10]); + execute("INSERT INTO %s (key, value) VALUES (?, ?)", o1, value); + execute("INSERT INTO %s (key, value) VALUES (?, ?)", o2, value); + Util.flushTable(KEYSPACE, table); + + ResultSet rs = executeNetWithPaging("SELECT * FROM vts.primary_ids WHERE keyspace_name = ? AND table_name = ? AND token_value = 10001", + 10, KEYSPACE, table); + List<Row> all = rs.all(); + assertEquals(0, all.size()); + assertEquals(1, scanned.get()); + } + + @Test + public void testPrimaryIdTableDuplicates() + { + // 0xc25f118f072d6ba5cab7fb1468ace617 hashes to 1563004846366 + ByteBuffer dup = Murmur3Partitioner.LongToken.keyForToken(1563004846366L); + // -19, 68, -61 (0xed44c3) hashes to 1563004846366 + ByteBuffer dup2 = ByteBuffer.wrap(new byte[]{ -19, 68, -61 }); + ByteBuffer value = ByteBuffer.wrap(new byte[10]); + execute("INSERT INTO %s (key, value) VALUES (?, ?)", dup, value); + execute("INSERT INTO %s (key, value) VALUES (?, ?)", dup2, value); + Util.flushTable(KEYSPACE, table); + + ResultSet rs = executeNetWithPaging("SELECT * FROM vts.primary_ids WHERE keyspace_name = ? AND table_name = ? AND token_value = 1563004846366", + 10, KEYSPACE, table); + List<Row> all = rs.all(); + assertEquals(2, all.size()); + assertEquals(BigInteger.valueOf(1563004846366L), all.get(0).get("token_value", BigInteger.class)); + assertEquals(BigInteger.valueOf(1563004846366L), all.get(1).get("token_value", BigInteger.class)); + assertEquals("0xc25f118f072d6ba5cab7fb1468ace617", all.get(0).getString("key")); + assertEquals("0xed44c3", all.get(1).getString("key")); + assertEquals(2, scanned.get()); + } + + @Test + public void testSameKeyInMultipleSSTables() + { + String table = createTable("CREATE TABLE %s (key blob PRIMARY KEY, value blob)"); + + ByteBuffer key = Murmur3Partitioner.LongToken.keyForToken(1); + ByteBuffer value = ByteBuffer.wrap(new byte[10]); + execute("INSERT INTO %s (key, value) VALUES (?, ?)", key, value); + Util.flushTable(KEYSPACE, table); + value = ByteBuffer.wrap(new byte[100]); + execute("INSERT INTO %s (key, value) VALUES (?, ?)", key, value); + Util.flushTable(KEYSPACE, table); + + ResultSet rs = executeNetWithPaging("SELECT * FROM vts.primary_ids WHERE keyspace_name = ? AND table_name = ?", + 10, KEYSPACE, table); + List<Row> all = rs.all(); + assertEquals(1, all.size()); + Row row = all.get(0); + assertEquals(BigInteger.valueOf(1), row.get("token_value", BigInteger.class)); + long size = row.get("size_estimate", Long.class); + assertTrue(size >= 110 && size < 200); Review Comment: I'm curious about this particular range for the estimate check. From my tests, size is always 162. Should we just check for `size == 126` with a nice comment onto why that is happening? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]

