This is an automated email from the ASF dual-hosted git repository.
mck pushed a commit to branch cassandra-3.0
in repository https://gitbox.apache.org/repos/asf/cassandra.git
The following commit(s) were added to refs/heads/cassandra-3.0 by this push:
new f8500ee Fix skipping on pre-3.0 created compact storage sstables due
to missing primary key liveness
f8500ee is described below
commit f8500ee911343eb8826f9c44bb6db2ab780f6327
Author: Caleb Rackliffe <[email protected]>
AuthorDate: Fri Oct 23 15:38:57 2020 -0500
Fix skipping on pre-3.0 created compact storage sstables due to missing
primary key liveness
Check primary key liveness information only if it exists, and fall back to
checking cell contents, which makes skipping possible for COMPACT STORAGE
tables after and upgrade to 3.0+
patch by Caleb Rackliffe; reviewed by Alex Petrov, Mick Semb Wever for
CASSANDRA-16226
---
CHANGES.txt | 1 +
.../cassandra/db/SinglePartitionReadCommand.java | 37 +++-
.../upgrade/CompactStorage2to3UpgradeTest.java | 6 +-
.../miscellaneous/SSTablesIteratedTest.java | 238 ++++++++++++++++++++-
.../cql3/validation/operations/UpdateTest.java | 1 -
5 files changed, 266 insertions(+), 17 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 53883bc..54456c1 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,4 +1,5 @@
3.0.24:
+ * Fix skipping on pre-3.0 created compact storage sstables due to missing
primary key liveness (CASSANDRA-16226)
* Fix DecimalDeserializer#toString OOM (CASSANDRA-14925)
* Extend the exclusion of replica filtering protection to other indices
instead of just SASI (CASSANDRA-16311)
* Synchronize transaction logs for JBOD (CASSANDRA-16225)
diff --git a/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java
b/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java
index ca4e8e3..1e1953b 100644
--- a/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java
+++ b/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java
@@ -27,8 +27,6 @@ import com.google.common.collect.Sets;
import org.apache.cassandra.cache.IRowCacheEntry;
import org.apache.cassandra.cache.RowCacheKey;
import org.apache.cassandra.cache.RowCacheSentinel;
-import org.apache.cassandra.concurrent.Stage;
-import org.apache.cassandra.concurrent.StageManager;
import org.apache.cassandra.config.CFMetaData;
import org.apache.cassandra.config.ColumnDefinition;
import org.apache.cassandra.config.DatabaseDescriptor;
@@ -1004,14 +1002,14 @@ public class SinglePartitionReadCommand extends
ReadCommand
if (!columns.statics.isEmpty())
{
Row staticRow = searchIter.next(Clustering.STATIC_CLUSTERING);
- removeStatic = staticRow != null && canRemoveRow(staticRow,
columns.statics, sstableTimestamp);
+ removeStatic = staticRow != null && isRowComplete(staticRow,
columns.statics, sstableTimestamp);
}
NavigableSet<Clustering> toRemove = null;
for (Clustering clustering : clusterings)
{
Row row = searchIter.next(clustering);
- if (row == null || !canRemoveRow(row, columns.regulars,
sstableTimestamp))
+ if (row == null || !isRowComplete(row, columns.regulars,
sstableTimestamp))
continue;
if (toRemove == null)
@@ -1037,21 +1035,40 @@ public class SinglePartitionReadCommand extends
ReadCommand
return new ClusteringIndexNamesFilter(clusterings,
filter.isReversed());
}
- private boolean canRemoveRow(Row row, Columns requestedColumns, long
sstableTimestamp)
+ /**
+ * We can stop reading row data from disk if what we've already read is
more recent than the max timestamp
+ * of the next newest SSTable that might have data for the query. We care
about 1.) the row timestamp (since
+ * every query cares if the row exists or not), 2.) the timestamps of the
requested cells, and 3.) whether or
+ * not any of the cells we've read have actual data.
+ *
+ * @param row a potentially incomplete {@link Row}
+ * @param requestedColumns the columns requested by the query
+ * @param sstableTimestamp the max timestamp of the next newest SSTable to
read
+ *
+ * @return true if the supplied {@link Row} is complete and its data more
recent than the supplied timestamp
+ */
+ private boolean isRowComplete(Row row, Columns requestedColumns, long
sstableTimestamp)
{
- // We can remove a row if it has data that is more recent that the
next sstable to consider for the data that the query
- // cares about. And the data we care about is 1) the row timestamp
(since every query cares if the row exists or not)
- // and 2) the requested columns.
- if (row.primaryKeyLivenessInfo().isEmpty() ||
row.primaryKeyLivenessInfo().timestamp() <= sstableTimestamp)
+ // Note that compact tables will always have an empty primary key
liveness info.
+ if (!row.primaryKeyLivenessInfo().isEmpty() &&
row.primaryKeyLivenessInfo().timestamp() <= sstableTimestamp)
return false;
+ boolean hasLiveCell = false;
+
for (ColumnDefinition column : requestedColumns)
{
Cell cell = row.getCell(column);
+
if (cell == null || cell.timestamp() <= sstableTimestamp)
return false;
+
+ if (!cell.isTombstone())
+ hasLiveCell = true;
}
- return true;
+
+ // If we've gotten here w/ a compact table or at least one
non-tombstone cell, the row is considered
+ // complete and we can avoid any further searching of older SSTables.
+ return hasLiveCell || !metadata().isCQLTable();
}
@Override
diff --git
a/test/distributed/org/apache/cassandra/distributed/upgrade/CompactStorage2to3UpgradeTest.java
b/test/distributed/org/apache/cassandra/distributed/upgrade/CompactStorage2to3UpgradeTest.java
index c6235d3..4f5d1bc 100644
---
a/test/distributed/org/apache/cassandra/distributed/upgrade/CompactStorage2to3UpgradeTest.java
+++
b/test/distributed/org/apache/cassandra/distributed/upgrade/CompactStorage2to3UpgradeTest.java
@@ -29,7 +29,10 @@ import org.apache.cassandra.distributed.api.ICoordinator;
import org.apache.cassandra.distributed.api.IMessageFilters;
import org.apache.cassandra.distributed.api.NodeToolResult;
import org.apache.cassandra.distributed.shared.Versions;
-import static org.apache.cassandra.distributed.shared.AssertUtils.*;
+
+import static org.apache.cassandra.distributed.shared.AssertUtils.assertRows;
+import static org.apache.cassandra.distributed.shared.AssertUtils.row;
+import static org.junit.Assert.assertEquals;
public class CompactStorage2to3UpgradeTest extends UpgradeTestBase
{
@@ -314,7 +317,6 @@ public class CompactStorage2to3UpgradeTest extends
UpgradeTestBase
}).run();
}
-
private void runQueries(ICoordinator coordinator, ResultsRecorder helper,
String[] queries)
{
for (String query : queries)
diff --git
a/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/SSTablesIteratedTest.java
b/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/SSTablesIteratedTest.java
index 2cf518a..599711e 100644
---
a/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/SSTablesIteratedTest.java
+++
b/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/SSTablesIteratedTest.java
@@ -22,12 +22,13 @@ package org.apache.cassandra.cql3.validation.miscellaneous;
import org.junit.Test;
-import static org.junit.Assert.assertEquals;
import org.apache.cassandra.cql3.CQLTester;
import org.apache.cassandra.cql3.UntypedResultSet;
import org.apache.cassandra.db.ColumnFamilyStore;
import org.apache.cassandra.metrics.ClearableHistogram;
+import static org.junit.Assert.assertEquals;
+
/**
* Tests for checking how many sstables we access during cql queries.
*/
@@ -108,10 +109,10 @@ public class SSTablesIteratedTest extends CQLTester
executeAndCheck("SELECT * FROM %s WHERE pk = 2 AND c > 20 ORDER BY c
DESC", 2,
row(2, 40, "42"));
- // Test with only 2 of the 3 SSTables being merged and a Name filter
+ // Test with only 1 of the 3 SSTables being merged and a Name filter
// This test checks the
SinglePartitionReadCommand::queryMemtableAndSSTablesInTimestampOrder which is
only
// used for ClusteringIndexNamesFilter when there are no multi-cell
columns
- executeAndCheck("SELECT * FROM %s WHERE pk = 2 AND c = 10", 2,
+ executeAndCheck("SELECT * FROM %s WHERE pk = 2 AND c = 10", 1,
row(2, 10, "12"));
// For partition range queries the metric must not be updated. The
reason being that range queries simply
@@ -133,4 +134,233 @@ public class SSTablesIteratedTest extends CQLTester
assertInvalidMessage("ORDER BY is only supported when the partition
key is restricted by an EQ or an IN",
"SELECT * FROM %s WHERE token(pk) = token(1)
ORDER BY C DESC");
}
-}
\ No newline at end of file
+
+ @Test
+ public void testNonCompactTableRowDeletion() throws Throwable
+ {
+ createTable("CREATE TABLE %s (pk int, ck int, v text, PRIMARY KEY (pk,
ck))");
+
+ execute("INSERT INTO %s (pk, ck, v) VALUES (1, 1, '1')");
+ flush();
+
+ execute("DELETE FROM %s WHERE pk = 1 AND ck = 1");
+ flush();
+
+ executeAndCheck("SELECT * FROM %s WHERE pk = 1 AND ck = 1", 2);
+ }
+
+ @Test
+ public void testNonCompactTableRangeDeletion() throws Throwable
+ {
+ createTable("CREATE TABLE %s (a int, b int, c int, d int, PRIMARY KEY
(a, b, c))");
+
+ execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 1, 1);
+ flush();
+
+ execute("DELETE FROM %s WHERE a=? AND b=?", 1, 1);
+ flush();
+
+ executeAndCheck("SELECT * FROM %s WHERE a=1 AND b=1 AND c=1", 2);
+ }
+
+ @Test
+ public void testNonCompactTableCellsDeletion() throws Throwable
+ {
+ createTable("CREATE TABLE %s (pk int, ck int, v1 text, v2 text,
PRIMARY KEY (pk, ck))");
+
+ execute("INSERT INTO %s (pk, ck, v1, v2) VALUES (1, 1, '1', '1')");
+ flush();
+
+ execute("DELETE v1 FROM %s WHERE pk = 1 AND ck = 1");
+ execute("DELETE v2 FROM %s WHERE pk = 1 AND ck = 1");
+ flush();
+
+ executeAndCheck("SELECT * FROM %s WHERE pk = 1 AND ck = 1", 2, row(1,
1, null, null));
+ }
+
+ @Test
+ public void testCompactTableSkipping() throws Throwable
+ {
+ createTable("CREATE TABLE %s (pk int, ck int, v text, PRIMARY KEY (pk,
ck)) WITH COMPACT STORAGE");
+
+ execute("INSERT INTO %s (pk, ck, v) VALUES (1, 1, '1') USING TIMESTAMP
1000000");
+ execute("INSERT INTO %s (pk, ck, v) VALUES (1, 50, '2') USING
TIMESTAMP 1000001");
+ execute("INSERT INTO %s (pk, ck, v) VALUES (1, 100, '3') USING
TIMESTAMP 1000002");
+ flush();
+
+ execute("INSERT INTO %s (pk, ck, v) VALUES (1, 2, '4') USING TIMESTAMP
2000000");
+ execute("INSERT INTO %s (pk, ck, v) VALUES (1, 51, '5') USING
TIMESTAMP 2000001");
+ execute("INSERT INTO %s (pk, ck, v) VALUES (1, 101, '6') USING
TIMESTAMP 2000002");
+ flush();
+
+ executeAndCheck("SELECT * FROM %s WHERE pk = 1 AND ck = 51", 1, row(1,
51, "5"));
+
+ execute("ALTER TABLE %s DROP COMPACT STORAGE");
+ executeAndCheck("SELECT * FROM %s WHERE pk = 1 AND ck = 51", 1, row(1,
51, "5"));
+ }
+
+ @Test
+ public void testCompactTableSkippingPkOnly() throws Throwable
+ {
+ createTable("CREATE TABLE %s (pk int, ck int, PRIMARY KEY (pk, ck))
WITH COMPACT STORAGE");
+
+ execute("INSERT INTO %s (pk, ck) VALUES (1, 1) USING TIMESTAMP
1000000");
+ execute("INSERT INTO %s (pk, ck) VALUES (1, 50) USING TIMESTAMP
1000001");
+ execute("INSERT INTO %s (pk, ck) VALUES (1, 100) USING TIMESTAMP
1000002");
+ flush();
+
+ execute("INSERT INTO %s (pk, ck) VALUES (1, 2) USING TIMESTAMP
2000000");
+ execute("INSERT INTO %s (pk, ck) VALUES (1, 51) USING TIMESTAMP
2000001");
+ execute("INSERT INTO %s (pk, ck) VALUES (1, 101) USING TIMESTAMP
2000002");
+ flush();
+
+ executeAndCheck("SELECT * FROM %s WHERE pk = 1 AND ck = 51", 1, row(1,
51));
+
+ execute("ALTER TABLE %s DROP COMPACT STORAGE");
+ executeAndCheck("SELECT * FROM %s WHERE pk = 1 AND ck = 51", 1, row(1,
51));
+ }
+
+ @Test
+ public void testCompactTableCellDeletion() throws Throwable
+ {
+ createTable("CREATE TABLE %s (pk int, ck int, v text, PRIMARY KEY (pk,
ck)) WITH COMPACT STORAGE");
+
+ execute("INSERT INTO %s (pk, ck, v) VALUES (1, 1, '1')");
+ flush();
+
+ execute("DELETE v FROM %s WHERE pk = 1 AND ck = 1");
+ flush();
+
+ executeAndCheck("SELECT * FROM %s WHERE pk = 1 AND ck = 1", 1);
+
+ // Dropping compact storage forces us to hit an extra SSTable, since
we can't rely on the isDense flag
+ // to determine that a row with a complete set of column deletes is
complete.
+ execute("ALTER TABLE %s DROP COMPACT STORAGE");
+ executeAndCheck("SELECT * FROM %s WHERE pk = 1 AND ck = 1", 2);
+ }
+
+ @Test
+ public void testCompactTableRowDeletion() throws Throwable
+ {
+ createTable("CREATE TABLE %s (pk int, ck int, v text, PRIMARY KEY (pk,
ck)) WITH COMPACT STORAGE");
+
+ execute("INSERT INTO %s (pk, ck, v) VALUES (1, 1, '1')");
+ flush();
+
+ execute("DELETE FROM %s WHERE pk = 1 AND ck = 1");
+ flush();
+
+ executeAndCheck("SELECT * FROM %s WHERE pk = 1 AND ck = 1", 1);
+
+ // Dropping compact storage forces us to hit an extra SSTable, since
we can't rely on the isDense flag
+ // to determine that a row with a complete set of column deletes is
complete.
+ execute("ALTER TABLE %s DROP COMPACT STORAGE");
+ executeAndCheck("SELECT * FROM %s WHERE pk = 1 AND ck = 1", 2);
+ }
+
+ @Test
+ public void testCompactTableRangeDeletion() throws Throwable
+ {
+ createTable("CREATE TABLE %s (a int, b int, c int, d int, PRIMARY KEY
(a, b, c)) WITH COMPACT STORAGE");
+
+ execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 1, 1);
+ execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 2, 1);
+ execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 2, 1, 1);
+ flush();
+
+ execute("DELETE FROM %s WHERE a=? AND b=?", 1, 1);
+ flush();
+
+ // Even with a compact table, we can't short-circuit for a range
deletion rather than a cell tombstone.
+ executeAndCheck("SELECT * FROM %s WHERE a=1 AND b=1 AND c=1", 2);
+
+ execute("ALTER TABLE %s DROP COMPACT STORAGE");
+ executeAndCheck("SELECT * FROM %s WHERE a=1 AND b=1 AND c=1", 2);
+ }
+
+ @Test
+ public void testCompactTableRangeOverRowDeletion() throws Throwable
+ {
+ createTable("CREATE TABLE %s (a int, b int, c int, d int, PRIMARY KEY
(a, b, c)) WITH COMPACT STORAGE");
+
+ execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 1, 1);
+ execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 2, 1);
+ execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 2, 1, 1);
+ flush();
+
+ execute("DELETE FROM %s WHERE a=? AND b=? AND c=?", 1, 1, 1);
+ flush();
+
+ execute("DELETE FROM %s WHERE a=? AND b=?", 1, 1);
+ flush();
+
+ // The range delete will subsume the row delete, and the latter will
not factor into skipping decisions.
+ executeAndCheck("SELECT * FROM %s WHERE a=1 AND b=1 AND c=1", 3);
+
+ execute("ALTER TABLE %s DROP COMPACT STORAGE");
+ executeAndCheck("SELECT * FROM %s WHERE a=1 AND b=1 AND c=1", 3);
+ }
+
+ @Test
+ public void testCompactTableRowOverRangeDeletion() throws Throwable
+ {
+ createTable("CREATE TABLE %s (a int, b int, c int, d int, PRIMARY KEY
(a, b, c)) WITH COMPACT STORAGE");
+
+ execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 1, 1);
+ execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 2, 1);
+ execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 2, 1, 1);
+ flush();
+
+ execute("DELETE FROM %s WHERE a=? AND b=?", 1, 1);
+ flush();
+
+ execute("DELETE FROM %s WHERE a=? AND b=? AND c=?", 1, 1, 1);
+ flush();
+
+ // The row delete provides a tombstone, which is enough information to
short-circuit after the first SSTable.
+ executeAndCheck("SELECT * FROM %s WHERE a=1 AND b=1 AND c=1", 1);
+
+ execute("ALTER TABLE %s DROP COMPACT STORAGE");
+ executeAndCheck("SELECT * FROM %s WHERE a=1 AND b=1 AND c=1", 3);
+ }
+
+ @Test
+ public void testCompactTableCellUpdate() throws Throwable
+ {
+ createTable("CREATE TABLE %s (pk int, ck int, v text, PRIMARY KEY (pk,
ck)) WITH COMPACT STORAGE");
+
+ execute("INSERT INTO %s (pk, ck, v) VALUES (1, 1, '1')");
+ flush();
+
+ execute("UPDATE %s SET v = '2' WHERE pk = 1 AND ck = 1");
+ flush();
+
+ executeAndCheck("SELECT * FROM %s WHERE pk = 1 AND ck = 1", 1, row(1,
1, "2"));
+
+ execute("ALTER TABLE %s DROP COMPACT STORAGE");
+ executeAndCheck("SELECT * FROM %s WHERE pk = 1 AND ck = 1", 1, row(1,
1, "2"));
+ }
+
+ @Test
+ public void testCompactTableDeleteOverlappingSSTables() throws Throwable
+ {
+ createTable("CREATE TABLE %s (pk int, ck int, PRIMARY KEY (pk, ck))
WITH COMPACT STORAGE");
+
+ execute("INSERT INTO %s (pk, ck) VALUES (1, 51) USING TIMESTAMP
1000002");
+ flush();
+ execute("DELETE FROM %s WHERE pk = 1 AND ck = 51");
+ flush();
+
+ execute("INSERT INTO %s (pk, ck) VALUES (1, 51) USING TIMESTAMP
1000001");
+ execute("INSERT INTO %s (pk, ck) VALUES (2, 51)");
+ flush();
+
+ // If it weren't for the write to pk = 2, ck = 51, we could skip the
third SSTable too and hit only one here.
+ executeAndCheck("SELECT * FROM %s WHERE pk = 1 AND ck = 51", 2);
+
+ // Dropping compact storage forces us to hit an extra SSTable, since
we can't rely on the isDense flag
+ // to determine that a row with a complete set of column deletes is
complete.
+ execute("ALTER TABLE %s DROP COMPACT STORAGE");
+ executeAndCheck("SELECT * FROM %s WHERE pk = 1 AND ck = 51", 3);
+ }
+}
diff --git
a/test/unit/org/apache/cassandra/cql3/validation/operations/UpdateTest.java
b/test/unit/org/apache/cassandra/cql3/validation/operations/UpdateTest.java
index 8a9be19..42a8560 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/operations/UpdateTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/operations/UpdateTest.java
@@ -20,7 +20,6 @@ package org.apache.cassandra.cql3.validation.operations;
import java.util.Arrays;
-import org.junit.Assert;
import org.junit.Test;
import static org.apache.commons.lang3.StringUtils.isEmpty;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]