Repository: cassandra Updated Branches: refs/heads/cassandra-3.11 ec9ce3dfb -> f55cb88ab refs/heads/trunk 8b74ae4b6 -> a87b15d1d
Possible AssertionError in UnfilteredRowIteratorWithLowerBound patch by Sylvain Lebresne; reviewed by Stefania for CASSANDRA-13366 Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/f55cb88a Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/f55cb88a Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/f55cb88a Branch: refs/heads/cassandra-3.11 Commit: f55cb88ab595ccb941ebb4a088ab90f860f463d5 Parents: ec9ce3d Author: Sylvain Lebresne <sylv...@datastax.com> Authored: Wed Mar 22 15:41:49 2017 +0100 Committer: Sylvain Lebresne <sylv...@datastax.com> Committed: Thu Mar 23 10:26:37 2017 +0100 ---------------------------------------------------------------------- CHANGES.txt | 1 + .../db/SinglePartitionReadCommand.java | 4 +-- .../db/compaction/CompactionController.java | 2 +- .../UnfilteredRowIteratorWithLowerBound.java | 31 ++++++++++++++--- .../io/sstable/format/SSTableReader.java | 35 ++++++++------------ 5 files changed, 44 insertions(+), 29 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/cassandra/blob/f55cb88a/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index 8386c20..f4e48ff 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 3.11.0 + * Possible AssertionError in UnfilteredRowIteratorWithLowerBound (CASSANDRA-13366) * Support unaligned memory access for AArch64 (CASSANDRA-13326) * Improve SASI range iterator efficiency on intersection with an empty range (CASSANDRA-12915). * Fix equality comparisons of columns using the duration type (CASSANDRA-13174) http://git-wip-us.apache.org/repos/asf/cassandra/blob/f55cb88a/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java b/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java index f6d10f5..724f59e 100644 --- a/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java +++ b/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java @@ -584,7 +584,7 @@ public class SinglePartitionReadCommand extends ReadCommand if (!shouldInclude(sstable)) { nonIntersectingSSTables++; - if (sstable.hasTombstones()) + if (sstable.mayHaveTombstones()) { // if sstable has tombstones we need to check after one pass if it can be safely skipped if (skippedSSTablesWithTombstones == null) skippedSSTablesWithTombstones = new ArrayList<>(); @@ -773,7 +773,7 @@ public class SinglePartitionReadCommand extends ReadCommand // however: if it is set, it impacts everything and must be included. Getting that top-level partition deletion costs us // some seek in general however (unless the partition is indexed and is in the key cache), so we first check if the sstable // has any tombstone at all as a shortcut. - if (!sstable.hasTombstones()) + if (!sstable.mayHaveTombstones()) continue; // no tombstone at all, we can skip that sstable // We need to get the partition deletion and include it if it's live. In any case though, we're done with that sstable. http://git-wip-us.apache.org/repos/asf/cassandra/blob/f55cb88a/src/java/org/apache/cassandra/db/compaction/CompactionController.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionController.java b/src/java/org/apache/cassandra/db/compaction/CompactionController.java index 64c35d9..bf3647a 100644 --- a/src/java/org/apache/cassandra/db/compaction/CompactionController.java +++ b/src/java/org/apache/cassandra/db/compaction/CompactionController.java @@ -297,7 +297,7 @@ public class CompactionController implements AutoCloseable { if (reader.isMarkedSuspect() || reader.getMaxTimestamp() <= minTimestamp || - tombstoneOnly && !reader.hasTombstones()) + tombstoneOnly && !reader.mayHaveTombstones()) return null; RowIndexEntry<?> position = reader.getPosition(key, SSTableReader.Operator.EQ); if (position == null) http://git-wip-us.apache.org/repos/asf/cassandra/blob/f55cb88a/src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorWithLowerBound.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorWithLowerBound.java b/src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorWithLowerBound.java index 14730ac..4536036 100644 --- a/src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorWithLowerBound.java +++ b/src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorWithLowerBound.java @@ -159,7 +159,7 @@ public class UnfilteredRowIteratorWithLowerBound extends LazilyInitializedUnfilt @Override public DeletionTime partitionLevelDeletion() { - if (!sstable.hasTombstones()) + if (!sstable.mayHaveTombstones()) return DeletionTime.LIVE; return super.partitionLevelDeletion(); @@ -210,13 +210,34 @@ public class UnfilteredRowIteratorWithLowerBound extends LazilyInitializedUnfilt } /** - * @return true if we can use the clustering values in the stats of the sstable: - * - we need the latest stats file format (or else the clustering values create clusterings with the wrong size) - * - we cannot create tombstone bounds from these values only and so we rule out sstables with tombstones + * Whether we can use the clustering values in the stats of the sstable to build the lower bound. + * <p> + * Currently, the clustering values of the stats file records for each clustering component the min and max + * value seen, null excluded. In other words, having a non-null value for a component in those min/max clustering + * values does _not_ guarantee that there isn't an unfiltered in the sstable whose clustering has either no value for + * that component (it's a prefix) or a null value. + * <p> + * This is problematic as this means we can't in general build a lower bound from those values since the "min" + * values doesn't actually guarantee minimality. + * <p> + * However, we can use those values if we can guarantee that no clustering in the sstable 1) is a true prefix and + * 2) uses null values. Nat having true prefixes means having no range tombstone markers since rows use + * {@link Clustering} which is always "full" (all components are always present). As for null values, we happen to + * only allow those in compact tables (for backward compatibility), so we can simply exclude those tables. + * <p> + * Note that the information we currently have at our disposal make this condition less precise that it could be. + * In particular, {@link SSTableReader#mayHaveTombstones} could return {@code true} (making us not use the stats) + * because of cell tombstone or even expiring cells even if the sstable has no range tombstone markers, even though + * it's really only markers we want to exclude here (more precisely, as said above, we want to exclude anything + * whose clustering is not "full", but that's only markers). It wouldn't be very hard to collect whether a sstable + * has any range tombstone marker however so it's a possible improvement. */ private boolean canUseMetadataLowerBound() { - return !sstable.hasTombstones() && sstable.descriptor.version.hasNewStatsFile(); + // Side-note: pre-2.1 sstable stat file had clustering value arrays whose size may not match the comparator size + // and that would break getMetadataLowerBound. We don't support upgrade from 2.0 to 3.0 directly however so it's + // not a true concern. Besides, !sstable.mayHaveTombstones already ensure this is a 3.0 sstable anyway. + return !sstable.mayHaveTombstones() && !sstable.metadata.isCompactTable(); } /** http://git-wip-us.apache.org/repos/asf/cassandra/blob/f55cb88a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java index 56609b3..321abc7 100644 --- a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java +++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java @@ -51,6 +51,7 @@ import org.apache.cassandra.config.Schema; import org.apache.cassandra.config.SchemaConstants; import org.apache.cassandra.db.*; import org.apache.cassandra.db.filter.ColumnFilter; +import org.apache.cassandra.db.rows.Cell; import org.apache.cassandra.db.rows.EncodingStats; import org.apache.cassandra.db.rows.UnfilteredRowIterator; import org.apache.cassandra.dht.AbstractBounds; @@ -1881,12 +1882,20 @@ public abstract class SSTableReader extends SSTable implements SelfRefCounted<SS return sstableMetadata.maxLocalDeletionTime; } - /** sstable contains no tombstones if minLocalDeletionTime == Integer.MAX_VALUE */ - public boolean hasTombstones() + /** + * Whether the sstable may contain tombstones or if it is guaranteed to not contain any. + * <p> + * Note that having that method return {@code false} guarantees the sstable has no tombstones whatsoever (so no + * cell tombstone, no range tombstone maker and no expiring columns), but having it return {@code true} doesn't + * guarantee it contains any as 1) it may simply have non-expired cells and 2) old-format sstables didn't contain + * enough information to decide this and so always return {@code true}. + */ + public boolean mayHaveTombstones() { - // sstable contains no tombstone if minLocalDeletionTime is still set to the default value Integer.MAX_VALUE - // which is bigger than any valid deletion times - return getMinLocalDeletionTime() != Integer.MAX_VALUE; + // A sstable is guaranteed to have no tombstones if it properly tracked the minLocalDeletionTime (which we only + // do since 3.0 - see CASSANDRA-13366) and that value is still set to its default, Cell.NO_DELETION_TIME, which + // is bigger than any valid deletion times. + return !descriptor.version.storeRows() || getMinLocalDeletionTime() != Cell.NO_DELETION_TIME; } public int getMinTTL() @@ -2008,22 +2017,6 @@ public abstract class SSTableReader extends SSTable implements SelfRefCounted<SS readMeter.mark(); } - /** - * Checks if this sstable can overlap with another one based on the min/man clustering values. - * If this methods return false, we're guarantee that {@code this} and {@code other} have no overlapping - * data, i.e. no cells to reconcile. - */ - public boolean mayOverlapsWith(SSTableReader other) - { - StatsMetadata m1 = getSSTableMetadata(); - StatsMetadata m2 = other.getSSTableMetadata(); - - if (m1.minClusteringValues.isEmpty() || m1.maxClusteringValues.isEmpty() || m2.minClusteringValues.isEmpty() || m2.maxClusteringValues.isEmpty()) - return true; - - return !(compare(m1.maxClusteringValues, m2.minClusteringValues) < 0 || compare(m1.minClusteringValues, m2.maxClusteringValues) > 0); - } - private int compare(List<ByteBuffer> values1, List<ByteBuffer> values2) { ClusteringComparator comparator = metadata.comparator;