Merge branch 'cassandra-2.0' into cassandra-2.1 Conflicts: CHANGES.txt src/java/org/apache/cassandra/db/DeletionTime.java src/java/org/apache/cassandra/db/RangeTombstone.java
Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/655f0569 Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/655f0569 Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/655f0569 Branch: refs/heads/trunk Commit: 655f0569874b7f1997214cb9fe0bda64c7cdf0d5 Parents: b9a89a3 b0dbea3 Author: Sylvain Lebresne <sylv...@datastax.com> Authored: Wed Jun 3 14:22:44 2015 +0200 Committer: Sylvain Lebresne <sylv...@datastax.com> Committed: Wed Jun 3 14:22:44 2015 +0200 ---------------------------------------------------------------------- CHANGES.txt | 1 + .../org/apache/cassandra/db/DeletionTime.java | 5 + .../org/apache/cassandra/db/RangeTombstone.java | 194 +++++++++++++------ .../db/compaction/LazilyCompactedRow.java | 3 + 4 files changed, 148 insertions(+), 55 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/cassandra/blob/655f0569/CHANGES.txt ---------------------------------------------------------------------- diff --cc CHANGES.txt index 57bbfcc,16ce060..f5c3b41 --- a/CHANGES.txt +++ b/CHANGES.txt @@@ -1,38 -1,5 +1,39 @@@ -2.0.16: +2.1.6 + * Ensure truncate without snapshot cannot produce corrupt responses (CASSANDRA-9388) + * Consistent error message when a table mixes counter and non-counter + columns (CASSANDRA-9492) + * Avoid getting unreadable keys during anticompaction (CASSANDRA-9508) + * (cqlsh) Better float precision by default (CASSANDRA-9224) + * Improve estimated row count (CASSANDRA-9107) + * Optimize range tombstone memory footprint (CASSANDRA-8603) + * Use configured gcgs in anticompaction (CASSANDRA-9397) + * Warn on misuse of unlogged batches (CASSANDRA-9282) + * Failure detector detects and ignores local pauses (CASSANDRA-9183) + * Add utility class to support for rate limiting a given log statement (CASSANDRA-9029) + * Add missing consistency levels to cassandra-stess (CASSANDRA-9361) + * Fix commitlog getCompletedTasks to not increment (CASSANDRA-9339) + * Fix for harmless exceptions logged as ERROR (CASSANDRA-8564) + * Delete processed sstables in sstablesplit/sstableupgrade (CASSANDRA-8606) + * Improve sstable exclusion from partition tombstones (CASSANDRA-9298) + * Validate the indexed column rather than the cell's contents for 2i (CASSANDRA-9057) + * Add support for top-k custom 2i queries (CASSANDRA-8717) + * Fix error when dropping table during compaction (CASSANDRA-9251) + * cassandra-stress supports validation operations over user profiles (CASSANDRA-8773) + * Add support for rate limiting log messages (CASSANDRA-9029) + * Log the partition key with tombstone warnings (CASSANDRA-8561) + * Reduce runWithCompactionsDisabled poll interval to 1ms (CASSANDRA-9271) + * Fix PITR commitlog replay (CASSANDRA-9195) + * GCInspector logs very different times (CASSANDRA-9124) + * Fix deleting from an empty list (CASSANDRA-9198) + * Update tuple and collection types that use a user-defined type when that UDT + is modified (CASSANDRA-9148, CASSANDRA-9192) + * Use higher timeout for prepair and snapshot in repair (CASSANDRA-9261) + * Fix anticompaction blocking ANTI_ENTROPY stage (CASSANDRA-9151) + * Repair waits for anticompaction to finish (CASSANDRA-9097) + * Fix streaming not holding ref when stream error (CASSANDRA-9295) + * Fix canonical view returning early opened SSTables (CASSANDRA-9396) +Merged from 2.0: + * Don't accumulate more range than necessary in RangeTombstone.Tracker (CASSANDRA-9486) * Add broadcast and rpc addresses to system.local (CASSANDRA-9436) * Always mark sstable suspect when corrupted (CASSANDRA-9478) * Add database users and permissions to CQL3 documentation (CASSANDRA-7558) http://git-wip-us.apache.org/repos/asf/cassandra/blob/655f0569/src/java/org/apache/cassandra/db/DeletionTime.java ---------------------------------------------------------------------- diff --cc src/java/org/apache/cassandra/db/DeletionTime.java index 0e5f13f,b39d681..7165417 --- a/src/java/org/apache/cassandra/db/DeletionTime.java +++ b/src/java/org/apache/cassandra/db/DeletionTime.java @@@ -113,19 -109,25 +113,24 @@@ public class DeletionTime implements Co return localDeletionTime < gcBefore; } - public boolean isDeleted(Column column) + public boolean isDeleted(OnDiskAtom atom) { - return column.timestamp() <= markedForDeleteAt; + return atom.timestamp() <= markedForDeleteAt; } + public boolean supersedes(DeletionTime dt) + { + return this.markedForDeleteAt > dt.markedForDeleteAt; + } + - public long memorySize() + public long unsharedHeapSize() { - long fields = TypeSizes.NATIVE.sizeof(markedForDeleteAt) + TypeSizes.NATIVE.sizeof(localDeletionTime); - return ObjectSizes.getFieldSize(fields); + return EMPTY_SIZE; } - private static class Serializer implements ISerializer<DeletionTime> + public static class Serializer implements ISerializer<DeletionTime> { - public void serialize(DeletionTime delTime, DataOutput out) throws IOException + public void serialize(DeletionTime delTime, DataOutputPlus out) throws IOException { out.writeInt(delTime.localDeletionTime); out.writeLong(delTime.markedForDeleteAt); http://git-wip-us.apache.org/repos/asf/cassandra/blob/655f0569/src/java/org/apache/cassandra/db/RangeTombstone.java ---------------------------------------------------------------------- diff --cc src/java/org/apache/cassandra/db/RangeTombstone.java index feeadbb,fe9da20..590b005 --- a/src/java/org/apache/cassandra/db/RangeTombstone.java +++ b/src/java/org/apache/cassandra/db/RangeTombstone.java @@@ -94,40 -114,57 +94,62 @@@ public class RangeTombstone extends Int return comparator.compare(min, rt.min) <= 0 && comparator.compare(max, rt.max) >= 0; } + public boolean includes(Comparator<Composite> comparator, Composite name) + { + return comparator.compare(name, min) >= 0 && comparator.compare(name, max) <= 0; + } + + /** + * Tracks opened RangeTombstones when iterating over a partition. + * <p> + * This tracker must be provided all the atoms of a given partition in + * order (to the {@code update} method). Given this, it keeps enough + * information to be able to decide if one of an atom is deleted (shadowed) + * by a previously open RT. One the tracker can prove a given range + * tombstone cannot be useful anymore (that is, as soon as we've seen an + * atom that is after the end of that RT), it discards this RT. In other + * words, the maximum memory used by this object should be proportional to + * the maximum number of RT that can be simultaneously open (and this + * should fairly low in practice). + */ public static class Tracker { - private final Comparator<ByteBuffer> comparator; + private final Comparator<Composite> comparator; - private final Deque<RangeTombstone> ranges = new ArrayDeque<RangeTombstone>(); - private final SortedSet<RangeTombstone> maxOrderingSet = new TreeSet<RangeTombstone>(new Comparator<RangeTombstone>() - { - public int compare(RangeTombstone t1, RangeTombstone t2) - { - return comparator.compare(t1.max, t2.max); - } - }); - public final Set<RangeTombstone> expired = new HashSet<RangeTombstone>(); + + // A list the currently open RTs. We keep the list sorted in order of growing end bounds as for a + // new atom, this allows to efficiently find the RTs that are now useless (if any). Also note that because + // atom are passed to the tracker in order, any RT that is tracked can be assumed as opened, i.e. we + // never have to test the RTs start since it's always assumed to be less than what we have. + // Also note that this will store expired RTs (#7810). Those will be of type ExpiredRangeTombstone and + // will be ignored by writeOpenedMarker. + private final List<RangeTombstone> openedTombstones = new LinkedList<RangeTombstone>(); + + // Total number of atoms written by writeOpenedMarker(). private int atomCount; + /** + * Creates a new tracker given the table comparator. + * + * @param comparator the comparator for the table this will track atoms + * for. The tracker assumes that atoms will be later provided to the + * tracker in {@code comparator} order. + */ - public Tracker(Comparator<ByteBuffer> comparator) + public Tracker(Comparator<Composite> comparator) { this.comparator = comparator; } /** - * Compute RangeTombstone that are needed at the beginning of an index + * Computes the RangeTombstone that are needed at the beginning of an index * block starting with {@code firstColumn}. - * Returns the total serialized size of said tombstones and write them - * to {@code out} it if isn't null. + * + * @return the total serialized size of said tombstones and write them to + * {@code out} it if isn't null. */ - public long writeOpenedMarker(OnDiskAtom firstColumn, DataOutput out, OnDiskAtom.Serializer atomSerializer) throws IOException + public long writeOpenedMarker(OnDiskAtom firstColumn, DataOutputPlus out, OnDiskAtom.Serializer atomSerializer) throws IOException { long size = 0; - if (ranges.isEmpty()) + if (openedTombstones.isEmpty()) return size; /* @@@ -228,16 -290,41 +275,41 @@@ } } + /** + * Adds the provided {@code tombstone} _before_ the last element returned by {@code iterator.next()}. + * <p> + * This method assumes that {@code iterator.next()} has been called prior to this method call, i.e. that + * {@code iterator.hasPrevious() == true}. + */ + private static void insertBefore(RangeTombstone tombstone, ListIterator<RangeTombstone> iterator) + { + assert iterator.hasPrevious(); + iterator.previous(); + iterator.add(tombstone); + iterator.next(); + } + + /** + * Tests if the provided column is deleted by one of the tombstone + * tracked by this tracker. + * <p> + * This method should be called on columns in the same order than for the update() + * method. Note that this method does not update the tracker so the update() method + * should still be called on {@code column} (it doesn't matter if update is called + * before or after this call). + */ - public boolean isDeleted(Column column) + public boolean isDeleted(Cell cell) { - for (RangeTombstone tombstone : ranges) + // We know every tombstone kept are "open", start before the column. So the + // column is deleted if any of the tracked tombstone ends after the column + // (this will be the case of every RT if update() has been called before this + // method, but we might have a few RT to skip otherwise) and the RT deletion is + // actually more recent than the column timestamp. + for (RangeTombstone tombstone : openedTombstones) { - if (comparator.compare(cell.name(), tombstone.min) >= 0 - && comparator.compare(cell.name(), tombstone.max) <= 0 - if (comparator.compare(column.name(), tombstone.max) <= 0 - && tombstone.maxTimestamp() >= column.timestamp()) ++ if (comparator.compare(cell.name(), tombstone.max) <= 0 + && tombstone.timestamp() >= cell.timestamp()) - { return true; - } } return false; } http://git-wip-us.apache.org/repos/asf/cassandra/blob/655f0569/src/java/org/apache/cassandra/db/compaction/LazilyCompactedRow.java ---------------------------------------------------------------------- diff --cc src/java/org/apache/cassandra/db/compaction/LazilyCompactedRow.java index 56a4ede,43801c6..941557b --- a/src/java/org/apache/cassandra/db/compaction/LazilyCompactedRow.java +++ b/src/java/org/apache/cassandra/db/compaction/LazilyCompactedRow.java @@@ -291,29 -290,23 +291,32 @@@ public class LazilyCompactedRow extend { // when we clear() the container, it removes the deletion info, so this needs to be reset each time container.delete(maxRowTombstone); - ColumnFamily purged = PrecompactedRow.removeDeleted(key, shouldPurge, controller, container); - if (purged == null || !purged.iterator().hasNext()) + Iterator<Cell> iter = container.iterator(); + Cell c = iter.next(); + boolean shouldPurge = c.getLocalDeletionTime() < Integer.MAX_VALUE && c.timestamp() < getMaxPurgeableTimestamp(); + removeDeleted(container, shouldPurge, key, controller); + iter = container.iterator(); + if (!iter.hasNext()) { // don't call clear() because that resets the deletion time. See CASSANDRA-7808. - container = ArrayBackedSortedColumns.factory.create(emptyColumnFamily.metadata());; + container = ArrayBackedSortedColumns.factory.create(emptyColumnFamily.metadata()); return null; } - Column reduced = purged.iterator().next(); + + int localDeletionTime = container.deletionInfo().getTopLevelDeletion().localDeletionTime; + if (localDeletionTime < Integer.MAX_VALUE) + tombstones.update(localDeletionTime); + + Cell reduced = iter.next(); container = ArrayBackedSortedColumns.factory.create(emptyColumnFamily.metadata()); - // PrecompactedRow.removeDeleted has only checked the top-level CF deletion times, - // not the range tombstones. For that we use the columnIndexer tombstone tracker. + // removeDeleted have only checked the top-level CF deletion times, + // not the range tombstone. For that we use the columnIndexer tombstone tracker. if (indexBuilder.tombstoneTracker().isDeleted(reduced)) { + // We skip that column so it won't be passed to the tracker by the index builded. So pass it now to + // make sure we still discard potentially un-needed RT as soon as possible. + indexBuilder.tombstoneTracker().update(reduced, false); indexer.remove(reduced); return null; }