Optimize mostRecentTomstone check in CC.collectAllData patch by slebresne; reviewed by jbellis for CASSANDRA-4883
Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/53943180 Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/53943180 Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/53943180 Branch: refs/heads/trunk Commit: 53943180a4a41ecc5e145d1e08b0ea4a9d849c8c Parents: 9118764 Author: Sylvain Lebresne <[email protected]> Authored: Tue Nov 13 08:54:37 2012 +0100 Committer: Sylvain Lebresne <[email protected]> Committed: Tue Nov 13 08:54:37 2012 +0100 ---------------------------------------------------------------------- CHANGES.txt | 1 + .../org/apache/cassandra/config/CFMetaData.java | 3 +- .../apache/cassandra/db/CollationController.java | 33 +++++++-------- src/java/org/apache/cassandra/db/DataTracker.java | 31 ++++++-------- .../cassandra/db/CollationControllerTest.java | 20 +++++---- 5 files changed, 43 insertions(+), 45 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/cassandra/blob/53943180/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index 43fc53a..77cc8a8 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -8,6 +8,7 @@ * Separate tracing from Log4J (CASSANDRA-4861) * Exclude gcable tombstones from merkle-tree computation (CASSANDRA-4905) * Better printing of AbstractBounds for tracing (CASSANDRA-4931) + * Optimize mostRecentTomstone check in CC.collectAllData (CASSANDRA-4883) 1.2-beta2 http://git-wip-us.apache.org/repos/asf/cassandra/blob/53943180/src/java/org/apache/cassandra/config/CFMetaData.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/config/CFMetaData.java b/src/java/org/apache/cassandra/config/CFMetaData.java index 921242a..c3a00b1 100644 --- a/src/java/org/apache/cassandra/config/CFMetaData.java +++ b/src/java/org/apache/cassandra/config/CFMetaData.java @@ -181,7 +181,8 @@ public final class CFMetaData + "thrift_version text," + "cql_version text," + "data_center text," - + "rack text" + + "rack text," + + "partitioner text," + ") WITH COMMENT='information about the local node'"); public static final CFMetaData TraceSessionsCf = compile(14, "CREATE TABLE " + Tracing.SESSIONS_CF + " (" http://git-wip-us.apache.org/repos/asf/cassandra/blob/53943180/src/java/org/apache/cassandra/db/CollationController.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/db/CollationController.java b/src/java/org/apache/cassandra/db/CollationController.java index 80d0a2c..73c675f 100644 --- a/src/java/org/apache/cassandra/db/CollationController.java +++ b/src/java/org/apache/cassandra/db/CollationController.java @@ -241,13 +241,27 @@ public class CollationController } } + /* + * We can't eliminate full sstables based on the timestamp of what we've already read like + * in collectTimeOrderedData, but we still want to eliminate sstable whose maxTimestamp < mostRecentTombstone + * we've read. We still rely on the sstable ordering by maxTimestamp since if + * maxTimestamp_s1 > maxTimestamp_s0, + * we're guaranteed that s1 cannot have a row tombstone such that + * timestamp(tombstone) > maxTimestamp_s0 + * since we necessarily have + * timestamp(tombstone) <= maxTimestamp_s1 + * In othere words, iterating in maxTimestamp order allow to do our mostRecentTombstone elimination + * in one pass, and minimize the number of sstables for which we read a rowTombstone. + */ + Collections.sort(view.sstables, SSTable.maxTimestampComparator); + long mostRecentRowTombstone = Long.MIN_VALUE; for (SSTableReader sstable : view.sstables) { // if we've already seen a row tombstone with a timestamp greater // than the most recent update to this sstable, we can skip it if (sstable.getMaxTimestamp() < mostRecentRowTombstone) - continue; + break; OnDiskAtomIterator iter = filter.getSSTableColumnIterator(sstable); iterators.add(iter); @@ -262,23 +276,6 @@ public class CollationController } } - // If we saw a row tombstone, do a second pass through the iterators we - // obtained from the sstables and drop any whose maxTimestamp < that of the - // row tombstone - { - Iterator<OnDiskAtomIterator> it = iterators.iterator(); - while (it.hasNext()) - { - OnDiskAtomIterator iter = it.next(); - if ((iter instanceof ISSTableColumnIterator) - && ((ISSTableColumnIterator) iter).getSStable().getMaxTimestamp() < mostRecentRowTombstone) - { - FileUtils.closeQuietly(iter); - it.remove(); - } - } - } - // we need to distinguish between "there is no data at all for this row" (BF will let us rebuild that efficiently) // and "there used to be data, but it's gone now" (we should cache the empty CF so we don't need to rebuild that slower) if (iterators.isEmpty()) http://git-wip-us.apache.org/repos/asf/cassandra/blob/53943180/src/java/org/apache/cassandra/db/DataTracker.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/db/DataTracker.java b/src/java/org/apache/cassandra/db/DataTracker.java index 85da449..7e8887f 100644 --- a/src/java/org/apache/cassandra/db/DataTracker.java +++ b/src/java/org/apache/cassandra/db/DataTracker.java @@ -64,7 +64,7 @@ public class DataTracker return view.get().memtablesPendingFlush; } - public List<SSTableReader> getSSTables() + public Set<SSTableReader> getSSTables() { return view.get().sstables; } @@ -300,7 +300,7 @@ public class DataTracker { view.set(new View(new Memtable(cfstore), Collections.<Memtable>emptySet(), - Collections.<SSTableReader>emptyList(), + Collections.<SSTableReader>emptySet(), Collections.<SSTableReader>emptySet(), SSTableIntervalTree.empty())); } @@ -456,15 +456,10 @@ public class DataTracker public final Memtable memtable; public final Set<Memtable> memtablesPendingFlush; public final Set<SSTableReader> compacting; - // We can't use a SortedSet here because "the ordering maintained by a sorted set (whether or not an - // explicit comparator is provided) must be <i>consistent with equals</i>." In particular, - // ImmutableSortedSet will ignore any objects that compare equally with an existing Set member. - // Obviously, dropping sstables whose max column timestamp happens to be equal to another's - // is not acceptable for us. So, we use a List instead. - public final List<SSTableReader> sstables; + public final Set<SSTableReader> sstables; public final SSTableIntervalTree intervalTree; - View(Memtable memtable, Set<Memtable> pendingFlush, List<SSTableReader> sstables, Set<SSTableReader> compacting, SSTableIntervalTree intervalTree) + View(Memtable memtable, Set<Memtable> pendingFlush, Set<SSTableReader> sstables, Set<SSTableReader> compacting, SSTableIntervalTree intervalTree) { this.memtable = memtable; this.memtablesPendingFlush = pendingFlush; @@ -492,18 +487,18 @@ public class DataTracker public View replaceFlushed(Memtable flushedMemtable, SSTableReader newSSTable) { Set<Memtable> newPending = ImmutableSet.copyOf(Sets.difference(memtablesPendingFlush, Collections.singleton(flushedMemtable))); - List<SSTableReader> newSSTables = newSSTable == null - ? Collections.<SSTableReader>emptyList() + Set<SSTableReader> newSSTables = newSSTable == null + ? Collections.<SSTableReader>emptySet() : newSSTables(newSSTable); SSTableIntervalTree intervalTree = buildIntervalTree(newSSTables); - return new View(memtable, newPending, Collections.unmodifiableList(newSSTables), compacting, intervalTree); + return new View(memtable, newPending, newSSTables, compacting, intervalTree); } public View replace(Collection<SSTableReader> oldSSTables, Iterable<SSTableReader> replacements) { - List<SSTableReader> newSSTables = newSSTables(oldSSTables, replacements); + Set<SSTableReader> newSSTables = newSSTables(oldSSTables, replacements); SSTableIntervalTree intervalTree = buildIntervalTree(newSSTables); - return new View(memtable, memtablesPendingFlush, Collections.unmodifiableList(newSSTables), compacting, intervalTree); + return new View(memtable, memtablesPendingFlush, newSSTables, compacting, intervalTree); } public View markCompacting(Collection<SSTableReader> tomark) @@ -518,19 +513,19 @@ public class DataTracker return new View(memtable, memtablesPendingFlush, sstables, compactingNew, intervalTree); } - private List<SSTableReader> newSSTables(SSTableReader newSSTable) + private Set<SSTableReader> newSSTables(SSTableReader newSSTable) { assert newSSTable != null; // not performance-sensitive, don't obsess over doing a selection merge here return newSSTables(Collections.<SSTableReader>emptyList(), Collections.singletonList(newSSTable)); } - private List<SSTableReader> newSSTables(Collection<SSTableReader> oldSSTables, Iterable<SSTableReader> replacements) + private Set<SSTableReader> newSSTables(Collection<SSTableReader> oldSSTables, Iterable<SSTableReader> replacements) { ImmutableSet<SSTableReader> oldSet = ImmutableSet.copyOf(oldSSTables); int newSSTablesSize = sstables.size() - oldSSTables.size() + Iterables.size(replacements); assert newSSTablesSize >= Iterables.size(replacements) : String.format("Incoherent new size %d replacing %s by %s in %s", newSSTablesSize, oldSSTables, replacements, this); - List<SSTableReader> newSSTables = new ArrayList<SSTableReader>(newSSTablesSize); + Set<SSTableReader> newSSTables = new HashSet<SSTableReader>(newSSTablesSize); for (SSTableReader sstable : sstables) { if (!oldSet.contains(sstable)) @@ -538,7 +533,7 @@ public class DataTracker } Iterables.addAll(newSSTables, replacements); assert newSSTables.size() == newSSTablesSize : String.format("Expecting new size of %d, got %d while replacing %s by %s in %s", newSSTablesSize, newSSTables.size(), oldSSTables, replacements, this); - return newSSTables; + return ImmutableSet.copyOf(newSSTables); } @Override http://git-wip-us.apache.org/repos/asf/cassandra/blob/53943180/test/unit/org/apache/cassandra/db/CollationControllerTest.java ---------------------------------------------------------------------- diff --git a/test/unit/org/apache/cassandra/db/CollationControllerTest.java b/test/unit/org/apache/cassandra/db/CollationControllerTest.java index f469639..4a8e426 100644 --- a/test/unit/org/apache/cassandra/db/CollationControllerTest.java +++ b/test/unit/org/apache/cassandra/db/CollationControllerTest.java @@ -30,6 +30,8 @@ import org.apache.cassandra.db.filter.QueryPath; import org.apache.cassandra.utils.ByteBufferUtil; import org.junit.Test; +import org.apache.cassandra.io.sstable.SSTableReader; + public class CollationControllerTest extends SchemaLoader { @Test @@ -61,20 +63,22 @@ public class CollationControllerTest extends SchemaLoader store.forceBlockingFlush(); + // add yet one more mutation + rm = new RowMutation("Keyspace1", dk.key); + rm.add(path, ByteBufferUtil.bytes("foobar"), 30); + rm.apply(); + store.forceBlockingFlush(); + // A NamesQueryFilter goes down one code path (through collectTimeOrderedData()) + // It should only iterate the last flushed sstable, since it probably contains the most recent value for Column1 QueryFilter filter = QueryFilter.getNamesFilter(dk, path, ByteBufferUtil.bytes("Column1")); CollationController controller = new CollationController(store, false, filter, Integer.MIN_VALUE); controller.getTopLevelColumns(); assertEquals(1, controller.getSstablesIterated()); - + // SliceQueryFilter goes down another path (through collectAllData()) - // Add another mutation, with a lower timestamp then force another flush - // so we can assert that we're not reading every sstable - rm = new RowMutation("Keyspace1", dk.key); - rm.add(path, ByteBufferUtil.bytes("asdf"), 5); - rm.apply(); - store.forceBlockingFlush(); - + // We will read "only" the last sstable in that case, but because the 2nd sstable has a tombstone that is more + // recent than the maxTimestamp of the very first sstable we flushed, we should only read the 2 first sstables. filter = QueryFilter.getIdentityFilter(dk, path); controller = new CollationController(store, false, filter, Integer.MIN_VALUE); controller.getTopLevelColumns();
