avoid promoting tiny sstables patch by jbellis; reviewed by yukim for CASSANDRA-4341
Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/a4fab90a Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/a4fab90a Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/a4fab90a Branch: refs/heads/trunk Commit: a4fab90af0063110e1118fb69b9c5923a415b9d3 Parents: 33f1bac Author: Jonathan Ellis <[email protected]> Authored: Thu Jun 14 14:30:36 2012 -0500 Committer: Jonathan Ellis <[email protected]> Committed: Thu Jun 14 17:40:15 2012 -0500 ---------------------------------------------------------------------- CHANGES.txt | 1 + .../cassandra/db/compaction/LeveledManifest.java | 71 +++++++++++---- 2 files changed, 55 insertions(+), 17 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/cassandra/blob/a4fab90a/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index fcfff37..b0e667d 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 1.1.2 + * LCS no longer promotes tiny sstables out of L0 (CASSANDRA-4341) * skip tombstones during hint replay (CASSANDRA-4320) * fix NPE in compactionstats (CASSANDRA-4318) * enforce 1m min keycache for auto (CASSANDRA-4306) http://git-wip-us.apache.org/repos/asf/cassandra/blob/a4fab90a/src/java/org/apache/cassandra/db/compaction/LeveledManifest.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/db/compaction/LeveledManifest.java b/src/java/org/apache/cassandra/db/compaction/LeveledManifest.java index b2ba857..4ed5fac 100644 --- a/src/java/org/apache/cassandra/db/compaction/LeveledManifest.java +++ b/src/java/org/apache/cassandra/db/compaction/LeveledManifest.java @@ -63,12 +63,12 @@ public class LeveledManifest private final List<SSTableReader>[] generations; private final Map<SSTableReader, Integer> sstableGenerations; private final RowPosition[] lastCompactedKeys; - private final int maxSSTableSizeInMB; + private final int maxSSTableSizeInBytes; private LeveledManifest(ColumnFamilyStore cfs, int maxSSTableSizeInMB) { this.cfs = cfs; - this.maxSSTableSizeInMB = maxSSTableSizeInMB; + this.maxSSTableSizeInBytes = maxSSTableSizeInMB * 1024 * 1024; // allocate enough generations for a PB of data int n = (int) Math.log10(1000 * 1000 * 1000 / maxSSTableSizeInMB); @@ -178,9 +178,18 @@ public class LeveledManifest if (!added.iterator().hasNext()) return; - int newLevel = minimumLevel == maximumLevel ? maximumLevel + 1 : maximumLevel; - newLevel = skipLevels(newLevel, added); - assert newLevel > 0; + int newLevel; + if (minimumLevel == 0 && maximumLevel == 0 && SSTable.getTotalBytes(removed) < maxSSTableSizeInBytes) + { + // special case for tiny L0 sstables; see CASSANDRA-4341 + newLevel = 0; + } + else + { + newLevel = minimumLevel == maximumLevel ? maximumLevel + 1 : maximumLevel; + newLevel = skipLevels(newLevel, added); + assert newLevel > 0; + } if (logger.isDebugEnabled()) logger.debug("Adding [{}] at L{}", toString(added), newLevel); @@ -225,8 +234,8 @@ public class LeveledManifest private long maxBytesForLevel(int level) { if (level == 0) - return 4L * maxSSTableSizeInMB * 1024 * 1024; - double bytes = Math.pow(10, level) * maxSSTableSizeInMB * 1024 * 1024; + return 4L * maxSSTableSizeInBytes; + double bytes = Math.pow(10, level) * maxSSTableSizeInBytes; if (bytes > Long.MAX_VALUE) throw new RuntimeException("At most " + Long.MAX_VALUE + " bytes may be in a compaction level; your maxSSTableSize must be absurdly high to compute " + bytes); return (long) bytes; @@ -265,7 +274,7 @@ public class LeveledManifest // L0 gets a special case that if we don't have anything more important to do, // we'll go ahead and compact even just one sstable - if (score > 1.001 || i == 0) + if (score > 1.001 || (i == 0 && sstables.size() > 1)) { Collection<SSTableReader> candidates = getCandidatesFor(i); @@ -368,17 +377,45 @@ public class LeveledManifest if (level == 0) { - // because L0 files may overlap each other, we treat compactions there specially: - // a L0 compaction also checks other L0 files for overlap. + // L0 is the dumping ground for new sstables which thus may overlap each other. + // + // We treat L0 compactions specially: + // 1a. add sstables to the candidate set until we have at least maxSSTableSizeInMB + // 1b. prefer choosing older sstables as candidates, to newer ones + // 1c. any L0 sstables that overlap a candidate, will also become candidates + // 2. At most MAX_COMPACTING_L0 sstables will be compacted at once + // 3. If total candidate size is less than maxSSTableSizeInMB, we won't bother compacting with L1, + // and the result of the compaction will stay in L0 instead of being promoted (see promote()) Set<SSTableReader> candidates = new HashSet<SSTableReader>(); - // pick the oldest sstable from L0, and any that overlap with it + Set<SSTableReader> remaining = new HashSet<SSTableReader>(generations[0]); List<SSTableReader> ageSortedSSTables = new ArrayList<SSTableReader>(generations[0]); Collections.sort(ageSortedSSTables, SSTable.maxTimestampComparator); - List<SSTableReader> L0 = overlapping(ageSortedSSTables.get(0), generations[0]); - L0 = L0.size() > MAX_COMPACTING_L0 ? L0.subList(0, MAX_COMPACTING_L0) : L0; - // add the overlapping ones from L1 - for (SSTableReader sstable : L0) - candidates.addAll(overlapping(sstable, generations[1])); + for (SSTableReader sstable : ageSortedSSTables) + { + if (candidates.contains(sstable)) + continue; + + List<SSTableReader> newCandidates = overlapping(sstable, remaining); + candidates.addAll(newCandidates); + remaining.removeAll(newCandidates); + + if (candidates.size() > MAX_COMPACTING_L0) + { + // limit to only the MAX_COMPACTING_L0 oldest candidates + List<SSTableReader> ageSortedCandidates = new ArrayList<SSTableReader>(candidates); + Collections.sort(ageSortedCandidates, SSTable.maxTimestampComparator); + return ageSortedCandidates.subList(0, MAX_COMPACTING_L0); + } + + if (SSTable.getTotalBytes(candidates) > maxSSTableSizeInBytes) + { + // add sstables from L1 that overlap candidates + for (SSTableReader candidate : new ArrayList<SSTableReader>(candidates)) + candidates.addAll(overlapping(candidate, generations[1])); + break; + } + } + return candidates; } @@ -470,7 +507,7 @@ public class LeveledManifest for (int i = generations.length - 1; i >= 0; i--) { List<SSTableReader> sstables = generations[i]; - estimated[i] = Math.max(0L, SSTableReader.getTotalBytes(sstables) - maxBytesForLevel(i)) / (maxSSTableSizeInMB * 1024L * 1024); + estimated[i] = Math.max(0L, SSTableReader.getTotalBytes(sstables) - maxBytesForLevel(i)) / maxSSTableSizeInBytes; tasks += estimated[i]; }
