This is an automated email from the ASF dual-hosted git repository.
marcuse pushed a commit to branch cassandra-4.0
in repository https://gitbox.apache.org/repos/asf/cassandra.git
The following commit(s) were added to refs/heads/cassandra-4.0 by this push:
new f2e3a0a4fc Handle sstable metadata stats file getting a new mtime
after compaction has finished
f2e3a0a4fc is described below
commit f2e3a0a4fc0c70d02a01cc2968c382d8f17697c6
Author: Josh McKenzie <[email protected]>
AuthorDate: Tue Dec 13 14:58:23 2022 +0100
Handle sstable metadata stats file getting a new mtime after compaction has
finished
Patch by Josh McKenzie and marcuse; reviewed by Josh McKenzie and marcuse
for CASSANDRA-18119
---
CHANGES.txt | 1 +
.../org/apache/cassandra/db/lifecycle/LogFile.java | 19 ++-
.../apache/cassandra/db/lifecycle/LogRecord.java | 69 +++++++-
.../cassandra/db/lifecycle/LogTransaction.java | 2 +
.../cassandra/db/lifecycle/LogTransactionTest.java | 175 ++++++++++++++++++++-
5 files changed, 256 insertions(+), 10 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 70761240f1..65e3cb4de0 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,4 +1,5 @@
4.0.18
+ * Handle sstable metadata stats file getting a new mtime after compaction has
finished (CASSANDRA-18119)
* Honor MAX_PARALLEL_TRANSFERS correctly (CASSANDRA-20532)
* Updating a column with a new TTL but same expiration time is
non-deterministic and causes repair mismatches. (CASSANDRA-20561)
* Grant permission on keyspaces system_views and system_virtual_schema not
possible (CASSANDRA-20171)
diff --git a/src/java/org/apache/cassandra/db/lifecycle/LogFile.java
b/src/java/org/apache/cassandra/db/lifecycle/LogFile.java
index d67019008f..cd793d4831 100644
--- a/src/java/org/apache/cassandra/db/lifecycle/LogFile.java
+++ b/src/java/org/apache/cassandra/db/lifecycle/LogFile.java
@@ -163,6 +163,10 @@ final class LogFile implements AutoCloseable
this.id = id;
}
+ /**
+ * Check a variety of the internals of the LogRecord as well as the state
of the LogRecord vs. the files found on disk
+ * to ensure they remain correct and nothing was changed external to the
process.
+ */
boolean verify()
{
records.clear();
@@ -229,6 +233,9 @@ final class LogFile implements AutoCloseable
return record;
}
+ /**
+ * Sets the {@link LogRecord.Status#error} if something wrong is found
with the record.
+ */
static void verifyRecord(LogRecord record, List<File> existingFiles)
{
if (record.checksum != record.computeChecksum())
@@ -240,6 +247,7 @@ final class LogFile implements AutoCloseable
return;
}
+ // If it's not a removal we don't check it since we're not going to
take action on it
if (record.type != Type.REMOVE)
return;
@@ -253,6 +261,16 @@ final class LogFile implements AutoCloseable
// we can have transaction files with mismatching updateTime
resolutions due to switching between jdk8 and jdk11, truncate both to be
consistent:
if (truncateMillis(record.updateTime) !=
truncateMillis(record.status.onDiskRecord.updateTime) &&
record.status.onDiskRecord.updateTime > 0)
{
+ // handle the case where we have existing broken transaction file
on disk, where the update time is
+ // based on the stats file. This is just for the first upgrade,
patched versions never base the update
+ // time on the stats file.
+ LogRecord statsIncluded = LogRecord.make(record.type,
existingFiles, existingFiles.size(), record.absolutePath(), true);
+ if (truncateMillis(statsIncluded.updateTime) ==
truncateMillis(record.updateTime))
+ {
+ logger.warn("Found a legacy log record {} with updateTime
based on the stats file, ignoring to allow startup to continue", record);
+ return;
+ }
+
record.setError(String.format("Unexpected files detected for
sstable [%s]: " +
"last update time [%tc] (%d) should
have been [%tc] (%d)",
record.fileName(),
@@ -260,7 +278,6 @@ final class LogFile implements AutoCloseable
record.status.onDiskRecord.updateTime,
record.updateTime,
record.updateTime));
-
}
}
diff --git a/src/java/org/apache/cassandra/db/lifecycle/LogRecord.java
b/src/java/org/apache/cassandra/db/lifecycle/LogRecord.java
index 513ad87460..65ff470ac8 100644
--- a/src/java/org/apache/cassandra/db/lifecycle/LogRecord.java
+++ b/src/java/org/apache/cassandra/db/lifecycle/LogRecord.java
@@ -30,12 +30,19 @@ import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.zip.CRC32;
+import com.google.common.annotations.VisibleForTesting;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.Descriptor;
import org.apache.cassandra.io.sstable.SSTable;
import org.apache.cassandra.io.sstable.format.SSTableReader;
import org.apache.cassandra.io.util.FileUtils;
import org.apache.cassandra.utils.FBUtilities;
+import static org.apache.cassandra.io.sstable.Descriptor.TMP_EXT;
+
/**
* A decoded line in a transaction log file replica.
*
@@ -43,6 +50,10 @@ import org.apache.cassandra.utils.FBUtilities;
*/
final class LogRecord
{
+ private static final Logger logger =
LoggerFactory.getLogger(LogRecord.class);
+ @VisibleForTesting
+ static boolean INCLUDE_STATS_FOR_TESTS = false;
+
public enum Type
{
UNKNOWN, // a record that cannot be parsed
@@ -66,7 +77,10 @@ final class LogRecord
return this == record.type;
}
- public boolean isFinal() { return this == Type.COMMIT || this ==
Type.ABORT; }
+ public boolean isFinal()
+ {
+ return this == Type.COMMIT || this == Type.ABORT;
+ }
}
/**
@@ -182,17 +196,66 @@ final class LogRecord
public LogRecord withExistingFiles(List<File> existingFiles)
{
+ if (!absolutePath.isPresent())
+ throw new IllegalStateException(String.format("Cannot create
record from existing files for type %s - file is not present", type));
+
return make(type, existingFiles, 0, absolutePath.get());
}
+ /**
+ * We create a LogRecord based on the files on disk; there's some subtlety
around how we handle stats files as the
+ * timestamp can be mutated by the async completion of compaction if
things race with node shutdown. To work around this,
+ * we don't take the stats file timestamp into account when calculating
nor using the timestamps for all the components
+ * as we build the LogRecord.
+ */
public static LogRecord make(Type type, List<File> files, int minFiles,
String absolutePath)
{
+ return make(type, files, minFiles, absolutePath,
INCLUDE_STATS_FOR_TESTS);
+ }
+
+ /**
+ * In most cases we skip including the stats file timestamp entirely as it
can be mutated during anticompaction
+ * and thus "invalidate" the LogRecord. There is an edge case where we
have a LogRecord that was written w/the wrong
+ * timestamp (i.e. included a mutated stats file) and we need the node to
come up, so we need to expose the selective
+ * ability to either include the stats file timestamp or not.
+ *
+ * See {@link LogFile#verifyRecord}
+ */
+ static LogRecord make(Type type, List<File> files, int minFiles, String
absolutePath, boolean includeStatsFile)
+ {
+ List<File> toVerify;
+ File statsFile = null;
+ if (!includeStatsFile && !files.isEmpty())
+ {
+ toVerify = new ArrayList<>(files.size() - 1);
+ for (File f : files)
+ {
+ if (!f.getName().endsWith(TMP_EXT))
+ {
+ Component component =
Descriptor.fromFilenameWithComponent(f).right;
+ if (component == Component.STATS)
+ statsFile = f;
+ else
+ toVerify.add(f);
+ }
+ }
+ }
+ else
+ {
+ toVerify = files;
+ }
// CASSANDRA-11889: File.lastModified() returns a positive value only
if the file exists, therefore
// we filter by positive values to only consider the files that still
exists right now, in case things
// changed on disk since getExistingFiles() was called
- List<Long> positiveModifiedTimes =
files.stream().map(File::lastModified).filter(lm -> lm >
0).collect(Collectors.toList());
+ List<Long> positiveModifiedTimes =
toVerify.stream().map(File::lastModified).filter(lm -> lm >
0).collect(Collectors.toList());
long lastModified = positiveModifiedTimes.stream().reduce(0L,
Long::max);
- return new LogRecord(type, absolutePath, lastModified,
Math.max(minFiles, positiveModifiedTimes.size()));
+
+ // We need to preserve the file count for the number of existing files
found on disk even though we ignored the
+ // stats file during our timestamp calculation. If the stats file
still exists, we add in the count of it as
+ // a separate validation assumption that it's one of the files
considered valid in this LogRecord.
+ boolean addStatTS = statsFile != null && statsFile.exists();
+ int positiveTSCount = addStatTS ? positiveModifiedTimes.size() + 1 :
positiveModifiedTimes.size();
+ return new LogRecord(type, absolutePath, lastModified,
Math.max(minFiles, positiveTSCount));
}
private LogRecord(Type type, long updateTime)
diff --git a/src/java/org/apache/cassandra/db/lifecycle/LogTransaction.java
b/src/java/org/apache/cassandra/db/lifecycle/LogTransaction.java
index a3c3837dc6..43b9e1f771 100644
--- a/src/java/org/apache/cassandra/db/lifecycle/LogTransaction.java
+++ b/src/java/org/apache/cassandra/db/lifecycle/LogTransaction.java
@@ -545,6 +545,8 @@ class LogTransaction extends
Transactional.AbstractTransactional implements Tran
try(LogFile txn = LogFile.make(entry.getKey(), entry.getValue()))
{
logger.info("Verifying logfile transaction {}", txn);
+ // We don't check / include the stats file timestamp on
LogRecord creation / verification as that might
+ // be modified by a race in compaction notification and then
needlessly fail subsequent node starts.
if (txn.verify())
{
Throwable failure = txn.removeUnfinishedLeftovers(null);
diff --git
a/test/unit/org/apache/cassandra/db/lifecycle/LogTransactionTest.java
b/test/unit/org/apache/cassandra/db/lifecycle/LogTransactionTest.java
index 7d1cb39ae3..15578d14d5 100644
--- a/test/unit/org/apache/cassandra/db/lifecycle/LogTransactionTest.java
+++ b/test/unit/org/apache/cassandra/db/lifecycle/LogTransactionTest.java
@@ -18,7 +18,6 @@
package org.apache.cassandra.db.lifecycle;
import java.io.File;
-import java.io.IOError;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.io.UncheckedIOException;
@@ -39,6 +38,7 @@ import java.util.stream.Stream;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Sets;
+import org.apache.cassandra.db.streaming.ComponentContext;
import org.junit.BeforeClass;
import org.junit.Test;
@@ -71,6 +71,7 @@ import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
@@ -1263,10 +1264,172 @@ public class LogTransactionTest extends
AbstractTransactionalTest
logs.finish();
}
+ @Test
+ public void testStatsTSMatchOnStart() throws Throwable
+ {
+ ColumnFamilyStore cfs = MockSchema.newCFS(KEYSPACE);
+ File dataFolder = new
Directories(cfs.metadata()).getDirectoryForNewSSTables();
+ SSTableReader sstable = sstable(dataFolder, cfs, 0, 128);
+
+ try(LogTransaction log = new LogTransaction(OperationType.COMPACTION))
+ {
+ assertNotNull(log);
+ log.trackNew(sstable);
+
+ // Confirm we can remove leftovers when they match
+ LogTransaction.removeUnfinishedLeftovers(cfs.metadata());
+ }
+
+ File sFile = new File(sstable.descriptor.filenameFor(Component.STATS));
+ assertFalse("Found STATS file but expected it to be cleaned up.",
Files.exists(sFile.toPath()));
+ sstable.selfRef().release();
+ }
+
+ @Test
+ public void testStatsTSMatchDuringList() throws Throwable
+ {
+ ColumnFamilyStore cfs = MockSchema.newCFS(KEYSPACE);
+ File dataFolder = new
Directories(cfs.metadata()).getDirectoryForNewSSTables();
+ SSTableReader sstable = sstable(dataFolder, cfs, 0, 128);
+
+ try(LogTransaction log = new LogTransaction(OperationType.COMPACTION))
+ {
+ assertNotNull(log);
+ log.trackNew(sstable);
+
+ // Confirm we can successfully classify files when they match -
this triggers the LogAwareFileLister verify
+ listFiles(dataFolder, Directories.OnTxnErr.THROW,
Directories.FileType.FINAL);
+ }
+ sstable.selfRef().release();
+ }
+
+ @Test
+ public void testStatsTSMismatchDuringStart() throws Throwable
+ {
+ ColumnFamilyStore cfs = MockSchema.newCFS(KEYSPACE);
+ File dataFolder = new
Directories(cfs.metadata()).getDirectoryForNewSSTables();
+ SSTableReader sstable = sstable(dataFolder, cfs, 0, 128);
+
+ File sFile = new File(sstable.descriptor.filenameFor(Component.STATS));
+ assertTrue("STATS file not created successfully in test setup",
Files.exists(sFile.toPath()));
+
+ // Confirm we can remove leftovers even if the STATS file doesn't match
+ try(LogTransaction log = new LogTransaction(OperationType.COMPACTION))
+ {
+ assertNotNull(log);
+
+ // Need to flag the transaction as having a REMOVE entry so it'll
trigger the path to calculate stats on list
+ log.obsoleted(sstable);
+
+ // Need to sleep for long enough to bypass the millisecond
truncation logic due to jdk8 and jdk11 change
+ Thread.sleep(2000);
+ assertTrue("Failed to set mtime for STATS file to
currentTimeMillis()", sFile.setLastModified(System.currentTimeMillis()));
+
+ // Confirm we have an mtime mismatch
+ File dFile = new
File(sstable.descriptor.filenameFor(Component.DATA));
+ assertNotEquals(sFile.lastModified(), dFile.lastModified());
+
+ // We need to add another LogRecord as we allow partial or
incorrect entries as the last record...
+ log.trackNew(sstable(dataFolder, cfs, 2, 128));
+
+ assertTrue("STATS file gone before removeUnfinished...",
Files.exists(sFile.toPath()));
+ // Confirm we can remove leftovers when the STATS file mismatches
+ log.prepareToCommit(); // commit so that obsolete sstable
components will be removed.
+ log.commit();
+ ComponentContext.create(sstable.descriptor);
+
assertTrue(LogTransaction.removeUnfinishedLeftovers(cfs.metadata()));
+ }
+
+ sstable.selfRef().release();
+ }
+
+ @Test
+ public void testWrongTimestampInTxnFile() throws IOException,
InterruptedException
+ {
+ ColumnFamilyStore cfs = MockSchema.newCFS(KEYSPACE);
+ File dataFolder = new
Directories(cfs.metadata()).getDirectoryForNewSSTables();
+ SSTableReader sstable = sstable(dataFolder, cfs, 0, 128);
+
+ File sFile = new File(sstable.descriptor.filenameFor(Component.STATS));
+ assertTrue("STATS file not created successfully in test setup",
Files.exists(sFile.toPath()));
+
+ LogRecord.INCLUDE_STATS_FOR_TESTS = true;
+
+ Thread.sleep(2000);
+ assertTrue("Failed to set mtime for STATS file to
currentTimeMillis()", sFile.setLastModified(System.currentTimeMillis()));
+
+ // Confirm we can remove leftovers even if the STATS file doesn't match
+ try(LogTransaction log = new LogTransaction(OperationType.COMPACTION))
+ {
+ assertNotNull(log);
+ // Need to flag the transaction as having a REMOVE entry so it'll
trigger the path to calculate stats on list
+ log.obsoleted(sstable);
+ // Need to sleep for long enough to bypass the millisecond
truncation logic due to jdk8 and jdk11 change
+ // Confirm we have an mtime mismatch
+ File dFile = new
File(sstable.descriptor.filenameFor(Component.DATA));
+ assertNotEquals(sFile.lastModified(), dFile.lastModified());
+
+ // We need to add another LogRecord as we allow partial or
incorrect entries as the last record...
+ log.trackNew(sstable(dataFolder, cfs, 2, 128));
+
+ assertTrue("STATS file gone before removeUnfinished...",
Files.exists(sFile.toPath()));
+ // Confirm we can remove leftovers when the STATS file mismatches
+ LogRecord.INCLUDE_STATS_FOR_TESTS = false;
+
assertTrue(LogTransaction.removeUnfinishedLeftovers(cfs.metadata()));
+ }
+
+ sstable.selfRef().release();
+ }
+
+ /**
+ * We do not consider the stats file's ts for any cases at this point
+ */
+ @Test
+ public void testStatsTSMismatchDuringList() throws Throwable
+ {
+ SSTableReader sstable = null;
+ try
+ {
+ ColumnFamilyStore cfs = MockSchema.newCFS(KEYSPACE);
+ File dataFolder = new
Directories(cfs.metadata()).getDirectoryForNewSSTables();
+ sstable = sstable(dataFolder, cfs, 0, 128);
+
+ File sFile = new
File(sstable.descriptor.filenameFor(Component.STATS));
+ assertTrue("STATS file not created successfully in test setup",
Files.exists(sFile.toPath()));
+
+ try(LogTransaction log = new
LogTransaction(OperationType.COMPACTION))
+ {
+ assertNotNull(log);
+
+ // Need to flag the transaction as having a REMOVE entry so
it'll trigger the path to calculate stats on list
+ log.obsoleted(sstable);
+
+ // Need to sleep for long enough to bypass the millisecond
truncation logic due to jdk8 and jdk11 change
+ Thread.sleep(2000);
+ assertTrue("Failed to set mtime for STATS file to
currentTimeMillis()", sFile.setLastModified(System.currentTimeMillis()));
+
+ // Confirm we have an mtime mismatch
+ File dFile = new
File(sstable.descriptor.filenameFor(Component.DATA));
+ assertNotEquals(sFile.lastModified(), dFile.lastModified());
+
+ // We need to add another LogRecord as we allow partial or
incorrect entries as the last record...
+ log.trackNew(sstable(dataFolder, cfs, 2, 128));
+
+ // Confirm we don't get a mismatch LogRecord error when the
STATS file is different even on listFiles case
+ listFiles(dataFolder, Directories.OnTxnErr.THROW,
Directories.FileType.FINAL);
+ }
+ }
+ finally
+ {
+ if (sstable != null)
+ sstable.selfRef().release();
+ }
+ }
+
private static SSTableReader sstable(File dataFolder, ColumnFamilyStore
cfs, int generation, int size) throws IOException
{
Descriptor descriptor = new Descriptor(dataFolder,
cfs.keyspace.getName(), cfs.getTableName(), generation, SSTableFormat.Type.BIG);
- Set<Component> components = ImmutableSet.of(Component.DATA,
Component.PRIMARY_INDEX, Component.FILTER, Component.TOC);
+ Set<Component> components = ImmutableSet.of(Component.DATA,
Component.PRIMARY_INDEX, Component.FILTER, Component.TOC, Component.STATS);
for (Component component : components)
{
File file = new File(descriptor.filenameFor(component));
@@ -1360,12 +1523,12 @@ public class LogTransactionTest extends
AbstractTransactionalTest
static Set<File> getTemporaryFiles(File folder)
{
- return listFiles(folder, Directories.FileType.TEMPORARY);
+ return listFiles(folder, Directories.OnTxnErr.IGNORE,
Directories.FileType.TEMPORARY);
}
static Set<File> getFinalFiles(File folder)
{
- return listFiles(folder, Directories.FileType.FINAL);
+ return listFiles(folder, Directories.OnTxnErr.IGNORE,
Directories.FileType.FINAL);
}
// Used by listFiles - this test is deliberately racing with files being
@@ -1390,12 +1553,12 @@ public class LogTransactionTest extends
AbstractTransactionalTest
}
}
- static Set<File> listFiles(File folder, Directories.FileType... types)
+ static Set<File> listFiles(File folder, Directories.OnTxnErr err,
Directories.FileType... types)
{
Collection<Directories.FileType> match = Arrays.asList(types);
return new LogAwareFileLister(folder.toPath(),
(file, type) -> match.contains(type),
- Directories.OnTxnErr.IGNORE).list()
+ err).list()
.stream()
.flatMap(LogTransactionTest::toCanonicalIgnoringNotFound)
.collect(Collectors.toSet());
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]