This is an automated email from the ASF dual-hosted git repository.

mhubail pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/asterixdb.git


The following commit(s) were added to refs/heads/master by this push:
     new 7a46963be8 [ASTERIXDB-3556][STO] Ensure local recovery is run on resume
7a46963be8 is described below

commit 7a46963be8b1ffaf66f97c2b0794af5f9bddd98c
Author: Murtadha Hubail <[email protected]>
AuthorDate: Wed Jan 29 15:22:40 2025 +0300

    [ASTERIXDB-3556][STO] Ensure local recovery is run on resume
    
    - user model changes: no
    - storage format changes: no
    - interface changes: yes
    
    Details:
    
    - Set the system state to CORRUPTED on resume to ensure
      local recovery is run.
    - Ensure cached files clean up is performed all the time
      when we have any cached files.
    
    Ext-ref: MB-65062
    Change-Id: I6600823447f478fc70a22cf27a9967235570562c
    Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/19373
    Tested-by: Jenkins <[email protected]>
    Reviewed-by: Murtadha Hubail <[email protected]>
    Reviewed-by: Ritik Raj <[email protected]>
---
 .../org/apache/asterix/app/nc/RecoveryManager.java     |  8 ++++++--
 .../app/nc/task/CloudToLocalStorageCachingTask.java    |  8 +++-----
 .../apache/asterix/app/nc/task/LocalRecoveryTask.java  |  4 +++-
 .../app/replication/NcLifecycleCoordinator.java        |  8 ++++----
 .../apache/asterix/cloud/AbstractCloudIOManager.java   | 18 +++++++++---------
 .../asterix/cloud/LocalPartitionBootstrapper.java      |  3 +--
 .../asterix/common/cloud/IPartitionBootstrapper.java   |  3 +--
 7 files changed, 27 insertions(+), 25 deletions(-)

diff --git 
a/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/nc/RecoveryManager.java
 
b/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/nc/RecoveryManager.java
index 77a1e01f68..78f574b977 100644
--- 
a/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/nc/RecoveryManager.java
+++ 
b/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/nc/RecoveryManager.java
@@ -158,9 +158,13 @@ public class RecoveryManager implements IRecoveryManager, 
ILifeCycleComponent {
     public void startLocalRecovery(Set<Integer> partitions) throws 
IOException, ACIDException {
         state = SystemState.RECOVERING;
         LOGGER.info("starting recovery for partitions {}", partitions);
+        Checkpoint systemCheckpoint = checkpointManager.getLatest();
+        if (systemCheckpoint == null) {
+            LOGGER.warn("no system checkpoint found; skipping txn log 
recovery");
+            return;
+        }
         long readableSmallestLSN = logMgr.getReadableSmallestLSN();
-        Checkpoint checkpointObject = checkpointManager.getLatest();
-        long lowWaterMarkLSN = checkpointObject.getMinMCTFirstLsn();
+        long lowWaterMarkLSN = systemCheckpoint.getMinMCTFirstLsn();
         if (lowWaterMarkLSN < readableSmallestLSN) {
             lowWaterMarkLSN = readableSmallestLSN;
         }
diff --git 
a/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/nc/task/CloudToLocalStorageCachingTask.java
 
b/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/nc/task/CloudToLocalStorageCachingTask.java
index b47703a475..cc594884ba 100644
--- 
a/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/nc/task/CloudToLocalStorageCachingTask.java
+++ 
b/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/nc/task/CloudToLocalStorageCachingTask.java
@@ -41,14 +41,12 @@ public class CloudToLocalStorageCachingTask implements 
INCLifecycleTask {
     private final Set<Integer> storagePartitions;
     private final boolean metadataNode;
     private final int metadataPartitionId;
-    private final boolean cleanup;
 
-    public CloudToLocalStorageCachingTask(Set<Integer> storagePartitions, 
boolean metadataNode, int metadataPartitionId,
-            boolean cleanup) {
+    public CloudToLocalStorageCachingTask(Set<Integer> storagePartitions, 
boolean metadataNode,
+            int metadataPartitionId) {
         this.storagePartitions = storagePartitions;
         this.metadataNode = metadataNode;
         this.metadataPartitionId = metadataPartitionId;
-        this.cleanup = cleanup;
     }
 
     @Override
@@ -68,7 +66,7 @@ public class CloudToLocalStorageCachingTask implements 
INCLifecycleTask {
                     
applicationContext.getTransactionSubsystem().getCheckpointManager().getLatest();
             IPartitionBootstrapper bootstrapper = 
applicationContext.getPartitionBootstrapper();
             bootstrapper.bootstrap(storagePartitions, 
lrs.getOnDiskPartitions(), metadataNode, metadataPartitionId,
-                    cleanup, latestCheckpoint == null);
+                    latestCheckpoint == null);
 
             // Report all local resources
             diskService.reportLocalResources(lrs.loadAndGetAllResources());
diff --git 
a/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/nc/task/LocalRecoveryTask.java
 
b/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/nc/task/LocalRecoveryTask.java
index d0a8dcc65c..b8addb29bc 100644
--- 
a/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/nc/task/LocalRecoveryTask.java
+++ 
b/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/nc/task/LocalRecoveryTask.java
@@ -42,7 +42,9 @@ public class LocalRecoveryTask implements INCLifecycleTask {
     public void perform(CcId ccId, IControllerService cs) throws 
HyracksDataException {
         INcApplicationContext appContext = (INcApplicationContext) 
cs.getApplicationContext();
         try {
-            
appContext.getTransactionSubsystem().getRecoveryManager().startLocalRecovery(partitions);
+            if (!partitions.isEmpty()) {
+                
appContext.getTransactionSubsystem().getRecoveryManager().startLocalRecovery(partitions);
+            }
         } catch (IOException | ACIDException e) {
             throw HyracksDataException.create(e);
         }
diff --git 
a/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/replication/NcLifecycleCoordinator.java
 
b/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/replication/NcLifecycleCoordinator.java
index dde2371d31..fff1bc118c 100644
--- 
a/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/replication/NcLifecycleCoordinator.java
+++ 
b/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/replication/NcLifecycleCoordinator.java
@@ -223,9 +223,9 @@ public class NcLifecycleCoordinator implements 
INcLifecycleCoordinator {
         tasks.add(new UpdateNodeStatusTask(NodeStatus.BOOTING, 
nodeActivePartitions));
         int metadataPartitionId = 
clusterManager.getMetadataPartition().getPartitionId();
         // Add any cloud-related tasks
-        addCloudTasks(tasks, nodeActivePartitions, metadataNode, 
metadataPartitionId, state == SystemState.CORRUPTED);
+        addCloudTasks(tasks, nodeActivePartitions, metadataNode, 
metadataPartitionId);
         tasks.add(new LocalStorageCleanupTask(metadataPartitionId));
-        if (state == SystemState.CORRUPTED) {
+        if (state == SystemState.CORRUPTED && !nodeActivePartitions.isEmpty()) 
{
             // need to perform local recovery for node active partitions
             LocalRecoveryTask rt = new LocalRecoveryTask(nodeActivePartitions);
             tasks.add(rt);
@@ -257,7 +257,7 @@ public class NcLifecycleCoordinator implements 
INcLifecycleCoordinator {
     }
 
     protected void addCloudTasks(List<INCLifecycleTask> tasks, Set<Integer> 
computePartitions, boolean metadataNode,
-            int metadataPartitionId, boolean cleanup) {
+            int metadataPartitionId) {
         IApplicationContext appCtx = (IApplicationContext) 
serviceContext.getApplicationContext();
         if (!appCtx.isCloudDeployment()) {
             return;
@@ -266,7 +266,7 @@ public class NcLifecycleCoordinator implements 
INcLifecycleCoordinator {
         StorageComputePartitionsMap map = 
clusterManager.getStorageComputeMap();
         map = map == null ? 
StorageComputePartitionsMap.computePartitionsMap(clusterManager) : map;
         Set<Integer> storagePartitions = 
map.getStoragePartitions(computePartitions);
-        tasks.add(new CloudToLocalStorageCachingTask(storagePartitions, 
metadataNode, metadataPartitionId, cleanup));
+        tasks.add(new CloudToLocalStorageCachingTask(storagePartitions, 
metadataNode, metadataPartitionId));
     }
 
     private synchronized void process(MetadataNodeResponseMessage response) 
throws HyracksDataException {
diff --git 
a/asterixdb/asterix-cloud/src/main/java/org/apache/asterix/cloud/AbstractCloudIOManager.java
 
b/asterixdb/asterix-cloud/src/main/java/org/apache/asterix/cloud/AbstractCloudIOManager.java
index 4ce382e7c1..912ca47777 100644
--- 
a/asterixdb/asterix-cloud/src/main/java/org/apache/asterix/cloud/AbstractCloudIOManager.java
+++ 
b/asterixdb/asterix-cloud/src/main/java/org/apache/asterix/cloud/AbstractCloudIOManager.java
@@ -45,7 +45,7 @@ import org.apache.asterix.common.api.INamespacePathResolver;
 import org.apache.asterix.common.cloud.IPartitionBootstrapper;
 import org.apache.asterix.common.config.CloudProperties;
 import org.apache.asterix.common.metadata.MetadataConstants;
-import org.apache.asterix.common.transactions.IRecoveryManager;
+import org.apache.asterix.common.transactions.IRecoveryManager.SystemState;
 import org.apache.asterix.common.utils.StoragePathUtil;
 import org.apache.hyracks.api.exceptions.HyracksDataException;
 import org.apache.hyracks.api.io.FileReference;
@@ -106,22 +106,23 @@ public abstract class AbstractCloudIOManager extends 
IOManager implements IParti
      */
 
     @Override
-    public IRecoveryManager.SystemState getSystemStateOnMissingCheckpoint() {
+    public SystemState getSystemStateOnMissingCheckpoint() {
         Set<CloudFile> existingMetadataFiles = 
getCloudMetadataPartitionFiles();
         CloudFile bootstrapMarkerPath = 
CloudFile.of(StoragePathUtil.getBootstrapMarkerRelativePath(nsPathResolver));
         if (existingMetadataFiles.isEmpty() || 
existingMetadataFiles.contains(bootstrapMarkerPath)) {
             LOGGER.info("First time to initialize this cluster: systemState = 
PERMANENT_DATA_LOSS");
-            return IRecoveryManager.SystemState.PERMANENT_DATA_LOSS;
+            return SystemState.PERMANENT_DATA_LOSS;
         } else {
-            LOGGER.info("Resuming a previous initialized cluster: systemState 
= HEALTHY");
-            return IRecoveryManager.SystemState.HEALTHY;
+            LOGGER.info(
+                    "Resuming a previously initialized cluster; setting system 
state to {} to force local recovery if needed",
+                    SystemState.CORRUPTED);
+            return SystemState.CORRUPTED;
         }
     }
 
     @Override
     public final void bootstrap(Set<Integer> activePartitions, 
List<FileReference> currentOnDiskPartitions,
-            boolean metadataNode, int metadataPartition, boolean cleanup, 
boolean ensureCompleteBootstrap)
-            throws HyracksDataException {
+            boolean metadataNode, int metadataPartition, boolean 
ensureCompleteBootstrap) throws HyracksDataException {
         partitions.clear();
         partitions.addAll(activePartitions);
         if (metadataNode) {
@@ -138,8 +139,7 @@ public abstract class AbstractCloudIOManager extends 
IOManager implements IParti
         }
 
         LOGGER.info("Initializing cloud manager with ({}) storage partitions: 
{}", partitions.size(), partitions);
-
-        if (cleanup) {
+        if (!currentOnDiskPartitions.isEmpty()) {
             deleteUnkeptPartitionDirs(currentOnDiskPartitions);
             cleanupLocalFiles();
         }
diff --git 
a/asterixdb/asterix-cloud/src/main/java/org/apache/asterix/cloud/LocalPartitionBootstrapper.java
 
b/asterixdb/asterix-cloud/src/main/java/org/apache/asterix/cloud/LocalPartitionBootstrapper.java
index 54090bb299..f2451ee04f 100644
--- 
a/asterixdb/asterix-cloud/src/main/java/org/apache/asterix/cloud/LocalPartitionBootstrapper.java
+++ 
b/asterixdb/asterix-cloud/src/main/java/org/apache/asterix/cloud/LocalPartitionBootstrapper.java
@@ -49,8 +49,7 @@ public class LocalPartitionBootstrapper implements 
IPartitionBootstrapper {
 
     @Override
     public void bootstrap(Set<Integer> activePartitions, List<FileReference> 
currentOnDiskPartitions,
-            boolean metadataNode, int metadataPartition, boolean cleanup, 
boolean ensureCompleteBootstrap)
-            throws HyracksDataException {
+            boolean metadataNode, int metadataPartition, boolean 
ensureCompleteBootstrap) throws HyracksDataException {
         for (FileReference onDiskPartition : currentOnDiskPartitions) {
             int partitionNum = 
StoragePathUtil.getPartitionNumFromRelativePath(onDiskPartition.getAbsolutePath());
             if (!activePartitions.contains(partitionNum)) {
diff --git 
a/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/cloud/IPartitionBootstrapper.java
 
b/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/cloud/IPartitionBootstrapper.java
index 6bb41760e0..73a13920f1 100644
--- 
a/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/cloud/IPartitionBootstrapper.java
+++ 
b/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/cloud/IPartitionBootstrapper.java
@@ -50,9 +50,8 @@ public interface IPartitionBootstrapper {
      * @param currentOnDiskPartitions paths to the current local partitions
      * @param metadataNode            whether the node is a metadata node as 
well
      * @param metadataPartition       metadata partition number
-     * @param cleanup                 performs cleanup by deleting all unkept 
partitions
      * @param ensureCompleteBootstrap ensures the metadata catalog was fully 
bootstrapped
      */
     void bootstrap(Set<Integer> activePartitions, List<FileReference> 
currentOnDiskPartitions, boolean metadataNode,
-            int metadataPartition, boolean cleanup, boolean 
ensureCompleteBootstrap) throws HyracksDataException;
+            int metadataPartition, boolean ensureCompleteBootstrap) throws 
HyracksDataException;
 }

Reply via email to