This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 3dd71dc8f87 [opt](log) add warn log for saving and pushing image 
failure (#41216)
3dd71dc8f87 is described below

commit 3dd71dc8f8798e98b9b904c799165565d951a734
Author: Mingyu Chen <[email protected]>
AuthorDate: Wed Sep 25 12:29:39 2024 +0800

    [opt](log) add warn log for saving and pushing image failure (#41216)
    
    We already have some metric on FE to record the number of saving or
    pushing image failure events.
    Such as:
    ```
    doris_fe_image_push{type="failed"} 0
    doris_fe_image_push{type="success"} 0
    ```
    But it is a counter and hard to monitor them if user want to send alert
    when failure.
    
    Actually, this kind of event-driven alert is better be done using log.
    So I add some warning log for these failure.
    So that use can monitor the log and send alert when failure.
    
    - Saving image failure:
        `Save image failed: xxx`
    
    - Pushing image failure:
        `Push image failed: xxx`
    
    - Deleting old edit log failure:
        `Delete old edit log failed: xxx`
    
    - Deleting old image failure:
        `Delete old image failed: xxx`
---
 .../java/org/apache/doris/master/Checkpoint.java   | 37 ++++++++++++----------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java 
b/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java
index 27dc6aa7b87..4934f8fb0b3 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java
@@ -105,14 +105,17 @@ public class Checkpoint extends MasterDaemon {
                 return;
             }
         } catch (Throwable e) {
-            LOG.error("Does not get storage info", e);
+            LOG.warn("Save image failed: " + e.getMessage(), e);
             if (MetricRepo.isInit) {
                 MetricRepo.COUNTER_IMAGE_WRITE_FAILED.increase(1L);
             }
             return;
         }
 
-        if (!checkMemoryEnoughToDoCheckpoint()) {
+        try {
+            checkMemoryEnoughToDoCheckpoint();
+        } catch (Throwable t) {
+            LOG.warn("Save image failed: " + t.getMessage(), t);
             if (MetricRepo.isInit) {
                 MetricRepo.COUNTER_IMAGE_WRITE_FAILED.increase(1L);
             }
@@ -156,7 +159,7 @@ public class Checkpoint extends MasterDaemon {
             LOG.info("checkpoint finished save image.{}", replayedJournalId);
         } catch (Throwable e) {
             exceptionCaught = true;
-            LOG.error("Exception when generate new image file", e);
+            LOG.warn("Save image failed: " + e.getMessage(), e);
             if (MetricRepo.isInit) {
                 MetricRepo.COUNTER_IMAGE_WRITE_FAILED.increase(1L);
             }
@@ -175,8 +178,8 @@ public class Checkpoint extends MasterDaemon {
                     if (MetricRepo.isInit) {
                         MetricRepo.COUNTER_IMAGE_CLEAN_SUCCESS.increase(1L);
                     }
-                } catch (Throwable ex) {
-                    LOG.error("Master delete latest invalid image file 
failed.", ex);
+                } catch (Throwable t) {
+                    LOG.warn("Delete old image failed: " + t.getMessage(), t);
                     if (MetricRepo.isInit) {
                         MetricRepo.COUNTER_IMAGE_CLEAN_FAILED.increase(1L);
                     }
@@ -211,7 +214,7 @@ public class Checkpoint extends MasterDaemon {
                         LOG.warn("Failed when pushing image file. url = 
{},responseBody = {}", url, responseBody);
                     }
                 } catch (IOException e) {
-                    LOG.error("Exception when pushing image file. url = {}", 
url, e);
+                    LOG.warn("Exception when pushing image file. url = {}", 
url, e);
                 }
             }
 
@@ -223,6 +226,7 @@ public class Checkpoint extends MasterDaemon {
                 MetricRepo.COUNTER_IMAGE_PUSH_SUCCESS.increase(1L);
             }
         } else {
+            LOG.warn("Push image failed: totally {} nodes, push succeeded {} 
nodes", otherNodesCount, successPushed);
             if (MetricRepo.isInit) {
                 MetricRepo.COUNTER_IMAGE_PUSH_FAILED.increase(1L);
             }
@@ -282,8 +286,8 @@ public class Checkpoint extends MasterDaemon {
                 }
                 LOG.info("journals <= {} are deleted. image version {}, other 
nodes min version {}",
                         deleteVersion, checkPointVersion, 
minOtherNodesJournalId);
-            } catch (Throwable e) {
-                LOG.error("failed to delete old edit log", e);
+            } catch (Throwable t) {
+                LOG.warn("Delete old edit log failed: " + t.getMessage(), t);
                 if (MetricRepo.isInit) {
                     MetricRepo.COUNTER_EDIT_LOG_CLEAN_FAILED.increase(1L);
                 }
@@ -298,7 +302,7 @@ public class Checkpoint extends MasterDaemon {
                 MetricRepo.COUNTER_IMAGE_CLEAN_SUCCESS.increase(1L);
             }
         } catch (Throwable e) {
-            LOG.error("Master delete old image file fail.", e);
+            LOG.warn("Master delete old image file fail.", e);
             if (MetricRepo.isInit) {
                 MetricRepo.COUNTER_IMAGE_CLEAN_FAILED.increase(1L);
             }
@@ -320,27 +324,26 @@ public class Checkpoint extends MasterDaemon {
     /*
      * Check whether can we do the checkpoint due to the memory used percent.
      */
-    private boolean checkMemoryEnoughToDoCheckpoint() {
+    private void checkMemoryEnoughToDoCheckpoint() throws CheckpointException {
         long memUsedPercent = getMemoryUsedPercent();
         LOG.info("get jvm memory used percent: {} %", memUsedPercent);
 
         if (memUsedPercent <= Config.metadata_checkpoint_memory_threshold || 
Config.force_do_metadata_checkpoint) {
             memoryNotEnoughCount = 0;
-            return true;
+            return;
         }
 
-        LOG.warn("the memory used percent {} exceed the checkpoint memory 
threshold: {}, exceeded count: {}",
-                memUsedPercent, Config.metadata_checkpoint_memory_threshold, 
memoryNotEnoughCount);
-
         memoryNotEnoughCount += 1;
         if (memoryNotEnoughCount != Config.checkpoint_manual_gc_threshold) {
-            return false;
+            throw new CheckpointException(String.format(
+                    "the memory used percent %d exceed the checkpoint memory 
threshold: %d, exceeded count: %d",
+                    memUsedPercent, 
Config.metadata_checkpoint_memory_threshold, memoryNotEnoughCount));
         }
 
-        LOG.warn("the not enough memory count has reached the manual gc 
threshold {}",
+        LOG.warn("the 'not enough memory count' has reached the manual gc 
threshold {}",
                 Config.checkpoint_manual_gc_threshold);
         System.gc();
-        return checkMemoryEnoughToDoCheckpoint();
+        checkMemoryEnoughToDoCheckpoint();
     }
 
     /*


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to