This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 3dd71dc8f87 [opt](log) add warn log for saving and pushing image
failure (#41216)
3dd71dc8f87 is described below
commit 3dd71dc8f8798e98b9b904c799165565d951a734
Author: Mingyu Chen <[email protected]>
AuthorDate: Wed Sep 25 12:29:39 2024 +0800
[opt](log) add warn log for saving and pushing image failure (#41216)
We already have some metric on FE to record the number of saving or
pushing image failure events.
Such as:
```
doris_fe_image_push{type="failed"} 0
doris_fe_image_push{type="success"} 0
```
But it is a counter and hard to monitor them if user want to send alert
when failure.
Actually, this kind of event-driven alert is better be done using log.
So I add some warning log for these failure.
So that use can monitor the log and send alert when failure.
- Saving image failure:
`Save image failed: xxx`
- Pushing image failure:
`Push image failed: xxx`
- Deleting old edit log failure:
`Delete old edit log failed: xxx`
- Deleting old image failure:
`Delete old image failed: xxx`
---
.../java/org/apache/doris/master/Checkpoint.java | 37 ++++++++++++----------
1 file changed, 20 insertions(+), 17 deletions(-)
diff --git a/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java
b/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java
index 27dc6aa7b87..4934f8fb0b3 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java
@@ -105,14 +105,17 @@ public class Checkpoint extends MasterDaemon {
return;
}
} catch (Throwable e) {
- LOG.error("Does not get storage info", e);
+ LOG.warn("Save image failed: " + e.getMessage(), e);
if (MetricRepo.isInit) {
MetricRepo.COUNTER_IMAGE_WRITE_FAILED.increase(1L);
}
return;
}
- if (!checkMemoryEnoughToDoCheckpoint()) {
+ try {
+ checkMemoryEnoughToDoCheckpoint();
+ } catch (Throwable t) {
+ LOG.warn("Save image failed: " + t.getMessage(), t);
if (MetricRepo.isInit) {
MetricRepo.COUNTER_IMAGE_WRITE_FAILED.increase(1L);
}
@@ -156,7 +159,7 @@ public class Checkpoint extends MasterDaemon {
LOG.info("checkpoint finished save image.{}", replayedJournalId);
} catch (Throwable e) {
exceptionCaught = true;
- LOG.error("Exception when generate new image file", e);
+ LOG.warn("Save image failed: " + e.getMessage(), e);
if (MetricRepo.isInit) {
MetricRepo.COUNTER_IMAGE_WRITE_FAILED.increase(1L);
}
@@ -175,8 +178,8 @@ public class Checkpoint extends MasterDaemon {
if (MetricRepo.isInit) {
MetricRepo.COUNTER_IMAGE_CLEAN_SUCCESS.increase(1L);
}
- } catch (Throwable ex) {
- LOG.error("Master delete latest invalid image file
failed.", ex);
+ } catch (Throwable t) {
+ LOG.warn("Delete old image failed: " + t.getMessage(), t);
if (MetricRepo.isInit) {
MetricRepo.COUNTER_IMAGE_CLEAN_FAILED.increase(1L);
}
@@ -211,7 +214,7 @@ public class Checkpoint extends MasterDaemon {
LOG.warn("Failed when pushing image file. url =
{},responseBody = {}", url, responseBody);
}
} catch (IOException e) {
- LOG.error("Exception when pushing image file. url = {}",
url, e);
+ LOG.warn("Exception when pushing image file. url = {}",
url, e);
}
}
@@ -223,6 +226,7 @@ public class Checkpoint extends MasterDaemon {
MetricRepo.COUNTER_IMAGE_PUSH_SUCCESS.increase(1L);
}
} else {
+ LOG.warn("Push image failed: totally {} nodes, push succeeded {}
nodes", otherNodesCount, successPushed);
if (MetricRepo.isInit) {
MetricRepo.COUNTER_IMAGE_PUSH_FAILED.increase(1L);
}
@@ -282,8 +286,8 @@ public class Checkpoint extends MasterDaemon {
}
LOG.info("journals <= {} are deleted. image version {}, other
nodes min version {}",
deleteVersion, checkPointVersion,
minOtherNodesJournalId);
- } catch (Throwable e) {
- LOG.error("failed to delete old edit log", e);
+ } catch (Throwable t) {
+ LOG.warn("Delete old edit log failed: " + t.getMessage(), t);
if (MetricRepo.isInit) {
MetricRepo.COUNTER_EDIT_LOG_CLEAN_FAILED.increase(1L);
}
@@ -298,7 +302,7 @@ public class Checkpoint extends MasterDaemon {
MetricRepo.COUNTER_IMAGE_CLEAN_SUCCESS.increase(1L);
}
} catch (Throwable e) {
- LOG.error("Master delete old image file fail.", e);
+ LOG.warn("Master delete old image file fail.", e);
if (MetricRepo.isInit) {
MetricRepo.COUNTER_IMAGE_CLEAN_FAILED.increase(1L);
}
@@ -320,27 +324,26 @@ public class Checkpoint extends MasterDaemon {
/*
* Check whether can we do the checkpoint due to the memory used percent.
*/
- private boolean checkMemoryEnoughToDoCheckpoint() {
+ private void checkMemoryEnoughToDoCheckpoint() throws CheckpointException {
long memUsedPercent = getMemoryUsedPercent();
LOG.info("get jvm memory used percent: {} %", memUsedPercent);
if (memUsedPercent <= Config.metadata_checkpoint_memory_threshold ||
Config.force_do_metadata_checkpoint) {
memoryNotEnoughCount = 0;
- return true;
+ return;
}
- LOG.warn("the memory used percent {} exceed the checkpoint memory
threshold: {}, exceeded count: {}",
- memUsedPercent, Config.metadata_checkpoint_memory_threshold,
memoryNotEnoughCount);
-
memoryNotEnoughCount += 1;
if (memoryNotEnoughCount != Config.checkpoint_manual_gc_threshold) {
- return false;
+ throw new CheckpointException(String.format(
+ "the memory used percent %d exceed the checkpoint memory
threshold: %d, exceeded count: %d",
+ memUsedPercent,
Config.metadata_checkpoint_memory_threshold, memoryNotEnoughCount));
}
- LOG.warn("the not enough memory count has reached the manual gc
threshold {}",
+ LOG.warn("the 'not enough memory count' has reached the manual gc
threshold {}",
Config.checkpoint_manual_gc_threshold);
System.gc();
- return checkMemoryEnoughToDoCheckpoint();
+ checkMemoryEnoughToDoCheckpoint();
}
/*
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]