This is an automated email from the ASF dual-hosted git repository.
gavinchou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 7a3ee3640ee [fix](cloud) checkpoint save cloud tablet stats to image
(#60705)
7a3ee3640ee is described below
commit 7a3ee3640ee421d6727e25daa41627b1d11d4c94
Author: meiyi <[email protected]>
AuthorDate: Sat Mar 21 02:47:46 2026 +0800
[fix](cloud) checkpoint save cloud tablet stats to image (#60705)
---
.../main/java/org/apache/doris/common/Config.java | 8 ++
.../apache/doris/catalog/CloudTabletStatMgr.java | 7 ++
.../apache/doris/cloud/catalog/CloudReplica.java | 4 +
.../java/org/apache/doris/master/Checkpoint.java | 124 ++++++++++++++++++++-
.../java/org/apache/doris/persist/Storage.java | 19 ++++
5 files changed, 159 insertions(+), 3 deletions(-)
diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
index 04dd9a2d610..2ad50b3a9bd 100644
--- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
+++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
@@ -542,6 +542,14 @@ public class Config extends ConfigBase {
"Randomly use V3 storage_format (ext_meta) for some tables in
fuzzy tests to increase coverage"})
public static boolean random_use_v3_storage_format = true;
+ @ConfField(mutable = true, masterOnly = true, description = {
+ "The stale threshold of checkpoint image file in cloud mode (in
seconds). "
+ + "If the image file is older than this threshold, a new
checkpoint will be triggered "
+ + "even if there are no new journals. This helps keep
table version, partition version, "
+ + "and tablet stats in the image up-to-date. If the value
is less than or equal to 0, "
+ + "this feature is disabled."})
+ public static long cloud_checkpoint_image_stale_threshold_seconds = 3600;
+
@ConfField(mutable = true, masterOnly = true, description = {
"Wait for the internal batch to be written before returning; "
+ "insert into and stream load use group commit by
default."})
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/catalog/CloudTabletStatMgr.java
b/fe/fe-core/src/main/java/org/apache/doris/catalog/CloudTabletStatMgr.java
index d6f0f9516a6..18da6784acf 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/CloudTabletStatMgr.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/CloudTabletStatMgr.java
@@ -108,6 +108,13 @@ public class CloudTabletStatMgr extends MasterDaemon {
@Override
protected void runAfterCatalogReady() {
+ if (cloudTableStatsList.isEmpty()) {
+ // use tablet stats loaded from image to update table stats when
fe start
+ // avoid that the table stats is empty for a long time since
getAllTabletStats may consume a long time
+ LOG.info("cloud tablet stat is empty, will update stat info of all
tables");
+ updateStatInfo(Env.getCurrentInternalCatalog().getDbIds());
+ }
+
int version = Config.cloud_get_tablet_stats_version;
LOG.info("cloud tablet stat begin with version: {}", version);
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudReplica.java
b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudReplica.java
index 3a9699e8f84..c50b5d80923 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudReplica.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudReplica.java
@@ -72,6 +72,7 @@ public class CloudReplica extends Replica implements
GsonPostProcessable {
// last time to get tablet stats
@Getter
@Setter
+ @SerializedName(value = "gst")
long lastGetTabletStatsTime = 0;
/**
* The index of {@link
org.apache.doris.catalog.CloudTabletStatMgr#DEFAULT_INTERVAL_LADDER_MS} array.
@@ -82,9 +83,12 @@ public class CloudReplica extends Replica implements
GsonPostProcessable {
*/
@Getter
@Setter
+ @SerializedName(value = "sii")
int statsIntervalIndex = 0;
+ @SerializedName(value = "sc")
private long segmentCount = 0L;
+ @SerializedName(value = "rsc")
private long rowsetCount = 1L; // [0-1] rowset
private static final Random rand = new Random();
diff --git a/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java
b/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java
index ec4126aa86d..0d21fa8094c 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java
@@ -17,7 +17,16 @@
package org.apache.doris.master;
+import org.apache.doris.catalog.Database;
import org.apache.doris.catalog.Env;
+import org.apache.doris.catalog.MaterializedIndex;
+import org.apache.doris.catalog.MaterializedIndex.IndexExtState;
+import org.apache.doris.catalog.OlapTable;
+import org.apache.doris.catalog.Partition;
+import org.apache.doris.catalog.Replica;
+import org.apache.doris.catalog.Table;
+import org.apache.doris.catalog.Tablet;
+import org.apache.doris.cloud.catalog.CloudReplica;
import org.apache.doris.common.CheckpointException;
import org.apache.doris.common.Config;
import org.apache.doris.common.FeConstants;
@@ -104,11 +113,23 @@ public class Checkpoint extends MasterDaemon {
storage = new Storage(imageDir);
// get max image version
imageVersion = storage.getLatestImageSeq();
+ long latestImageCreateTime = storage.getLatestImageCreateTime();
// get max finalized journal id
checkPointVersion = editLog.getFinalizedJournalId();
- LOG.info("last checkpoint journal id: {}, current finalized
journal id: {}",
- imageVersion, checkPointVersion);
- if (imageVersion >= checkPointVersion) {
+ LOG.info("last checkpoint journal id: {}, create timestamp: {}.
current finalized journal id: {}",
+ imageVersion, latestImageCreateTime, checkPointVersion);
+ if (imageVersion < checkPointVersion) {
+ LOG.info("Trigger checkpoint since last checkpoint journal id:
{} is less than "
+ + "current finalized journal id: {}", imageVersion,
checkPointVersion);
+ } else if (Config.isCloudMode() &&
Config.cloud_checkpoint_image_stale_threshold_seconds > 0
+ && latestImageCreateTime > 0 &&
((System.currentTimeMillis() - latestImageCreateTime)
+ >= Config.cloud_checkpoint_image_stale_threshold_seconds *
1000L)) {
+ // No new finalized journals beyond the latest image.
+ // But in cloud mode, we may still want to force a checkpoint
if the latest image file is expired.
+ // This helps that image can keep the newer table version,
partition version, tablet stats.
+ LOG.info("Trigger checkpoint in cloud mode because latest
image is expired. "
+ + "latestImageSeq: {}, latestImageCreateTime: {}",
imageVersion, latestImageCreateTime);
+ } else {
return;
}
} catch (Throwable e) {
@@ -146,6 +167,7 @@ public class Checkpoint extends MasterDaemon {
checkPointVersion,
env.getReplayedJournalId()));
}
env.postProcessAfterMetadataReplayed(false);
+ postProcessCloudMetadata();
latestImageFilePath = env.saveImage();
replayedJournalId = env.getReplayedJournalId();
@@ -395,4 +417,100 @@ public class Checkpoint extends MasterDaemon {
public ReentrantReadWriteLock getLock() {
return lock;
}
+
+ private void postProcessCloudMetadata() {
+ if (Config.isNotCloudMode()) {
+ return;
+ }
+ Env servingEnv = Env.getServingEnv();
+ if (servingEnv == null) {
+ LOG.warn("serving env is null, skip process cloud metadata for
checkpoint");
+ return;
+ }
+ long start = System.currentTimeMillis();
+ for (Database db : env.getInternalCatalog().getDbs()) {
+ Database servingDb =
servingEnv.getInternalCatalog().getDbNullable(db.getId());
+ if (servingDb == null) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("serving db is null. dbId: {}, dbName: {}",
db.getId(), db.getFullName());
+ }
+ continue;
+ }
+
+ for (Table table : db.getTables()) {
+ Table servingTable = servingDb.getTableNullable(table.getId());
+ if (servingTable == null) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("serving table is null. dbId: {}, table:
{}", db.getId(), table);
+ }
+ continue;
+ }
+ if (!(table instanceof OlapTable) || !(servingTable instanceof
OlapTable)) {
+ continue;
+ }
+ OlapTable olapTable = (OlapTable) table;
+ OlapTable servingOlapTable = (OlapTable) servingTable;
+
+ List<Partition> partitions = olapTable.getAllPartitions();
+ for (Partition partition : partitions) {
+ Partition servingPartition =
servingOlapTable.getPartition(partition.getId());
+ if (servingPartition == null) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("serving partition is null. tableId: {},
partitionId: {}", table.getId(),
+ partition.getId());
+ }
+ continue;
+ }
+ // set tablet stats
+ setTabletStats(table.getId(), partition, servingPartition);
+ }
+ }
+ }
+ LOG.info("post process cloud metadata for checkpoint finished. cost {}
ms", System.currentTimeMillis() - start);
+ }
+
+ private void setTabletStats(long tableId, Partition partition, Partition
servingPartition) {
+ for (MaterializedIndex index :
partition.getMaterializedIndices(IndexExtState.VISIBLE)) {
+ MaterializedIndex servingIndex =
servingPartition.getIndex(index.getId());
+ if (servingIndex == null) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("serving index is null. tableId: {},
partitionId: {}, indexId: {}", tableId,
+ partition.getId(), index.getId());
+ }
+ continue;
+ }
+ for (Tablet tablet : index.getTablets()) {
+ Tablet servingTablet = servingIndex.getTablet(tablet.getId());
+ if (servingTablet == null) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("serving tablet is null. tableId: {},
partitionId: {}, indexId: {}, tabletId: {}",
+ tableId, partition.getId(), index.getId(),
tablet.getId());
+ }
+ continue;
+ }
+ for (Replica replica : tablet.getReplicas()) {
+ Replica servingReplica =
servingTablet.getReplicaById(replica.getId());
+ if (servingReplica == null) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("serving replica is null. tableId: {},
partitionId: {}, indexId: {}, "
+ + "tabletId: {}, replicaId: {}",
tableId, partition.getId(), index.getId(),
+ tablet.getId(), replica.getId());
+ }
+ continue;
+ }
+ replica.setDataSize(servingReplica.getDataSize());
+ replica.setRowsetCount(servingReplica.getRowsetCount());
+ replica.setSegmentCount(servingReplica.getSegmentCount());
+ replica.setRowCount(servingReplica.getRowCount());
+
replica.setLocalInvertedIndexSize(servingReplica.getLocalInvertedIndexSize());
+
replica.setLocalSegmentSize(servingReplica.getLocalSegmentSize());
+ // set last get stats time and stats interval index
+ CloudReplica cloudReplica = (CloudReplica) replica;
+ CloudReplica servingCloudReplica = (CloudReplica)
servingReplica;
+
cloudReplica.setStatsIntervalIndex(servingCloudReplica.getStatsIntervalIndex());
+
cloudReplica.setLastGetTabletStatsTime(servingCloudReplica.getLastGetTabletStatsTime());
+ }
+ }
+ }
+ }
}
diff --git a/fe/fe-core/src/main/java/org/apache/doris/persist/Storage.java
b/fe/fe-core/src/main/java/org/apache/doris/persist/Storage.java
index 9f8cd558a57..e826b7e03c2 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/persist/Storage.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/persist/Storage.java
@@ -68,6 +68,7 @@ public class Storage {
private long editsSeq;
private long latestImageSeq = 0;
private long latestValidatedImageSeq = 0;
+ private long latestImageCreateTime = 0;
private String metaDir;
private List<Long> editsFileSequenceNumbers;
@@ -83,6 +84,15 @@ public class Storage {
this.editsSeq = editsSeq;
this.latestImageSeq = latestImageSeq;
this.metaDir = metaDir;
+ // try to set latestImageCreateTime from the image file if it exists
+ try {
+ File img = getImageFile(latestImageSeq);
+ if (img != null && img.exists()) {
+ latestImageCreateTime = img.lastModified();
+ }
+ } catch (Exception e) {
+ // ignore; best-effort only
+ }
}
public Storage(String metaDir) throws IOException {
@@ -146,6 +156,7 @@ public class Storage {
imageIds.add(fileSeq);
if (latestImageSeq < fileSeq) {
latestImageSeq = fileSeq;
+ latestImageCreateTime = child.lastModified();
}
} else if (name.startsWith(EDITS)) {
// Just record the sequence part of the file name
@@ -162,6 +173,14 @@ public class Storage {
latestValidatedImageSeq = imageIds.size() < 2 ? 0 :
imageIds.get(imageIds.size() - 2);
}
+ /**
+ * Return latest image creation time in milliseconds since epoch.
+ * 0 means unknown.
+ */
+ public long getLatestImageCreateTime() {
+ return latestImageCreateTime;
+ }
+
public int getClusterID() {
return clusterID;
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]