This is an automated email from the ASF dual-hosted git repository.
lide pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new abefe26fc9a [refactor](log) Add some log for table and db transaction
manager's lock (#40746)
abefe26fc9a is described below
commit abefe26fc9a3c57f65d81c68971b95f568b5456e
Author: Lijia Liu <[email protected]>
AuthorDate: Fri Sep 20 21:58:06 2024 +0800
[refactor](log) Add some log for table and db transaction manager's lock
(#40746)
Use the fe config like #39015, print table lock usage duration.
In 2.0.6, I found `replayer` thread is blocked for a long time.
```
2024-09-12 11:00:58,797 INFO (replayer|89)
[DatabaseTransactionMgr.replayUpsertTransactionState():1977] replay a visible
transaction TransactionState. transaction id: 471728922, label:
flink_connector_20240912_103150_269f8d635ffb455bb4043eecd1a8406b, db id:
475180, table id list: 14473765, callback id: -1, coordinator: BE:
10.217.0.100, transaction status: VISIBLE, error replicas num: 0, replica ids:
, prepare time: 1726108310505, commit time: 1726108310579, finish time:
1726108310604 [...]
2024-09-12 11:00:58,798 WARN (replayer|89) [Env.replayJournal():2575]
entityCost:940256 loadJournalCost:940256 logId:1392353661
replayedJournalId:1392353661 code:100 size:562
```
And, there are many table lock fail log:
```
2024-09-12 10:56:53,167 WARN (mysql-nio-pool-32036|412611)
[Table.tryReadLock():155] Failed to try table atdpersonpaycode's read lock.
timeout 1 MINUTES. Current owner: null
```
But, I don't kown what thread hold the table lock.
So, add some logs to print lock holding info.
---
.../main/java/org/apache/doris/common/Config.java | 4 ++
.../main/java/org/apache/doris/catalog/Table.java | 46 ++++++++++++++++++++++
.../doris/transaction/DatabaseTransactionMgr.java | 15 +++++--
3 files changed, 61 insertions(+), 4 deletions(-)
diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
index 86547830282..ffc23900a0f 100644
--- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
+++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
@@ -2092,6 +2092,10 @@ public class Config extends ConfigBase {
@ConfField
public static long lock_reporting_threshold_ms = 500L;
+ @ConfField(mutable = true, description = {"表示最大锁持有时间,超过该时间会打印告警日志,单位秒",
+ "Maximum lock hold time; logs a warning if exceeded"})
+ public static long max_lock_hold_threshold_seconds = 10L;
+
/**
* If false, when select from tables in information_schema database,
* the result will not contain the information of the table in external
catalog.
diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java
b/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java
index 6b5de7e581f..7cbef2a5d6c 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java
@@ -19,6 +19,7 @@ package org.apache.doris.catalog;
import org.apache.doris.alter.AlterCancelException;
import org.apache.doris.analysis.CreateTableStmt;
+import org.apache.doris.common.Config;
import org.apache.doris.common.DdlException;
import org.apache.doris.common.MetaNotFoundException;
import org.apache.doris.common.io.Text;
@@ -73,6 +74,8 @@ public abstract class Table extends MetaObject implements
Writable, TableIf {
protected TableType type;
protected long createTime;
protected QueryableReentrantReadWriteLock rwLock;
+ private final ThreadLocal<Long> lockReadStart = new ThreadLocal<>();
+ private long lockWriteStart;
/*
* fullSchema and nameToColumn should contains all columns, both visible
and shadow.
@@ -146,6 +149,7 @@ public abstract class Table extends MetaObject implements
Writable, TableIf {
public void readLock() {
this.rwLock.readLock().lock();
+ lockReadStart.set(System.currentTimeMillis());
}
public boolean tryReadLock(long timeout, TimeUnit unit) {
@@ -155,6 +159,9 @@ public abstract class Table extends MetaObject implements
Writable, TableIf {
LOG.warn("Failed to try table {}'s read lock. timeout {} {}.
Current owner: {}",
name, timeout, unit.name(), rwLock.getOwner());
}
+ if (res) {
+ lockReadStart.set(System.currentTimeMillis());
+ }
return res;
} catch (InterruptedException e) {
LOG.warn("failed to try read lock at table[" + name + "]", e);
@@ -163,11 +170,16 @@ public abstract class Table extends MetaObject implements
Writable, TableIf {
}
public void readUnlock() {
+ if (lockReadStart.get() != null) {
+ checkAndLogLockDuration("read", lockReadStart.get(),
System.currentTimeMillis());
+ lockReadStart.remove();
+ }
this.rwLock.readLock().unlock();
}
public void writeLock() {
this.rwLock.writeLock().lock();
+ lockWriteStart = System.currentTimeMillis();
}
public boolean writeLockIfExist() {
@@ -176,9 +188,39 @@ public abstract class Table extends MetaObject implements
Writable, TableIf {
this.rwLock.writeLock().unlock();
return false;
}
+ lockWriteStart = System.currentTimeMillis();
return true;
}
+ private void checkAndLogLockDuration(String type, long lockStart, long
lockEnd) {
+ long duration = lockEnd - lockStart;
+ if (duration > Config.max_lock_hold_threshold_seconds * 1000) {
+ StringBuilder msgBuilder = new StringBuilder();
+ msgBuilder.append(getId())
+ .append(" ")
+ .append(getName())
+ .append(" ")
+ .append(type)
+ .append(" lock is held at ")
+ .append(lockStart)
+ .append(".And release after ")
+ .append(duration)
+ .append(" ms.")
+ .append("Call stack is :\n")
+ .append(getStackTrace(Thread.currentThread()));
+ LOG.info(msgBuilder.toString());
+ }
+ }
+
+ private static String getStackTrace(Thread t) {
+ final StackTraceElement[] stackTrace = t.getStackTrace();
+ StringBuilder msgBuilder = new StringBuilder();
+ for (StackTraceElement e : stackTrace) {
+ msgBuilder.append(e.toString()).append("\n");
+ }
+ return msgBuilder.toString();
+ }
+
public boolean tryWriteLock(long timeout, TimeUnit unit) {
try {
boolean res = this.rwLock.writeLock().tryLock(timeout, unit);
@@ -186,6 +228,9 @@ public abstract class Table extends MetaObject implements
Writable, TableIf {
LOG.warn("Failed to try table {}'s write lock. timeout {} {}.
Current owner: {}",
name, timeout, unit.name(), rwLock.getOwner());
}
+ if (res) {
+ lockWriteStart = System.currentTimeMillis();
+ }
return res;
} catch (InterruptedException e) {
LOG.warn("failed to try write lock at table[" + name + "]", e);
@@ -194,6 +239,7 @@ public abstract class Table extends MetaObject implements
Writable, TableIf {
}
public void writeUnlock() {
+ checkAndLogLockDuration("write", lockWriteStart,
System.currentTimeMillis());
this.rwLock.writeLock().unlock();
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java
b/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java
index cf4bed28e5b..3bc676a1b42 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java
@@ -165,14 +165,20 @@ public class DatabaseTransactionMgr {
private volatile long usedQuotaDataBytes = -1;
private long lockWriteStart;
+ private final ThreadLocal<Long> lockReadStart = new ThreadLocal<>();
- private long lockReportingThresholdMs = Config.lock_reporting_threshold_ms;
+ private final long lockReportingThresholdMs =
Config.lock_reporting_threshold_ms;
protected void readLock() {
this.transactionLock.readLock().lock();
+ lockReadStart.set(System.currentTimeMillis());
}
protected void readUnlock() {
+ if (lockReadStart.get() != null) {
+ checkAndLogLockDuration("read", lockReadStart.get(),
System.currentTimeMillis());
+ lockReadStart.remove();
+ }
this.transactionLock.readLock().unlock();
}
@@ -182,7 +188,7 @@ public class DatabaseTransactionMgr {
}
protected void writeUnlock() {
- checkAndLogWriteLockDuration(lockWriteStart,
System.currentTimeMillis());
+ checkAndLogLockDuration("write", lockWriteStart,
System.currentTimeMillis());
this.transactionLock.writeLock().unlock();
}
@@ -2211,11 +2217,12 @@ public class DatabaseTransactionMgr {
* @param lockStart holing lock start time.
* @param lockEnd release lock time.
*/
- private void checkAndLogWriteLockDuration(long lockStart, long lockEnd) {
+ private void checkAndLogLockDuration(String type, long lockStart, long
lockEnd) {
long duration = lockEnd - lockStart;
if (duration > lockReportingThresholdMs) {
StringBuilder msgBuilder = new StringBuilder();
- msgBuilder.append("lock is held at ")
+ msgBuilder.append(type)
+ .append(" lock is held at ")
.append(lockStart)
.append(".And release after ")
.append(duration)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]