This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git

commit 84017b5f78dcb54fa3a22f7aeb04bd0b87db9b56
Author: Lei Zhang <[email protected]>
AuthorDate: Thu Sep 14 17:49:28 2023 +0800

    [fix](bdbje) handle `ReplicaWriteException` in `BDBJEJournal.write` (#24259)
    
    * When BDBJEJournal.write meet `ReplicaWriteException`, we should not
      retry. Because at the monment the bdbje node state is `REPLICA` (not 
`MASTER`)
      if we still retry write, at the same time trigger election, the orgin 
`REPLICA`
      node may transfer to `MASTER` and will cause incorrect journalId
    
    Co-authored-by: yiguolei <[email protected]>
---
 .../src/main/java/org/apache/doris/catalog/Env.java |  8 ++++++++
 .../apache/doris/journal/bdbje/BDBJEJournal.java    | 21 +++++++++++++++++++++
 .../apache/doris/service/FrontendServiceImpl.java   |  9 +++++++++
 3 files changed, 38 insertions(+)

diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java
index 1437660a2c..40b99025a6 100755
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java
@@ -2552,6 +2552,14 @@ public class Env {
             LOG.warn("replay journal cost too much time: {} replayedJournalId: 
{}", cost, replayedJournalId);
         }
 
+        if (replayedJournalId.get() != newToJournalId) {
+            String msg = "replayedJournalId:" + replayedJournalId.get() + " 
not equal with newToJournalId:"
+                    + newToJournalId + " , will exit";
+            LOG.error(msg);
+            Util.stdoutWithTime(msg);
+            System.exit(-1);
+        }
+
         return hasLog;
     }
 
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/journal/bdbje/BDBJEJournal.java 
b/fe/fe-core/src/main/java/org/apache/doris/journal/bdbje/BDBJEJournal.java
index f865bba4a2..dd574ab4d6 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/journal/bdbje/BDBJEJournal.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/journal/bdbje/BDBJEJournal.java
@@ -38,6 +38,7 @@ import com.sleepycat.je.OperationStatus;
 import com.sleepycat.je.rep.InsufficientLogException;
 import com.sleepycat.je.rep.NetworkRestore;
 import com.sleepycat.je.rep.NetworkRestoreConfig;
+import com.sleepycat.je.rep.ReplicaWriteException;
 import com.sleepycat.je.rep.RollbackException;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
@@ -155,6 +156,26 @@ public class BDBJEJournal implements Journal { // 
CHECKSTYLE IGNORE THIS LINE: B
                     }
                     break;
                 }
+            } catch (ReplicaWriteException e) {
+                /**
+                 * This exception indicates that an update operation or 
transaction commit
+                 * or abort was attempted while in the
+                 * {@link ReplicatedEnvironment.State#REPLICA} state. The 
transaction is marked
+                 * as being invalid.
+                 * <p>
+                 * The exception is the result of either an error in the 
application logic or
+                 * the result of a transition of the node from Master to 
Replica while a
+                 * transaction was in progress.
+                 * <p>
+                 * The application must abort the current transaction and 
redirect all
+                 * subsequent update operations to the Master.
+                 */
+                LOG.error("catch ReplicaWriteException when writing to 
database, will exit. journal id {}", id, e);
+                String msg = "write bdb failed. will exit. journalId: " + id + 
", bdb database Name: "
+                        + currentJournalDB.getDatabaseName();
+                LOG.error(msg);
+                Util.stdoutWithTime(msg);
+                System.exit(-1);
             } catch (DatabaseException e) {
                 LOG.error("catch an exception when writing to database. sleep 
and retry. journal id {}", id, e);
                 try {
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/service/FrontendServiceImpl.java 
b/fe/fe-core/src/main/java/org/apache/doris/service/FrontendServiceImpl.java
index a6bca2dc0a..3fee0e0330 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/service/FrontendServiceImpl.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/service/FrontendServiceImpl.java
@@ -1100,6 +1100,15 @@ public class FrontendServiceImpl implements 
FrontendService.Iface {
         TLoadTxnBeginResult result = new TLoadTxnBeginResult();
         TStatus status = new TStatus(TStatusCode.OK);
         result.setStatus(status);
+
+        if (!Env.getCurrentEnv().isMaster()) {
+            status.setStatusCode(TStatusCode.NOT_MASTER);
+            status.addToErrorMsgs(NOT_MASTER_ERR_MSG);
+            LOG.error("failed to loadTxnBegin:{}, request:{}, backend:{}",
+                    NOT_MASTER_ERR_MSG, request, clientAddr);
+            return result;
+        }
+
         try {
             TLoadTxnBeginResult tmpRes = loadTxnBeginImpl(request, clientAddr);
             result.setTxnId(tmpRes.getTxnId()).setDbId(tmpRes.getDbId());


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to