This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new 9b4a83a  KUDU-2612 p2 (c): small fix on TxnStatusManager's behavior
9b4a83a is described below

commit 9b4a83ac71621211582b9eb0c471d8ac4fd9b984
Author: Alexey Serbin <[email protected]>
AuthorDate: Thu Oct 1 23:22:52 2020 -0700

    KUDU-2612 p2 (c): small fix on TxnStatusManager's behavior
    
    This patch adds more consistency into the TxnStatusManager's behavior
    during the time interval between the following events:
    
      * The state of the leader replica of a transaction status tablet
        changes from BOOTSTRAPPING to RUNNING (see TabletReplica::Start()
        for details).
    
      * TxnStatusManager's runtime structures are populated with the
        information loaded from the tablet
        (i.e. TxnStatusManager::LoadFromTablet() successfully completes).
    
    Before this patch, relevant methods of the TxnStatusManager class could
    behave inconsistently within the mentioned interval.  With this patch,
    those methods return Status::ServiceUnavailable() and do not exhibit any
    inconsistent behavior.
    
    This patch also adds a new test scenario to cover the new behavior.
    
    This is a follow-up to efd8c4f165460b7fa337b8ebd1856b10bc274311.
    
    Change-Id: Ia95f821d87145aff6587c017a425e205bdcb8b2b
    Reviewed-on: http://gerrit.cloudera.org:8080/16537
    Tested-by: Kudu Jenkins
    Reviewed-by: Alexey Serbin <[email protected]>
---
 .../integration-tests/ts_tablet_manager-itest.cc   | 35 ++++++++++++--
 src/kudu/transactions/txn_status_manager-test.cc   | 46 ++++++++++++++++++-
 src/kudu/transactions/txn_status_manager.cc        | 53 +++++++++++++++++++++-
 src/kudu/transactions/txn_status_manager.h         | 17 ++++---
 4 files changed, 139 insertions(+), 12 deletions(-)

diff --git a/src/kudu/integration-tests/ts_tablet_manager-itest.cc 
b/src/kudu/integration-tests/ts_tablet_manager-itest.cc
index ae6102e..75bb41f 100644
--- a/src/kudu/integration-tests/ts_tablet_manager-itest.cc
+++ b/src/kudu/integration-tests/ts_tablet_manager-itest.cc
@@ -1052,7 +1052,8 @@ class TxnStatusTabletManagementTest : public 
TsTabletManagerITest {
   // Creates a tablet with a fixed schema, with the given tablet ID.
   Status CreateTablet(MiniTabletServer* ts,
                       const string& tablet_id,
-                      bool is_txn_status_tablet) {
+                      bool is_txn_status_tablet,
+                      scoped_refptr<TabletReplica>* replica = nullptr) {
     CreateTabletRequestPB req = CreateTxnTabletReq(
         tablet_id, ts->server()->fs_manager()->uuid(), ts->CreateLocalConfig(),
         is_txn_status_tablet);
@@ -1074,12 +1075,38 @@ class TxnStatusTabletManagementTest : public 
TsTabletManagerITest {
 
     // Wait for the tablet to be in RUNNING state and its consensus running 
too.
     RETURN_NOT_OK(r->WaitUntilConsensusRunning(kTimeout));
-    return r->consensus()->WaitUntilLeaderForTests(kTimeout);
+    auto s = r->consensus()->WaitUntilLeaderForTests(kTimeout);
+    if (replica) {
+      *replica = std::move(r);
+    }
+    return s;
   }
 
-  // Creates a transaction status tablet at the given tablet server.
+  // Creates a transaction status tablet at the given tablet server,
+  // waiting for the transaction status data to be loaded from the tablet.
   Status CreateTxnStatusTablet(MiniTabletServer* ts) {
-    return CreateTablet(ts, kTxnStatusTabletId, /*is_txn_status_tablet*/true);
+    scoped_refptr<TabletReplica> r;
+    RETURN_NOT_OK(CreateTablet(
+        ts, kTxnStatusTabletId, /*is_txn_status_tablet*/true, &r));
+    TxnCoordinator* c = CHECK_NOTNULL(CHECK_NOTNULL(r)->txn_coordinator());
+
+    // Wait for the transaction status data to be loaded. The verification 
below
+    // relies on the internals of the TxnStatusManager's implementation, but it
+    // seems OK for test purposes.
+    const auto deadline = MonoTime::Now() + kTimeout;
+    bool is_data_loaded = false;
+    do {
+      auto txn_id = c->highest_txn_id();
+      if (txn_id >= -1) {
+        is_data_loaded = true;
+        break;
+      }
+      SleepFor(MonoDelta::FromMilliseconds(10));
+    } while (MonoTime::Now() < deadline);
+
+    return is_data_loaded
+        ? Status::OK()
+        : Status::TimedOut("timed out waiting for txn status data to be 
loaded");
   }
 
   static Status StartTransactions(const ParticipantIdsByTxnId& txns, 
TxnCoordinator* coordinator) {
diff --git a/src/kudu/transactions/txn_status_manager-test.cc 
b/src/kudu/transactions/txn_status_manager-test.cc
index 9934d41..7ceb0fe 100644
--- a/src/kudu/transactions/txn_status_manager-test.cc
+++ b/src/kudu/transactions/txn_status_manager-test.cc
@@ -19,6 +19,7 @@
 
 #include <algorithm>
 #include <cstdint>
+#include <initializer_list>
 #include <map>
 #include <memory>
 #include <mutex>
@@ -40,9 +41,9 @@
 #include "kudu/tablet/tablet-test-util.h"
 #include "kudu/tablet/tablet_replica-test-base.h"
 #include "kudu/tablet/txn_coordinator.h"
-#include "kudu/tserver/tserver.pb.h"
 #include "kudu/transactions/transactions.pb.h"
 #include "kudu/transactions/txn_status_tablet.h"
+#include "kudu/tserver/tserver.pb.h"
 #include "kudu/util/barrier.h"
 #include "kudu/util/countdown_latch.h"
 #include "kudu/util/locks.h"
@@ -88,6 +89,7 @@ class TxnStatusManagerTest : public TabletReplicaTestBase {
     ConsensusBootstrapInfo info;
     ASSERT_OK(StartReplicaAndWaitUntilLeader(info));
     txn_manager_.reset(new TxnStatusManager(tablet_replica_.get()));
+    ASSERT_OK(txn_manager_->LoadFromTablet());
   }
  protected:
   unique_ptr<TxnStatusManager> txn_manager_;
@@ -152,6 +154,48 @@ TEST_F(TxnStatusManagerTest, TestStartTransactions) {
               txn_manager_reloaded.GetParticipantsByTxnIdForTests());
     ASSERT_EQ(3, txn_manager_reloaded.highest_txn_id());
   }
+
+  // Verify that TxnStatusManager methods return Status::ServiceUnavailable()
+  // if the transaction status tablet's data is not loaded yet.
+  ASSERT_OK(RestartReplica());
+  {
+    TxnStatusManager tsm(tablet_replica_.get());
+    // Check for the special value of the highest_txn_id when the data from
+    // the transaction status tablet isn't loaded yet.
+    ASSERT_EQ(-2, tsm.highest_txn_id());
+
+    // Regardless of transaction identifiers and the records stored in the
+    // transaction status tablet, all relevant methods should return
+    // Status::ServiceUnavailable().
+    const string kErrMsg = "transaction status data is not loaded";
+    TabletServerErrorPB ts_error;
+    for (int64_t txn_id : { 0, 1, 3, 4 }) {
+      auto s = tsm.BeginTransaction(txn_id, kOwner, &ts_error);
+      ASSERT_TRUE(s.IsServiceUnavailable());
+      ASSERT_STR_CONTAINS(s.ToString(), kErrMsg);
+
+      s = tsm.BeginCommitTransaction(txn_id, kOwner, &ts_error);
+      ASSERT_TRUE(s.IsServiceUnavailable());
+      ASSERT_STR_CONTAINS(s.ToString(), kErrMsg);
+
+      s = tsm.FinalizeCommitTransaction(txn_id);
+      ASSERT_TRUE(s.IsServiceUnavailable());
+      ASSERT_STR_CONTAINS(s.ToString(), kErrMsg);
+
+      s = tsm.AbortTransaction(txn_id, kOwner, &ts_error);
+      ASSERT_TRUE(s.IsServiceUnavailable());
+      ASSERT_STR_CONTAINS(s.ToString(), kErrMsg);
+
+      transactions::TxnStatusEntryPB txn_status;
+      s = tsm.GetTransactionStatus(txn_id, kOwner, &txn_status);
+      ASSERT_TRUE(s.IsServiceUnavailable());
+      ASSERT_STR_CONTAINS(s.ToString(), kErrMsg);
+
+      s = tsm.RegisterParticipant(txn_id, kParticipant1, kOwner, &ts_error);
+      ASSERT_TRUE(s.IsServiceUnavailable());
+      ASSERT_STR_CONTAINS(s.ToString(), kErrMsg);
+    }
+  }
 }
 
 TEST_F(TxnStatusManagerTest, TestStartTransactionsConcurrently) {
diff --git a/src/kudu/transactions/txn_status_manager.cc 
b/src/kudu/transactions/txn_status_manager.cc
index 84edfd7..c583b11 100644
--- a/src/kudu/transactions/txn_status_manager.cc
+++ b/src/kudu/transactions/txn_status_manager.cc
@@ -45,6 +45,24 @@ using strings::Substitute;
 namespace kudu {
 namespace transactions {
 
+namespace {
+
+// The following special values are used to determine whether the data from
+// the underlying transaction status tablet has already been loaded
+// (an alternative would be introducing a dedicated member field into the
+//  TxnStatusManager):
+//   * kIdStatusDataNotLoaded: value assigned at construction time
+//   * kIdStatusDataReady: value after loading data from the transaction status
+//     tablet (unless the tablet contains a record with higher txn_id)
+constexpr int64_t kIdStatusDataNotLoaded = -2;
+constexpr int64_t kIdStatusDataReady = -1;
+
+} // anonymous namespace
+
+TxnStatusManagerBuildingVisitor::TxnStatusManagerBuildingVisitor()
+    : highest_txn_id_(kIdStatusDataReady) {
+}
+
 void TxnStatusManagerBuildingVisitor::VisitTransactionEntries(
     int64_t txn_id, TxnStatusEntryPB status_entry_pb,
     vector<ParticipantIdAndPB> participants) {
@@ -80,6 +98,11 @@ void TxnStatusManagerBuildingVisitor::Release(
   *txns_by_id = std::move(txns_by_id_);
 }
 
+TxnStatusManager::TxnStatusManager(tablet::TabletReplica* tablet_replica)
+    : highest_txn_id_(kIdStatusDataNotLoaded),
+      status_tablet_(tablet_replica) {
+}
+
 Status TxnStatusManager::LoadFromTablet() {
   TxnStatusManagerBuildingVisitor v;
   RETURN_NOT_OK(status_tablet_.VisitTransactions(&v));
@@ -93,10 +116,32 @@ Status TxnStatusManager::LoadFromTablet() {
   return Status::OK();
 }
 
+Status TxnStatusManager::CheckTxnStatusDataLoadedUnlocked() const {
+  DCHECK(lock_.is_locked());
+  // TODO(aserbin): this is just to handle requests which come in a short time
+  //                interval when the leader replica of the transaction status
+  //                tablet is already in RUNNING state, but the records from
+  //                the tablet hasn't yet been loaded into the runtime
+  //                structures of this TxnStatusManager instance. However,
+  //                the case when a former leader replica is queried about the
+  //                status of transactions which it is no longer aware of 
should
+  //                be handled separately.
+  if (PREDICT_FALSE(highest_txn_id_ <= kIdStatusDataNotLoaded)) {
+    return Status::ServiceUnavailable("transaction status data is not loaded");
+  }
+  return Status::OK();
+}
+
 Status TxnStatusManager::GetTransaction(int64_t txn_id,
                                         const boost::optional<string>& user,
                                         scoped_refptr<TransactionEntry>* txn) 
const {
   std::lock_guard<simple_spinlock> l(lock_);
+
+  // First, make sure the transaction status data has been loaded. If not, then
+  // the caller might get an unexpected error response and bail instead of
+  // retrying a bit later and getting proper response.
+  RETURN_NOT_OK(CheckTxnStatusDataLoadedUnlocked());
+
   scoped_refptr<TransactionEntry> ret = FindPtrOrNull(txns_by_id_, txn_id);
   if (PREDICT_FALSE(!ret)) {
     return Status::NotFound(
@@ -114,8 +159,14 @@ Status TxnStatusManager::GetTransaction(int64_t txn_id,
 Status TxnStatusManager::BeginTransaction(int64_t txn_id, const string& user,
                                           TabletServerErrorPB* ts_error) {
   {
-    // First, make sure the requested ID is viable.
     std::lock_guard<simple_spinlock> l(lock_);
+    // First, make sure the transaction status data has been loaded.
+    // If not, then there is chance that, being a leader, this replica might
+    // register a transaction with the identifier which is lower than the
+    // identifiers of already registered transactions.
+    RETURN_NOT_OK(CheckTxnStatusDataLoadedUnlocked());
+
+    // Second, make sure the requested ID is viable.
     if (PREDICT_FALSE(txn_id <= highest_txn_id_)) {
       return Status::InvalidArgument(
           Substitute("transaction ID $0 is not higher than the highest ID so 
far: $1",
diff --git a/src/kudu/transactions/txn_status_manager.h 
b/src/kudu/transactions/txn_status_manager.h
index b6ee2ae..ba22023 100644
--- a/src/kudu/transactions/txn_status_manager.h
+++ b/src/kudu/transactions/txn_status_manager.h
@@ -53,6 +53,8 @@ typedef std::unordered_map<int64_t, 
scoped_refptr<TransactionEntry>> Transaction
 // status tablet.
 class TxnStatusManagerBuildingVisitor : public TransactionsVisitor {
  public:
+  TxnStatusManagerBuildingVisitor();
+  ~TxnStatusManagerBuildingVisitor() = default;
   // Builds a TransactionEntry for the given metadata and keeps track of it in
   // txns_by_id_. This is not thread-safe -- callers should ensure only a
   // single thread calls it at once.
@@ -63,18 +65,16 @@ class TxnStatusManagerBuildingVisitor : public 
TransactionsVisitor {
   // per call to VisitTransactionEntries().
   void Release(int64_t* highest_txn_id, TransactionsMap* txns_by_id);
  private:
-  int64_t highest_txn_id_ = -1;
+  int64_t highest_txn_id_;
   TransactionsMap txns_by_id_;
 };
 
 // Manages ongoing transactions and participants therein, backed by an
 // underlying tablet.
-class TxnStatusManager : public tablet::TxnCoordinator {
+class TxnStatusManager final : public tablet::TxnCoordinator {
  public:
-  explicit TxnStatusManager(tablet::TabletReplica* tablet_replica)
-      : highest_txn_id_(-1),
-        status_tablet_(tablet_replica) {}
-  virtual ~TxnStatusManager() {}
+  explicit TxnStatusManager(tablet::TabletReplica* tablet_replica);
+  ~TxnStatusManager() = default;
 
   // Loads the contents of the status tablet into memory.
   Status LoadFromTablet() override;
@@ -135,6 +135,11 @@ class TxnStatusManager : public tablet::TxnCoordinator {
   }
 
  private:
+  // Verifies that the transaction status data has already been loaded from the
+  // underlying tablet. Returns Status::OK() if the data is loaded, otherwise
+  // returns Status::ServiceUnavailable().
+  Status CheckTxnStatusDataLoadedUnlocked() const;
+
   // Returns the transaction entry, returning an error if the transaction ID
   // doesn't exist or if 'user' is specified but isn't the owner of the
   // transaction.

Reply via email to