This is an automated email from the ASF dual-hosted git repository.
alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new 9b4a83a KUDU-2612 p2 (c): small fix on TxnStatusManager's behavior
9b4a83a is described below
commit 9b4a83ac71621211582b9eb0c471d8ac4fd9b984
Author: Alexey Serbin <[email protected]>
AuthorDate: Thu Oct 1 23:22:52 2020 -0700
KUDU-2612 p2 (c): small fix on TxnStatusManager's behavior
This patch adds more consistency into the TxnStatusManager's behavior
during the time interval between the following events:
* The state of the leader replica of a transaction status tablet
changes from BOOTSTRAPPING to RUNNING (see TabletReplica::Start()
for details).
* TxnStatusManager's runtime structures are populated with the
information loaded from the tablet
(i.e. TxnStatusManager::LoadFromTablet() successfully completes).
Before this patch, relevant methods of the TxnStatusManager class could
behave inconsistently within the mentioned interval. With this patch,
those methods return Status::ServiceUnavailable() and do not exhibit any
inconsistent behavior.
This patch also adds a new test scenario to cover the new behavior.
This is a follow-up to efd8c4f165460b7fa337b8ebd1856b10bc274311.
Change-Id: Ia95f821d87145aff6587c017a425e205bdcb8b2b
Reviewed-on: http://gerrit.cloudera.org:8080/16537
Tested-by: Kudu Jenkins
Reviewed-by: Alexey Serbin <[email protected]>
---
.../integration-tests/ts_tablet_manager-itest.cc | 35 ++++++++++++--
src/kudu/transactions/txn_status_manager-test.cc | 46 ++++++++++++++++++-
src/kudu/transactions/txn_status_manager.cc | 53 +++++++++++++++++++++-
src/kudu/transactions/txn_status_manager.h | 17 ++++---
4 files changed, 139 insertions(+), 12 deletions(-)
diff --git a/src/kudu/integration-tests/ts_tablet_manager-itest.cc
b/src/kudu/integration-tests/ts_tablet_manager-itest.cc
index ae6102e..75bb41f 100644
--- a/src/kudu/integration-tests/ts_tablet_manager-itest.cc
+++ b/src/kudu/integration-tests/ts_tablet_manager-itest.cc
@@ -1052,7 +1052,8 @@ class TxnStatusTabletManagementTest : public
TsTabletManagerITest {
// Creates a tablet with a fixed schema, with the given tablet ID.
Status CreateTablet(MiniTabletServer* ts,
const string& tablet_id,
- bool is_txn_status_tablet) {
+ bool is_txn_status_tablet,
+ scoped_refptr<TabletReplica>* replica = nullptr) {
CreateTabletRequestPB req = CreateTxnTabletReq(
tablet_id, ts->server()->fs_manager()->uuid(), ts->CreateLocalConfig(),
is_txn_status_tablet);
@@ -1074,12 +1075,38 @@ class TxnStatusTabletManagementTest : public
TsTabletManagerITest {
// Wait for the tablet to be in RUNNING state and its consensus running
too.
RETURN_NOT_OK(r->WaitUntilConsensusRunning(kTimeout));
- return r->consensus()->WaitUntilLeaderForTests(kTimeout);
+ auto s = r->consensus()->WaitUntilLeaderForTests(kTimeout);
+ if (replica) {
+ *replica = std::move(r);
+ }
+ return s;
}
- // Creates a transaction status tablet at the given tablet server.
+ // Creates a transaction status tablet at the given tablet server,
+ // waiting for the transaction status data to be loaded from the tablet.
Status CreateTxnStatusTablet(MiniTabletServer* ts) {
- return CreateTablet(ts, kTxnStatusTabletId, /*is_txn_status_tablet*/true);
+ scoped_refptr<TabletReplica> r;
+ RETURN_NOT_OK(CreateTablet(
+ ts, kTxnStatusTabletId, /*is_txn_status_tablet*/true, &r));
+ TxnCoordinator* c = CHECK_NOTNULL(CHECK_NOTNULL(r)->txn_coordinator());
+
+ // Wait for the transaction status data to be loaded. The verification
below
+ // relies on the internals of the TxnStatusManager's implementation, but it
+ // seems OK for test purposes.
+ const auto deadline = MonoTime::Now() + kTimeout;
+ bool is_data_loaded = false;
+ do {
+ auto txn_id = c->highest_txn_id();
+ if (txn_id >= -1) {
+ is_data_loaded = true;
+ break;
+ }
+ SleepFor(MonoDelta::FromMilliseconds(10));
+ } while (MonoTime::Now() < deadline);
+
+ return is_data_loaded
+ ? Status::OK()
+ : Status::TimedOut("timed out waiting for txn status data to be
loaded");
}
static Status StartTransactions(const ParticipantIdsByTxnId& txns,
TxnCoordinator* coordinator) {
diff --git a/src/kudu/transactions/txn_status_manager-test.cc
b/src/kudu/transactions/txn_status_manager-test.cc
index 9934d41..7ceb0fe 100644
--- a/src/kudu/transactions/txn_status_manager-test.cc
+++ b/src/kudu/transactions/txn_status_manager-test.cc
@@ -19,6 +19,7 @@
#include <algorithm>
#include <cstdint>
+#include <initializer_list>
#include <map>
#include <memory>
#include <mutex>
@@ -40,9 +41,9 @@
#include "kudu/tablet/tablet-test-util.h"
#include "kudu/tablet/tablet_replica-test-base.h"
#include "kudu/tablet/txn_coordinator.h"
-#include "kudu/tserver/tserver.pb.h"
#include "kudu/transactions/transactions.pb.h"
#include "kudu/transactions/txn_status_tablet.h"
+#include "kudu/tserver/tserver.pb.h"
#include "kudu/util/barrier.h"
#include "kudu/util/countdown_latch.h"
#include "kudu/util/locks.h"
@@ -88,6 +89,7 @@ class TxnStatusManagerTest : public TabletReplicaTestBase {
ConsensusBootstrapInfo info;
ASSERT_OK(StartReplicaAndWaitUntilLeader(info));
txn_manager_.reset(new TxnStatusManager(tablet_replica_.get()));
+ ASSERT_OK(txn_manager_->LoadFromTablet());
}
protected:
unique_ptr<TxnStatusManager> txn_manager_;
@@ -152,6 +154,48 @@ TEST_F(TxnStatusManagerTest, TestStartTransactions) {
txn_manager_reloaded.GetParticipantsByTxnIdForTests());
ASSERT_EQ(3, txn_manager_reloaded.highest_txn_id());
}
+
+ // Verify that TxnStatusManager methods return Status::ServiceUnavailable()
+ // if the transaction status tablet's data is not loaded yet.
+ ASSERT_OK(RestartReplica());
+ {
+ TxnStatusManager tsm(tablet_replica_.get());
+ // Check for the special value of the highest_txn_id when the data from
+ // the transaction status tablet isn't loaded yet.
+ ASSERT_EQ(-2, tsm.highest_txn_id());
+
+ // Regardless of transaction identifiers and the records stored in the
+ // transaction status tablet, all relevant methods should return
+ // Status::ServiceUnavailable().
+ const string kErrMsg = "transaction status data is not loaded";
+ TabletServerErrorPB ts_error;
+ for (int64_t txn_id : { 0, 1, 3, 4 }) {
+ auto s = tsm.BeginTransaction(txn_id, kOwner, &ts_error);
+ ASSERT_TRUE(s.IsServiceUnavailable());
+ ASSERT_STR_CONTAINS(s.ToString(), kErrMsg);
+
+ s = tsm.BeginCommitTransaction(txn_id, kOwner, &ts_error);
+ ASSERT_TRUE(s.IsServiceUnavailable());
+ ASSERT_STR_CONTAINS(s.ToString(), kErrMsg);
+
+ s = tsm.FinalizeCommitTransaction(txn_id);
+ ASSERT_TRUE(s.IsServiceUnavailable());
+ ASSERT_STR_CONTAINS(s.ToString(), kErrMsg);
+
+ s = tsm.AbortTransaction(txn_id, kOwner, &ts_error);
+ ASSERT_TRUE(s.IsServiceUnavailable());
+ ASSERT_STR_CONTAINS(s.ToString(), kErrMsg);
+
+ transactions::TxnStatusEntryPB txn_status;
+ s = tsm.GetTransactionStatus(txn_id, kOwner, &txn_status);
+ ASSERT_TRUE(s.IsServiceUnavailable());
+ ASSERT_STR_CONTAINS(s.ToString(), kErrMsg);
+
+ s = tsm.RegisterParticipant(txn_id, kParticipant1, kOwner, &ts_error);
+ ASSERT_TRUE(s.IsServiceUnavailable());
+ ASSERT_STR_CONTAINS(s.ToString(), kErrMsg);
+ }
+ }
}
TEST_F(TxnStatusManagerTest, TestStartTransactionsConcurrently) {
diff --git a/src/kudu/transactions/txn_status_manager.cc
b/src/kudu/transactions/txn_status_manager.cc
index 84edfd7..c583b11 100644
--- a/src/kudu/transactions/txn_status_manager.cc
+++ b/src/kudu/transactions/txn_status_manager.cc
@@ -45,6 +45,24 @@ using strings::Substitute;
namespace kudu {
namespace transactions {
+namespace {
+
+// The following special values are used to determine whether the data from
+// the underlying transaction status tablet has already been loaded
+// (an alternative would be introducing a dedicated member field into the
+// TxnStatusManager):
+// * kIdStatusDataNotLoaded: value assigned at construction time
+// * kIdStatusDataReady: value after loading data from the transaction status
+// tablet (unless the tablet contains a record with higher txn_id)
+constexpr int64_t kIdStatusDataNotLoaded = -2;
+constexpr int64_t kIdStatusDataReady = -1;
+
+} // anonymous namespace
+
+TxnStatusManagerBuildingVisitor::TxnStatusManagerBuildingVisitor()
+ : highest_txn_id_(kIdStatusDataReady) {
+}
+
void TxnStatusManagerBuildingVisitor::VisitTransactionEntries(
int64_t txn_id, TxnStatusEntryPB status_entry_pb,
vector<ParticipantIdAndPB> participants) {
@@ -80,6 +98,11 @@ void TxnStatusManagerBuildingVisitor::Release(
*txns_by_id = std::move(txns_by_id_);
}
+TxnStatusManager::TxnStatusManager(tablet::TabletReplica* tablet_replica)
+ : highest_txn_id_(kIdStatusDataNotLoaded),
+ status_tablet_(tablet_replica) {
+}
+
Status TxnStatusManager::LoadFromTablet() {
TxnStatusManagerBuildingVisitor v;
RETURN_NOT_OK(status_tablet_.VisitTransactions(&v));
@@ -93,10 +116,32 @@ Status TxnStatusManager::LoadFromTablet() {
return Status::OK();
}
+Status TxnStatusManager::CheckTxnStatusDataLoadedUnlocked() const {
+ DCHECK(lock_.is_locked());
+ // TODO(aserbin): this is just to handle requests which come in a short time
+ // interval when the leader replica of the transaction status
+ // tablet is already in RUNNING state, but the records from
+ // the tablet hasn't yet been loaded into the runtime
+ // structures of this TxnStatusManager instance. However,
+ // the case when a former leader replica is queried about the
+ // status of transactions which it is no longer aware of
should
+ // be handled separately.
+ if (PREDICT_FALSE(highest_txn_id_ <= kIdStatusDataNotLoaded)) {
+ return Status::ServiceUnavailable("transaction status data is not loaded");
+ }
+ return Status::OK();
+}
+
Status TxnStatusManager::GetTransaction(int64_t txn_id,
const boost::optional<string>& user,
scoped_refptr<TransactionEntry>* txn)
const {
std::lock_guard<simple_spinlock> l(lock_);
+
+ // First, make sure the transaction status data has been loaded. If not, then
+ // the caller might get an unexpected error response and bail instead of
+ // retrying a bit later and getting proper response.
+ RETURN_NOT_OK(CheckTxnStatusDataLoadedUnlocked());
+
scoped_refptr<TransactionEntry> ret = FindPtrOrNull(txns_by_id_, txn_id);
if (PREDICT_FALSE(!ret)) {
return Status::NotFound(
@@ -114,8 +159,14 @@ Status TxnStatusManager::GetTransaction(int64_t txn_id,
Status TxnStatusManager::BeginTransaction(int64_t txn_id, const string& user,
TabletServerErrorPB* ts_error) {
{
- // First, make sure the requested ID is viable.
std::lock_guard<simple_spinlock> l(lock_);
+ // First, make sure the transaction status data has been loaded.
+ // If not, then there is chance that, being a leader, this replica might
+ // register a transaction with the identifier which is lower than the
+ // identifiers of already registered transactions.
+ RETURN_NOT_OK(CheckTxnStatusDataLoadedUnlocked());
+
+ // Second, make sure the requested ID is viable.
if (PREDICT_FALSE(txn_id <= highest_txn_id_)) {
return Status::InvalidArgument(
Substitute("transaction ID $0 is not higher than the highest ID so
far: $1",
diff --git a/src/kudu/transactions/txn_status_manager.h
b/src/kudu/transactions/txn_status_manager.h
index b6ee2ae..ba22023 100644
--- a/src/kudu/transactions/txn_status_manager.h
+++ b/src/kudu/transactions/txn_status_manager.h
@@ -53,6 +53,8 @@ typedef std::unordered_map<int64_t,
scoped_refptr<TransactionEntry>> Transaction
// status tablet.
class TxnStatusManagerBuildingVisitor : public TransactionsVisitor {
public:
+ TxnStatusManagerBuildingVisitor();
+ ~TxnStatusManagerBuildingVisitor() = default;
// Builds a TransactionEntry for the given metadata and keeps track of it in
// txns_by_id_. This is not thread-safe -- callers should ensure only a
// single thread calls it at once.
@@ -63,18 +65,16 @@ class TxnStatusManagerBuildingVisitor : public
TransactionsVisitor {
// per call to VisitTransactionEntries().
void Release(int64_t* highest_txn_id, TransactionsMap* txns_by_id);
private:
- int64_t highest_txn_id_ = -1;
+ int64_t highest_txn_id_;
TransactionsMap txns_by_id_;
};
// Manages ongoing transactions and participants therein, backed by an
// underlying tablet.
-class TxnStatusManager : public tablet::TxnCoordinator {
+class TxnStatusManager final : public tablet::TxnCoordinator {
public:
- explicit TxnStatusManager(tablet::TabletReplica* tablet_replica)
- : highest_txn_id_(-1),
- status_tablet_(tablet_replica) {}
- virtual ~TxnStatusManager() {}
+ explicit TxnStatusManager(tablet::TabletReplica* tablet_replica);
+ ~TxnStatusManager() = default;
// Loads the contents of the status tablet into memory.
Status LoadFromTablet() override;
@@ -135,6 +135,11 @@ class TxnStatusManager : public tablet::TxnCoordinator {
}
private:
+ // Verifies that the transaction status data has already been loaded from the
+ // underlying tablet. Returns Status::OK() if the data is loaded, otherwise
+ // returns Status::ServiceUnavailable().
+ Status CheckTxnStatusDataLoadedUnlocked() const;
+
// Returns the transaction entry, returning an error if the transaction ID
// doesn't exist or if 'user' is specified but isn't the owner of the
// transaction.