KUDU-2322: don't spew logs about behind-log-GC follower This patch addresses the issue of a leader spewing a lot of INFO log messages about missing logs segments for a follower that falls behind log GC. In the 3-4-3 replica management scheme, such followers linger around until corresponding non-voter replica catches up with the leader. That might take long time.
With this patch, the leader throttles messages about missing log segments for a behind-log-GC follower, outputting 1 message per minute. Change-Id: I4438e63aef0be26c149c57f9f58cb54327e41a33 Reviewed-on: http://gerrit.cloudera.org:8080/9573 Reviewed-by: Todd Lipcon <t...@apache.org> Reviewed-by: Alexey Serbin <aser...@cloudera.com> Tested-by: Kudu Jenkins (cherry picked from commit d856107e0b58067f0bbebbbda52f4f67c674897a) Reviewed-on: http://gerrit.cloudera.org:8080/9609 Tested-by: Grant Henke <granthe...@gmail.com> Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/62cb06e8 Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/62cb06e8 Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/62cb06e8 Branch: refs/heads/branch-1.7.x Commit: 62cb06e82fa064cc0e5d1c7b795f70310ddd2868 Parents: 9e20ca6 Author: Alexey Serbin <aser...@cloudera.com> Authored: Fri Mar 9 20:31:24 2018 -0800 Committer: Grant Henke <granthe...@gmail.com> Committed: Tue Mar 13 21:20:00 2018 +0000 ---------------------------------------------------------------------- src/kudu/consensus/consensus-test-util.h | 3 ++- src/kudu/consensus/consensus_peers.cc | 3 +-- src/kudu/consensus/consensus_queue.cc | 34 ++++++++++++++++++++------- src/kudu/consensus/consensus_queue.h | 20 +++++----------- src/kudu/util/logging.h | 2 +- 5 files changed, 36 insertions(+), 26 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/62cb06e8/src/kudu/consensus/consensus-test-util.h ---------------------------------------------------------------------- diff --git a/src/kudu/consensus/consensus-test-util.h b/src/kudu/consensus/consensus-test-util.h index 9cb0473..36c2354 100644 --- a/src/kudu/consensus/consensus-test-util.h +++ b/src/kudu/consensus/consensus-test-util.h @@ -27,16 +27,17 @@ #include <boost/bind.hpp> #include <gmock/gmock.h> +#include "kudu/clock/clock.h" #include "kudu/common/timestamp.h" #include "kudu/common/wire_protocol.h" #include "kudu/consensus/consensus_peers.h" #include "kudu/consensus/consensus_queue.h" #include "kudu/consensus/log.h" +#include "kudu/consensus/opid_util.h" #include "kudu/consensus/raft_consensus.h" #include "kudu/gutil/map-util.h" #include "kudu/gutil/strings/substitute.h" #include "kudu/rpc/messenger.h" -#include "kudu/clock/clock.h" #include "kudu/util/countdown_latch.h" #include "kudu/util/locks.h" #include "kudu/util/pb_util.h" http://git-wip-us.apache.org/repos/asf/kudu/blob/62cb06e8/src/kudu/consensus/consensus_peers.cc ---------------------------------------------------------------------- diff --git a/src/kudu/consensus/consensus_peers.cc b/src/kudu/consensus/consensus_peers.cc index 4724ad9..8f3dadd 100644 --- a/src/kudu/consensus/consensus_peers.cc +++ b/src/kudu/consensus/consensus_peers.cc @@ -227,8 +227,7 @@ void Peer::SendNextRequest(bool even_if_queue_empty) { request_.committed_index() : kMinimumOpIdIndex; if (PREDICT_FALSE(!s.ok())) { - LOG_WITH_PREFIX_UNLOCKED(INFO) << "Could not obtain request from queue for peer: " - << peer_pb_.permanent_uuid() << ". Status: " << s.ToString(); + VLOG_WITH_PREFIX_UNLOCKED(1) << s.ToString(); return; } http://git-wip-us.apache.org/repos/asf/kudu/blob/62cb06e8/src/kudu/consensus/consensus_queue.cc ---------------------------------------------------------------------- diff --git a/src/kudu/consensus/consensus_queue.cc b/src/kudu/consensus/consensus_queue.cc index 5b6206d..28110aa 100644 --- a/src/kudu/consensus/consensus_queue.cc +++ b/src/kudu/consensus/consensus_queue.cc @@ -20,6 +20,7 @@ #include <cstdint> #include <functional> #include <iostream> +#include <memory> #include <mutex> #include <string> #include <unordered_set> @@ -122,6 +123,19 @@ const char* PeerStatusToString(PeerStatus p) { return "<unknown>"; } +PeerMessageQueue::TrackedPeer::TrackedPeer(RaftPeerPB peer_pb) + : peer_pb(std::move(peer_pb)), + next_index(kInvalidOpIdIndex), + last_received(MinimumOpId()), + last_known_committed_index(MinimumOpId().index()), + last_exchange_status(PeerStatus::NEW), + last_communication_time(MonoTime::Now()), + wal_catchup_possible(true), + last_overall_health_status(HealthReportPB::UNKNOWN), + status_log_throttler(std::make_shared<logging::LogThrottler>()), + last_seen_term_(0) { +} + std::string PeerMessageQueue::TrackedPeer::ToString() const { return Substitute("Peer: $0, Status: $1, Last received: $2, Next index: $3, " "Last known committed idx: $4, Time since last communication: $5", @@ -587,7 +601,8 @@ Status PeerMessageQueue::RequestForPeer(const string& uuid, TrackedPeer* peer = FindPtrOrNull(peers_map_, uuid); if (PREDICT_FALSE(peer == nullptr || queue_state_.mode == NON_LEADER)) { - return Status::NotFound("Peer not tracked or queue not in leader mode."); + return Status::NotFound(Substitute("peer $0 is no longer tracked or " + "queue is not in leader mode", uuid)); } peer_copy = *peer; @@ -645,17 +660,20 @@ Status PeerMessageQueue::RequestForPeer(const string& uuid, &preceding_id); if (PREDICT_FALSE(!s.ok())) { // It's normal to have a NotFound() here if a follower falls behind where - // the leader has GCed its logs. + // the leader has GCed its logs. The follower replica will hang around + // for a while until it's evicted. if (PREDICT_TRUE(s.IsNotFound())) { - string msg = Substitute("The logs necessary to catch up peer $0 have been " - "garbage collected. The follower will never be able " - "to catch up ($1)", uuid, s.ToString()); + KLOG_EVERY_N_SECS_THROTTLER(INFO, 60, *peer_copy.status_log_throttler, "logs_gced") + << LogPrefixUnlocked() + << Substitute("The logs necessary to catch up peer $0 have been " + "garbage collected. The follower will never be able " + "to catch up ($1)", uuid, s.ToString()); wal_catchup_failure = true; return s; - // IsIncomplete() means that we tried to read beyond the head of the log - // (in the future). See KUDU-1078. } if (s.IsIncomplete()) { + // IsIncomplete() means that we tried to read beyond the head of the log + // (in the future). See KUDU-1078. LOG_WITH_PREFIX_UNLOCKED(ERROR) << "Error trying to read ahead of the log " << "while preparing peer request: " << s.ToString() << ". Destination peer: " @@ -690,7 +708,7 @@ Status PeerMessageQueue::RequestForPeer(const string& uuid, if (request->ops_size() > 0) { int64_t last_op_sent = request->ops(request->ops_size() - 1).id().index(); if (last_op_sent < request->committed_index()) { - KLOG_EVERY_N_SECS_THROTTLER(INFO, 3, peer_copy.status_log_throttler, "lagging") + KLOG_EVERY_N_SECS_THROTTLER(INFO, 3, *peer_copy.status_log_throttler, "lagging") << LogPrefixUnlocked() << "Peer " << uuid << " is lagging by at least " << (request->committed_index() - last_op_sent) << " ops behind the committed index " << THROTTLE_MSG; http://git-wip-us.apache.org/repos/asf/kudu/blob/62cb06e8/src/kudu/consensus/consensus_queue.h ---------------------------------------------------------------------- diff --git a/src/kudu/consensus/consensus_queue.h b/src/kudu/consensus/consensus_queue.h index e3a148c..626f8e3 100644 --- a/src/kudu/consensus/consensus_queue.h +++ b/src/kudu/consensus/consensus_queue.h @@ -24,7 +24,6 @@ #include <memory> #include <string> #include <unordered_map> -#include <utility> #include <vector> #include <boost/optional/optional.hpp> @@ -34,13 +33,11 @@ #include "kudu/consensus/log_cache.h" #include "kudu/consensus/metadata.pb.h" #include "kudu/consensus/opid.pb.h" -#include "kudu/consensus/opid_util.h" #include "kudu/consensus/ref_counted_replicate.h" #include "kudu/gutil/gscoped_ptr.h" #include "kudu/gutil/ref_counted.h" #include "kudu/gutil/threading/thread_collision_warner.h" #include "kudu/util/locks.h" -#include "kudu/util/logging.h" #include "kudu/util/metrics.h" #include "kudu/util/monotime.h" #include "kudu/util/status.h" @@ -53,6 +50,10 @@ namespace log { class Log; } +namespace logging { +class LogThrottler; +} + namespace consensus { class ConsensusRequestPB; class ConsensusResponsePB; @@ -114,16 +115,7 @@ const char* PeerStatusToString(PeerStatus p); class PeerMessageQueue { public: struct TrackedPeer { - explicit TrackedPeer(RaftPeerPB peer_pb) - : peer_pb(std::move(peer_pb)), - next_index(kInvalidOpIdIndex), - last_received(MinimumOpId()), - last_known_committed_index(MinimumOpId().index()), - last_exchange_status(PeerStatus::NEW), - last_communication_time(MonoTime::Now()), - wal_catchup_possible(true), - last_overall_health_status(HealthReportPB::UNKNOWN), - last_seen_term_(0) {} + explicit TrackedPeer(RaftPeerPB peer_pb); TrackedPeer() = default; @@ -180,7 +172,7 @@ class PeerMessageQueue { // Throttler for how often we will log status messages pertaining to this // peer (eg when it is lagging, etc). - logging::LogThrottler status_log_throttler; + std::shared_ptr<logging::LogThrottler> status_log_throttler; private: // The last term we saw from a given peer. http://git-wip-us.apache.org/repos/asf/kudu/blob/62cb06e8/src/kudu/util/logging.h ---------------------------------------------------------------------- diff --git a/src/kudu/util/logging.h b/src/kudu/util/logging.h index 9aa06ac..fbd614f 100644 --- a/src/kudu/util/logging.h +++ b/src/kudu/util/logging.h @@ -155,7 +155,7 @@ class ScopedDisableRedaction { #define KLOG_EVERY_N_SECS_THROTTLER(severity, n_secs, throttler, tag) \ int VARNAME_LINENUM(num_suppressed) = 0; \ - if (throttler.ShouldLog(n_secs, tag, &VARNAME_LINENUM(num_suppressed))) \ + if ((throttler).ShouldLog(n_secs, tag, &VARNAME_LINENUM(num_suppressed))) \ google::LogMessage( \ __FILE__, __LINE__, google::GLOG_ ## severity, VARNAME_LINENUM(num_suppressed), \ &google::LogMessage::SendToLog).stream()