KUDU-2322: don't spew logs about behind-log-GC follower

This patch addresses the issue of a leader spewing a lot of INFO
log messages about missing logs segments for a follower that falls
behind log GC.  In the 3-4-3 replica management scheme, such followers
linger around until corresponding non-voter replica catches up with
the leader.  That might take long time.

With this patch, the leader throttles messages about missing log
segments for a behind-log-GC follower, outputting 1 message per minute.

Change-Id: I4438e63aef0be26c149c57f9f58cb54327e41a33
Reviewed-on: http://gerrit.cloudera.org:8080/9573
Reviewed-by: Todd Lipcon <t...@apache.org>
Reviewed-by: Alexey Serbin <aser...@cloudera.com>
Tested-by: Kudu Jenkins
(cherry picked from commit d856107e0b58067f0bbebbbda52f4f67c674897a)
Reviewed-on: http://gerrit.cloudera.org:8080/9609
Tested-by: Grant Henke <granthe...@gmail.com>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/62cb06e8
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/62cb06e8
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/62cb06e8

Branch: refs/heads/branch-1.7.x
Commit: 62cb06e82fa064cc0e5d1c7b795f70310ddd2868
Parents: 9e20ca6
Author: Alexey Serbin <aser...@cloudera.com>
Authored: Fri Mar 9 20:31:24 2018 -0800
Committer: Grant Henke <granthe...@gmail.com>
Committed: Tue Mar 13 21:20:00 2018 +0000

----------------------------------------------------------------------
 src/kudu/consensus/consensus-test-util.h |  3 ++-
 src/kudu/consensus/consensus_peers.cc    |  3 +--
 src/kudu/consensus/consensus_queue.cc    | 34 ++++++++++++++++++++-------
 src/kudu/consensus/consensus_queue.h     | 20 +++++-----------
 src/kudu/util/logging.h                  |  2 +-
 5 files changed, 36 insertions(+), 26 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/62cb06e8/src/kudu/consensus/consensus-test-util.h
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/consensus-test-util.h 
b/src/kudu/consensus/consensus-test-util.h
index 9cb0473..36c2354 100644
--- a/src/kudu/consensus/consensus-test-util.h
+++ b/src/kudu/consensus/consensus-test-util.h
@@ -27,16 +27,17 @@
 #include <boost/bind.hpp>
 #include <gmock/gmock.h>
 
+#include "kudu/clock/clock.h"
 #include "kudu/common/timestamp.h"
 #include "kudu/common/wire_protocol.h"
 #include "kudu/consensus/consensus_peers.h"
 #include "kudu/consensus/consensus_queue.h"
 #include "kudu/consensus/log.h"
+#include "kudu/consensus/opid_util.h"
 #include "kudu/consensus/raft_consensus.h"
 #include "kudu/gutil/map-util.h"
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/rpc/messenger.h"
-#include "kudu/clock/clock.h"
 #include "kudu/util/countdown_latch.h"
 #include "kudu/util/locks.h"
 #include "kudu/util/pb_util.h"

http://git-wip-us.apache.org/repos/asf/kudu/blob/62cb06e8/src/kudu/consensus/consensus_peers.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/consensus_peers.cc 
b/src/kudu/consensus/consensus_peers.cc
index 4724ad9..8f3dadd 100644
--- a/src/kudu/consensus/consensus_peers.cc
+++ b/src/kudu/consensus/consensus_peers.cc
@@ -227,8 +227,7 @@ void Peer::SendNextRequest(bool even_if_queue_empty) {
       request_.committed_index() : kMinimumOpIdIndex;
 
   if (PREDICT_FALSE(!s.ok())) {
-    LOG_WITH_PREFIX_UNLOCKED(INFO) << "Could not obtain request from queue for 
peer: "
-        << peer_pb_.permanent_uuid() << ". Status: " << s.ToString();
+    VLOG_WITH_PREFIX_UNLOCKED(1) << s.ToString();
     return;
   }
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/62cb06e8/src/kudu/consensus/consensus_queue.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/consensus_queue.cc 
b/src/kudu/consensus/consensus_queue.cc
index 5b6206d..28110aa 100644
--- a/src/kudu/consensus/consensus_queue.cc
+++ b/src/kudu/consensus/consensus_queue.cc
@@ -20,6 +20,7 @@
 #include <cstdint>
 #include <functional>
 #include <iostream>
+#include <memory>
 #include <mutex>
 #include <string>
 #include <unordered_set>
@@ -122,6 +123,19 @@ const char* PeerStatusToString(PeerStatus p) {
   return "<unknown>";
 }
 
+PeerMessageQueue::TrackedPeer::TrackedPeer(RaftPeerPB peer_pb)
+    : peer_pb(std::move(peer_pb)),
+      next_index(kInvalidOpIdIndex),
+      last_received(MinimumOpId()),
+      last_known_committed_index(MinimumOpId().index()),
+      last_exchange_status(PeerStatus::NEW),
+      last_communication_time(MonoTime::Now()),
+      wal_catchup_possible(true),
+      last_overall_health_status(HealthReportPB::UNKNOWN),
+      status_log_throttler(std::make_shared<logging::LogThrottler>()),
+      last_seen_term_(0) {
+}
+
 std::string PeerMessageQueue::TrackedPeer::ToString() const {
   return Substitute("Peer: $0, Status: $1, Last received: $2, Next index: $3, "
                     "Last known committed idx: $4, Time since last 
communication: $5",
@@ -587,7 +601,8 @@ Status PeerMessageQueue::RequestForPeer(const string& uuid,
 
     TrackedPeer* peer = FindPtrOrNull(peers_map_, uuid);
     if (PREDICT_FALSE(peer == nullptr || queue_state_.mode == NON_LEADER)) {
-      return Status::NotFound("Peer not tracked or queue not in leader mode.");
+      return Status::NotFound(Substitute("peer $0 is no longer tracked or "
+                                         "queue is not in leader mode", uuid));
     }
     peer_copy = *peer;
 
@@ -645,17 +660,20 @@ Status PeerMessageQueue::RequestForPeer(const string& 
uuid,
                                   &preceding_id);
     if (PREDICT_FALSE(!s.ok())) {
       // It's normal to have a NotFound() here if a follower falls behind where
-      // the leader has GCed its logs.
+      // the leader has GCed its logs. The follower replica will hang around
+      // for a while until it's evicted.
       if (PREDICT_TRUE(s.IsNotFound())) {
-        string msg = Substitute("The logs necessary to catch up peer $0 have 
been "
-                                "garbage collected. The follower will never be 
able "
-                                "to catch up ($1)", uuid, s.ToString());
+        KLOG_EVERY_N_SECS_THROTTLER(INFO, 60, *peer_copy.status_log_throttler, 
"logs_gced")
+            << LogPrefixUnlocked()
+            << Substitute("The logs necessary to catch up peer $0 have been "
+                          "garbage collected. The follower will never be able "
+                          "to catch up ($1)", uuid, s.ToString());
         wal_catchup_failure = true;
         return s;
-      // IsIncomplete() means that we tried to read beyond the head of the log
-      // (in the future). See KUDU-1078.
       }
       if (s.IsIncomplete()) {
+        // IsIncomplete() means that we tried to read beyond the head of the 
log
+        // (in the future). See KUDU-1078.
         LOG_WITH_PREFIX_UNLOCKED(ERROR) << "Error trying to read ahead of the 
log "
                                         << "while preparing peer request: "
                                         << s.ToString() << ". Destination 
peer: "
@@ -690,7 +708,7 @@ Status PeerMessageQueue::RequestForPeer(const string& uuid,
   if (request->ops_size() > 0) {
     int64_t last_op_sent = request->ops(request->ops_size() - 1).id().index();
     if (last_op_sent < request->committed_index()) {
-      KLOG_EVERY_N_SECS_THROTTLER(INFO, 3, peer_copy.status_log_throttler, 
"lagging")
+      KLOG_EVERY_N_SECS_THROTTLER(INFO, 3, *peer_copy.status_log_throttler, 
"lagging")
           << LogPrefixUnlocked() << "Peer " << uuid << " is lagging by at 
least "
           << (request->committed_index() - last_op_sent)
           << " ops behind the committed index " << THROTTLE_MSG;

http://git-wip-us.apache.org/repos/asf/kudu/blob/62cb06e8/src/kudu/consensus/consensus_queue.h
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/consensus_queue.h 
b/src/kudu/consensus/consensus_queue.h
index e3a148c..626f8e3 100644
--- a/src/kudu/consensus/consensus_queue.h
+++ b/src/kudu/consensus/consensus_queue.h
@@ -24,7 +24,6 @@
 #include <memory>
 #include <string>
 #include <unordered_map>
-#include <utility>
 #include <vector>
 
 #include <boost/optional/optional.hpp>
@@ -34,13 +33,11 @@
 #include "kudu/consensus/log_cache.h"
 #include "kudu/consensus/metadata.pb.h"
 #include "kudu/consensus/opid.pb.h"
-#include "kudu/consensus/opid_util.h"
 #include "kudu/consensus/ref_counted_replicate.h"
 #include "kudu/gutil/gscoped_ptr.h"
 #include "kudu/gutil/ref_counted.h"
 #include "kudu/gutil/threading/thread_collision_warner.h"
 #include "kudu/util/locks.h"
-#include "kudu/util/logging.h"
 #include "kudu/util/metrics.h"
 #include "kudu/util/monotime.h"
 #include "kudu/util/status.h"
@@ -53,6 +50,10 @@ namespace log {
 class Log;
 }
 
+namespace logging {
+class LogThrottler;
+}
+
 namespace consensus {
 class ConsensusRequestPB;
 class ConsensusResponsePB;
@@ -114,16 +115,7 @@ const char* PeerStatusToString(PeerStatus p);
 class PeerMessageQueue {
  public:
   struct TrackedPeer {
-    explicit TrackedPeer(RaftPeerPB peer_pb)
-        : peer_pb(std::move(peer_pb)),
-          next_index(kInvalidOpIdIndex),
-          last_received(MinimumOpId()),
-          last_known_committed_index(MinimumOpId().index()),
-          last_exchange_status(PeerStatus::NEW),
-          last_communication_time(MonoTime::Now()),
-          wal_catchup_possible(true),
-          last_overall_health_status(HealthReportPB::UNKNOWN),
-          last_seen_term_(0) {}
+    explicit TrackedPeer(RaftPeerPB peer_pb);
 
     TrackedPeer() = default;
 
@@ -180,7 +172,7 @@ class PeerMessageQueue {
 
     // Throttler for how often we will log status messages pertaining to this
     // peer (eg when it is lagging, etc).
-    logging::LogThrottler status_log_throttler;
+    std::shared_ptr<logging::LogThrottler> status_log_throttler;
 
    private:
     // The last term we saw from a given peer.

http://git-wip-us.apache.org/repos/asf/kudu/blob/62cb06e8/src/kudu/util/logging.h
----------------------------------------------------------------------
diff --git a/src/kudu/util/logging.h b/src/kudu/util/logging.h
index 9aa06ac..fbd614f 100644
--- a/src/kudu/util/logging.h
+++ b/src/kudu/util/logging.h
@@ -155,7 +155,7 @@ class ScopedDisableRedaction {
 
 #define KLOG_EVERY_N_SECS_THROTTLER(severity, n_secs, throttler, tag) \
   int VARNAME_LINENUM(num_suppressed) = 0;                            \
-  if (throttler.ShouldLog(n_secs, tag, &VARNAME_LINENUM(num_suppressed)))  \
+  if ((throttler).ShouldLog(n_secs, tag, &VARNAME_LINENUM(num_suppressed)))  \
     google::LogMessage( \
       __FILE__, __LINE__, google::GLOG_ ## severity, 
VARNAME_LINENUM(num_suppressed), \
       &google::LogMessage::SendToLog).stream()

Reply via email to