This is an automated email from the ASF dual-hosted git repository.

karan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/druid.git


The following commit(s) were added to refs/heads/master by this push:
     new b82ad59dfe5 Better logging in ServiceClientImpl (#15269)
b82ad59dfe5 is described below

commit b82ad59dfe5acbcef8ae68fe0005b8c446d718ad
Author: Laksh Singla <[email protected]>
AuthorDate: Thu Nov 2 11:32:49 2023 +0530

    Better logging in ServiceClientImpl (#15269)
    
    ServiceClientImpl logs the cause of every retry, even though we are 
retrying the connection attempt. This leads to slight pollution in the logs 
because a lot of the time, the reason for retrying is the same. This is seen 
primarily in MSQ, when the worker task hasn't launched yet however controller 
attempts to connect to the worker task, which can lead to scary-looking 
messages (with INFO log level), even though they are normal.
    This PR changes the logging logic to log every 10 (arbitrary number) 
retries instead of every retry, to reduce the pollution of the logs.
    Note: If there are no retries left, the client returns an exception, which 
would get thrown up by the caller, and therefore this change doesn't hide any 
important information.
---
 .../msq/indexing/client/IndexerWorkerClient.java   |  2 +-
 .../org/apache/druid/rpc/ServiceClientImpl.java    | 25 ++++++++++++--
 .../org/apache/druid/rpc/ServiceRetryPolicy.java   |  5 +++
 .../org/apache/druid/rpc/StandardRetryPolicy.java  | 38 ++++++++++++++++++++--
 .../rpc/indexing/SpecificTaskRetryPolicy.java      |  6 ++++
 5 files changed, 70 insertions(+), 6 deletions(-)

diff --git 
a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/client/IndexerWorkerClient.java
 
b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/client/IndexerWorkerClient.java
index 33f030bc3be..af089a29600 100644
--- 
a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/client/IndexerWorkerClient.java
+++ 
b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/client/IndexerWorkerClient.java
@@ -301,7 +301,7 @@ public class IndexerWorkerClient implements WorkerClient
             final ServiceClient client = clientFactory.makeClient(
                 id,
                 locator,
-                new SpecificTaskRetryPolicy(workerTaskId, 
StandardRetryPolicy.unlimited())
+                new SpecificTaskRetryPolicy(workerTaskId, 
StandardRetryPolicy.unlimitedWithoutRetryLogging())
             );
             return Pair.of(client, locator);
           }
diff --git a/server/src/main/java/org/apache/druid/rpc/ServiceClientImpl.java 
b/server/src/main/java/org/apache/druid/rpc/ServiceClientImpl.java
index 2146ed5d5be..3178360016a 100644
--- a/server/src/main/java/org/apache/druid/rpc/ServiceClientImpl.java
+++ b/server/src/main/java/org/apache/druid/rpc/ServiceClientImpl.java
@@ -220,7 +220,20 @@ public class ServiceClientImpl implements ServiceClient
                     if (shouldTry(nextAttemptNumber) && 
retryPolicy.retryThrowable(t)) {
                       final long backoffMs = computeBackoffMs(retryPolicy, 
attemptNumber);
 
-                      log.noStackTrace().info(t, buildErrorMessage(request, 
null, backoffMs, nextAttemptNumber));
+                      if (retryPolicy.retryLoggable()) {
+                        // log as INFO level if the retry is loggable
+                        log.noStackTrace().info(t, buildErrorMessage(request, 
null, backoffMs, nextAttemptNumber));
+                      } else if (log.isDebugEnabled()) {
+                        // log as DEBUG level if the debug log is enabled
+                        log.noStackTrace().debug(t, buildErrorMessage(request, 
null, backoffMs, nextAttemptNumber));
+                      } else {
+                        // If none of the above is valid, we log the error 
message every tenth time we retry. It seems like
+                        // a good balance between making the logs not too 
verbose when the retry is due to the same cause
+                        // and enriching logs with useful information, if we 
keep retrying due to the same reason
+                        if (nextAttemptNumber > 0 && nextAttemptNumber % 10 == 
0) {
+                          log.noStackTrace().info(t, 
buildErrorMessage(request, null, backoffMs, nextAttemptNumber));
+                        }
+                      }
 
                       connectExec.schedule(
                           () -> tryRequest(requestBuilder, handler, retVal, 
nextAttemptNumber, ImmutableSet.of()),
@@ -272,7 +285,15 @@ public class ServiceClientImpl implements ServiceClient
                   // Retryable server response (or null errorHolder, which 
means null result, which can happen
                   // if the HttpClient encounters an exception in the midst of 
response processing).
                   final long backoffMs = computeBackoffMs(retryPolicy, 
attemptNumber);
-                  log.info(buildErrorMessage(request, errorHolder, backoffMs, 
nextAttemptNumber));
+                  if (retryPolicy.retryLoggable()) {
+                    log.noStackTrace().info(buildErrorMessage(request, 
errorHolder, backoffMs, nextAttemptNumber));
+                  } else if (log.isDebugEnabled()) {
+                    log.noStackTrace().debug(buildErrorMessage(request, 
errorHolder, backoffMs, nextAttemptNumber));
+                  } else {
+                    if (nextAttemptNumber > 0 && nextAttemptNumber % 10 == 0) {
+                      log.noStackTrace().info(buildErrorMessage(request, 
errorHolder, backoffMs, nextAttemptNumber));
+                    }
+                  }
                   connectExec.schedule(
                       () -> tryRequest(requestBuilder, handler, retVal, 
nextAttemptNumber, ImmutableSet.of()),
                       backoffMs,
diff --git a/server/src/main/java/org/apache/druid/rpc/ServiceRetryPolicy.java 
b/server/src/main/java/org/apache/druid/rpc/ServiceRetryPolicy.java
index 297f369688a..91689523a20 100644
--- a/server/src/main/java/org/apache/druid/rpc/ServiceRetryPolicy.java
+++ b/server/src/main/java/org/apache/druid/rpc/ServiceRetryPolicy.java
@@ -54,6 +54,11 @@ public interface ServiceRetryPolicy
    */
   boolean retryThrowable(Throwable t);
 
+  /**
+   * Returns whether to log the cause of failure before retrying
+   */
+  boolean retryLoggable();
+
   /**
    * Returns whether service-not-available, i.e. empty {@link 
ServiceLocations#getLocations()}, can be retried.
    */
diff --git a/server/src/main/java/org/apache/druid/rpc/StandardRetryPolicy.java 
b/server/src/main/java/org/apache/druid/rpc/StandardRetryPolicy.java
index ceb2a41c807..71097ef6d89 100644
--- a/server/src/main/java/org/apache/druid/rpc/StandardRetryPolicy.java
+++ b/server/src/main/java/org/apache/druid/rpc/StandardRetryPolicy.java
@@ -44,6 +44,9 @@ public class StandardRetryPolicy implements ServiceRetryPolicy
   private static final int MAX_ATTEMPTS_ABOUT_AN_HOUR = 125;
 
   private static final StandardRetryPolicy DEFAULT_UNLIMITED_POLICY = new 
Builder().maxAttempts(UNLIMITED).build();
+  private static final StandardRetryPolicy 
DEFAULT_UNLIMITED_POLICY_NO_RETRY_LOG = new Builder().maxAttempts(UNLIMITED)
+                                                                               
                 .retryLoggable(false)
+                                                                               
                 .build();
   private static final StandardRetryPolicy DEFAULT_ABOUT_AN_HOUR_POLICY =
       new Builder().maxAttempts(MAX_ATTEMPTS_ABOUT_AN_HOUR).build();
   private static final StandardRetryPolicy DEFAULT_NO_RETRIES_POLICY = new 
Builder().maxAttempts(1).build();
@@ -52,13 +55,21 @@ public class StandardRetryPolicy implements 
ServiceRetryPolicy
   private final long minWaitMillis;
   private final long maxWaitMillis;
   private final boolean retryNotAvailable;
-
-  private StandardRetryPolicy(long maxAttempts, long minWaitMillis, long 
maxWaitMillis, boolean retryNotAvailable)
+  private final boolean retryLoggable;
+
+  private StandardRetryPolicy(
+      long maxAttempts,
+      long minWaitMillis,
+      long maxWaitMillis,
+      boolean retryNotAvailable,
+      boolean retryLoggable
+  )
   {
     this.maxAttempts = maxAttempts;
     this.minWaitMillis = minWaitMillis;
     this.maxWaitMillis = maxWaitMillis;
     this.retryNotAvailable = retryNotAvailable;
+    this.retryLoggable = retryLoggable;
 
     if (maxAttempts == 0) {
       throw new IAE("maxAttempts must be positive (limited) or negative 
(unlimited); cannot be zero.");
@@ -79,6 +90,14 @@ public class StandardRetryPolicy implements 
ServiceRetryPolicy
     return DEFAULT_UNLIMITED_POLICY;
   }
 
+  /**
+   * Standard unlimited retry policy along with muted the logging for the 
retries.
+   */
+  public static StandardRetryPolicy unlimitedWithoutRetryLogging()
+  {
+    return DEFAULT_UNLIMITED_POLICY_NO_RETRY_LOG;
+  }
+
   /**
    * Retry policy that uses up to about an hour of total wait time. Note that 
this is just the total waiting time
    * between attempts. It does not include the time that each attempt takes to 
execute.
@@ -135,6 +154,12 @@ public class StandardRetryPolicy implements 
ServiceRetryPolicy
            || (t.getCause() != null && retryThrowable(t.getCause()));
   }
 
+  @Override
+  public boolean retryLoggable()
+  {
+    return retryLoggable;
+  }
+
   @Override
   public boolean retryNotAvailable()
   {
@@ -147,6 +172,7 @@ public class StandardRetryPolicy implements 
ServiceRetryPolicy
     private long minWaitMillis = DEFAULT_MIN_WAIT_MS;
     private long maxWaitMillis = DEFAULT_MAX_WAIT_MS;
     private boolean retryNotAvailable = true;
+    private boolean retryLoggable = true;
 
     public Builder maxAttempts(final long maxAttempts)
     {
@@ -172,9 +198,15 @@ public class StandardRetryPolicy implements 
ServiceRetryPolicy
       return this;
     }
 
+    public Builder retryLoggable(final boolean retryLoggable)
+    {
+      this.retryLoggable = retryLoggable;
+      return this;
+    }
+
     public StandardRetryPolicy build()
     {
-      return new StandardRetryPolicy(maxAttempts, minWaitMillis, 
maxWaitMillis, retryNotAvailable);
+      return new StandardRetryPolicy(maxAttempts, minWaitMillis, 
maxWaitMillis, retryNotAvailable, retryLoggable);
     }
   }
 }
diff --git 
a/server/src/main/java/org/apache/druid/rpc/indexing/SpecificTaskRetryPolicy.java
 
b/server/src/main/java/org/apache/druid/rpc/indexing/SpecificTaskRetryPolicy.java
index 2482786c2a7..36d84f2dc4e 100644
--- 
a/server/src/main/java/org/apache/druid/rpc/indexing/SpecificTaskRetryPolicy.java
+++ 
b/server/src/main/java/org/apache/druid/rpc/indexing/SpecificTaskRetryPolicy.java
@@ -77,6 +77,12 @@ public class SpecificTaskRetryPolicy implements 
ServiceRetryPolicy
     return StandardRetryPolicy.unlimited().retryThrowable(t);
   }
 
+  @Override
+  public boolean retryLoggable()
+  {
+    return baseRetryPolicy.retryLoggable();
+  }
+
   @Override
   public boolean retryNotAvailable()
   {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to