This is an automated email from the ASF dual-hosted git repository.
karan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/druid.git
The following commit(s) were added to refs/heads/master by this push:
new b82ad59dfe5 Better logging in ServiceClientImpl (#15269)
b82ad59dfe5 is described below
commit b82ad59dfe5acbcef8ae68fe0005b8c446d718ad
Author: Laksh Singla <[email protected]>
AuthorDate: Thu Nov 2 11:32:49 2023 +0530
Better logging in ServiceClientImpl (#15269)
ServiceClientImpl logs the cause of every retry, even though we are
retrying the connection attempt. This leads to slight pollution in the logs
because a lot of the time, the reason for retrying is the same. This is seen
primarily in MSQ, when the worker task hasn't launched yet however controller
attempts to connect to the worker task, which can lead to scary-looking
messages (with INFO log level), even though they are normal.
This PR changes the logging logic to log every 10 (arbitrary number)
retries instead of every retry, to reduce the pollution of the logs.
Note: If there are no retries left, the client returns an exception, which
would get thrown up by the caller, and therefore this change doesn't hide any
important information.
---
.../msq/indexing/client/IndexerWorkerClient.java | 2 +-
.../org/apache/druid/rpc/ServiceClientImpl.java | 25 ++++++++++++--
.../org/apache/druid/rpc/ServiceRetryPolicy.java | 5 +++
.../org/apache/druid/rpc/StandardRetryPolicy.java | 38 ++++++++++++++++++++--
.../rpc/indexing/SpecificTaskRetryPolicy.java | 6 ++++
5 files changed, 70 insertions(+), 6 deletions(-)
diff --git
a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/client/IndexerWorkerClient.java
b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/client/IndexerWorkerClient.java
index 33f030bc3be..af089a29600 100644
---
a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/client/IndexerWorkerClient.java
+++
b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/client/IndexerWorkerClient.java
@@ -301,7 +301,7 @@ public class IndexerWorkerClient implements WorkerClient
final ServiceClient client = clientFactory.makeClient(
id,
locator,
- new SpecificTaskRetryPolicy(workerTaskId,
StandardRetryPolicy.unlimited())
+ new SpecificTaskRetryPolicy(workerTaskId,
StandardRetryPolicy.unlimitedWithoutRetryLogging())
);
return Pair.of(client, locator);
}
diff --git a/server/src/main/java/org/apache/druid/rpc/ServiceClientImpl.java
b/server/src/main/java/org/apache/druid/rpc/ServiceClientImpl.java
index 2146ed5d5be..3178360016a 100644
--- a/server/src/main/java/org/apache/druid/rpc/ServiceClientImpl.java
+++ b/server/src/main/java/org/apache/druid/rpc/ServiceClientImpl.java
@@ -220,7 +220,20 @@ public class ServiceClientImpl implements ServiceClient
if (shouldTry(nextAttemptNumber) &&
retryPolicy.retryThrowable(t)) {
final long backoffMs = computeBackoffMs(retryPolicy,
attemptNumber);
- log.noStackTrace().info(t, buildErrorMessage(request,
null, backoffMs, nextAttemptNumber));
+ if (retryPolicy.retryLoggable()) {
+ // log as INFO level if the retry is loggable
+ log.noStackTrace().info(t, buildErrorMessage(request,
null, backoffMs, nextAttemptNumber));
+ } else if (log.isDebugEnabled()) {
+ // log as DEBUG level if the debug log is enabled
+ log.noStackTrace().debug(t, buildErrorMessage(request,
null, backoffMs, nextAttemptNumber));
+ } else {
+ // If none of the above is valid, we log the error
message every tenth time we retry. It seems like
+ // a good balance between making the logs not too
verbose when the retry is due to the same cause
+ // and enriching logs with useful information, if we
keep retrying due to the same reason
+ if (nextAttemptNumber > 0 && nextAttemptNumber % 10 ==
0) {
+ log.noStackTrace().info(t,
buildErrorMessage(request, null, backoffMs, nextAttemptNumber));
+ }
+ }
connectExec.schedule(
() -> tryRequest(requestBuilder, handler, retVal,
nextAttemptNumber, ImmutableSet.of()),
@@ -272,7 +285,15 @@ public class ServiceClientImpl implements ServiceClient
// Retryable server response (or null errorHolder, which
means null result, which can happen
// if the HttpClient encounters an exception in the midst of
response processing).
final long backoffMs = computeBackoffMs(retryPolicy,
attemptNumber);
- log.info(buildErrorMessage(request, errorHolder, backoffMs,
nextAttemptNumber));
+ if (retryPolicy.retryLoggable()) {
+ log.noStackTrace().info(buildErrorMessage(request,
errorHolder, backoffMs, nextAttemptNumber));
+ } else if (log.isDebugEnabled()) {
+ log.noStackTrace().debug(buildErrorMessage(request,
errorHolder, backoffMs, nextAttemptNumber));
+ } else {
+ if (nextAttemptNumber > 0 && nextAttemptNumber % 10 == 0) {
+ log.noStackTrace().info(buildErrorMessage(request,
errorHolder, backoffMs, nextAttemptNumber));
+ }
+ }
connectExec.schedule(
() -> tryRequest(requestBuilder, handler, retVal,
nextAttemptNumber, ImmutableSet.of()),
backoffMs,
diff --git a/server/src/main/java/org/apache/druid/rpc/ServiceRetryPolicy.java
b/server/src/main/java/org/apache/druid/rpc/ServiceRetryPolicy.java
index 297f369688a..91689523a20 100644
--- a/server/src/main/java/org/apache/druid/rpc/ServiceRetryPolicy.java
+++ b/server/src/main/java/org/apache/druid/rpc/ServiceRetryPolicy.java
@@ -54,6 +54,11 @@ public interface ServiceRetryPolicy
*/
boolean retryThrowable(Throwable t);
+ /**
+ * Returns whether to log the cause of failure before retrying
+ */
+ boolean retryLoggable();
+
/**
* Returns whether service-not-available, i.e. empty {@link
ServiceLocations#getLocations()}, can be retried.
*/
diff --git a/server/src/main/java/org/apache/druid/rpc/StandardRetryPolicy.java
b/server/src/main/java/org/apache/druid/rpc/StandardRetryPolicy.java
index ceb2a41c807..71097ef6d89 100644
--- a/server/src/main/java/org/apache/druid/rpc/StandardRetryPolicy.java
+++ b/server/src/main/java/org/apache/druid/rpc/StandardRetryPolicy.java
@@ -44,6 +44,9 @@ public class StandardRetryPolicy implements ServiceRetryPolicy
private static final int MAX_ATTEMPTS_ABOUT_AN_HOUR = 125;
private static final StandardRetryPolicy DEFAULT_UNLIMITED_POLICY = new
Builder().maxAttempts(UNLIMITED).build();
+ private static final StandardRetryPolicy
DEFAULT_UNLIMITED_POLICY_NO_RETRY_LOG = new Builder().maxAttempts(UNLIMITED)
+
.retryLoggable(false)
+
.build();
private static final StandardRetryPolicy DEFAULT_ABOUT_AN_HOUR_POLICY =
new Builder().maxAttempts(MAX_ATTEMPTS_ABOUT_AN_HOUR).build();
private static final StandardRetryPolicy DEFAULT_NO_RETRIES_POLICY = new
Builder().maxAttempts(1).build();
@@ -52,13 +55,21 @@ public class StandardRetryPolicy implements
ServiceRetryPolicy
private final long minWaitMillis;
private final long maxWaitMillis;
private final boolean retryNotAvailable;
-
- private StandardRetryPolicy(long maxAttempts, long minWaitMillis, long
maxWaitMillis, boolean retryNotAvailable)
+ private final boolean retryLoggable;
+
+ private StandardRetryPolicy(
+ long maxAttempts,
+ long minWaitMillis,
+ long maxWaitMillis,
+ boolean retryNotAvailable,
+ boolean retryLoggable
+ )
{
this.maxAttempts = maxAttempts;
this.minWaitMillis = minWaitMillis;
this.maxWaitMillis = maxWaitMillis;
this.retryNotAvailable = retryNotAvailable;
+ this.retryLoggable = retryLoggable;
if (maxAttempts == 0) {
throw new IAE("maxAttempts must be positive (limited) or negative
(unlimited); cannot be zero.");
@@ -79,6 +90,14 @@ public class StandardRetryPolicy implements
ServiceRetryPolicy
return DEFAULT_UNLIMITED_POLICY;
}
+ /**
+ * Standard unlimited retry policy along with muted the logging for the
retries.
+ */
+ public static StandardRetryPolicy unlimitedWithoutRetryLogging()
+ {
+ return DEFAULT_UNLIMITED_POLICY_NO_RETRY_LOG;
+ }
+
/**
* Retry policy that uses up to about an hour of total wait time. Note that
this is just the total waiting time
* between attempts. It does not include the time that each attempt takes to
execute.
@@ -135,6 +154,12 @@ public class StandardRetryPolicy implements
ServiceRetryPolicy
|| (t.getCause() != null && retryThrowable(t.getCause()));
}
+ @Override
+ public boolean retryLoggable()
+ {
+ return retryLoggable;
+ }
+
@Override
public boolean retryNotAvailable()
{
@@ -147,6 +172,7 @@ public class StandardRetryPolicy implements
ServiceRetryPolicy
private long minWaitMillis = DEFAULT_MIN_WAIT_MS;
private long maxWaitMillis = DEFAULT_MAX_WAIT_MS;
private boolean retryNotAvailable = true;
+ private boolean retryLoggable = true;
public Builder maxAttempts(final long maxAttempts)
{
@@ -172,9 +198,15 @@ public class StandardRetryPolicy implements
ServiceRetryPolicy
return this;
}
+ public Builder retryLoggable(final boolean retryLoggable)
+ {
+ this.retryLoggable = retryLoggable;
+ return this;
+ }
+
public StandardRetryPolicy build()
{
- return new StandardRetryPolicy(maxAttempts, minWaitMillis,
maxWaitMillis, retryNotAvailable);
+ return new StandardRetryPolicy(maxAttempts, minWaitMillis,
maxWaitMillis, retryNotAvailable, retryLoggable);
}
}
}
diff --git
a/server/src/main/java/org/apache/druid/rpc/indexing/SpecificTaskRetryPolicy.java
b/server/src/main/java/org/apache/druid/rpc/indexing/SpecificTaskRetryPolicy.java
index 2482786c2a7..36d84f2dc4e 100644
---
a/server/src/main/java/org/apache/druid/rpc/indexing/SpecificTaskRetryPolicy.java
+++
b/server/src/main/java/org/apache/druid/rpc/indexing/SpecificTaskRetryPolicy.java
@@ -77,6 +77,12 @@ public class SpecificTaskRetryPolicy implements
ServiceRetryPolicy
return StandardRetryPolicy.unlimited().retryThrowable(t);
}
+ @Override
+ public boolean retryLoggable()
+ {
+ return baseRetryPolicy.retryLoggable();
+ }
+
@Override
public boolean retryNotAvailable()
{
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]