DRILL-6143: Made FragmentsRunner's rpc timeout larger to reduce random failures and made it configurable as a SystemOption.
closes #1119 Project: http://git-wip-us.apache.org/repos/asf/drill/repo Commit: http://git-wip-us.apache.org/repos/asf/drill/commit/33149059 Tree: http://git-wip-us.apache.org/repos/asf/drill/tree/33149059 Diff: http://git-wip-us.apache.org/repos/asf/drill/diff/33149059 Branch: refs/heads/master Commit: 3314905987ee6d28c89cc5cfd8c7dbd6f36da41c Parents: 5d4fbd1 Author: Timothy Farkas <timothyfar...@apache.org> Authored: Thu Feb 8 15:25:59 2018 -0800 Committer: Vitalii Diravka <vitalii.dira...@gmail.com> Committed: Fri Feb 16 20:15:57 2018 +0000 ---------------------------------------------------------------------- .../src/main/java/org/apache/drill/exec/ExecConstants.java | 2 ++ .../apache/drill/exec/server/options/SystemOptionManager.java | 3 ++- .../org/apache/drill/exec/work/foreman/FragmentsRunner.java | 5 ++--- exec/java-exec/src/main/resources/drill-module.conf | 1 + 4 files changed, 7 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/drill/blob/33149059/exec/java-exec/src/main/java/org/apache/drill/exec/ExecConstants.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/ExecConstants.java b/exec/java-exec/src/main/java/org/apache/drill/exec/ExecConstants.java index c949e51..f3572d8 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/ExecConstants.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/ExecConstants.java @@ -59,6 +59,8 @@ public final class ExecConstants { public static final String CLIENT_RPC_THREADS = "drill.exec.rpc.user.client.threads"; public static final String BIT_SERVER_RPC_THREADS = "drill.exec.rpc.bit.server.threads"; public static final String USER_SERVER_RPC_THREADS = "drill.exec.rpc.user.server.threads"; + public static final String FRAG_RUNNER_RPC_TIMEOUT = "drill.exec.rpc.fragrunner.timeout"; + public static final PositiveLongValidator FRAG_RUNNER_RPC_TIMEOUT_VALIDATOR = new PositiveLongValidator(FRAG_RUNNER_RPC_TIMEOUT, Long.MAX_VALUE); public static final String TRACE_DUMP_DIRECTORY = "drill.exec.trace.directory"; public static final String TRACE_DUMP_FILESYSTEM = "drill.exec.trace.filesystem"; public static final String TEMP_DIRECTORIES = "drill.exec.tmp.directories"; http://git-wip-us.apache.org/repos/asf/drill/blob/33149059/exec/java-exec/src/main/java/org/apache/drill/exec/server/options/SystemOptionManager.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/server/options/SystemOptionManager.java b/exec/java-exec/src/main/java/org/apache/drill/exec/server/options/SystemOptionManager.java index 4dba96d..2b170e7 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/server/options/SystemOptionManager.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/server/options/SystemOptionManager.java @@ -212,7 +212,8 @@ public class SystemOptionManager extends BaseOptionManager implements AutoClosea new OptionDefinition(ExecConstants.CPU_LOAD_AVERAGE), new OptionDefinition(ExecConstants.ENABLE_VECTOR_VALIDATOR), new OptionDefinition(ExecConstants.ENABLE_ITERATOR_VALIDATOR), - new OptionDefinition(ExecConstants.OUTPUT_BATCH_SIZE_VALIDATOR, new OptionMetaData(OptionValue.AccessibleScopes.SYSTEM, true, false)) + new OptionDefinition(ExecConstants.OUTPUT_BATCH_SIZE_VALIDATOR, new OptionMetaData(OptionValue.AccessibleScopes.SYSTEM, true, false)), + new OptionDefinition(ExecConstants.FRAG_RUNNER_RPC_TIMEOUT_VALIDATOR, new OptionMetaData(OptionValue.AccessibleScopes.SYSTEM, true, true)), }; final CaseInsensitiveMap<OptionDefinition> map = CaseInsensitiveMap.newHashMap(); http://git-wip-us.apache.org/repos/asf/drill/blob/33149059/exec/java-exec/src/main/java/org/apache/drill/exec/work/foreman/FragmentsRunner.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/work/foreman/FragmentsRunner.java b/exec/java-exec/src/main/java/org/apache/drill/exec/work/foreman/FragmentsRunner.java index 2e5f2dd..b677576 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/work/foreman/FragmentsRunner.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/work/foreman/FragmentsRunner.java @@ -24,6 +24,7 @@ import io.netty.buffer.ByteBuf; import org.apache.drill.common.concurrent.ExtendedLatch; import org.apache.drill.common.exceptions.ExecutionSetupException; import org.apache.drill.common.exceptions.UserException; +import org.apache.drill.exec.ExecConstants; import org.apache.drill.exec.ops.FragmentContextImpl; import org.apache.drill.exec.physical.base.FragmentRoot; import org.apache.drill.exec.proto.BitControl.InitializeFragments; @@ -61,8 +62,6 @@ public class FragmentsRunner { private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(FragmentsRunner.class); private static final ControlsInjector injector = ControlsInjectorFactory.getInjector(FragmentsRunner.class); - private static final long RPC_WAIT_IN_MSECS_PER_FRAGMENT = 5000; - private final WorkerBee bee; private final UserClientConnection initiatingClient; private final DrillbitContext drillbitContext; @@ -278,7 +277,7 @@ public class FragmentsRunner { sendRemoteFragments(ep, remoteFragmentMap.get(ep), endpointLatch, fragmentSubmitFailures); } - final long timeout = RPC_WAIT_IN_MSECS_PER_FRAGMENT * numIntFragments; + final long timeout = drillbitContext.getOptionManager().getLong(ExecConstants.FRAG_RUNNER_RPC_TIMEOUT) * numIntFragments; if (numIntFragments > 0 && !endpointLatch.awaitUninterruptibly(timeout)) { long numberRemaining = endpointLatch.getCount(); throw UserException.connectionError() http://git-wip-us.apache.org/repos/asf/drill/blob/33149059/exec/java-exec/src/main/resources/drill-module.conf ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/resources/drill-module.conf b/exec/java-exec/src/main/resources/drill-module.conf index 5659c82..39320c2 100644 --- a/exec/java-exec/src/main/resources/drill-module.conf +++ b/exec/java-exec/src/main/resources/drill-module.conf @@ -408,6 +408,7 @@ drill.exec.options: { debug.validate_iterators: false, debug.validate_vectors: false, drill.exec.functions.cast_empty_string_to_null: false, + drill.exec.rpc.fragrunner.timeout: 10000, # Setting to control if HashAgg should fallback to older behavior of consuming # unbounded memory. In case of 2 phase Agg when available memory is not enough # to start at least 2 partitions then HashAgg fallbacks to this case. It can be