This is an automated email from the ASF dual-hosted git repository. alexey pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/kudu.git
commit c167c1dc39d7089c4b1216bc62e423f3e2638479 Author: Andrew Wong <[email protected]> AuthorDate: Wed Feb 24 15:59:46 2021 -0800 [java] KUDU-3213: try at different server on TABLET_NOT_RUNNING Prior to this patch, if a tablet server were quiescing for a prolonged period, scan requests could time out, complaining that the tablet server is quiescing, but without ever retrying the scan at another tablet server. This is because tablet servers will return TABLET_NOT_RUNNING to clients when attempting a scan while quiescing. The behavior in the C++ client is that the location is then blacklisted and the request is retried elsewhere. The behavior in the Java client, though, is that the same location is retried until failure. This patch addresses this by treating TABLET_NOT_RUNNING errors in the Java client as we would for TABLET_NOT_FOUND, which is actually quite similar to the handling for TABLET_NOT_RUNNING in the C++ client: the location is invalidated for further attempts, and the request is retried elsewhere. Why not just have quiescing tablet servers return TABLET_NOT_FOUND, then? TABLET_NOT_FOUND errors in the C++ client actually have some behavior not present in the Java client: a tablet whose location is invalidated with TABLET_NOT_FOUND in the C++ client will be required to be looked up again, requiring a round trip to the master. This behavior doesn't exist in the Java client, so I thought it easiest to piggyback on TABLET_NOT_FOUND handling for now. Change-Id: I38ac84a52676ff361fa1ba996665b338d1bbfba1 Reviewed-on: http://gerrit.cloudera.org:8080/17124 Tested-by: Kudu Jenkins Reviewed-by: Alexey Serbin <[email protected]> --- .../main/java/org/apache/kudu/client/RpcProxy.java | 9 ++-- .../org/apache/kudu/client/TestKuduScanner.java | 52 ++++++++++++++++++++++ .../java/org/apache/kudu/test/KuduTestHarness.java | 2 +- 3 files changed, 59 insertions(+), 4 deletions(-) diff --git a/java/kudu-client/src/main/java/org/apache/kudu/client/RpcProxy.java b/java/kudu-client/src/main/java/org/apache/kudu/client/RpcProxy.java index cf564b9..28784ab 100644 --- a/java/kudu-client/src/main/java/org/apache/kudu/client/RpcProxy.java +++ b/java/kudu-client/src/main/java/org/apache/kudu/client/RpcProxy.java @@ -357,12 +357,15 @@ class RpcProxy { Tserver.TabletServerErrorPB.Code errCode = error.getCode(); WireProtocol.AppStatusPB.ErrorCode errStatusCode = error.getStatus().getCode(); Status status = Status.fromTabletServerErrorPB(error); - if (errCode == Tserver.TabletServerErrorPB.Code.TABLET_NOT_FOUND) { + if (errCode == Tserver.TabletServerErrorPB.Code.TABLET_NOT_FOUND || + errCode == Tserver.TabletServerErrorPB.Code.TABLET_NOT_RUNNING) { + // TODO(awong): for TABLET_NOT_FOUND, we may want to force a location + // lookup for the tablet. For now, this just invalidates the location + // and tries somewhere else. client.handleTabletNotFound( rpc, new RecoverableException(status), connection.getServerInfo()); // we're not calling rpc.callback() so we rely on the client to retry that RPC - } else if (errCode == Tserver.TabletServerErrorPB.Code.TABLET_NOT_RUNNING || - errStatusCode == WireProtocol.AppStatusPB.ErrorCode.SERVICE_UNAVAILABLE) { + } else if (errStatusCode == WireProtocol.AppStatusPB.ErrorCode.SERVICE_UNAVAILABLE) { client.handleRetryableError(rpc, new RecoverableException(status)); // The following two error codes are an indication that the tablet isn't a leader. } else if (errStatusCode == WireProtocol.AppStatusPB.ErrorCode.ILLEGAL_STATE || diff --git a/java/kudu-client/src/test/java/org/apache/kudu/client/TestKuduScanner.java b/java/kudu-client/src/test/java/org/apache/kudu/client/TestKuduScanner.java index c0fbac3..54e7d65 100644 --- a/java/kudu-client/src/test/java/org/apache/kudu/client/TestKuduScanner.java +++ b/java/kudu-client/src/test/java/org/apache/kudu/client/TestKuduScanner.java @@ -38,6 +38,7 @@ import java.util.Map; import java.util.Random; import java.util.Set; +import com.google.common.collect.Lists; import org.junit.Before; import org.junit.Rule; import org.junit.Test; @@ -52,6 +53,7 @@ import org.apache.kudu.client.Operation.ChangeType; import org.apache.kudu.test.CapturingLogAppender; import org.apache.kudu.test.KuduTestHarness; import org.apache.kudu.test.RandomUtils; +import org.apache.kudu.test.cluster.KuduBinaryLocator; import org.apache.kudu.util.DataGenerator; import org.apache.kudu.util.Pair; @@ -79,6 +81,56 @@ public class TestKuduScanner { .build(); } + /** + * Test that scans get retried at other tablet servers when they're quiescing. + */ + @Test(timeout = 100000) + public void testScanQuiescingTabletServer() throws Exception { + int rowCount = 500; + Schema tableSchema = new Schema(Collections.singletonList( + new ColumnSchema.ColumnSchemaBuilder("key", Type.INT32).key(true).build() + )); + + // Create a table with some rows in it. For simplicity, use a + // single-partition table with replicas on each server (we're required + // to set some partitioning though). + CreateTableOptions tableOptions = new CreateTableOptions() + .setRangePartitionColumns(Collections.singletonList("key")) + .setNumReplicas(3); + KuduTable table = client.createTable(tableName, tableSchema, tableOptions); + KuduSession session = client.newSession(); + for (int i = 0; i < rowCount; i++) { + Insert insert = table.newInsert(); + PartialRow row = insert.getRow(); + row.addInt(0, i); + session.apply(insert); + } + + // Quiesce a single tablet server. + List<HostAndPort> tservers = harness.getTabletServers(); + KuduBinaryLocator.ExecutableInfo exeInfo = KuduBinaryLocator.findBinary("kudu"); + List<String> commandLine = Lists.newArrayList(exeInfo.exePath(), + "tserver", + "quiesce", + "start", + tservers.get(0).toString()); + ProcessBuilder processBuilder = new ProcessBuilder(commandLine); + processBuilder.environment().putAll(exeInfo.environment()); + Process quiesceTserver = processBuilder.start(); + assertEquals(0, quiesceTserver.waitFor()); + + // Now start a scan. Even if the scan goes to the quiescing server, the + // scan request should eventually be routed to a non-quiescing server + // and complete. We aren't guaranteed to hit the quiescing server, but this + // test would frequently fail if we didn't handle quiescing servers properly. + KuduScanner scanner = client.newScannerBuilder(table).build(); + KuduScannerIterator iterator = scanner.iterator(); + assertTrue(iterator.hasNext()); + while (iterator.hasNext()) { + iterator.next(); + } + } + @Test(timeout = 100000) public void testIterable() throws Exception { KuduTable table = client.createTable(tableName, getBasicSchema(), getBasicCreateTableOptions()); diff --git a/java/kudu-test-utils/src/main/java/org/apache/kudu/test/KuduTestHarness.java b/java/kudu-test-utils/src/main/java/org/apache/kudu/test/KuduTestHarness.java index 2b65844..805704f 100644 --- a/java/kudu-test-utils/src/main/java/org/apache/kudu/test/KuduTestHarness.java +++ b/java/kudu-test-utils/src/main/java/org/apache/kudu/test/KuduTestHarness.java @@ -334,7 +334,7 @@ public class KuduTestHarness extends ExternalResource { * @return the list of tablet servers */ public List<HostAndPort> getTabletServers() { - return miniCluster.getMasterServers(); + return miniCluster.getTabletServers(); } /**
