This is an automated email from the ASF dual-hosted git repository.
ndimiduk pushed a commit to branch branch-2.5
in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/branch-2.5 by this push:
new ac7622c3982 HBASE-26366 Provide meaningful parent spans to ZK
interactions
ac7622c3982 is described below
commit ac7622c39822ef6be9c81c6762eda13a5f2a4eef
Author: Nick Dimiduk <[email protected]>
AuthorDate: Mon Jun 13 10:33:49 2022 +0200
HBASE-26366 Provide meaningful parent spans to ZK interactions
Signed-off-by: Andrew Purtell <[email protected]>
---
.../client/trace/hamcrest/SpanDataMatchers.java | 22 +-
.../java/org/apache/hadoop/hbase/ChoreService.java | 6 +-
.../org/apache/hadoop/hbase/trace/TraceUtil.java | 31 ++
.../hadoop/hbase/MetaRegionLocationCache.java | 67 ++---
.../org/apache/hadoop/hbase/master/HMaster.java | 99 ++++---
.../hadoop/hbase/master/HMasterCommandLine.java | 15 +-
.../hadoop/hbase/master/RegionServerTracker.java | 40 ++-
.../hadoop/hbase/regionserver/HRegionServer.java | 312 +++++++++++----------
.../regionserver/HRegionServerCommandLine.java | 11 +-
.../hadoop/hbase/TestServerInternalsTracing.java | 297 ++++++++++++++++++++
.../hbase/zookeeper/MasterAddressTracker.java | 13 +-
.../hbase/zookeeper/RecoverableZooKeeper.java | 94 +++++--
.../apache/hadoop/hbase/zookeeper/ZKWatcher.java | 66 +++--
13 files changed, 763 insertions(+), 310 deletions(-)
diff --git
a/hbase-client/src/test/java/org/apache/hadoop/hbase/client/trace/hamcrest/SpanDataMatchers.java
b/hbase-client/src/test/java/org/apache/hadoop/hbase/client/trace/hamcrest/SpanDataMatchers.java
index 6d0468c32ed..d021f4d3aaf 100644
---
a/hbase-client/src/test/java/org/apache/hadoop/hbase/client/trace/hamcrest/SpanDataMatchers.java
+++
b/hbase-client/src/test/java/org/apache/hadoop/hbase/client/trace/hamcrest/SpanDataMatchers.java
@@ -26,7 +26,6 @@ import io.opentelemetry.api.trace.SpanKind;
import io.opentelemetry.api.trace.StatusCode;
import io.opentelemetry.sdk.trace.data.EventData;
import io.opentelemetry.sdk.trace.data.SpanData;
-import io.opentelemetry.sdk.trace.data.StatusData;
import io.opentelemetry.semconv.trace.attributes.SemanticAttributes;
import java.time.Duration;
import java.util.Objects;
@@ -144,23 +143,20 @@ public final class SpanDataMatchers {
};
}
- public static Matcher<SpanData> hasStatusWithCode(StatusCode statusCode) {
- final Matcher<StatusCode> matcher = is(equalTo(statusCode));
- return new TypeSafeMatcher<SpanData>() {
- @Override
- protected boolean matchesSafely(SpanData item) {
- final StatusData statusData = item.getStatus();
- return statusData != null && statusData.getStatusCode() != null
- && matcher.matches(statusData.getStatusCode());
- }
-
+ public static Matcher<SpanData> hasStatusWithCode(Matcher<StatusCode>
matcher) {
+ return new FeatureMatcher<SpanData, StatusCode>(matcher, "SpanData with
StatusCode that",
+ "statusWithCode") {
@Override
- public void describeTo(Description description) {
- description.appendText("SpanData with StatusCode that
").appendDescriptionOf(matcher);
+ protected StatusCode featureValueOf(SpanData actual) {
+ return actual.getStatus().getStatusCode();
}
};
}
+ public static Matcher<SpanData> hasStatusWithCode(StatusCode statusCode) {
+ return hasStatusWithCode(is(equalTo(statusCode)));
+ }
+
public static Matcher<SpanData> hasTraceId(String traceId) {
return hasTraceId(is(equalTo(traceId)));
}
diff --git
a/hbase-common/src/main/java/org/apache/hadoop/hbase/ChoreService.java
b/hbase-common/src/main/java/org/apache/hadoop/hbase/ChoreService.java
index 939d75fd729..1bba8d49120 100644
--- a/hbase-common/src/main/java/org/apache/hadoop/hbase/ChoreService.java
+++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/ChoreService.java
@@ -26,6 +26,7 @@ import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.ScheduledThreadPoolExecutor;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.atomic.AtomicInteger;
+import org.apache.hadoop.hbase.trace.TraceUtil;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -163,8 +164,9 @@ public class ChoreService {
chore.getChoreService().cancelChore(chore);
}
chore.setChoreService(this);
- ScheduledFuture<?> future = scheduler.scheduleAtFixedRate(chore,
chore.getInitialDelay(),
- chore.getPeriod(), chore.getTimeUnit());
+ ScheduledFuture<?> future =
+ scheduler.scheduleAtFixedRate(TraceUtil.tracedRunnable(chore,
chore.getName()),
+ chore.getInitialDelay(), chore.getPeriod(), chore.getTimeUnit());
scheduledChores.put(chore, future);
return true;
} catch (Exception e) {
diff --git
a/hbase-common/src/main/java/org/apache/hadoop/hbase/trace/TraceUtil.java
b/hbase-common/src/main/java/org/apache/hadoop/hbase/trace/TraceUtil.java
index 7dc24a54ab6..5b1fb86a351 100644
--- a/hbase-common/src/main/java/org/apache/hadoop/hbase/trace/TraceUtil.java
+++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/trace/TraceUtil.java
@@ -135,6 +135,31 @@ public final class TraceUtil {
});
}
+ /**
+ * Wrap the provided {@code runnable} in a {@link Runnable} that is traced.
+ */
+ public static Runnable tracedRunnable(final Runnable runnable, final String
spanName) {
+ return tracedRunnable(runnable, () -> createSpan(spanName));
+ }
+
+ /**
+ * Wrap the provided {@code runnable} in a {@link Runnable} that is traced.
+ */
+ public static Runnable tracedRunnable(final Runnable runnable,
+ final Supplier<Span> spanSupplier) {
+ // N.B. This method name follows the convention of this class, i.e.,
tracedFuture, rather than
+ // the convention of the OpenTelemetry classes, i.e., Context#wrap.
+ return () -> {
+ final Span span = spanSupplier.get();
+ try (final Scope ignored = span.makeCurrent()) {
+ runnable.run();
+ span.setStatus(StatusCode.OK);
+ } finally {
+ span.end();
+ }
+ };
+ }
+
/**
* A {@link Runnable} that may also throw.
* @param <T> the type of {@link Throwable} that can be produced.
@@ -144,11 +169,17 @@ public final class TraceUtil {
void run() throws T;
}
+ /**
+ * Trace the execution of {@code runnable}.
+ */
public static <T extends Throwable> void trace(final ThrowingRunnable<T>
runnable,
final String spanName) throws T {
trace(runnable, () -> createSpan(spanName));
}
+ /**
+ * Trace the execution of {@code runnable}.
+ */
public static <T extends Throwable> void trace(final ThrowingRunnable<T>
runnable,
final Supplier<Span> spanSupplier) throws T {
Span span = spanSupplier.get();
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/MetaRegionLocationCache.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/MetaRegionLocationCache.java
index 22edcbfdfa9..c1ebb6fb9b4 100644
---
a/hbase-server/src/main/java/org/apache/hadoop/hbase/MetaRegionLocationCache.java
+++
b/hbase-server/src/main/java/org/apache/hadoop/hbase/MetaRegionLocationCache.java
@@ -24,6 +24,7 @@ import java.util.concurrent.ConcurrentNavigableMap;
import java.util.concurrent.ThreadFactory;
import org.apache.hadoop.hbase.exceptions.DeserializationException;
import org.apache.hadoop.hbase.master.RegionState;
+import org.apache.hadoop.hbase.trace.TraceUtil;
import org.apache.hadoop.hbase.types.CopyOnWriteArrayMap;
import org.apache.hadoop.hbase.util.RetryCounter;
import org.apache.hadoop.hbase.util.RetryCounterFactory;
@@ -101,41 +102,43 @@ public class MetaRegionLocationCache extends ZKListener {
* @param retryCounter controls the number of retries and sleep between
retries.
*/
private void loadMetaLocationsFromZk(RetryCounter retryCounter, ZNodeOpType
opType) {
- List<String> znodes = null;
- while (retryCounter.shouldRetry()) {
- try {
- znodes = watcher.getMetaReplicaNodesAndWatchChildren();
- break;
- } catch (KeeperException ke) {
- LOG.debug("Error populating initial meta locations", ke);
- if (!retryCounter.shouldRetry()) {
- // Retries exhausted and watchers not set. This is not a desirable
state since the cache
- // could remain stale forever. Propagate the exception.
- watcher.abort("Error populating meta locations", ke);
- return;
- }
+ TraceUtil.trace(() -> {
+ List<String> znodes = null;
+ while (retryCounter.shouldRetry()) {
try {
- retryCounter.sleepUntilNextRetry();
- } catch (InterruptedException ie) {
- LOG.error("Interrupted while loading meta locations from ZK", ie);
- Thread.currentThread().interrupt();
- return;
+ znodes = watcher.getMetaReplicaNodesAndWatchChildren();
+ break;
+ } catch (KeeperException ke) {
+ LOG.debug("Error populating initial meta locations", ke);
+ if (!retryCounter.shouldRetry()) {
+ // Retries exhausted and watchers not set. This is not a desirable
state since the cache
+ // could remain stale forever. Propagate the exception.
+ watcher.abort("Error populating meta locations", ke);
+ return;
+ }
+ try {
+ retryCounter.sleepUntilNextRetry();
+ } catch (InterruptedException ie) {
+ LOG.error("Interrupted while loading meta locations from ZK", ie);
+ Thread.currentThread().interrupt();
+ return;
+ }
}
}
- }
- if (znodes == null || znodes.isEmpty()) {
- // No meta znodes exist at this point but we registered a watcher on the
base znode to listen
- // for updates. They will be handled via nodeChildrenChanged().
- return;
- }
- if (znodes.size() == cachedMetaLocations.size()) {
- // No new meta znodes got added.
- return;
- }
- for (String znode : znodes) {
- String path = ZNodePaths.joinZNode(watcher.getZNodePaths().baseZNode,
znode);
- updateMetaLocation(path, opType);
- }
+ if (znodes == null || znodes.isEmpty()) {
+ // No meta znodes exist at this point but we registered a watcher on
the base znode to
+ // listen for updates. They will be handled via nodeChildrenChanged().
+ return;
+ }
+ if (znodes.size() == cachedMetaLocations.size()) {
+ // No new meta znodes got added.
+ return;
+ }
+ for (String znode : znodes) {
+ String path = ZNodePaths.joinZNode(watcher.getZNodePaths().baseZNode,
znode);
+ updateMetaLocation(path, opType);
+ }
+ }, "MetaRegionLocationCache.loadMetaLocationsFromZk");
}
/**
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
index dae8fdad0e2..e793fe844fd 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
@@ -26,6 +26,9 @@ import static
org.apache.hadoop.hbase.util.DNS.MASTER_HOSTNAME_KEY;
import com.google.errorprone.annotations.RestrictedApi;
import com.google.protobuf.Descriptors;
import com.google.protobuf.Service;
+import io.opentelemetry.api.trace.Span;
+import io.opentelemetry.api.trace.StatusCode;
+import io.opentelemetry.context.Scope;
import java.io.IOException;
import java.io.InterruptedIOException;
import java.lang.reflect.Constructor;
@@ -213,6 +216,7 @@ import
org.apache.hadoop.hbase.replication.regionserver.ReplicationStatus;
import org.apache.hadoop.hbase.security.AccessDeniedException;
import org.apache.hadoop.hbase.security.SecurityConstants;
import org.apache.hadoop.hbase.security.UserProvider;
+import org.apache.hadoop.hbase.trace.TraceUtil;
import org.apache.hadoop.hbase.util.Addressing;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
@@ -452,7 +456,8 @@ public class HMaster extends HRegionServer implements
MasterServices {
*/
public HMaster(final Configuration conf) throws IOException {
super(conf);
- try {
+ final Span span = TraceUtil.createSpan("HMaster.cxtor");
+ try (Scope ignored = span.makeCurrent()) {
if (conf.getBoolean(MAINTENANCE_MODE, false)) {
LOG.info("Detected {}=true via configuration.", MAINTENANCE_MODE);
maintenanceMode = true;
@@ -513,11 +518,15 @@ public class HMaster extends HRegionServer implements
MasterServices {
cachedClusterId = new CachedClusterId(this, conf);
this.regionServerTracker = new RegionServerTracker(zooKeeper, this);
+ span.setStatus(StatusCode.OK);
} catch (Throwable t) {
// Make sure we log the exception. HMaster is often started via
reflection and the
// cause of failed startup is lost.
+ TraceUtil.setError(span, t);
LOG.error("Failed construction of Master", t);
throw t;
+ } finally {
+ span.end();
}
}
@@ -540,7 +549,7 @@ public class HMaster extends HRegionServer implements
MasterServices {
@Override
public void run() {
try {
- Threads.setDaemonThreadRunning(new Thread(() -> {
+ Threads.setDaemonThreadRunning(new Thread(() -> TraceUtil.trace(() -> {
try {
int infoPort = putUpJettyServer();
startActiveMasterManager(infoPort);
@@ -553,23 +562,29 @@ public class HMaster extends HRegionServer implements
MasterServices {
abort(error, t);
}
}
- }), getName() + ":becomeActiveMaster");
+ }, "HMaster.becomeActiveMaster")), getName() + ":becomeActiveMaster");
// Fall in here even if we have been aborted. Need to run the shutdown
services and
// the super run call will do this for us.
super.run();
} finally {
- if (this.clusterSchemaService != null) {
- // If on way out, then we are no longer active master.
- this.clusterSchemaService.stopAsync();
- try {
- this.clusterSchemaService
-
.awaitTerminated(getConfiguration().getInt(HBASE_MASTER_WAIT_ON_SERVICE_IN_SECONDS,
- DEFAULT_HBASE_MASTER_WAIT_ON_SERVICE_IN_SECONDS),
TimeUnit.SECONDS);
- } catch (TimeoutException te) {
- LOG.warn("Failed shutdown of clusterSchemaService", te);
+ final Span span = TraceUtil.createSpan("HMaster exiting main loop");
+ try (Scope ignored = span.makeCurrent()) {
+ if (this.clusterSchemaService != null) {
+ // If on way out, then we are no longer active master.
+ this.clusterSchemaService.stopAsync();
+ try {
+ this.clusterSchemaService
+
.awaitTerminated(getConfiguration().getInt(HBASE_MASTER_WAIT_ON_SERVICE_IN_SECONDS,
+ DEFAULT_HBASE_MASTER_WAIT_ON_SERVICE_IN_SECONDS),
TimeUnit.SECONDS);
+ } catch (TimeoutException te) {
+ LOG.warn("Failed shutdown of clusterSchemaService", te);
+ }
}
+ this.activeMaster = false;
+ span.setStatus(StatusCode.OK);
+ } finally {
+ span.end();
}
- this.activeMaster = false;
}
}
@@ -3094,36 +3109,38 @@ public class HMaster extends HRegionServer implements
MasterServices {
* Shutdown the cluster. Master runs a coordinated stop of all RegionServers
and then itself.
*/
public void shutdown() throws IOException {
- if (cpHost != null) {
- cpHost.preShutdown();
- }
+ TraceUtil.trace(() -> {
+ if (cpHost != null) {
+ cpHost.preShutdown();
+ }
- // Tell the servermanager cluster shutdown has been called. This makes it
so when Master is
- // last running server, it'll stop itself. Next, we broadcast the cluster
shutdown by setting
- // the cluster status as down. RegionServers will notice this change in
state and will start
- // shutting themselves down. When last has exited, Master can go down.
- if (this.serverManager != null) {
- this.serverManager.shutdownCluster();
- }
- if (this.clusterStatusTracker != null) {
- try {
- this.clusterStatusTracker.setClusterDown();
- } catch (KeeperException e) {
- LOG.error("ZooKeeper exception trying to set cluster as down in ZK",
e);
+ // Tell the servermanager cluster shutdown has been called. This makes
it so when Master is
+ // last running server, it'll stop itself. Next, we broadcast the
cluster shutdown by setting
+ // the cluster status as down. RegionServers will notice this change in
state and will start
+ // shutting themselves down. When last has exited, Master can go down.
+ if (this.serverManager != null) {
+ this.serverManager.shutdownCluster();
}
- }
- // Stop the procedure executor. Will stop any ongoing assign, unassign,
server crash etc.,
- // processing so we can go down.
- if (this.procedureExecutor != null) {
- this.procedureExecutor.stop();
- }
- // Shutdown our cluster connection. This will kill any hosted RPCs that
might be going on;
- // this is what we want especially if the Master is in startup phase doing
call outs to
- // hbase:meta, etc. when cluster is down. Without ths connection close,
we'd have to wait on
- // the rpc to timeout.
- if (this.clusterConnection != null) {
- this.clusterConnection.close();
- }
+ if (this.clusterStatusTracker != null) {
+ try {
+ this.clusterStatusTracker.setClusterDown();
+ } catch (KeeperException e) {
+ LOG.error("ZooKeeper exception trying to set cluster as down in ZK",
e);
+ }
+ }
+ // Stop the procedure executor. Will stop any ongoing assign, unassign,
server crash etc.,
+ // processing so we can go down.
+ if (this.procedureExecutor != null) {
+ this.procedureExecutor.stop();
+ }
+ // Shutdown our cluster connection. This will kill any hosted RPCs that
might be going on;
+ // this is what we want especially if the Master is in startup phase
doing call outs to
+ // hbase:meta, etc. when cluster is down. Without ths connection close,
we'd have to wait on
+ // the rpc to timeout.
+ if (this.clusterConnection != null) {
+ this.clusterConnection.close();
+ }
+ }, "HMaster.shutdown");
}
public void stopMaster() throws IOException {
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMasterCommandLine.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMasterCommandLine.java
index efded4841d7..8ac3a1ebba9 100644
---
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMasterCommandLine.java
+++
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMasterCommandLine.java
@@ -17,6 +17,9 @@
*/
package org.apache.hadoop.hbase.master;
+import io.opentelemetry.api.trace.Span;
+import io.opentelemetry.api.trace.StatusCode;
+import io.opentelemetry.context.Scope;
import java.io.File;
import java.io.IOException;
import java.util.List;
@@ -30,13 +33,13 @@ import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.regionserver.HRegionServer;
+import org.apache.hadoop.hbase.trace.TraceUtil;
import org.apache.hadoop.hbase.util.JVMClusterUtil;
import org.apache.hadoop.hbase.util.ServerCommandLine;
import org.apache.hadoop.hbase.zookeeper.MiniZooKeeperCluster;
import org.apache.hadoop.hbase.zookeeper.ZKAuthentication;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
import org.apache.yetus.audience.InterfaceAudience;
-import org.apache.zookeeper.KeeperException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -161,7 +164,8 @@ public class HMasterCommandLine extends ServerCommandLine {
private int startMaster() {
Configuration conf = getConf();
- try {
+ final Span span = TraceUtil.createSpan("HMasterCommandLine.startMaster");
+ try (Scope ignored = span.makeCurrent()) {
// If 'local', defer to LocalHBaseCluster instance. Starts master
// and regionserver both in the one JVM.
if (LocalHBaseCluster.isLocal(conf)) {
@@ -250,9 +254,13 @@ public class HMasterCommandLine extends ServerCommandLine {
master.join();
if (master.isAborted()) throw new RuntimeException("HMaster Aborted");
}
+ span.setStatus(StatusCode.OK);
} catch (Throwable t) {
+ TraceUtil.setError(span, t);
LOG.error("Master exiting", t);
return 1;
+ } finally {
+ span.end();
}
return 0;
}
@@ -310,8 +318,7 @@ public class HMasterCommandLine extends ServerCommandLine {
public static class LocalHMaster extends HMaster {
private MiniZooKeeperCluster zkcluster = null;
- public LocalHMaster(Configuration conf)
- throws IOException, KeeperException, InterruptedException {
+ public LocalHMaster(Configuration conf) throws IOException {
super(conf);
}
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java
index 283da5fc0c4..5ecf6a2f6e6 100644
---
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java
+++
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java
@@ -17,6 +17,9 @@
*/
package org.apache.hadoop.hbase.master;
+import io.opentelemetry.api.trace.Span;
+import io.opentelemetry.api.trace.StatusCode;
+import io.opentelemetry.context.Scope;
import java.io.IOException;
import java.io.InterruptedIOException;
import java.util.Collections;
@@ -29,6 +32,7 @@ import org.apache.hadoop.hbase.ServerMetrics;
import org.apache.hadoop.hbase.ServerMetricsBuilder;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.client.VersionInfoUtil;
+import org.apache.hadoop.hbase.trace.TraceUtil;
import org.apache.hadoop.hbase.zookeeper.ZKListener;
import org.apache.hadoop.hbase.zookeeper.ZKUtil;
import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
@@ -184,22 +188,28 @@ public class RegionServerTracker extends ZKListener {
private synchronized void refresh() {
List<String> names;
- try {
- names = ZKUtil.listChildrenAndWatchForNewChildren(watcher,
watcher.getZNodePaths().rsZNode);
- } catch (KeeperException e) {
- // here we need to abort as we failed to set watcher on the rs node
which means that we can
- // not track the node deleted event any more.
- server.abort("Unexpected zk exception getting RS nodes", e);
- return;
- }
- Set<ServerName> newServers = CollectionUtils.isEmpty(names)
- ? Collections.emptySet()
- : names.stream().map(ServerName::parseServerName)
- .collect(Collectors.collectingAndThen(Collectors.toSet(),
Collections::unmodifiableSet));
- if (active) {
- processAsActiveMaster(newServers);
+ final Span span = TraceUtil.createSpan("RegionServerTracker.refresh");
+ try (final Scope ignored = span.makeCurrent()) {
+ try {
+ names = ZKUtil.listChildrenAndWatchForNewChildren(watcher,
watcher.getZNodePaths().rsZNode);
+ } catch (KeeperException e) {
+ // here we need to abort as we failed to set watcher on the rs node
which means that we can
+ // not track the node deleted event any more.
+ server.abort("Unexpected zk exception getting RS nodes", e);
+ return;
+ }
+ Set<ServerName> newServers = CollectionUtils.isEmpty(names)
+ ? Collections.emptySet()
+ : names.stream().map(ServerName::parseServerName)
+ .collect(Collectors.collectingAndThen(Collectors.toSet(),
Collections::unmodifiableSet));
+ if (active) {
+ processAsActiveMaster(newServers);
+ }
+ this.regionServers = newServers;
+ span.setStatus(StatusCode.OK);
+ } finally {
+ span.end();
}
- this.regionServers = newServers;
}
@Override
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
index 97095a8abe4..b5a9aa48e09 100644
---
a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
+++
b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
@@ -23,6 +23,9 @@ import static
org.apache.hadoop.hbase.HConstants.HBASE_SPLIT_WAL_COORDINATED_BY_
import static org.apache.hadoop.hbase.HConstants.HBASE_SPLIT_WAL_MAX_SPLITTER;
import static org.apache.hadoop.hbase.util.DNS.UNSAFE_RS_HOSTNAME_KEY;
+import io.opentelemetry.api.trace.Span;
+import io.opentelemetry.api.trace.StatusCode;
+import io.opentelemetry.context.Scope;
import java.io.IOException;
import java.io.PrintWriter;
import java.lang.management.MemoryType;
@@ -161,6 +164,7 @@ import org.apache.hadoop.hbase.security.User;
import org.apache.hadoop.hbase.security.UserProvider;
import org.apache.hadoop.hbase.security.access.AccessChecker;
import org.apache.hadoop.hbase.security.access.ZKPermissionWatcher;
+import org.apache.hadoop.hbase.trace.TraceUtil;
import org.apache.hadoop.hbase.unsafe.HBasePlatformDependent;
import org.apache.hadoop.hbase.util.Addressing;
import org.apache.hadoop.hbase.util.Bytes;
@@ -589,7 +593,8 @@ public class HRegionServer extends Thread
*/
public HRegionServer(final Configuration conf) throws IOException {
super("RegionServer"); // thread name
- try {
+ final Span span = TraceUtil.createSpan("HRegionServer.cxtor");
+ try (Scope ignored = span.makeCurrent()) {
this.startcode = EnvironmentEdgeManager.currentTime();
this.conf = conf;
this.dataFsOk = true;
@@ -701,11 +706,15 @@ public class HRegionServer extends Thread
this.choreService = new ChoreService(getName(), true);
this.executorService = new ExecutorService(getName());
putUpWebUI();
+ span.setStatus(StatusCode.OK);
} catch (Throwable t) {
// Make sure we log the exception. HRegionServer is often started via
reflection and the
// cause of failed startup is lost.
+ TraceUtil.setError(span, t);
LOG.error("Failed construction RegionServer", t);
throw t;
+ } finally {
+ span.end();
}
}
@@ -920,18 +929,23 @@ public class HRegionServer extends Thread
* In here we just put up the RpcServer, setup Connection, and ZooKeeper.
*/
private void preRegistrationInitialization() {
- try {
+ final Span span =
TraceUtil.createSpan("HRegionServer.preRegistrationInitialization");
+ try (Scope ignored = span.makeCurrent()) {
initializeZooKeeper();
setupClusterConnection();
// Setup RPC client for master communication
this.rpcClient = RpcClientFactory.createClient(conf, clusterId,
new InetSocketAddress(this.rpcServices.isa.getAddress(), 0),
clusterConnection.getConnectionMetrics());
+ span.setStatus(StatusCode.OK);
} catch (Throwable t) {
// Call stop if error or process will stick around for ever since server
// puts up non-daemon threads.
+ TraceUtil.setError(span, t);
this.rpcServices.stop();
abort("Initialization of RS failed. Hence aborting RS.", t);
+ } finally {
+ span.end();
}
}
@@ -1042,35 +1056,39 @@ public class HRegionServer extends Thread
// start up all Services. Use RetryCounter to get backoff in case
Master is struggling to
// come up.
LOG.debug("About to register with Master.");
- RetryCounterFactory rcf =
- new RetryCounterFactory(Integer.MAX_VALUE, this.sleeper.getPeriod(),
1000 * 60 * 5);
- RetryCounter rc = rcf.create();
- while (keepLooping()) {
- RegionServerStartupResponse w = reportForDuty();
- if (w == null) {
- long sleepTime = rc.getBackoffTimeAndIncrementAttempts();
- LOG.warn("reportForDuty failed; sleeping {} ms and then
retrying.", sleepTime);
- this.sleeper.sleep(sleepTime);
- } else {
- handleReportForDutyResponse(w);
- break;
+ TraceUtil.trace(() -> {
+ RetryCounterFactory rcf =
+ new RetryCounterFactory(Integer.MAX_VALUE,
this.sleeper.getPeriod(), 1000 * 60 * 5);
+ RetryCounter rc = rcf.create();
+ while (keepLooping()) {
+ RegionServerStartupResponse w = reportForDuty();
+ if (w == null) {
+ long sleepTime = rc.getBackoffTimeAndIncrementAttempts();
+ LOG.warn("reportForDuty failed; sleeping {} ms and then
retrying.", sleepTime);
+ this.sleeper.sleep(sleepTime);
+ } else {
+ handleReportForDutyResponse(w);
+ break;
+ }
}
- }
+ }, "HRegionServer.registerWithMaster");
}
if (!isStopped() && isHealthy()) {
- // start the snapshot handler and other procedure handlers,
- // since the server is ready to run
- if (this.rspmHost != null) {
- this.rspmHost.start();
- }
- // Start the Quota Manager
- if (this.rsQuotaManager != null) {
- rsQuotaManager.start(getRpcServer().getScheduler());
- }
- if (this.rsSpaceQuotaManager != null) {
- this.rsSpaceQuotaManager.start();
- }
+ TraceUtil.trace(() -> {
+ // start the snapshot handler and other procedure handlers,
+ // since the server is ready to run
+ if (this.rspmHost != null) {
+ this.rspmHost.start();
+ }
+ // Start the Quota Manager
+ if (this.rsQuotaManager != null) {
+ rsQuotaManager.start(getRpcServer().getScheduler());
+ }
+ if (this.rsSpaceQuotaManager != null) {
+ this.rsSpaceQuotaManager.start();
+ }
+ }, "HRegionServer.startup");
}
// We registered with the Master. Go into run mode.
@@ -1121,138 +1139,144 @@ public class HRegionServer extends Thread
}
}
- if (this.leaseManager != null) {
- this.leaseManager.closeAfterLeasesExpire();
- }
- if (this.splitLogWorker != null) {
- splitLogWorker.stop();
- }
- if (this.infoServer != null) {
- LOG.info("Stopping infoServer");
- try {
- this.infoServer.stop();
- } catch (Exception e) {
- LOG.error("Failed to stop infoServer", e);
+ final Span span = TraceUtil.createSpan("HRegionServer exiting main loop");
+ try (Scope ignored = span.makeCurrent()) {
+ if (this.leaseManager != null) {
+ this.leaseManager.closeAfterLeasesExpire();
+ }
+ if (this.splitLogWorker != null) {
+ splitLogWorker.stop();
+ }
+ if (this.infoServer != null) {
+ LOG.info("Stopping infoServer");
+ try {
+ this.infoServer.stop();
+ } catch (Exception e) {
+ LOG.error("Failed to stop infoServer", e);
+ }
+ }
+ // Send cache a shutdown.
+ if (blockCache != null) {
+ blockCache.shutdown();
+ }
+ if (mobFileCache != null) {
+ mobFileCache.shutdown();
}
- }
- // Send cache a shutdown.
- if (blockCache != null) {
- blockCache.shutdown();
- }
- if (mobFileCache != null) {
- mobFileCache.shutdown();
- }
- // Send interrupts to wake up threads if sleeping so they notice shutdown.
- // TODO: Should we check they are alive? If OOME could have exited already
- if (this.hMemManager != null) {
- this.hMemManager.stop();
- }
- if (this.cacheFlusher != null) {
- this.cacheFlusher.interruptIfNecessary();
- }
- if (this.compactSplitThread != null) {
- this.compactSplitThread.interruptIfNecessary();
- }
+ // Send interrupts to wake up threads if sleeping so they notice
shutdown.
+ // TODO: Should we check they are alive? If OOME could have exited
already
+ if (this.hMemManager != null) {
+ this.hMemManager.stop();
+ }
+ if (this.cacheFlusher != null) {
+ this.cacheFlusher.interruptIfNecessary();
+ }
+ if (this.compactSplitThread != null) {
+ this.compactSplitThread.interruptIfNecessary();
+ }
- // Stop the snapshot and other procedure handlers, forcefully killing all
running tasks
- if (rspmHost != null) {
- rspmHost.stop(this.abortRequested.get() || this.killed);
- }
+ // Stop the snapshot and other procedure handlers, forcefully killing
all running tasks
+ if (rspmHost != null) {
+ rspmHost.stop(this.abortRequested.get() || this.killed);
+ }
- if (this.killed) {
- // Just skip out w/o closing regions. Used when testing.
- } else if (abortRequested.get()) {
- if (this.dataFsOk) {
- closeUserRegions(abortRequested.get()); // Don't leave any open file
handles
+ if (this.killed) {
+ // Just skip out w/o closing regions. Used when testing.
+ } else if (abortRequested.get()) {
+ if (this.dataFsOk) {
+ closeUserRegions(abortRequested.get()); // Don't leave any open file
handles
+ }
+ LOG.info("aborting server " + this.serverName);
+ } else {
+ closeUserRegions(abortRequested.get());
+ LOG.info("stopping server " + this.serverName);
}
- LOG.info("aborting server " + this.serverName);
- } else {
- closeUserRegions(abortRequested.get());
- LOG.info("stopping server " + this.serverName);
- }
- if (this.clusterConnection != null && !clusterConnection.isClosed()) {
- try {
- this.clusterConnection.close();
- } catch (IOException e) {
- // Although the {@link Closeable} interface throws an {@link
- // IOException}, in reality, the implementation would never do that.
- LOG.warn("Attempt to close server's short circuit ClusterConnection
failed.", e);
+ if (this.clusterConnection != null && !clusterConnection.isClosed()) {
+ try {
+ this.clusterConnection.close();
+ } catch (IOException e) {
+ // Although the {@link Closeable} interface throws an {@link
+ // IOException}, in reality, the implementation would never do that.
+ LOG.warn("Attempt to close server's short circuit ClusterConnection
failed.", e);
+ }
}
- }
- // Closing the compactSplit thread before closing meta regions
- if (!this.killed && containsMetaTableRegions()) {
- if (!abortRequested.get() || this.dataFsOk) {
- if (this.compactSplitThread != null) {
- this.compactSplitThread.join();
- this.compactSplitThread = null;
+ // Closing the compactSplit thread before closing meta regions
+ if (!this.killed && containsMetaTableRegions()) {
+ if (!abortRequested.get() || this.dataFsOk) {
+ if (this.compactSplitThread != null) {
+ this.compactSplitThread.join();
+ this.compactSplitThread = null;
+ }
+ closeMetaTableRegions(abortRequested.get());
}
- closeMetaTableRegions(abortRequested.get());
}
- }
- if (!this.killed && this.dataFsOk) {
- waitOnAllRegionsToClose(abortRequested.get());
- LOG.info("stopping server " + this.serverName + "; all regions closed.");
- }
+ if (!this.killed && this.dataFsOk) {
+ waitOnAllRegionsToClose(abortRequested.get());
+ LOG.info("stopping server " + this.serverName + "; all regions
closed.");
+ }
- // Stop the quota manager
- if (rsQuotaManager != null) {
- rsQuotaManager.stop();
- }
- if (rsSpaceQuotaManager != null) {
- rsSpaceQuotaManager.stop();
- rsSpaceQuotaManager = null;
- }
+ // Stop the quota manager
+ if (rsQuotaManager != null) {
+ rsQuotaManager.stop();
+ }
+ if (rsSpaceQuotaManager != null) {
+ rsSpaceQuotaManager.stop();
+ rsSpaceQuotaManager = null;
+ }
- // flag may be changed when closing regions throws exception.
- if (this.dataFsOk) {
- shutdownWAL(!abortRequested.get());
- }
+ // flag may be changed when closing regions throws exception.
+ if (this.dataFsOk) {
+ shutdownWAL(!abortRequested.get());
+ }
- // Make sure the proxy is down.
- if (this.rssStub != null) {
- this.rssStub = null;
- }
- if (this.lockStub != null) {
- this.lockStub = null;
- }
- if (this.rpcClient != null) {
- this.rpcClient.close();
- }
- if (this.leaseManager != null) {
- this.leaseManager.close();
- }
- if (this.pauseMonitor != null) {
- this.pauseMonitor.stop();
- }
+ // Make sure the proxy is down.
+ if (this.rssStub != null) {
+ this.rssStub = null;
+ }
+ if (this.lockStub != null) {
+ this.lockStub = null;
+ }
+ if (this.rpcClient != null) {
+ this.rpcClient.close();
+ }
+ if (this.leaseManager != null) {
+ this.leaseManager.close();
+ }
+ if (this.pauseMonitor != null) {
+ this.pauseMonitor.stop();
+ }
- if (!killed) {
- stopServiceThreads();
- }
+ if (!killed) {
+ stopServiceThreads();
+ }
- if (this.rpcServices != null) {
- this.rpcServices.stop();
- }
+ if (this.rpcServices != null) {
+ this.rpcServices.stop();
+ }
- try {
- deleteMyEphemeralNode();
- } catch (KeeperException.NoNodeException nn) {
- // pass
- } catch (KeeperException e) {
- LOG.warn("Failed deleting my ephemeral node", e);
- }
- // We may have failed to delete the znode at the previous step, but
- // we delete the file anyway: a second attempt to delete the znode is
likely to fail again.
- ZNodeClearer.deleteMyEphemeralNodeOnDisk();
+ try {
+ deleteMyEphemeralNode();
+ } catch (KeeperException.NoNodeException nn) {
+ // pass
+ } catch (KeeperException e) {
+ LOG.warn("Failed deleting my ephemeral node", e);
+ }
+ // We may have failed to delete the znode at the previous step, but
+ // we delete the file anyway: a second attempt to delete the znode is
likely to fail again.
+ ZNodeClearer.deleteMyEphemeralNodeOnDisk();
- if (this.zooKeeper != null) {
- this.zooKeeper.close();
+ if (this.zooKeeper != null) {
+ this.zooKeeper.close();
+ }
+ this.shutDown = true;
+ LOG.info("Exiting; stopping=" + this.serverName + "; zookeeper
connection closed.");
+ span.setStatus(StatusCode.OK);
+ } finally {
+ span.end();
}
- this.shutDown = true;
- LOG.info("Exiting; stopping=" + this.serverName + "; zookeeper connection
closed.");
}
private boolean containsMetaTableRegions() {
@@ -1293,23 +1317,29 @@ public class HRegionServer extends Thread
return;
}
ClusterStatusProtos.ServerLoad sl = buildServerLoad(reportStartTime,
reportEndTime);
- try {
+ final Span span =
TraceUtil.createSpan("HRegionServer.tryRegionServerReport");
+ try (Scope ignored = span.makeCurrent()) {
RegionServerReportRequest.Builder request =
RegionServerReportRequest.newBuilder();
request.setServer(ProtobufUtil.toServerName(this.serverName));
request.setLoad(sl);
rss.regionServerReport(null, request.build());
+ span.setStatus(StatusCode.OK);
} catch (ServiceException se) {
IOException ioe = ProtobufUtil.getRemoteException(se);
if (ioe instanceof YouAreDeadException) {
// This will be caught and handled as a fatal error in run()
+ TraceUtil.setError(span, ioe);
throw ioe;
}
if (rssStub == rss) {
rssStub = null;
}
+ TraceUtil.setError(span, se);
// Couldn't connect to the master, get location from zk and reconnect
// Method blocks until new master is found or we are stopped
createRegionServerStatusStub(true);
+ } finally {
+ span.end();
}
}
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServerCommandLine.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServerCommandLine.java
index 3b21171c28e..db74380d2ed 100644
---
a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServerCommandLine.java
+++
b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServerCommandLine.java
@@ -17,9 +17,13 @@
*/
package org.apache.hadoop.hbase.regionserver;
+import io.opentelemetry.api.trace.Span;
+import io.opentelemetry.api.trace.StatusCode;
+import io.opentelemetry.context.Scope;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.LocalHBaseCluster;
+import org.apache.hadoop.hbase.trace.TraceUtil;
import org.apache.hadoop.hbase.util.ServerCommandLine;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
@@ -47,7 +51,8 @@ public class HRegionServerCommandLine extends
ServerCommandLine {
private int start() throws Exception {
Configuration conf = getConf();
- try {
+ final Span span = TraceUtil.createSpan("HRegionServerCommandLine.start");
+ try (Scope ignored = span.makeCurrent()) {
// If 'local', don't start a region server here. Defer to
// LocalHBaseCluster. It manages 'local' clusters.
if (LocalHBaseCluster.isLocal(conf)) {
@@ -62,9 +67,13 @@ public class HRegionServerCommandLine extends
ServerCommandLine {
throw new RuntimeException("HRegionServer Aborted");
}
}
+ span.setStatus(StatusCode.OK);
} catch (Throwable t) {
+ TraceUtil.setError(span, t);
LOG.error("Region server exiting", t);
return 1;
+ } finally {
+ span.end();
}
return 0;
}
diff --git
a/hbase-server/src/test/java/org/apache/hadoop/hbase/TestServerInternalsTracing.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestServerInternalsTracing.java
new file mode 100644
index 00000000000..ff010141f81
--- /dev/null
+++
b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestServerInternalsTracing.java
@@ -0,0 +1,297 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase;
+
+import static
org.apache.hadoop.hbase.client.trace.hamcrest.SpanDataMatchers.hasName;
+import static
org.apache.hadoop.hbase.client.trace.hamcrest.SpanDataMatchers.hasParentSpanId;
+import static
org.apache.hadoop.hbase.client.trace.hamcrest.SpanDataMatchers.hasStatusWithCode;
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.allOf;
+import static org.hamcrest.Matchers.containsString;
+import static org.hamcrest.Matchers.endsWith;
+import static org.hamcrest.Matchers.hasItem;
+import static org.hamcrest.Matchers.isOneOf;
+import static org.hamcrest.Matchers.startsWith;
+
+import io.opentelemetry.api.trace.StatusCode;
+import io.opentelemetry.sdk.trace.data.SpanData;
+import java.util.List;
+import java.util.function.Supplier;
+import org.apache.hadoop.hbase.client.trace.StringTraceRenderer;
+import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.apache.hadoop.hbase.testclassification.RegionServerTests;
+import org.apache.hadoop.hbase.trace.OpenTelemetryClassRule;
+import org.hamcrest.Matcher;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.junit.rules.ExternalResource;
+import org.junit.rules.RuleChain;
+import org.junit.rules.TestRule;
+import org.junit.runners.model.Statement;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Test that sundry operations internal to the region server are traced as
expected.
+ */
+@Category({ MediumTests.class, RegionServerTests.class })
+public class TestServerInternalsTracing {
+ private static final Logger LOG =
LoggerFactory.getLogger(TestServerInternalsTracing.class);
+
+ @ClassRule
+ public static final HBaseClassTestRule CLASS_RULE =
+ HBaseClassTestRule.forClass(TestServerInternalsTracing.class);
+
+ private static final String NO_PARENT_ID = "0000000000000000";
+ private static List<SpanData> spans;
+
+ /**
+ * Wait for the underlying cluster to come up -- defined by meta being
available.
+ */
+ private static class Setup extends ExternalResource {
+ private final Supplier<HBaseTestingUtility> testingUtilSupplier;
+
+ public Setup(final Supplier<HBaseTestingUtility> testingUtilSupplier) {
+ this.testingUtilSupplier = testingUtilSupplier;
+ }
+
+ @Override
+ protected void before() throws Throwable {
+ final HBaseTestingUtility testingUtil = testingUtilSupplier.get();
+ testingUtil.waitTableAvailable(TableName.META_TABLE_NAME);
+ }
+ }
+
+ private static class Noop extends Statement {
+ @Override
+ public void evaluate() throws Throwable {
+ }
+ }
+
+ @ClassRule
+ public static TestRule classRule = (base, description) -> new Statement() {
+ @Override
+ public void evaluate() throws Throwable {
+ // setup and tear down the cluster, collecting all the spans produced in
the process.
+ final OpenTelemetryClassRule otelClassRule =
OpenTelemetryClassRule.create();
+ final MiniClusterRule miniClusterRule =
MiniClusterRule.newBuilder().build();
+ final Setup setup = new Setup(miniClusterRule::getTestingUtility);
+ final TestRule clusterRule =
+
RuleChain.outerRule(otelClassRule).around(miniClusterRule).around(setup);
+ clusterRule.apply(new Noop(), description).evaluate();
+ spans = otelClassRule.getSpans();
+ if (LOG.isDebugEnabled()) {
+ StringTraceRenderer renderer = new StringTraceRenderer(spans);
+ renderer.render(LOG::debug);
+ }
+ base.evaluate();
+ }
+ };
+
+ @Test
+ public void testHMasterConstructor() {
+ final Matcher<SpanData> masterConstructorMatcher =
allOf(hasName("HMaster.cxtor"),
+ hasParentSpanId(NO_PARENT_ID), hasStatusWithCode(isOneOf(StatusCode.OK,
StatusCode.ERROR)));
+ assertThat("there should be a span from the HMaster constructor.", spans,
+ hasItem(masterConstructorMatcher));
+ final SpanData masterConstructorSpan =
spans.stream().filter(masterConstructorMatcher::matches)
+ .findAny().orElseThrow(AssertionError::new);
+ assertThat("the HMaster constructor span should show zookeeper
interaction.", spans, hasItem(
+ allOf(hasName(startsWith("RecoverableZookeeper.")),
hasParentSpanId(masterConstructorSpan))));
+ }
+
+ @Test
+ public void testHMasterBecomeActiveMaster() {
+ final Matcher<SpanData> masterBecomeActiveMasterMatcher =
+ allOf(hasName("HMaster.becomeActiveMaster"),
hasParentSpanId(NO_PARENT_ID),
+ hasStatusWithCode(isOneOf(StatusCode.OK, StatusCode.ERROR)));
+ assertThat("there should be a span from the HMaster.becomeActiveMaster.",
spans,
+ hasItem(masterBecomeActiveMasterMatcher));
+ final SpanData masterBecomeActiveMasterSpan = spans.stream()
+
.filter(masterBecomeActiveMasterMatcher::matches).findAny().orElseThrow(AssertionError::new);
+ assertThat("the HMaster.becomeActiveMaster span should show zookeeper
interaction.", spans,
+ hasItem(allOf(hasName(startsWith("RecoverableZookeeper.")),
+ hasParentSpanId(masterBecomeActiveMasterSpan))));
+ assertThat("the HMaster.becomeActiveMaster span should show Region
interaction.", spans,
+ hasItem(
+ allOf(hasName(startsWith("Region.")),
hasParentSpanId(masterBecomeActiveMasterSpan))));
+ assertThat("the HMaster.becomeActiveMaster span should show RegionScanner
interaction.", spans,
+ hasItem(allOf(hasName(startsWith("RegionScanner.")),
+ hasParentSpanId(masterBecomeActiveMasterSpan))));
+ assertThat("the HMaster.becomeActiveMaster span should show hbase:meta
interaction.", spans,
+ hasItem(allOf(hasName(containsString("hbase:meta")),
+ hasParentSpanId(masterBecomeActiveMasterSpan))));
+ assertThat("the HMaster.becomeActiveMaster span should show WAL
interaction.", spans,
+ hasItem(allOf(hasName(startsWith("WAL.")),
hasParentSpanId(masterBecomeActiveMasterSpan))));
+ }
+
+ @Test
+ public void testZKWatcherHMaster() {
+ final Matcher<SpanData> mZKWatcherMatcher =
allOf(hasName(startsWith("ZKWatcher-master")),
+ hasParentSpanId(NO_PARENT_ID), hasStatusWithCode(isOneOf(StatusCode.OK,
StatusCode.ERROR)));
+ assertThat("there should be a span from the ZKWatcher running in the
HMaster.", spans,
+ hasItem(mZKWatcherMatcher));
+ final SpanData mZKWatcherSpan =
+
spans.stream().filter(mZKWatcherMatcher::matches).findAny().orElseThrow(AssertionError::new);
+ assertThat("the ZKWatcher running in the HMaster span should invoke
processEvent.", spans,
+ hasItem(allOf(hasName(containsString("processEvent")),
hasParentSpanId(mZKWatcherSpan))));
+ }
+
+ @Test
+ public void testHMasterShutdown() {
+ final Matcher<SpanData> masterShutdownMatcher =
allOf(hasName("HMaster.shutdown"),
+ hasParentSpanId(NO_PARENT_ID), hasStatusWithCode(isOneOf(StatusCode.OK,
StatusCode.ERROR)));
+ assertThat("there should be a span from the HMaster.shutdown.", spans,
+ hasItem(masterShutdownMatcher));
+ final SpanData masterShutdownSpan =
spans.stream().filter(masterShutdownMatcher::matches)
+ .findAny().orElseThrow(AssertionError::new);
+ assertThat("the HMaster.shutdown span should show zookeeper interaction.",
spans, hasItem(
+ allOf(hasName(startsWith("RecoverableZookeeper.")),
hasParentSpanId(masterShutdownSpan))));
+ assertThat(
+ "the HMaster.shutdown span should show ShortCircuitingClusterConnection
interaction.", spans,
+ hasItem(allOf(hasName(startsWith("ShortCircuitingClusterConnection.")),
+ hasParentSpanId(masterShutdownSpan))));
+ }
+
+ @Test
+ public void testHMasterExitingMainLoop() {
+ final Matcher<SpanData> masterExitingMainLoopMatcher =
+ allOf(hasName("HMaster exiting main loop"),
hasParentSpanId(NO_PARENT_ID),
+ hasStatusWithCode(isOneOf(StatusCode.OK, StatusCode.ERROR)));
+ assertThat("there should be a span from the HMaster exiting main loop.",
spans,
+ hasItem(masterExitingMainLoopMatcher));
+ final SpanData masterExitingMainLoopSpan = spans.stream()
+
.filter(masterExitingMainLoopMatcher::matches).findAny().orElseThrow(AssertionError::new);
+ assertThat("the HMaster exiting main loop span should show HTable
interaction.", spans,
+ hasItem(allOf(hasName(startsWith("HTable.")),
hasParentSpanId(masterExitingMainLoopSpan))));
+ }
+
+ @Test
+ public void testTryRegionServerReport() {
+ final Matcher<SpanData> tryRegionServerReportMatcher =
+ allOf(hasName("HRegionServer.tryRegionServerReport"),
hasParentSpanId(NO_PARENT_ID),
+ hasStatusWithCode(isOneOf(StatusCode.OK, StatusCode.ERROR)));
+ assertThat("there should be a span for the region server sending a
report.", spans,
+ hasItem(tryRegionServerReportMatcher));
+ final SpanData tryRegionServerReportSpan = spans.stream()
+
.filter(tryRegionServerReportMatcher::matches).findAny().orElseThrow(AssertionError::new);
+ assertThat(
+ "the region server report span should have an invocation of the
RegionServerReport RPC.",
+ spans,
hasItem(allOf(hasName(endsWith("RegionServerStatusService/RegionServerReport")),
+ hasParentSpanId(tryRegionServerReportSpan))));
+ }
+
+ @Test
+ public void testHRegionServerStartup() {
+ final Matcher<SpanData> regionServerStartupMatcher =
allOf(hasName("HRegionServer.startup"),
+ hasParentSpanId(NO_PARENT_ID), hasStatusWithCode(isOneOf(StatusCode.OK,
StatusCode.ERROR)));
+ assertThat("there should be a span from the HRegionServer startup
procedure.", spans,
+ hasItem(regionServerStartupMatcher));
+ final SpanData regionServerStartupSpan = spans.stream()
+
.filter(regionServerStartupMatcher::matches).findAny().orElseThrow(AssertionError::new);
+ assertThat("the HRegionServer startup procedure span should show zookeeper
interaction.", spans,
+ hasItem(allOf(hasName(startsWith("RecoverableZookeeper.")),
+ hasParentSpanId(regionServerStartupSpan))));
+ }
+
+ @Test
+ public void testHRegionServerConstructor() {
+ final Matcher<SpanData> rsConstructorMatcher =
allOf(hasName("HRegionServer.cxtor"),
+ hasParentSpanId(NO_PARENT_ID), hasStatusWithCode(isOneOf(StatusCode.OK,
StatusCode.ERROR)));
+ assertThat("there should be a span from the HRegionServer constructor.",
spans,
+ hasItem(rsConstructorMatcher));
+ final SpanData rsConstructorSpan =
spans.stream().filter(rsConstructorMatcher::matches)
+ .findAny().orElseThrow(AssertionError::new);
+ assertThat("the HRegionServer constructor span should show zookeeper
interaction.", spans,
+ hasItem(
+ allOf(hasName(startsWith("RecoverableZookeeper.")),
hasParentSpanId(rsConstructorSpan))));
+ assertThat("the HRegionServer constructor span should invoke the
MasterAddressTracker.", spans,
+ hasItem(
+ allOf(hasName(startsWith("MasterAddressTracker.")),
hasParentSpanId(rsConstructorSpan))));
+ }
+
+ @Test
+ public void testHRegionServerPreRegistrationInitialization() {
+ final Matcher<SpanData> rsPreRegistrationInitializationMatcher =
+ allOf(hasName("HRegionServer.preRegistrationInitialization"),
hasParentSpanId(NO_PARENT_ID),
+ hasStatusWithCode(isOneOf(StatusCode.OK, StatusCode.ERROR)));
+ assertThat("there should be a span from the HRegionServer
preRegistrationInitialization.",
+ spans, hasItem(rsPreRegistrationInitializationMatcher));
+ final SpanData rsPreRegistrationInitializationSpan =
+
spans.stream().filter(rsPreRegistrationInitializationMatcher::matches).findAny()
+ .orElseThrow(AssertionError::new);
+ assertThat(
+ "the HRegionServer preRegistrationInitialization span should show
zookeeper interaction.",
+ spans, hasItem(allOf(hasName(startsWith("RecoverableZookeeper.")),
+ hasParentSpanId(rsPreRegistrationInitializationSpan))));
+ }
+
+ @Test
+ public void testHRegionServerRegisterWithMaster() {
+ final Matcher<SpanData> rsRegisterWithMasterMatcher =
+ allOf(hasName("HRegionServer.registerWithMaster"),
hasParentSpanId(NO_PARENT_ID),
+ hasStatusWithCode(isOneOf(StatusCode.OK, StatusCode.ERROR)));
+ assertThat("there should be a span from the HRegionServer
registerWithMaster.", spans,
+ hasItem(rsRegisterWithMasterMatcher));
+ final SpanData rsRegisterWithMasterSpan = spans.stream()
+
.filter(rsRegisterWithMasterMatcher::matches).findAny().orElseThrow(AssertionError::new);
+ assertThat("the HRegionServer registerWithMaster span should show
zookeeper interaction.",
+ spans, hasItem(allOf(hasName(startsWith("RecoverableZookeeper.")),
+ hasParentSpanId(rsRegisterWithMasterSpan))));
+ assertThat(
+ "the HRegionServer registerWithMaster span should have an invocation of
the"
+ + " RegionServerStartup RPC.",
+ spans,
hasItem(allOf(hasName(endsWith("RegionServerStatusService/RegionServerStartup")),
+ hasParentSpanId(rsRegisterWithMasterSpan))));
+ }
+
+ @Test
+ public void testZKWatcherRegionServer() {
+ final Matcher<SpanData> rsZKWatcherMatcher =
+ allOf(hasName(startsWith("ZKWatcher-regionserver")),
hasParentSpanId(NO_PARENT_ID),
+ hasStatusWithCode(isOneOf(StatusCode.OK, StatusCode.ERROR)));
+ assertThat("there should be a span from the ZKWatcher running in the
HRegionServer.", spans,
+ hasItem(rsZKWatcherMatcher));
+ final SpanData rsZKWatcherSpan =
+
spans.stream().filter(rsZKWatcherMatcher::matches).findAny().orElseThrow(AssertionError::new);
+ assertThat("the ZKWatcher running in the HRegionServer span should invoke
processEvent.", spans,
+ hasItem(allOf(hasName(containsString("processEvent")),
hasParentSpanId(rsZKWatcherSpan))));
+ }
+
+ @Test
+ public void testHRegionServerExitingMainLoop() {
+ final Matcher<SpanData> rsExitingMainLoopMatcher =
+ allOf(hasName("HRegionServer exiting main loop"),
hasParentSpanId(NO_PARENT_ID),
+ hasStatusWithCode(isOneOf(StatusCode.OK, StatusCode.ERROR)));
+ assertThat("there should be a span from the HRegionServer exiting main
loop.", spans,
+ hasItem(rsExitingMainLoopMatcher));
+ final SpanData rsExitingMainLoopSpan =
spans.stream().filter(rsExitingMainLoopMatcher::matches)
+ .findAny().orElseThrow(AssertionError::new);
+ assertThat("the HRegionServer exiting main loop span should show zookeeper
interaction.", spans,
+ hasItem(allOf(hasName(startsWith("RecoverableZookeeper.")),
+ hasParentSpanId(rsExitingMainLoopSpan))));
+ assertThat(
+ "the HRegionServer exiting main loop span should show "
+ + "ShortCircuitingClusterConnection interaction.",
+ spans,
hasItem(allOf(hasName(startsWith("ShortCircuitingClusterConnection.")),
+ hasParentSpanId(rsExitingMainLoopSpan))));
+ assertThat("the HRegionServer exiting main loop span should invoke
CloseMetaHandler.", spans,
+ hasItem(allOf(hasName("CloseMetaHandler"),
hasParentSpanId(rsExitingMainLoopSpan))));
+ }
+}
diff --git
a/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/MasterAddressTracker.java
b/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/MasterAddressTracker.java
index 649178f7a01..840ee2d215f 100644
---
a/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/MasterAddressTracker.java
+++
b/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/MasterAddressTracker.java
@@ -27,6 +27,7 @@ import org.apache.hadoop.hbase.Abortable;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.exceptions.DeserializationException;
+import org.apache.hadoop.hbase.trace.TraceUtil;
import org.apache.yetus.audience.InterfaceAudience;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.data.Stat;
@@ -71,11 +72,13 @@ public class MasterAddressTracker extends ZKNodeTracker {
}
private void loadBackupMasters() {
- try {
- backupMasters =
Collections.unmodifiableList(getBackupMastersAndRenewWatch(watcher));
- } catch (InterruptedIOException e) {
- abortable.abort("Unexpected exception handling nodeChildrenChanged
event", e);
- }
+ TraceUtil.trace(() -> {
+ try {
+ backupMasters =
Collections.unmodifiableList(getBackupMastersAndRenewWatch(watcher));
+ } catch (InterruptedIOException e) {
+ abortable.abort("Unexpected exception handling nodeChildrenChanged
event", e);
+ }
+ }, "MasterAddressTracker.loadBackupMasters");
}
@Override
diff --git
a/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java
b/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java
index 017679e08bd..622a4f49939 100644
---
a/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java
+++
b/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java
@@ -18,6 +18,7 @@
package org.apache.hadoop.hbase.zookeeper;
import io.opentelemetry.api.trace.Span;
+import io.opentelemetry.api.trace.StatusCode;
import io.opentelemetry.context.Scope;
import java.io.IOException;
import java.lang.management.ManagementFactory;
@@ -201,14 +202,15 @@ public class RecoverableZooKeeper {
* throw NoNodeException if the path does not exist.
*/
public void delete(String path, int version) throws InterruptedException,
KeeperException {
- Span span =
TraceUtil.getGlobalTracer().spanBuilder("RecoverableZookeeper.delete").startSpan();
- try (Scope scope = span.makeCurrent()) {
+ final Span span = TraceUtil.createSpan("RecoverableZookeeper.delete");
+ try (Scope ignored = span.makeCurrent()) {
RetryCounter retryCounter = retryCounterFactory.create();
boolean isRetry = false; // False for first attempt, true for all
retries.
while (true) {
try {
long startTime = EnvironmentEdgeManager.currentTime();
checkZk().delete(path, version);
+ span.setStatus(StatusCode.OK);
return;
} catch (KeeperException e) {
switch (e.code()) {
@@ -216,18 +218,22 @@ public class RecoverableZooKeeper {
if (isRetry) {
LOG.debug(
"Node " + path + " already deleted. Assuming a " + "previous
attempt succeeded.");
+ span.setStatus(StatusCode.OK);
return;
}
LOG.debug("Node {} already deleted, retry={}", path, isRetry);
+ TraceUtil.setError(span, e);
throw e;
case CONNECTIONLOSS:
case OPERATIONTIMEOUT:
case REQUESTTIMEOUT:
+ TraceUtil.setError(span, e);
retryOrThrow(retryCounter, e, "delete");
break;
default:
+ TraceUtil.setError(span, e);
throw e;
}
}
@@ -244,23 +250,26 @@ public class RecoverableZooKeeper {
* @return A Stat instance
*/
public Stat exists(String path, Watcher watcher) throws KeeperException,
InterruptedException {
- Span span =
TraceUtil.getGlobalTracer().spanBuilder("RecoverableZookeeper.exists").startSpan();
- try (Scope scope = span.makeCurrent()) {
+ final Span span = TraceUtil.createSpan("RecoverableZookeeper.exists");
+ try (Scope ignored = span.makeCurrent()) {
RetryCounter retryCounter = retryCounterFactory.create();
while (true) {
try {
long startTime = EnvironmentEdgeManager.currentTime();
Stat nodeStat = checkZk().exists(path, watcher);
+ span.setStatus(StatusCode.OK);
return nodeStat;
} catch (KeeperException e) {
switch (e.code()) {
case CONNECTIONLOSS:
case OPERATIONTIMEOUT:
case REQUESTTIMEOUT:
+ TraceUtil.setError(span, e);
retryOrThrow(retryCounter, e, "exists");
break;
default:
+ TraceUtil.setError(span, e);
throw e;
}
}
@@ -276,24 +285,28 @@ public class RecoverableZooKeeper {
* @return A Stat instance
*/
public Stat exists(String path, boolean watch) throws KeeperException,
InterruptedException {
- Span span =
TraceUtil.getGlobalTracer().spanBuilder("RecoverableZookeeper.exists").startSpan();
- try (Scope scope = span.makeCurrent()) {
+ Span span = TraceUtil.createSpan("RecoverableZookeeper.exists");
+ try (Scope ignored = span.makeCurrent()) {
RetryCounter retryCounter = retryCounterFactory.create();
while (true) {
try {
long startTime = EnvironmentEdgeManager.currentTime();
Stat nodeStat = checkZk().exists(path, watch);
+ span.setStatus(StatusCode.OK);
return nodeStat;
} catch (KeeperException e) {
switch (e.code()) {
case CONNECTIONLOSS:
+ TraceUtil.setError(span, e);
retryOrThrow(retryCounter, e, "exists");
break;
case OPERATIONTIMEOUT:
+ TraceUtil.setError(span, e);
retryOrThrow(retryCounter, e, "exists");
break;
default:
+ TraceUtil.setError(span, e);
throw e;
}
}
@@ -319,24 +332,26 @@ public class RecoverableZooKeeper {
*/
public List<String> getChildren(String path, Watcher watcher)
throws KeeperException, InterruptedException {
- Span span =
-
TraceUtil.getGlobalTracer().spanBuilder("RecoverableZookeeper.getChildren").startSpan();
- try (Scope scope = span.makeCurrent()) {
+ final Span span = TraceUtil.createSpan("RecoverableZookeeper.getChildren");
+ try (Scope ignored = span.makeCurrent()) {
RetryCounter retryCounter = retryCounterFactory.create();
while (true) {
try {
long startTime = EnvironmentEdgeManager.currentTime();
List<String> children = checkZk().getChildren(path, watcher);
+ span.setStatus(StatusCode.OK);
return children;
} catch (KeeperException e) {
switch (e.code()) {
case CONNECTIONLOSS:
case OPERATIONTIMEOUT:
case REQUESTTIMEOUT:
+ TraceUtil.setError(span, e);
retryOrThrow(retryCounter, e, "getChildren");
break;
default:
+ TraceUtil.setError(span, e);
throw e;
}
}
@@ -353,25 +368,28 @@ public class RecoverableZooKeeper {
*/
public List<String> getChildren(String path, boolean watch)
throws KeeperException, InterruptedException {
- Span span =
-
TraceUtil.getGlobalTracer().spanBuilder("RecoverableZookeeper.getChildren").startSpan();
- try (Scope scope = span.makeCurrent()) {
+ Span span = TraceUtil.createSpan("RecoverableZookeeper.getChildren");
+ try (Scope ignored = span.makeCurrent()) {
RetryCounter retryCounter = retryCounterFactory.create();
while (true) {
try {
long startTime = EnvironmentEdgeManager.currentTime();
List<String> children = checkZk().getChildren(path, watch);
+ span.setStatus(StatusCode.OK);
return children;
} catch (KeeperException e) {
switch (e.code()) {
case CONNECTIONLOSS:
+ TraceUtil.setError(span, e);
retryOrThrow(retryCounter, e, "getChildren");
break;
case OPERATIONTIMEOUT:
+ TraceUtil.setError(span, e);
retryOrThrow(retryCounter, e, "getChildren");
break;
default:
+ TraceUtil.setError(span, e);
throw e;
}
}
@@ -387,23 +405,26 @@ public class RecoverableZooKeeper {
*/
public byte[] getData(String path, Watcher watcher, Stat stat)
throws KeeperException, InterruptedException {
- Span span =
TraceUtil.getGlobalTracer().spanBuilder("RecoverableZookeeper.getData").startSpan();
- try (Scope scope = span.makeCurrent()) {
+ final Span span = TraceUtil.createSpan("RecoverableZookeeper.getData");
+ try (Scope ignored = span.makeCurrent()) {
RetryCounter retryCounter = retryCounterFactory.create();
while (true) {
try {
long startTime = EnvironmentEdgeManager.currentTime();
byte[] revData = checkZk().getData(path, watcher, stat);
+ span.setStatus(StatusCode.OK);
return ZKMetadata.removeMetaData(revData);
} catch (KeeperException e) {
switch (e.code()) {
case CONNECTIONLOSS:
case OPERATIONTIMEOUT:
case REQUESTTIMEOUT:
+ TraceUtil.setError(span, e);
retryOrThrow(retryCounter, e, "getData");
break;
default:
+ TraceUtil.setError(span, e);
throw e;
}
}
@@ -426,17 +447,21 @@ public class RecoverableZooKeeper {
try {
long startTime = EnvironmentEdgeManager.currentTime();
byte[] revData = checkZk().getData(path, watch, stat);
+ span.setStatus(StatusCode.OK);
return ZKMetadata.removeMetaData(revData);
} catch (KeeperException e) {
switch (e.code()) {
case CONNECTIONLOSS:
+ TraceUtil.setError(span, e);
retryOrThrow(retryCounter, e, "getData");
break;
case OPERATIONTIMEOUT:
+ TraceUtil.setError(span, e);
retryOrThrow(retryCounter, e, "getData");
break;
default:
+ TraceUtil.setError(span, e);
throw e;
}
}
@@ -455,8 +480,8 @@ public class RecoverableZooKeeper {
*/
public Stat setData(String path, byte[] data, int version)
throws KeeperException, InterruptedException {
- Span span =
TraceUtil.getGlobalTracer().spanBuilder("RecoverableZookeeper.setData").startSpan();
- try (Scope scope = span.makeCurrent()) {
+ final Span span = TraceUtil.createSpan("RecoverableZookeeper.setData");
+ try (Scope ignored = span.makeCurrent()) {
RetryCounter retryCounter = retryCounterFactory.create();
byte[] newData = ZKMetadata.appendMetaData(id, data);
boolean isRetry = false;
@@ -465,12 +490,14 @@ public class RecoverableZooKeeper {
try {
startTime = EnvironmentEdgeManager.currentTime();
Stat nodeStat = checkZk().setData(path, newData, version);
+ span.setStatus(StatusCode.OK);
return nodeStat;
} catch (KeeperException e) {
switch (e.code()) {
case CONNECTIONLOSS:
case OPERATIONTIMEOUT:
case REQUESTTIMEOUT:
+ TraceUtil.setError(span, e);
retryOrThrow(retryCounter, e, "setData");
break;
case BADVERSION:
@@ -481,15 +508,18 @@ public class RecoverableZooKeeper {
byte[] revData = checkZk().getData(path, false, stat);
if (Bytes.compareTo(revData, newData) == 0) {
// the bad version is caused by previous successful setData
+ span.setStatus(StatusCode.OK);
return stat;
}
} catch (KeeperException keeperException) {
// the ZK is not reliable at this moment. just throwing
exception
+ TraceUtil.setError(span, keeperException);
throw keeperException;
}
}
// throw other exceptions and verified bad version exceptions
default:
+ TraceUtil.setError(span, e);
throw e;
}
}
@@ -506,23 +536,26 @@ public class RecoverableZooKeeper {
* @return list of ACLs
*/
public List<ACL> getAcl(String path, Stat stat) throws KeeperException,
InterruptedException {
- Span span =
TraceUtil.getGlobalTracer().spanBuilder("RecoverableZookeeper.getAcl").startSpan();
- try (Scope scope = span.makeCurrent()) {
+ final Span span = TraceUtil.createSpan("RecoverableZookeeper.getAcl");
+ try (Scope ignored = span.makeCurrent()) {
RetryCounter retryCounter = retryCounterFactory.create();
while (true) {
try {
long startTime = EnvironmentEdgeManager.currentTime();
List<ACL> nodeACL = checkZk().getACL(path, stat);
+ span.setStatus(StatusCode.OK);
return nodeACL;
} catch (KeeperException e) {
switch (e.code()) {
case CONNECTIONLOSS:
case OPERATIONTIMEOUT:
case REQUESTTIMEOUT:
+ TraceUtil.setError(span, e);
retryOrThrow(retryCounter, e, "getAcl");
break;
default:
+ TraceUtil.setError(span, e);
throw e;
}
}
@@ -539,22 +572,25 @@ public class RecoverableZooKeeper {
*/
public Stat setAcl(String path, List<ACL> acls, int version)
throws KeeperException, InterruptedException {
- Span span =
TraceUtil.getGlobalTracer().spanBuilder("RecoverableZookeeper.setAcl").startSpan();
- try (Scope scope = span.makeCurrent()) {
+ final Span span = TraceUtil.createSpan("RecoverableZookeeper.setAcl");
+ try (Scope ignored = span.makeCurrent()) {
RetryCounter retryCounter = retryCounterFactory.create();
while (true) {
try {
long startTime = EnvironmentEdgeManager.currentTime();
Stat nodeStat = checkZk().setACL(path, acls, version);
+ span.setStatus(StatusCode.OK);
return nodeStat;
} catch (KeeperException e) {
switch (e.code()) {
case CONNECTIONLOSS:
case OPERATIONTIMEOUT:
+ TraceUtil.setError(span, e);
retryOrThrow(retryCounter, e, "setAcl");
break;
default:
+ TraceUtil.setError(span, e);
throw e;
}
}
@@ -578,20 +614,25 @@ public class RecoverableZooKeeper {
*/
public String create(String path, byte[] data, List<ACL> acl, CreateMode
createMode)
throws KeeperException, InterruptedException {
- Span span =
TraceUtil.getGlobalTracer().spanBuilder("RecoverableZookeeper.create").startSpan();
- try (Scope scope = span.makeCurrent()) {
+ final Span span = TraceUtil.createSpan("RecoverableZookeeper.create");
+ try (Scope ignored = span.makeCurrent()) {
byte[] newData = ZKMetadata.appendMetaData(id, data);
switch (createMode) {
case EPHEMERAL:
case PERSISTENT:
+ span.setStatus(StatusCode.OK);
return createNonSequential(path, newData, acl, createMode);
case EPHEMERAL_SEQUENTIAL:
case PERSISTENT_SEQUENTIAL:
+ span.setStatus(StatusCode.OK);
return createSequential(path, newData, acl, createMode);
default:
- throw new IllegalArgumentException("Unrecognized CreateMode: " +
createMode);
+ final IllegalArgumentException e =
+ new IllegalArgumentException("Unrecognized CreateMode: " +
createMode);
+ TraceUtil.setError(span, e);
+ throw e;
}
} finally {
span.end();
@@ -709,24 +750,27 @@ public class RecoverableZooKeeper {
* Run multiple operations in a transactional manner. Retry before throwing
exception
*/
public List<OpResult> multi(Iterable<Op> ops) throws KeeperException,
InterruptedException {
- Span span =
TraceUtil.getGlobalTracer().spanBuilder("RecoverableZookeeper.multi").startSpan();
- try (Scope scope = span.makeCurrent()) {
+ final Span span = TraceUtil.createSpan("RecoverableZookeeper.multi");
+ try (Scope ignored = span.makeCurrent()) {
RetryCounter retryCounter = retryCounterFactory.create();
Iterable<Op> multiOps = prepareZKMulti(ops);
while (true) {
try {
long startTime = EnvironmentEdgeManager.currentTime();
List<OpResult> opResults = checkZk().multi(multiOps);
+ span.setStatus(StatusCode.OK);
return opResults;
} catch (KeeperException e) {
switch (e.code()) {
case CONNECTIONLOSS:
case OPERATIONTIMEOUT:
case REQUESTTIMEOUT:
+ TraceUtil.setError(span, e);
retryOrThrow(retryCounter, e, "multi");
break;
default:
+ TraceUtil.setError(span, e);
throw e;
}
}
diff --git
a/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKWatcher.java
b/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKWatcher.java
index b80dd699cca..feaa62fd77b 100644
---
a/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKWatcher.java
+++
b/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKWatcher.java
@@ -34,6 +34,7 @@ import org.apache.hadoop.hbase.AuthUtil;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.ZooKeeperConnectionException;
import org.apache.hadoop.hbase.security.Superusers;
+import org.apache.hadoop.hbase.trace.TraceUtil;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
import org.apache.hadoop.hbase.util.Threads;
import org.apache.hadoop.security.UserGroupInformation;
@@ -552,45 +553,47 @@ public class ZKWatcher implements Watcher, Abortable,
Closeable {
}
private void processEvent(WatchedEvent event) {
- switch (event.getType()) {
- // If event type is NONE, this is a connection status change
- case None: {
- connectionEvent(event);
- break;
- }
+ TraceUtil.trace(() -> {
+ switch (event.getType()) {
+ // If event type is NONE, this is a connection status change
+ case None: {
+ connectionEvent(event);
+ break;
+ }
- // Otherwise pass along to the listeners
- case NodeCreated: {
- for (ZKListener listener : listeners) {
- listener.nodeCreated(event.getPath());
+ // Otherwise pass along to the listeners
+ case NodeCreated: {
+ for (ZKListener listener : listeners) {
+ listener.nodeCreated(event.getPath());
+ }
+ break;
}
- break;
- }
- case NodeDeleted: {
- for (ZKListener listener : listeners) {
- listener.nodeDeleted(event.getPath());
+ case NodeDeleted: {
+ for (ZKListener listener : listeners) {
+ listener.nodeDeleted(event.getPath());
+ }
+ break;
}
- break;
- }
- case NodeDataChanged: {
- for (ZKListener listener : listeners) {
- listener.nodeDataChanged(event.getPath());
+ case NodeDataChanged: {
+ for (ZKListener listener : listeners) {
+ listener.nodeDataChanged(event.getPath());
+ }
+ break;
}
- break;
- }
- case NodeChildrenChanged: {
- for (ZKListener listener : listeners) {
- listener.nodeChildrenChanged(event.getPath());
+ case NodeChildrenChanged: {
+ for (ZKListener listener : listeners) {
+ listener.nodeChildrenChanged(event.getPath());
+ }
+ break;
}
- break;
+ default:
+ LOG.error("Invalid event of type {} received for path {}.
Ignoring.", event.getState(),
+ event.getPath());
}
- default:
- LOG.error("Invalid event of type {} received for path {}. Ignoring.",
event.getState(),
- event.getPath());
- }
+ }, "ZKWatcher.processEvent: " + event.getType() + " " + event.getPath());
}
/**
@@ -602,7 +605,8 @@ public class ZKWatcher implements Watcher, Abortable,
Closeable {
public void process(WatchedEvent event) {
LOG.debug(prefix("Received ZooKeeper Event, " + "type=" + event.getType()
+ ", " + "state="
+ event.getState() + ", " + "path=" + event.getPath()));
- zkEventProcessor.submit(() -> processEvent(event));
+ final String spanName = ZKWatcher.class.getSimpleName() + "-" + identifier;
+ zkEventProcessor.submit(TraceUtil.tracedRunnable(() ->
processEvent(event), spanName));
}
// Connection management