[
https://issues.apache.org/jira/browse/HBASE-3617?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=13005273#comment-13005273
]
stack commented on HBASE-3617:
------------------------------
We do all this stuff in the balance:
{code}
try {
// TODO: We should consider making this look more like it does for the
// region open where we catch all throwables and never abort
if (serverManager.sendRegionClose(server, state.getRegion())) {
LOG.debug("Sent CLOSE to " + server + " for region " +
region.getRegionNameAsString());
return;
}
// This never happens. Currently regionserver close always return true.
LOG.debug("Server " + server + " region CLOSE RPC returned false for " +
region.getEncodedName());
} catch (NotServingRegionException nsre) {
LOG.info("Server " + server + " returned " + nsre + " for " +
region.getEncodedName());
// Presume that master has stale data. Presume remote side just split.
// Presume that the split message when it comes in will fix up the
master's
// in memory cluster state.
return;
} catch (ConnectException e) {
LOG.info("Failed connect to " + server + ", message=" + e.getMessage() +
", region=" + region.getEncodedName());
// Presume that regionserver just failed and we haven't got expired
// server from zk yet. Let expired server deal with clean up.
} catch (java.net.SocketTimeoutException e) {
LOG.info("Server " + server + " returned " + e.getMessage() + " for " +
region.getEncodedName());
// Presume retry or server will expire.
} catch (EOFException e) {
LOG.info("Server " + server + " returned " + e.getMessage() + " for " +
region.getEncodedName());
// Presume retry or server will expire.
} catch (RemoteException re) {
IOException ioe = re.unwrapRemoteException();
if (ioe instanceof NotServingRegionException) {
// Failed to close, so pass through and reassign
LOG.debug("Server " + server + " returned " + ioe + " for " +
region.getEncodedName());
} else if (ioe instanceof EOFException) {
// Failed to close, so pass through and reassign
LOG.debug("Server " + server + " returned " + ioe + " for " +
region.getEncodedName());
} else {
this.master.abort("Remote unexpected exception", ioe);
}
} catch (Throwable t) {
// For now call abort if unexpected exception -- radical, but will get
// fellas attention. St.Ack 20101012
this.master.abort("Remote unexpected exception", t);
}
}
{code}
> NoRouteToHostException during balancing will cause Master abort
> ---------------------------------------------------------------
>
> Key: HBASE-3617
> URL: https://issues.apache.org/jira/browse/HBASE-3617
> Project: HBase
> Issue Type: Bug
> Reporter: stack
> Priority: Critical
> Fix For: 0.90.2
>
>
> Via Tatsuya up on the list:
> {code}
> 2011-03-10 07:48:39,192 FATAL org.apache.hadoop.hbase.master.HMaster:
> Remote unexpected exception
> java.net.NoRouteToHostException: No route to host
> at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)
> at
> sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:567)
> at
> org.apache.hadoop.net.SocketIOWithTimeout.connect(SocketIOWithTimeout.java:
> 206)
> at org.apache.hadoop.net.NetUtils.connect(NetUtils.java:408)
> at org.apache.hadoop.hbase.ipc.HBaseClient
> $Connection.setupIOstreams(HBaseClient.java:328)
> at
> org.apache.hadoop.hbase.ipc.HBaseClient.getConnection(HBaseClient.java:
> 883)
> at
> org.apache.hadoop.hbase.ipc.HBaseClient.call(HBaseClient.java:750)
> at org.apache.hadoop.hbase.ipc.HBaseRPC
> $Invoker.invoke(HBaseRPC.java:257)
> at $Proxy6.closeRegion(Unknown Source)
> at
> org.apache.hadoop.hbase.master.ServerManager.sendRegionClose(ServerManager.java:
> 589)
> at
> org.apache.hadoop.hbase.master.AssignmentManager.unassign(AssignmentManager.java:
> 1093)
> at
> org.apache.hadoop.hbase.master.AssignmentManager.unassign(AssignmentManager.java:
> 1040)
> at
> org.apache.hadoop.hbase.master.AssignmentManager.balance(AssignmentManager.java:
> 1831)
> at org.apache.hadoop.hbase.master.HMaster.balance(HMaster.java:
> 692)
> at org.apache.hadoop.hbase.master.HMaster$1.chore(HMaster.java:
> 583)
> at org.apache.hadoop.hbase.Chore.run(Chore.java:66)
> 2011-03-10 07:48:39,192 INFO org.apache.hadoop.hbase.master.HMaster:
> Aborting
> 2011-03-10 07:48:39,192 INFO org.apache.hadoop.hbase.master.HMaster:
> balance hri=SpecialObject_Speed_Test,,
> 1299710751983.f0e5544339870a510c338b3029979d3e.,
> src=ap13.secur2,60020,1299710609447,
> dest=ap12.secur2,60020,1299710609148
> 2011-03-10 07:48:39,192 DEBUG
> org.apache.hadoop.hbase.master.AssignmentManager: Starting
> unassignment of region SpecialObject_Speed_Test,,
> 1299710751983.f0e5544339870a510c338b3029979d3e. (offlining)
> 2011-03-10 07:48:39,852 DEBUG org.apache.hadoop.hbase.master.HMaster:
> Stopping service threads
> 2011-03-10 07:48:39,852 INFO org.apache.hadoop.ipc.HBaseServer:
> Stopping server on 60000
> 2011-03-10 07:48:39,852 FATAL org.apache.hadoop.hbase.master.HMaster:
> Remote unexpected exception
> java.io.InterruptedIOException: Interruped while waiting for IO on
> channel java.nio.channels.SocketChannel[connection-pending remote=/
> 10.X.X.18:60020]. 19340 millis timeout left.
> at org.apache.hadoop.net.SocketIOWithTimeout
> $SelectorPool.select(SocketIOWithTimeout.java:349)
> at
> org.apache.hadoop.net.SocketIOWithTimeout.connect(SocketIOWithTimeout.java:
> 203)
> at org.apache.hadoop.net.NetUtils.connect(NetUtils.java:408)
> at org.apache.hadoop.hbase.ipc.HBaseClient
> $Connection.setupIOstreams(HBaseClient.java:328)
> at
> org.apache.hadoop.hbase.ipc.HBaseClient.getConnection(HBaseClient.java:
> 883)
> at
> org.apache.hadoop.hbase.ipc.HBaseClient.call(HBaseClient.java:750)
> at org.apache.hadoop.hbase.ipc.HBaseRPC
> $Invoker.invoke(HBaseRPC.java:257)
> at $Proxy6.closeRegion(Unknown Source)
> at
> org.apache.hadoop.hbase.master.ServerManager.sendRegionClose(ServerManager.java:
> 589)
> at
> org.apache.hadoop.hbase.master.AssignmentManager.unassign(AssignmentManager.java:
> 1093)
> at
> org.apache.hadoop.hbase.master.AssignmentManager.unassign(AssignmentManager.java:
> 1040)
> at
> org.apache.hadoop.hbase.master.AssignmentManager.balance(AssignmentManager.java:
> 1831)
> at org.apache.hadoop.hbase.master.HMaster.balance(HMaster.java:
> 692)
> at org.apache.hadoop.hbase.master.HMaster$1.chore(HMaster.java:
> 583)
> at org.apache.hadoop.hbase.Chore.run(Chore.java:66)
> 2011-03-10 07:48:39,852 INFO org.apache.hadoop.hbase.master.HMaster:
> Aborting
> {code}
--
This message is automatically generated by JIRA.
For more information on JIRA, see: http://www.atlassian.com/software/jira