Author: stack
Date: Mon Sep 27 21:02:04 2010
New Revision: 1001911
URL: http://svn.apache.org/viewvc?rev=1001911&view=rev
Log:
HBASE-3039 Stuck in regionsInTransition because rebalance came in at same time
as a split
M src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
Add region name to warning log message (w/o it message is no good).
M src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
Add src of split message else need to deduce where it came from by looking
elsewhere.
M src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
Updated log messages to include region and where appropritate source
server name; debug hard w/o
Changed regionOnline and regionOffline to check for unexpected
states and log warnings rather than proceed regardless.
Added in fix for concurrent balance+split; split message now
updates regionsintransition where previous it did not.
Remove checkRegion method. Its a reimplementation of
what regionOnline and regionOffline do only less comprehensive
regards what gets updated (this.regions + this.servers rather
than this.regions, this.servers and regionsInTransition)
That they were less comprehensive is root of this bug.
M src/main/java/org/apache/hadoop/hbase/master/HMaster.java
Make the message about why we are not running balancer richer
(print out how many reigons in transition and more of the
regionsintrnasition list).
M src/main/java/org/apache/hadoop/hbase/executor/RegionTransitionData.java
Javadoc and minor formatting.
Modified:
hbase/trunk/CHANGES.txt
hbase/trunk/src/main/java/org/apache/hadoop/hbase/executor/RegionTransitionData.java
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
Modified: hbase/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/hbase/trunk/CHANGES.txt?rev=1001911&r1=1001910&r2=1001911&view=diff
==============================================================================
--- hbase/trunk/CHANGES.txt (original)
+++ hbase/trunk/CHANGES.txt Mon Sep 27 21:02:04 2010
@@ -542,6 +542,8 @@ Release 0.21.0 - Unreleased
HBASE-2992 [replication] MalformedObjectNameException in ReplicationMetrics
HBASE-3037 When new master joins running cluster does "Received report from
unknown server -- telling it to STOP_REGIONSERVER.
+ HBASE-3039 Stuck in regionsInTransition because rebalance came in at same
+ time as a split
IMPROVEMENTS
HBASE-1760 Cleanup TODOs in HTable
Modified:
hbase/trunk/src/main/java/org/apache/hadoop/hbase/executor/RegionTransitionData.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/executor/RegionTransitionData.java?rev=1001911&r1=1001910&r2=1001911&view=diff
==============================================================================
---
hbase/trunk/src/main/java/org/apache/hadoop/hbase/executor/RegionTransitionData.java
(original)
+++
hbase/trunk/src/main/java/org/apache/hadoop/hbase/executor/RegionTransitionData.java
Mon Sep 27 21:02:04 2010
@@ -68,7 +68,7 @@ public class RegionTransitionData implem
* {...@link EventType#M2ZK_REGION_OFFLINE}.
*
* @param eventType type of event
- * @param regionName name of region
+ * @param regionName name of region as per {...@link
HRegionInfo#getRegionName()}
*/
public RegionTransitionData(EventType eventType, byte [] regionName) {
this(eventType, regionName, null);
@@ -85,7 +85,7 @@ public class RegionTransitionData implem
* and {...@link EventType#RS2ZK_REGION_OPENED}.
*
* @param eventType type of event
- * @param regionName name of region
+ * @param regionName name of region as per {...@link
HRegionInfo#getRegionName()}
* @param serverName name of server setting data
*/
public RegionTransitionData(EventType eventType, byte [] regionName,
@@ -117,7 +117,7 @@ public class RegionTransitionData implem
* Gets the name of the region being transitioned.
*
* <p>Region name is required so this never returns null.
- * @return region name
+ * @return region name, the result of a call to {...@link
HRegionInfo#getRegionName()}
*/
public byte [] getRegionName() {
return regionName;
Modified:
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java?rev=1001911&r1=1001910&r2=1001911&view=diff
==============================================================================
---
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
(original)
+++
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
Mon Sep 27 21:02:04 2010
@@ -60,8 +60,8 @@ import org.apache.hadoop.hbase.util.Thre
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
import org.apache.hadoop.hbase.zookeeper.ZKTableDisable;
import org.apache.hadoop.hbase.zookeeper.ZKUtil;
-import org.apache.hadoop.hbase.zookeeper.ZKUtil.NodeAndData;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperListener;
+import org.apache.hadoop.hbase.zookeeper.ZKUtil.NodeAndData;
import org.apache.hadoop.io.Writable;
import org.apache.zookeeper.KeeperException;
@@ -234,7 +234,7 @@ public class AssignmentManager extends Z
* yet).
* @param data
*/
- private void handleRegion(RegionTransitionData data) {
+ private void handleRegion(final RegionTransitionData data) {
synchronized(regionsInTransition) {
// Verify this is a known server
if (!serverManager.isServerOnline(data.getServerName()) &&
@@ -248,7 +248,7 @@ public class AssignmentManager extends Z
LOG.debug("Handling transition=" + data.getEventType() +
", server=" + data.getServerName() + ", region=" +
prettyPrintedRegionName);
RegionState regionState = regionsInTransition.get(encodedName);
- switch(data.getEventType()) {
+ switch (data.getEventType()) {
case M_ZK_REGION_OFFLINE:
// Nothing to do.
break;
@@ -259,9 +259,9 @@ public class AssignmentManager extends Z
if (regionState == null ||
(!regionState.isPendingClose() && !regionState.isClosing())) {
LOG.warn("Received CLOSING for region " + prettyPrintedRegionName +
- " from server " + data.getServerName() + " but region was in "
+
- " the state " + regionState + " and not " +
- "in expected PENDING_CLOSE or CLOSING states");
+ " from server " + data.getServerName() + " but region was in " +
+ " the state " + regionState + " and not " +
+ "in expected PENDING_CLOSE or CLOSING states");
return;
}
// Transition to CLOSING (or update stamp if already CLOSING)
@@ -422,12 +422,21 @@ public class AssignmentManager extends Z
* @param serverInfo
*/
public void regionOnline(HRegionInfo regionInfo, HServerInfo serverInfo) {
- synchronized(regionsInTransition) {
- regionsInTransition.remove(regionInfo.getEncodedName());
- regionsInTransition.notifyAll();
+ synchronized (this.regionsInTransition) {
+ RegionState rs =
+ this.regionsInTransition.remove(regionInfo.getEncodedName());
+ if (rs != null) {
+ this.regionsInTransition.notifyAll();
+ LOG.warn("Asked online a region that was already in " +
+ "regionsInTransition: " + rs);
+ }
}
- synchronized(regions) {
- regions.put(regionInfo, serverInfo);
+ synchronized (this.regions) {
+ // Add check
+ HServerInfo hsi = this.regions.get(regionInfo);
+ if (hsi != null) LOG.warn("Overwriting " + regionInfo.getEncodedName() +
+ " on " + hsi);
+ this.regions.put(regionInfo, serverInfo);
addToServers(serverInfo, regionInfo);
}
}
@@ -440,15 +449,20 @@ public class AssignmentManager extends Z
* @param regionInfo
* @param serverInfo
*/
- public void regionOffline(HRegionInfo regionInfo) {
- synchronized(regionsInTransition) {
- regionsInTransition.remove(regionInfo.getEncodedName());
- regionsInTransition.notifyAll();
+ public void regionOffline(final HRegionInfo regionInfo) {
+ synchronized(this.regionsInTransition) {
+ if (this.regionsInTransition.remove(regionInfo.getEncodedName()) !=
null) {
+ this.regionsInTransition.notifyAll();
+ }
}
- synchronized(regions) {
- HServerInfo serverInfo = regions.remove(regionInfo);
- List<HRegionInfo> serverRegions = servers.get(serverInfo);
- serverRegions.remove(regionInfo);
+ synchronized(this.regions) {
+ HServerInfo serverInfo = this.regions.remove(regionInfo);
+ if (serverInfo != null) {
+ List<HRegionInfo> serverRegions = this.servers.get(serverInfo);
+ serverRegions.remove(regionInfo);
+ } else {
+ LOG.warn("Asked offline a region that was not online: " + regionInfo);
+ }
}
}
@@ -580,17 +594,17 @@ public class AssignmentManager extends Z
region.getRegionNameAsString() + " (offlining)");
// Check if this region is currently assigned
if (!regions.containsKey(region)) {
- LOG.debug("Attempted to unassign region " +
region.getRegionNameAsString() +
- " but it is not " +
+ LOG.debug("Attempted to unassign region " +
+ region.getRegionNameAsString() + " but it is not " +
"currently assigned anywhere");
return;
}
String encodedName = region.getEncodedName();
// Grab the state of this region and synchronize on it
RegionState state;
- synchronized(regionsInTransition) {
+ synchronized (regionsInTransition) {
state = regionsInTransition.get(encodedName);
- if(state == null) {
+ if (state == null) {
state = new RegionState(region, RegionState.State.PENDING_CLOSE);
regionsInTransition.put(encodedName, state);
} else {
@@ -923,27 +937,29 @@ public class AssignmentManager extends Z
HRegionInfo regionInfo = regionState.getRegion();
LOG.info("Regions in transition timed out: " + regionState);
// Expired! Do a retry.
- switch(regionState.getState()) {
+ switch (regionState.getState()) {
case OFFLINE:
case CLOSED:
LOG.info("Region has been OFFLINE or CLOSED for too long, " +
- "reassigning " + regionInfo.getRegionNameAsString());
+ "reassigning " + regionInfo.getRegionNameAsString());
assign(regionState.getRegion());
break;
case PENDING_OPEN:
case OPENING:
LOG.info("Region has been PENDING_OPEN or OPENING for too " +
- "long, reassigning " + regionInfo.getRegionNameAsString());
+ "long, reassigning region=" +
+ regionInfo.getRegionNameAsString());
assign(regionState.getRegion());
break;
case OPEN:
LOG.warn("Long-running region in OPEN state? This should " +
- "not happen");
+ "not happen; region=" + regionInfo.getRegionNameAsString());
break;
case PENDING_CLOSE:
case CLOSING:
LOG.info("Region has been PENDING_CLOSE or CLOSING for too " +
- "long, resending close rpc");
+ "long, running unassign again on region=" +
+ regionInfo.getRegionNameAsString());
unassign(regionInfo);
break;
}
@@ -1017,61 +1033,9 @@ public class AssignmentManager extends Z
*/
public void handleSplitReport(final HServerInfo hsi, final HRegionInfo
parent,
final HRegionInfo a, final HRegionInfo b) {
- synchronized (this.regions) {
- checkRegion(hsi, parent, true);
- checkRegion(hsi, a, false);
- checkRegion(hsi, b, false);
- this.regions.put(a, hsi);
- this.regions.put(b, hsi);
- removeFromServers(hsi, parent, true);
- removeFromServers(hsi, a, false);
- removeFromServers(hsi, b, false);
- addToServers(hsi, a);
- addToServers(hsi, b);
- }
- }
-
- /*
- * Caller must hold locks on this.regions Map.
- * @param hsi
- * @param hri
- * @param expected True if we expect <code>hri</code> to be in this.regions.
- */
- private void checkRegion(final HServerInfo hsi, final HRegionInfo hri,
- final boolean expected) {
- HServerInfo serverInfo = regions.remove(hri);
- if (expected) {
- if (serverInfo == null) {
- LOG.info("Region not on a server: " + hri.getRegionNameAsString());
- }
- } else {
- if (serverInfo != null) {
- LOG.warn("Region present on " + hsi + "; unexpected");
- }
- }
- }
-
- /*
- * Caller must hold locks on servers Map.
- * @param hsi
- * @param hri
- * @param expected
- */
- private void removeFromServers(final HServerInfo hsi, final HRegionInfo hri,
- final boolean expected) {
- List<HRegionInfo> serverRegions = this.servers.get(hsi);
- boolean removed = serverRegions.remove(hri);
- if (expected) {
- if (!removed) {
- LOG.warn(hri.getRegionNameAsString() + " not found on " + hsi +
- "; unexpected");
- }
- } else {
- if (removed) {
- LOG.warn(hri.getRegionNameAsString() + " found on " + hsi +
- "; unexpected");
- }
- }
+ regionOffline(parent);
+ regionOnline(a, hsi);
+ regionOnline(b, hsi);
}
/**
Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1001911&r1=1001910&r2=1001911&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
(original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Mon
Sep 27 21:02:04 2010
@@ -572,9 +572,11 @@ implements HMasterInterface, HMasterRegi
synchronized (this.balancer) {
// Only allow one balance run at at time.
if (this.assignmentManager.isRegionsInTransition()) {
- LOG.debug("Not running balancer because regions in transition: " +
+ LOG.debug("Not running balancer because " +
+ this.assignmentManager.getRegionsInTransition().size() +
+ " region(s) in transition: " +
org.apache.commons.lang.StringUtils.
-
abbreviate(this.assignmentManager.getRegionsInTransition().toString(), 64));
+
abbreviate(this.assignmentManager.getRegionsInTransition().toString(), 256));
return false;
}
if (!this.serverManager.getDeadServers().isEmpty()) {
Modified:
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java?rev=1001911&r1=1001910&r2=1001911&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
(original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
Mon Sep 27 21:02:04 2010
@@ -285,7 +285,7 @@ public class ServerManager {
}
for (HMsg msg: msgs) {
- LOG.info("Received " + msg);
+ LOG.info("Received " + msg + " from " + serverInfo.getServerName());
switch (msg.getType()) {
case REGION_SPLIT:
this.services.getAssignmentManager().handleSplitReport(serverInfo,
@@ -527,8 +527,9 @@ public class ServerManager {
throws NotServingRegionException {
HRegionInterface hri = getServerConnection(server);
if(hri == null) {
- LOG.warn("Attempting to send CLOSE RPC to server " +
server.getServerName()
- + " failed because no RPC connection found to this server");
+ LOG.warn("Attempting to send CLOSE RPC to server " +
+ server.getServerName() + " failed because no RPC connection found " +
+ "to this server");
return;
}
hri.closeRegion(region);
Modified:
hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java?rev=1001911&r1=1001910&r2=1001911&view=diff
==============================================================================
---
hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
(original)
+++
hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
Mon Sep 27 21:02:04 2010
@@ -1924,7 +1924,8 @@ public class HRegionServer implements HR
// TODO: Need to check if this is being served here but currently
undergoing
// a split (so master needs to retry close after split is complete)
if (!onlineRegions.containsKey(region.getEncodedName())) {
- LOG.warn("Received close for region we are not serving");
+ LOG.warn("Received close for region we are not serving; " +
+ region.getEncodedName());
throw new NotServingRegionException("Received close for "
+ region.getRegionNameAsString() + " but we are not serving it");
}