Author: stack
Date: Sat Sep 25 06:24:42 2010
New Revision: 1001140
URL: http://svn.apache.org/viewvc?rev=1001140&view=rev
Log:
HBASE-3037 When new master joins running cluster does "Received report from
unknown server -- telling it to STOP_REGIONSERVER..."
M src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
On regionServerReport, if we get a report from an 'unknown' regionserver,
we used to tell it stop itself. Now, if 'unknown' server AND
this master did not start the cluster, its joining the cluster, then
treat the report as a regionServerStart and register the incoming
server rather than tell it shutdown.
M src/main/java/org/apache/hadoop/hbase/master/HMaster.java
Pass the freshClusterStartup flag to ServerManager.
Add more executors for opening and closing. On cluster startup a
master shouldn't be bottleneck clearning the server opens.
Expose the run-balancer method so can make it available in
ServeAdmin.
M src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
Minor formatting and javadoc
M src/main/java/org/apache/hadoop/hbase/ipc/HBaseRPCProtocolVersion.java
Upped rpc version number because of new balancer addition (and because
we didn't do it when we put in new master).
M src/main/java/org/apache/hadoop/hbase/ipc/HMasterInterface.java
Added balance method.
M src/main/resources/hbase-default.xml
Change how ofter we check in from every 3 seconds to every 5 seconds.
Modified:
hbase/trunk/CHANGES.txt
hbase/trunk/src/main/java/org/apache/hadoop/hbase/client/HBaseAdmin.java
hbase/trunk/src/main/java/org/apache/hadoop/hbase/ipc/HBaseRPCProtocolVersion.java
hbase/trunk/src/main/java/org/apache/hadoop/hbase/ipc/HMasterInterface.java
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
hbase/trunk/src/main/resources/hbase-default.xml
Modified: hbase/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/hbase/trunk/CHANGES.txt?rev=1001140&r1=1001139&r2=1001140&view=diff
==============================================================================
--- hbase/trunk/CHANGES.txt (original)
+++ hbase/trunk/CHANGES.txt Sat Sep 25 06:24:42 2010
@@ -540,6 +540,8 @@ Release 0.21.0 - Unreleased
HBASE-3028 No basescanner means no GC'ing of split, offlined parent regions
HBASE-2989 [replication] RSM won't cleanup after locking if 0 peers
HBASE-2992 [replication] MalformedObjectNameException in ReplicationMetrics
+ HBASE-3037 When new master joins running cluster does "Received report from
+ unknown server -- telling it to STOP_REGIONSERVER.
IMPROVEMENTS
HBASE-1760 Cleanup TODOs in HTable
Modified:
hbase/trunk/src/main/java/org/apache/hadoop/hbase/client/HBaseAdmin.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/client/HBaseAdmin.java?rev=1001140&r1=1001139&r2=1001140&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/client/HBaseAdmin.java
(original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/client/HBaseAdmin.java
Sat Sep 25 06:24:42 2010
@@ -879,14 +879,24 @@ public class HBaseAdmin implements Abort
}
/**
+ * Turn the load balancer on or off.
* @param b If true, enable balancer. If false, disable balancer.
* @return Previous balancer value
- * @throws ZooKeeperConnectionException
- * @throws MasterNotRunningException
*/
- public boolean balance(final boolean b)
+ public boolean balanceSwitch(final boolean b)
throws MasterNotRunningException, ZooKeeperConnectionException {
- return getMaster().balance(b);
+ return getMaster().balanceSwitch(b);
+ }
+
+ /**
+ * Invoke the balancer. Will run the balancer and if regions to move, it
will
+ * go ahead and do the reassignments. Can NOT run for various reasons.
Check
+ * logs.
+ * @return True if balancer ran, false otherwise.
+ */
+ public boolean balancer()
+ throws MasterNotRunningException, ZooKeeperConnectionException {
+ return getMaster().balance();
}
/**
Modified:
hbase/trunk/src/main/java/org/apache/hadoop/hbase/ipc/HBaseRPCProtocolVersion.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/ipc/HBaseRPCProtocolVersion.java?rev=1001140&r1=1001139&r2=1001140&view=diff
==============================================================================
---
hbase/trunk/src/main/java/org/apache/hadoop/hbase/ipc/HBaseRPCProtocolVersion.java
(original)
+++
hbase/trunk/src/main/java/org/apache/hadoop/hbase/ipc/HBaseRPCProtocolVersion.java
Sat Sep 25 06:24:42 2010
@@ -77,7 +77,8 @@ public interface HBaseRPCProtocolVersion
* <li>Version 23: HBASE-2066, multi-put.</li>
* <li>Version 24: HBASE-2473, create table with regions.</li>
* <li>Version 25: Added openRegion and Stoppable/Abortable to API.</li>
+ * <li>Version 26: New master.</li>
* </ul>
*/
- public static final long versionID = 25L;
+ public static final long versionID = 26L;
}
Modified:
hbase/trunk/src/main/java/org/apache/hadoop/hbase/ipc/HMasterInterface.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/ipc/HMasterInterface.java?rev=1001140&r1=1001139&r2=1001140&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/ipc/HMasterInterface.java
(original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/ipc/HMasterInterface.java
Sat Sep 25 06:24:42 2010
@@ -144,8 +144,17 @@ public interface HMasterInterface extend
throws UnknownRegionException;
/**
+ * Run the balancer. Will run the balancer and if regions to move, it will
+ * go ahead and do the reassignments. Can NOT run for various reasons.
Check
+ * logs.
+ * @return True if balancer ran, false otherwise.
+ */
+ public boolean balance();
+
+ /**
+ * Turn the load balancer on or off.
* @param b If true, enable balancer. If false, disable balancer.
* @return Previous balancer value
*/
- public boolean balance(final boolean b);
+ public boolean balanceSwitch(final boolean b);
}
\ No newline at end of file
Modified:
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java?rev=1001140&r1=1001139&r2=1001140&view=diff
==============================================================================
---
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
(original)
+++
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
Sat Sep 25 06:24:42 2010
@@ -177,13 +177,13 @@ public class AssignmentManager extends Z
// Check existing regions in transition
List<String> nodes = ZKUtil.listChildrenAndWatchForNewChildren(watcher,
watcher.assignmentZNode);
- if(nodes.isEmpty()) {
+ if (nodes.isEmpty()) {
LOG.info("No regions in transition in ZK to process on failover");
return;
}
LOG.info("Failed-over master needs to process " + nodes.size() +
" regions in transition");
- for(String regionName : nodes) {
+ for (String regionName: nodes) {
RegionTransitionData data = ZKAssign.getData(watcher, regionName);
HRegionInfo regionInfo =
MetaReader.getRegion(catalogTracker, data.getRegionName()).getFirst();
@@ -738,10 +738,10 @@ public class AssignmentManager extends Z
private void rebuildUserRegions() throws IOException {
Map<HRegionInfo,HServerAddress> allRegions =
MetaReader.fullScan(catalogTracker);
- for(Map.Entry<HRegionInfo,HServerAddress> region : allRegions.entrySet()) {
+ for (Map.Entry<HRegionInfo,HServerAddress> region : allRegions.entrySet())
{
HServerAddress regionLocation = region.getValue();
HRegionInfo regionInfo = region.getKey();
- if(regionLocation == null) {
+ if (regionLocation == null) {
regions.put(regionInfo, null);
continue;
}
@@ -1020,6 +1020,7 @@ public class AssignmentManager extends Z
synchronized (this.regions) {
checkRegion(hsi, parent, true);
checkRegion(hsi, a, false);
+ checkRegion(hsi, b, false);
this.regions.put(a, hsi);
this.regions.put(b, hsi);
removeFromServers(hsi, parent, true);
@@ -1031,10 +1032,10 @@ public class AssignmentManager extends Z
}
/*
- * Caller must hold locks on regions Map.
+ * Caller must hold locks on this.regions Map.
* @param hsi
* @param hri
- * @param expected
+ * @param expected True if we expect <code>hri</code> to be in this.regions.
*/
private void checkRegion(final HServerInfo hsi, final HRegionInfo hri,
final boolean expected) {
Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1001140&r1=1001139&r2=1001140&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
(original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Sat
Sep 25 06:24:42 2010
@@ -160,7 +160,8 @@ implements HMasterInterface, HMasterRegi
private LoadBalancer balancer = new LoadBalancer();
private Thread balancerChore;
- private volatile boolean balance = true;
+ // If 'true', the balancer is 'on'. If 'false', the balancer will not run.
+ private volatile boolean balanceSwitch = true;
private Thread catalogJanitorChore;
@@ -241,7 +242,7 @@ implements HMasterInterface, HMasterRegi
this.connection = HConnectionManager.getConnection(conf);
this.executorService = new ExecutorService(getServerName());
- this.serverManager = new ServerManager(this, this);
+ this.serverManager = new ServerManager(this, this,
this.freshClusterStartup);
this.catalogTracker = new CatalogTracker(this.zooKeeper, this.connection,
this, conf.getInt("hbase.master.catalog.timeout", Integer.MAX_VALUE));
@@ -304,15 +305,13 @@ implements HMasterInterface, HMasterRegi
this.serverManager.waitForRegionServers();
// Start assignment of user regions, startup or failure
- if (!this.stopped) {
- if (this.freshClusterStartup) {
- clusterStarterInitializations(this.fileSystemManager,
- this.serverManager, this.catalogTracker, this.assignmentManager);
- } else {
- // Process existing unassigned nodes in ZK, read all regions from
META,
- // rebuild in-memory state.
- this.assignmentManager.processFailover();
- }
+ if (this.freshClusterStartup) {
+ clusterStarterInitializations(this.fileSystemManager,
+ this.serverManager, this.catalogTracker, this.assignmentManager);
+ } else {
+ // Process existing unassigned nodes in ZK, read all regions from META,
+ // rebuild in-memory state.
+ this.assignmentManager.processFailover();
}
// Start balancer and meta catalog janitor after meta and regions have
@@ -320,6 +319,7 @@ implements HMasterInterface, HMasterRegi
this.balancerChore = getAndStartBalancerChore(this);
this.catalogJanitorChore =
Threads.setDaemonThreadRunning(new CatalogJanitor(this, this));
+
// Check if we should stop every second.
Sleeper sleeper = new Sleeper(1000, this);
while (!this.stopped) sleeper.sleep();
@@ -442,9 +442,9 @@ implements HMasterInterface, HMasterRegi
try {
// Start the executor service pools
this.executorService.startExecutorService(ExecutorType.MASTER_OPEN_REGION,
- conf.getInt("hbase.master.executor.openregion.threads", 5));
+ conf.getInt("hbase.master.executor.openregion.threads", 10));
this.executorService.startExecutorService(ExecutorType.MASTER_CLOSE_REGION,
- conf.getInt("hbase.master.executor.closeregion.threads", 5));
+ conf.getInt("hbase.master.executor.closeregion.threads", 10));
this.executorService.startExecutorService(ExecutorType.MASTER_SERVER_OPERATIONS,
conf.getInt("hbase.master.executor.serverops.threads", 5));
this.executorService.startExecutorService(ExecutorType.MASTER_TABLE_OPERATIONS,
@@ -496,9 +496,8 @@ implements HMasterInterface, HMasterRegi
}
private static Thread getAndStartBalancerChore(final HMaster master) {
- String name = master.getServerName() + "-balancerChore";
- int period = master.getConfiguration().
- getInt("hbase.balancer.period", 3000000);
+ String name = master.getServerName() + "-BalancerChore";
+ int period = master.getConfiguration().getInt("hbase.balancer.period",
300000);
// Start up the load balancer chore
Chore chore = new Chore(name, period, master) {
@Override
@@ -566,13 +565,10 @@ implements HMasterInterface, HMasterRegi
return !isStopped();
}
- /**
- * Run the balancer.
- * @return True if balancer ran, false otherwise.
- */
+ @Override
public boolean balance() {
// If balance not true, don't run balancer.
- if (!this.balance) return false;
+ if (!this.balanceSwitch) return false;
synchronized (this.balancer) {
// Only allow one balance run at at time.
if (this.assignmentManager.isRegionsInTransition()) {
@@ -606,9 +602,9 @@ implements HMasterInterface, HMasterRegi
}
@Override
- public boolean balance(final boolean b) {
- boolean oldValue = this.balance;
- this.balance = b;
+ public boolean balanceSwitch(final boolean b) {
+ boolean oldValue = this.balanceSwitch;
+ this.balanceSwitch = b;
LOG.info("Balance=" + b);
return oldValue;
}
Modified:
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java?rev=1001140&r1=1001139&r2=1001140&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
(original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
Sat Sep 25 06:24:42 2010
@@ -84,6 +84,7 @@ public class ServerManager {
private final Server master;
private final MasterServices services;
+ private final boolean freshClusterStartup;
private final ServerMonitor serverMonitorThread;
@@ -119,10 +120,14 @@ public class ServerManager {
* Constructor.
* @param master
* @param services
+ * @param freshClusterStartup True if we are original master on a fresh
+ * cluster startup else if false, we are joining an already running cluster.
*/
- public ServerManager(final Server master, final MasterServices services) {
+ public ServerManager(final Server master, final MasterServices services,
+ final boolean freshClusterStartup) {
this.master = master;
this.services = services;
+ this.freshClusterStartup = freshClusterStartup;
Configuration c = master.getConfiguration();
int monitorInterval = c.getInt("hbase.master.monitor.interval", 60 * 1000);
this.metrics = new MasterMetrics(master.getServerName());
@@ -249,9 +254,29 @@ public class ServerManager {
// If we don't know this server, tell it shutdown.
HServerInfo storedInfo = this.onlineServers.get(info.getServerName());
if (storedInfo == null) {
- LOG.warn("Received report from unknown server -- telling it " +
- "to " + HMsg.Type.STOP_REGIONSERVER + ": " + info.getServerName());
- return HMsg.STOP_REGIONSERVER_ARRAY;
+ if (!this.freshClusterStartup) {
+ // If we are joining an existing cluster, then soon as we come up we'll
+ // be getting reports from already running regionservers.
+ LOG.info("Registering new server: " + info.getServerName());
+ // recordNewServer is what happens at the end of reportServerStartup.
+ // The only thing we are skipping is passing back to the regionserver
+ // the HServerInfo to use. Here we presume a master has already done
+ // that so we'll press on with whatever it gave us for HSI.
+ recordNewServer(info);
+ // If msgs, put off their processing but this is not enough because
+ // its possible that the next time the server reports in, we'll still
+ // not be up and serving. For example, if a split, we'll need the
+ // regions and servers setup in the master before the below
+ // handleSplitReport will work. TODO: FIx!!
+ if (msgs.length > 0) throw new PleaseHoldException("FIX! Putting off "
+
+ "message processing because not yet rwady but possible we won't be "
+
+ "ready next on next report");
+ } else {
+ LOG.warn("Received report from unknown server, a server calling " +
+ " regionServerReport w/o having first called regionServerStartup; " +
+ "telling it " + HMsg.Type.STOP_REGIONSERVER + ": " +
info.getServerName());
+ return HMsg.STOP_REGIONSERVER_ARRAY;
+ }
}
// Check startcodes
Modified: hbase/trunk/src/main/resources/hbase-default.xml
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/main/resources/hbase-default.xml?rev=1001140&r1=1001139&r2=1001140&view=diff
==============================================================================
--- hbase/trunk/src/main/resources/hbase-default.xml (original)
+++ hbase/trunk/src/main/resources/hbase-default.xml Sat Sep 25 06:24:42 2010
@@ -157,10 +157,9 @@
</property>
<property>
<name>hbase.regionserver.msginterval</name>
- <value>3000</value>
+ <value>5000</value>
<description>Interval between messages from the RegionServer to HMaster
- in milliseconds. Use a high value for clusters with more than 100
- nodes. Default is 3 seconds.
+ in milliseconds.
</description>
</property>
<property>