Author: liyin Date: Thu May 16 19:20:14 2013 New Revision: 1483521 URL: http://svn.apache.org/r1483521 Log: [HBASE-8500] Minor Changes in RollingRestart + a basic wrapper around the utility
Author: rshroff Summary: Made small changes in RollingRestart class to be able to remotely restart a region server. The change also adds a very basic wrapper(groovy script) around the RollingRestart utility to perform the RR for all/selected region servers in the cluster. Test Plan: tested it multiple times on TSH025 Reviewers: aaiyer, liyintang, paultuckfield Reviewed By: aaiyer CC: hbase-eng@ Differential Revision: https://phabricator.fb.com/D807373 Task ID: 2229110 Added: hbase/branches/0.89-fb/bin/rolling_restart_v2 Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/RollingRestart.java Added: hbase/branches/0.89-fb/bin/rolling_restart_v2 URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/bin/rolling_restart_v2?rev=1483521&view=auto ============================================================================== --- hbase/branches/0.89-fb/bin/rolling_restart_v2 (added) +++ hbase/branches/0.89-fb/bin/rolling_restart_v2 Thu May 16 19:20:14 2013 @@ -0,0 +1,155 @@ +#!/bin/env /opt/hbase/bin/hbase-groovy +import org.apache.hadoop.hbase.HBaseConfiguration; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.client.MetaScanner; +import org.apache.hadoop.hbase.client.HBaseAdmin; +import org.apache.hadoop.hbase.client.*; +import org.apache.hadoop.hbase.master.*; +import org.apache.hadoop.hbase.*; +import org.apache.hadoop.hbase.util.Bytes; +import java.io.ByteArrayInputStream; +import org.apache.log4j.* +import org.apache.commons.cli.* + +Logger.getLogger("org.apache.zookeeper").setLevel(Level.ERROR); +Logger.getLogger("org.apache.hadoop.hbase").setLevel(Level.INFO); + +def restartRegionServer(HBaseAdmin admin, HServerInfo regionserver, + Configuration conf, String rollingRestartArgs, String logsFolder) { + long startCode = regionserver.getStartCode()/1000; + String host = regionserver.getHostname(); + + // Start the rolling restart + + String cmd ="/usr/local/hadoop/" + conf.get("titan.cell.name") + + "-HBASE/bin/hbase org.apache.hadoop.hbase.util.RollingRestart" + + rollingRestartArgs + " -s " + host; + + println "Performing rolling restart for host " + host; + println cmd; + + File file = new File(logsFolder + "/rolling_restart_" + host + ".txt"); + + file.createNewFile(); + FileOutputStream ostream = new FileOutputStream(file); + println "Check the output at " + logsFolder + "/rolling_restart_" + host + ".txt"; + + def proc = cmd.execute() + proc.consumeProcessOutput(ostream, ostream); + proc.waitForOrKill(30 * 60 * 1000) + + ostream.close(); + long newStartCode = admin.getConnection(). + getHRegionConnection(regionserver.getServerAddress()).getHServerInfo().getStartCode()/1000; + + if (newStartCode <= startCode) { + throw new Exception("Rolling Restart failed for Regionserver " + host); + } +} + +// START + +Options options = new Options(); + +options.addOption("n", "nodes", true, + "Name of the region servers to restart"); +options.addOption("a", "all", false, + "Name of the region servers to restart"); +options.addOption("r", "sleep_after_restart", true, + "time interval after which the region server should be started assigning regions. Default : 10000ms"); +options.addOption("b", "sleep_before_restart", true, + "time interval after which the region server should be restarted after draining. Default : 10000ms"); +options.addOption("d", "region_drain_interval", true, + "time interval between region movements while draining. Default : 1000ms"); +options.addOption("u", "region_undrain_interval", true, + "time interval between region movements while undraining. Default : 10000ms"); +options.addOption("g", "get_request_frequency", true, + "frequency at which region checker will check for region availability. Default : 1000ms"); +options.addOption("c", "clear", false, + "Clear all the regionserver from blacklist. Default : false"); +options.addOption("h", "dont_use_hadoopctl", false, + "Don't hadoopctl to restart the regionserver. Default : true"); +options.addOption("f", "log_file_folder", true, + "Default location where logs for rolling restart should be stored. Default : /tmp"); + +if (args.length == 0) { + HelpFormatter formatter = new HelpFormatter(); + formatter.printHelp("rolling_restart", options, true); + return; +} + +CommandLineParser parser = new PosixParser(); +CommandLine cmd = parser.parse(options, args); + +String rollingRestartArgs = ""; +String logsFolder = "/tmp"; +hosts = null; + +if (cmd.hasOption('r')) { + rollingRestartArgs += " -r " + cmd.getOptionValue('r'); +} +if (cmd.hasOption('b')) { + rollingRestartArgs += " -b " + cmd.getOptionValue('b'); +} +if (cmd.hasOption('d')) { + rollingRestartArgs += " -d " + cmd.getOptionValue('d'); +} +if (cmd.hasOption('u')) { + rollingRestartArgs += " -u " + cmd.getOptionValue('u'); +} +if (cmd.hasOption('g')) { + rollingRestartArgs += " -g " + cmd.getOptionValue('g'); +} +if (cmd.hasOption('c')) { + rollingRestartArgs += " -c " + cmd.getOptionValue('c'); +} +if (cmd.hasOption('h')) { + rollingRestartArgs += " -h " + cmd.getOptionValue('h'); +} +if (cmd.hasOption('f')) { + logsFolder = cmd.getOptionValue('f'); +} + +if (cmd.hasOption("n")) { + hosts = cmd.getOptionValue("n").split(','); + println hosts; +} else if (cmd.hasOption('a')) { + println "Performing restart on all regionservers"; +} else { + HelpFormatter formatter = new HelpFormatter(); + formatter.printHelp("rolling_restart", options, true); + return; +} + +conf = HBaseConfiguration.create(); +admin = new HBaseAdmin(conf); + +PrintWriter errFileWriter = new PrintWriter(logsFolder + "/rolling_restart_failed_nodes"); + +if (hosts != null) { + for (String server in hosts) { + HServerAddress serverAddr = new HServerAddress(server, 60020); + try { + HServerInfo host = + admin.getConnection().getHRegionConnection(serverAddr).getHServerInfo(); + restartRegionServer(admin, host, conf, rollingRestartArgs, logsFolder); + } catch (Exception e) { + e.printStackTrace(); + println "\nERROR: "+ server + " Failed."; + errFileWriter.println(host.getHostname()); + } + } +} else { + + for (host in admin.getClusterStatus().getServerInfo()) { + try { + restartRegionServer(admin, host, conf, rollingRestartArgs, logsFolder); + } catch (Exception e) { + e.printStackTrace(); + println "\nERROR: "+ host + " Failed."; + errFileWriter.println(host.getHostname()); + } + } +} + +errFileWriter.close(); Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/RollingRestart.java URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/RollingRestart.java?rev=1483521&r1=1483520&r2=1483521&view=diff ============================================================================== --- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/RollingRestart.java (original) +++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/RollingRestart.java Thu May 16 19:20:14 2013 @@ -85,7 +85,6 @@ public class RollingRestart { currentState = STAGE.FAIL; return; } - this.serverAddr = new HServerAddress(serverName, 60020); currentState = STAGE.SETUP; @@ -164,22 +163,50 @@ public class RollingRestart { * @throws InterruptedException */ void restart() throws IOException, InterruptedException { - System.out.println("Shutting down the region server"); + System.out.println("Shutting down the region server after sleep of " + + this.sleepIntervalBeforeRestart); Thread.sleep(this.sleepIntervalBeforeRestart); String cellName = conf.get("titan.cell.name"); - try { + String sshCmd = "ssh hadoop@" + serverAddr.getHostname(); + try { if (this.useHadoopCtl) { - Process p = Runtime.getRuntime().exec("/usr/local/bin/hadoopctl restart regionserver"); + sshCmd += " hadoopctl restart regionserver"; + LOG.info("Executing " + sshCmd); + Process p = Runtime.getRuntime().exec(sshCmd); + p.waitFor(); + + LOG.info("Exit value for the region server restart " + p.exitValue()); + + if (p.exitValue() != 0) { + LOG.error("Failed to restart. regionserver. Aborting.."); + throw new IOException("Failed to restart regionserver. Aborting.."); + } } else { - Process p = Runtime.getRuntime().exec("/usr/local/hadoop/" + - cellName + "-HBASE/bin/hbase-daemon.sh stop regionserver"); + String sshCmdToStopRS = sshCmd + " /usr/local/hadoop/" + + cellName + "-HBASE/bin/hbase-daemon.sh stop regionserver"; + LOG.info("Executing " + sshCmd); + Process p = Runtime.getRuntime().exec(sshCmdToStopRS); p.waitFor(); - p = Runtime.getRuntime().exec("/usr/local/hadoop/" + - cellName + "-HBASE/bin/hbase-daemon.sh start regionserver"); + + LOG.info("Exit value for the region server stop " + p.exitValue()); + + if (p.exitValue() != 0) { + LOG.error("Failed to stop regionserver. Aborting.."); + throw new IOException("Failed to stop regionserver. Aborting.."); + } + String sshCmdToStartRS = sshCmd + " /usr/local/hadoop/" + + cellName + "-HBASE/bin/hbase-daemon.sh start regionserver "; + p = Runtime.getRuntime().exec(sshCmdToStartRS); p.waitFor(); - LOG.info("Exit value for the restarter " + p.exitValue()); + + LOG.info("Exit value for the region server start " + p.exitValue()); + + if (p.exitValue() != 0) { + LOG.error("Failed to start regionserver. Aborting.."); + throw new IOException("Failed to start regionserver. Aborting.."); + } } } catch (IOException e1) { @@ -210,6 +237,9 @@ public class RollingRestart { List<HServerAddress> serversForRegion = plan.getAssignment(region); + if (serversForRegion == null) { + return null; + } // Get the preferred region server from the Assignment Plan for (HServerAddress server : serversForRegion) { if (!server.equals(serverAddr)) { @@ -402,7 +432,7 @@ public class RollingRestart { "Name of the region server to restart"); options.addOption("r", "sleep_after_restart", true, "time interval after which the region server should be started assigning regions. Default : 10000ms"); - options.addOption("r", "sleep_before_restart", true, + options.addOption("b", "sleep_before_restart", true, "time interval after which the region server should be restarted after draining. Default : 10000ms"); options.addOption("d", "region_drain_interval", true, "time interval between region movements while draining. Default : 1000ms"); @@ -434,6 +464,7 @@ public class RollingRestart { if (cmd.hasOption("c")) { RollingRestart.clearAll(); + return; } if (!cmd.hasOption("s")) {
