HBASE-14678 Experiment: Temporarily disable balancer and a few others to see if root of crashed/timedout JVMs; ADD TestDistributedLogSplitting to the mix
Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/129c4843 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/129c4843 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/129c4843 Branch: refs/heads/hbase-12439 Commit: 129c48430e2102bb6f71b56047a8b15b31105fd2 Parents: 36d4a5a Author: stack <st...@apache.org> Authored: Thu Oct 22 17:01:58 2015 -0700 Committer: stack <st...@apache.org> Committed: Thu Oct 22 17:01:58 2015 -0700 ---------------------------------------------------------------------- .../master/TestDistributedLogSplitting.java | 1799 ------------------ 1 file changed, 1799 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/129c4843/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestDistributedLogSplitting.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestDistributedLogSplitting.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestDistributedLogSplitting.java deleted file mode 100644 index 4a43bbd..0000000 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestDistributedLogSplitting.java +++ /dev/null @@ -1,1799 +0,0 @@ -/** - * - - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hbase.master; - -import static org.apache.hadoop.hbase.SplitLogCounters.tot_mgr_wait_for_zk_delete; -import static org.apache.hadoop.hbase.SplitLogCounters.tot_wkr_final_transition_failed; -import static org.apache.hadoop.hbase.SplitLogCounters.tot_wkr_preempt_task; -import static org.apache.hadoop.hbase.SplitLogCounters.tot_wkr_task_acquired; -import static org.apache.hadoop.hbase.SplitLogCounters.tot_wkr_task_done; -import static org.apache.hadoop.hbase.SplitLogCounters.tot_wkr_task_err; -import static org.apache.hadoop.hbase.SplitLogCounters.tot_wkr_task_resigned; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; -import static org.junit.Assert.fail; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashSet; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; -import java.util.NavigableSet; -import java.util.Set; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.Future; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.TimeoutException; -import java.util.concurrent.atomic.AtomicLong; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.PathFilter; -import org.apache.hadoop.hbase.HBaseConfiguration; -import org.apache.hadoop.hbase.HBaseTestingUtility; -import org.apache.hadoop.hbase.HColumnDescriptor; -import org.apache.hadoop.hbase.HConstants; -import org.apache.hadoop.hbase.HRegionInfo; -import org.apache.hadoop.hbase.HTableDescriptor; -import org.apache.hadoop.hbase.KeyValue; -import org.apache.hadoop.hbase.MiniHBaseCluster; -import org.apache.hadoop.hbase.NamespaceDescriptor; -import org.apache.hadoop.hbase.ServerName; -import org.apache.hadoop.hbase.SplitLogCounters; -import org.apache.hadoop.hbase.TableName; -import org.apache.hadoop.hbase.Waiter; -import org.apache.hadoop.hbase.client.ClusterConnection; -import org.apache.hadoop.hbase.client.ConnectionUtils; -import org.apache.hadoop.hbase.client.Delete; -import org.apache.hadoop.hbase.client.Get; -import org.apache.hadoop.hbase.client.Increment; -import org.apache.hadoop.hbase.client.NonceGenerator; -import org.apache.hadoop.hbase.client.PerClientRandomNonceGenerator; -import org.apache.hadoop.hbase.client.Put; -import org.apache.hadoop.hbase.client.RegionLocator; -import org.apache.hadoop.hbase.client.Result; -import org.apache.hadoop.hbase.client.RetriesExhaustedWithDetailsException; -import org.apache.hadoop.hbase.client.Table; -import org.apache.hadoop.hbase.coordination.BaseCoordinatedStateManager; -import org.apache.hadoop.hbase.coordination.ZKSplitLogManagerCoordination; -import org.apache.hadoop.hbase.exceptions.OperationConflictException; -import org.apache.hadoop.hbase.exceptions.RegionInRecoveryException; -import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException; -import org.apache.hadoop.hbase.master.SplitLogManager.TaskBatch; -import org.apache.hadoop.hbase.protobuf.ProtobufUtil; -import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.GetRegionInfoResponse.CompactionState; -import org.apache.hadoop.hbase.regionserver.HRegion; -import org.apache.hadoop.hbase.regionserver.HRegionServer; -import org.apache.hadoop.hbase.regionserver.Region; -import org.apache.hadoop.hbase.regionserver.wal.HLogKey; -import org.apache.hadoop.hbase.regionserver.wal.WALEdit; -import org.apache.hadoop.hbase.testclassification.LargeTests; -import org.apache.hadoop.hbase.testclassification.MasterTests; -import org.apache.hadoop.hbase.util.Bytes; -import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; -import org.apache.hadoop.hbase.util.FSUtils; -import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread; -import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread; -import org.apache.hadoop.hbase.util.Threads; -import org.apache.hadoop.hbase.wal.DefaultWALProvider; -import org.apache.hadoop.hbase.wal.WAL; -import org.apache.hadoop.hbase.wal.WALFactory; -import org.apache.hadoop.hbase.wal.WALSplitter; -import org.apache.hadoop.hbase.zookeeper.MiniZooKeeperCluster; -import org.apache.hadoop.hbase.zookeeper.ZKUtil; -import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher; -import org.apache.hadoop.hdfs.MiniDFSCluster; -import org.junit.After; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.Ignore; -import org.junit.Test; -import org.junit.experimental.categories.Category; - -@Category({MasterTests.class, LargeTests.class}) -@SuppressWarnings("deprecation") -public class TestDistributedLogSplitting { - private static final Log LOG = LogFactory.getLog(TestSplitLogManager.class); - static { - // Uncomment the following line if more verbosity is needed for - // debugging (see HBASE-12285 for details). - //Logger.getLogger("org.apache.hadoop.hbase").setLevel(Level.DEBUG); - - // test ThreeRSAbort fails under hadoop2 (2.0.2-alpha) if shortcircuit-read (scr) is on. this - // turns it off for this test. TODO: Figure out why scr breaks recovery. - System.setProperty("hbase.tests.use.shortcircuit.reads", "false"); - - } - - // Start a cluster with 2 masters and 6 regionservers - static final int NUM_MASTERS = 2; - static final int NUM_RS = 5; - - MiniHBaseCluster cluster; - HMaster master; - Configuration conf; - static Configuration originalConf; - static HBaseTestingUtility TEST_UTIL; - static MiniDFSCluster dfsCluster; - static MiniZooKeeperCluster zkCluster; - - @BeforeClass - public static void setup() throws Exception { - TEST_UTIL = new HBaseTestingUtility(HBaseConfiguration.create()); - dfsCluster = TEST_UTIL.startMiniDFSCluster(1); - zkCluster = TEST_UTIL.startMiniZKCluster(); - originalConf = TEST_UTIL.getConfiguration(); - } - - @AfterClass - public static void tearDown() throws IOException { - TEST_UTIL.shutdownMiniZKCluster(); - TEST_UTIL.shutdownMiniDFSCluster(); - TEST_UTIL.shutdownMiniHBaseCluster(); - } - - private void startCluster(int num_rs) throws Exception { - SplitLogCounters.resetCounters(); - LOG.info("Starting cluster"); - conf.getLong("hbase.splitlog.max.resubmit", 0); - // Make the failure test faster - conf.setInt("zookeeper.recovery.retry", 0); - conf.setInt(HConstants.REGIONSERVER_INFO_PORT, -1); - conf.setFloat(HConstants.LOAD_BALANCER_SLOP_KEY, (float) 100.0); // no load balancing - conf.setInt("hbase.regionserver.wal.max.splitters", 3); - conf.setInt(HConstants.REGION_SERVER_HIGH_PRIORITY_HANDLER_COUNT, 10); - TEST_UTIL.shutdownMiniHBaseCluster(); - TEST_UTIL = new HBaseTestingUtility(conf); - TEST_UTIL.setDFSCluster(dfsCluster); - TEST_UTIL.setZkCluster(zkCluster); - TEST_UTIL.startMiniHBaseCluster(NUM_MASTERS, num_rs); - cluster = TEST_UTIL.getHBaseCluster(); - LOG.info("Waiting for active/ready master"); - cluster.waitForActiveAndReadyMaster(); - master = cluster.getMaster(); - while (cluster.getLiveRegionServerThreads().size() < num_rs) { - Threads.sleep(10); - } - } - - @Before - public void before() throws Exception { - // refresh configuration - conf = HBaseConfiguration.create(originalConf); - } - - @After - public void after() throws Exception { - try { - if (TEST_UTIL.getHBaseCluster() != null) { - for (MasterThread mt : TEST_UTIL.getHBaseCluster().getLiveMasterThreads()) { - mt.getMaster().abort("closing...", null); - } - } - TEST_UTIL.shutdownMiniHBaseCluster(); - } finally { - TEST_UTIL.getTestFileSystem().delete(FSUtils.getRootDir(TEST_UTIL.getConfiguration()), true); - ZKUtil.deleteNodeRecursively(TEST_UTIL.getZooKeeperWatcher(), "/hbase"); - } - } - - @Ignore("DLR is broken by HBASE-12751") @Test (timeout=300000) - public void testRecoveredEdits() throws Exception { - LOG.info("testRecoveredEdits"); - conf.setLong("hbase.regionserver.hlog.blocksize", 30 * 1024); // create more than one wal - conf.setBoolean(HConstants.DISTRIBUTED_LOG_REPLAY_KEY, false); - startCluster(NUM_RS); - - final int NUM_LOG_LINES = 1000; - final SplitLogManager slm = master.getMasterFileSystem().splitLogManager; - // turn off load balancing to prevent regions from moving around otherwise - // they will consume recovered.edits - master.balanceSwitch(false); - FileSystem fs = master.getMasterFileSystem().getFileSystem(); - - List<RegionServerThread> rsts = cluster.getLiveRegionServerThreads(); - - Path rootdir = FSUtils.getRootDir(conf); - - Table t = installTable(new ZooKeeperWatcher(conf, "table-creation", null), - "table", "family", 40); - try { - TableName table = t.getName(); - List<HRegionInfo> regions = null; - HRegionServer hrs = null; - for (int i = 0; i < NUM_RS; i++) { - boolean foundRs = false; - hrs = rsts.get(i).getRegionServer(); - regions = ProtobufUtil.getOnlineRegions(hrs.getRSRpcServices()); - for (HRegionInfo region : regions) { - if (region.getTable().getNameAsString().equalsIgnoreCase("table")) { - foundRs = true; - break; - } - } - if (foundRs) break; - } - final Path logDir = new Path(rootdir, DefaultWALProvider.getWALDirectoryName(hrs - .getServerName().toString())); - - LOG.info("#regions = " + regions.size()); - Iterator<HRegionInfo> it = regions.iterator(); - while (it.hasNext()) { - HRegionInfo region = it.next(); - if (region.getTable().getNamespaceAsString() - .equals(NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR)) { - it.remove(); - } - } - - makeWAL(hrs, regions, "table", "family", NUM_LOG_LINES, 100); - - slm.splitLogDistributed(logDir); - - int count = 0; - for (HRegionInfo hri : regions) { - - Path tdir = FSUtils.getTableDir(rootdir, table); - Path editsdir = - WALSplitter.getRegionDirRecoveredEditsDir( - HRegion.getRegionDir(tdir, hri.getEncodedName())); - LOG.debug("checking edits dir " + editsdir); - FileStatus[] files = fs.listStatus(editsdir, new PathFilter() { - @Override - public boolean accept(Path p) { - if (WALSplitter.isSequenceIdFile(p)) { - return false; - } - return true; - } - }); - assertTrue( - "edits dir should have more than a single file in it. instead has " + files.length, - files.length > 1); - for (int i = 0; i < files.length; i++) { - int c = countWAL(files[i].getPath(), fs, conf); - count += c; - } - LOG.info(count + " edits in " + files.length + " recovered edits files."); - } - - // check that the log file is moved - assertFalse(fs.exists(logDir)); - - assertEquals(NUM_LOG_LINES, count); - } finally { - if (t != null) t.close(); - } - } - - @Ignore("DLR is broken by HBASE-12751") @Test(timeout = 300000) - public void testLogReplayWithNonMetaRSDown() throws Exception { - LOG.info("testLogReplayWithNonMetaRSDown"); - conf.setLong("hbase.regionserver.hlog.blocksize", 30 * 1024); // create more than one wal - conf.setBoolean(HConstants.DISTRIBUTED_LOG_REPLAY_KEY, true); - startCluster(NUM_RS); - final int NUM_REGIONS_TO_CREATE = 40; - final int NUM_LOG_LINES = 1000; - // turn off load balancing to prevent regions from moving around otherwise - // they will consume recovered.edits - master.balanceSwitch(false); - - final ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "table-creation", null); - Table ht = installTable(zkw, "table", "family", NUM_REGIONS_TO_CREATE); - try { - HRegionServer hrs = findRSToKill(false, "table"); - List<HRegionInfo> regions = ProtobufUtil.getOnlineRegions(hrs.getRSRpcServices()); - makeWAL(hrs, regions, "table", "family", NUM_LOG_LINES, 100); - - // wait for abort completes - this.abortRSAndVerifyRecovery(hrs, ht, zkw, NUM_REGIONS_TO_CREATE, NUM_LOG_LINES); - } finally { - if (ht != null) ht.close(); - if (zkw != null) zkw.close(); - } - } - - private static class NonceGeneratorWithDups extends PerClientRandomNonceGenerator { - private boolean isDups = false; - private LinkedList<Long> nonces = new LinkedList<Long>(); - - public void startDups() { - isDups = true; - } - - @Override - public long newNonce() { - long nonce = isDups ? nonces.removeFirst() : super.newNonce(); - if (!isDups) { - nonces.add(nonce); - } - return nonce; - } - } - - @Ignore("DLR is broken by HBASE-12751") @Test(timeout = 300000) - public void testNonceRecovery() throws Exception { - LOG.info("testNonceRecovery"); - final String TABLE_NAME = "table"; - final String FAMILY_NAME = "family"; - final int NUM_REGIONS_TO_CREATE = 40; - - conf.setLong("hbase.regionserver.hlog.blocksize", 100*1024); - conf.setBoolean(HConstants.DISTRIBUTED_LOG_REPLAY_KEY, true); - startCluster(NUM_RS); - master.balanceSwitch(false); - - final ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "table-creation", null); - Table ht = installTable(zkw, TABLE_NAME, FAMILY_NAME, NUM_REGIONS_TO_CREATE); - NonceGeneratorWithDups ng = new NonceGeneratorWithDups(); - NonceGenerator oldNg = - ConnectionUtils.injectNonceGeneratorForTesting( - (ClusterConnection)TEST_UTIL.getConnection(), ng); - - try { - List<Increment> reqs = new ArrayList<Increment>(); - for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) { - HRegionServer hrs = rst.getRegionServer(); - List<HRegionInfo> hris = ProtobufUtil.getOnlineRegions(hrs.getRSRpcServices()); - for (HRegionInfo hri : hris) { - if (TABLE_NAME.equalsIgnoreCase(hri.getTable().getNameAsString())) { - byte[] key = hri.getStartKey(); - if (key == null || key.length == 0) { - key = Bytes.copy(hri.getEndKey()); - --(key[key.length - 1]); - } - Increment incr = new Increment(key); - incr.addColumn(Bytes.toBytes(FAMILY_NAME), Bytes.toBytes("q"), 1); - ht.increment(incr); - reqs.add(incr); - } - } - } - - HRegionServer hrs = findRSToKill(false, "table"); - abortRSAndWaitForRecovery(hrs, zkw, NUM_REGIONS_TO_CREATE); - ng.startDups(); - for (Increment incr : reqs) { - try { - ht.increment(incr); - fail("should have thrown"); - } catch (OperationConflictException ope) { - LOG.debug("Caught as expected: " + ope.getMessage()); - } - } - } finally { - ConnectionUtils.injectNonceGeneratorForTesting((ClusterConnection) - TEST_UTIL.getConnection(), oldNg); - if (ht != null) ht.close(); - if (zkw != null) zkw.close(); - } - } - - @Ignore("DLR is broken by HBASE-12751") @Test(timeout = 300000) - public void testLogReplayWithMetaRSDown() throws Exception { - LOG.info("testRecoveredEditsReplayWithMetaRSDown"); - conf.setBoolean(HConstants.DISTRIBUTED_LOG_REPLAY_KEY, true); - startCluster(NUM_RS); - final int NUM_REGIONS_TO_CREATE = 40; - final int NUM_LOG_LINES = 1000; - // turn off load balancing to prevent regions from moving around otherwise - // they will consume recovered.edits - master.balanceSwitch(false); - - final ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "table-creation", null); - Table ht = installTable(zkw, "table", "family", NUM_REGIONS_TO_CREATE); - try { - HRegionServer hrs = findRSToKill(true, "table"); - List<HRegionInfo> regions = ProtobufUtil.getOnlineRegions(hrs.getRSRpcServices()); - makeWAL(hrs, regions, "table", "family", NUM_LOG_LINES, 100); - - this.abortRSAndVerifyRecovery(hrs, ht, zkw, NUM_REGIONS_TO_CREATE, NUM_LOG_LINES); - } finally { - if (ht != null) ht.close(); - if (zkw != null) zkw.close(); - } - } - - private void abortRSAndVerifyRecovery(HRegionServer hrs, Table ht, final ZooKeeperWatcher zkw, - final int numRegions, final int numofLines) throws Exception { - - abortRSAndWaitForRecovery(hrs, zkw, numRegions); - assertEquals(numofLines, TEST_UTIL.countRows(ht)); - } - - private void abortRSAndWaitForRecovery(HRegionServer hrs, final ZooKeeperWatcher zkw, - final int numRegions) throws Exception { - final MiniHBaseCluster tmpCluster = this.cluster; - - // abort RS - LOG.info("Aborting region server: " + hrs.getServerName()); - hrs.abort("testing"); - - // wait for abort completes - TEST_UTIL.waitFor(120000, 200, new Waiter.Predicate<Exception>() { - @Override - public boolean evaluate() throws Exception { - return (tmpCluster.getLiveRegionServerThreads().size() <= (NUM_RS - 1)); - } - }); - - // wait for regions come online - TEST_UTIL.waitFor(180000, 200, new Waiter.Predicate<Exception>() { - @Override - public boolean evaluate() throws Exception { - return (HBaseTestingUtility.getAllOnlineRegions(tmpCluster).size() - >= (numRegions + 1)); - } - }); - - // wait for all regions are fully recovered - TEST_UTIL.waitFor(180000, 200, new Waiter.Predicate<Exception>() { - @Override - public boolean evaluate() throws Exception { - List<String> recoveringRegions = zkw.getRecoverableZooKeeper().getChildren( - zkw.recoveringRegionsZNode, false); - return (recoveringRegions != null && recoveringRegions.size() == 0); - } - }); - } - - @Ignore("DLR is broken by HBASE-12751") @Test(timeout = 300000) - public void testMasterStartsUpWithLogSplittingWork() throws Exception { - LOG.info("testMasterStartsUpWithLogSplittingWork"); - conf.setBoolean(HConstants.DISTRIBUTED_LOG_REPLAY_KEY, false); - conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, NUM_RS - 1); - startCluster(NUM_RS); - - final int NUM_REGIONS_TO_CREATE = 40; - final int NUM_LOG_LINES = 1000; - // turn off load balancing to prevent regions from moving around otherwise - // they will consume recovered.edits - master.balanceSwitch(false); - - final ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "table-creation", null); - Table ht = installTable(zkw, "table", "family", NUM_REGIONS_TO_CREATE); - try { - HRegionServer hrs = findRSToKill(false, "table"); - List<HRegionInfo> regions = ProtobufUtil.getOnlineRegions(hrs.getRSRpcServices()); - makeWAL(hrs, regions, "table", "family", NUM_LOG_LINES, 100); - - // abort master - abortMaster(cluster); - - // abort RS - LOG.info("Aborting region server: " + hrs.getServerName()); - hrs.abort("testing"); - - // wait for abort completes - TEST_UTIL.waitFor(120000, 200, new Waiter.Predicate<Exception>() { - @Override - public boolean evaluate() throws Exception { - return (cluster.getLiveRegionServerThreads().size() <= (NUM_RS - 1)); - } - }); - - Thread.sleep(2000); - LOG.info("Current Open Regions:" - + HBaseTestingUtility.getAllOnlineRegions(cluster).size()); - - // wait for abort completes - TEST_UTIL.waitFor(120000, 200, new Waiter.Predicate<Exception>() { - @Override - public boolean evaluate() throws Exception { - return (HBaseTestingUtility.getAllOnlineRegions(cluster).size() - >= (NUM_REGIONS_TO_CREATE + 1)); - } - }); - - LOG.info("Current Open Regions After Master Node Starts Up:" - + HBaseTestingUtility.getAllOnlineRegions(cluster).size()); - - assertEquals(NUM_LOG_LINES, TEST_UTIL.countRows(ht)); - } finally { - if (ht != null) ht.close(); - if (zkw != null) zkw.close(); - } - } - - @Ignore("DLR is broken by HBASE-12751") @Test(timeout = 300000) - public void testMasterStartsUpWithLogReplayWork() throws Exception { - LOG.info("testMasterStartsUpWithLogReplayWork"); - conf.setBoolean(HConstants.DISTRIBUTED_LOG_REPLAY_KEY, true); - conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, NUM_RS - 1); - startCluster(NUM_RS); - - final int NUM_REGIONS_TO_CREATE = 40; - final int NUM_LOG_LINES = 1000; - // turn off load balancing to prevent regions from moving around otherwise - // they will consume recovered.edits - master.balanceSwitch(false); - - final ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "table-creation", null); - Table ht = installTable(zkw, "table", "family", NUM_REGIONS_TO_CREATE); - try { - HRegionServer hrs = findRSToKill(false, "table"); - List<HRegionInfo> regions = ProtobufUtil.getOnlineRegions(hrs.getRSRpcServices()); - makeWAL(hrs, regions, "table", "family", NUM_LOG_LINES, 100); - - // abort master - abortMaster(cluster); - - // abort RS - LOG.info("Aborting region server: " + hrs.getServerName()); - hrs.abort("testing"); - - // wait for the RS dies - TEST_UTIL.waitFor(120000, 200, new Waiter.Predicate<Exception>() { - @Override - public boolean evaluate() throws Exception { - return (cluster.getLiveRegionServerThreads().size() <= (NUM_RS - 1)); - } - }); - - Thread.sleep(2000); - LOG.info("Current Open Regions:" + HBaseTestingUtility.getAllOnlineRegions(cluster).size()); - - // wait for all regions are fully recovered - TEST_UTIL.waitFor(180000, 200, new Waiter.Predicate<Exception>() { - @Override - public boolean evaluate() throws Exception { - List<String> recoveringRegions = zkw.getRecoverableZooKeeper().getChildren( - zkw.recoveringRegionsZNode, false); - boolean done = recoveringRegions != null && recoveringRegions.size() == 0; - if (!done) { - LOG.info("Recovering regions: " + recoveringRegions); - } - return done; - } - }); - - LOG.info("Current Open Regions After Master Node Starts Up:" - + HBaseTestingUtility.getAllOnlineRegions(cluster).size()); - - assertEquals(NUM_LOG_LINES, TEST_UTIL.countRows(ht)); - } finally { - if (ht != null) ht.close(); - if (zkw != null) zkw.close(); - } - } - - - @Ignore("DLR is broken by HBASE-12751") @Test(timeout = 300000) - public void testLogReplayTwoSequentialRSDown() throws Exception { - LOG.info("testRecoveredEditsReplayTwoSequentialRSDown"); - conf.setBoolean(HConstants.DISTRIBUTED_LOG_REPLAY_KEY, true); - startCluster(NUM_RS); - final int NUM_REGIONS_TO_CREATE = 40; - final int NUM_LOG_LINES = 1000; - // turn off load balancing to prevent regions from moving around otherwise - // they will consume recovered.edits - master.balanceSwitch(false); - - List<RegionServerThread> rsts = cluster.getLiveRegionServerThreads(); - final ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "table-creation", null); - Table ht = installTable(zkw, "table", "family", NUM_REGIONS_TO_CREATE); - try { - List<HRegionInfo> regions = null; - HRegionServer hrs1 = findRSToKill(false, "table"); - regions = ProtobufUtil.getOnlineRegions(hrs1.getRSRpcServices()); - - makeWAL(hrs1, regions, "table", "family", NUM_LOG_LINES, 100); - - // abort RS1 - LOG.info("Aborting region server: " + hrs1.getServerName()); - hrs1.abort("testing"); - - // wait for abort completes - TEST_UTIL.waitFor(120000, 200, new Waiter.Predicate<Exception>() { - @Override - public boolean evaluate() throws Exception { - return (cluster.getLiveRegionServerThreads().size() <= (NUM_RS - 1)); - } - }); - - // wait for regions come online - TEST_UTIL.waitFor(180000, 200, new Waiter.Predicate<Exception>() { - @Override - public boolean evaluate() throws Exception { - return (HBaseTestingUtility.getAllOnlineRegions(cluster).size() - >= (NUM_REGIONS_TO_CREATE + 1)); - } - }); - - // sleep a little bit in order to interrupt recovering in the middle - Thread.sleep(300); - // abort second region server - rsts = cluster.getLiveRegionServerThreads(); - HRegionServer hrs2 = rsts.get(0).getRegionServer(); - LOG.info("Aborting one more region server: " + hrs2.getServerName()); - hrs2.abort("testing"); - - // wait for abort completes - TEST_UTIL.waitFor(120000, 200, new Waiter.Predicate<Exception>() { - @Override - public boolean evaluate() throws Exception { - return (cluster.getLiveRegionServerThreads().size() <= (NUM_RS - 2)); - } - }); - - // wait for regions come online - TEST_UTIL.waitFor(180000, 200, new Waiter.Predicate<Exception>() { - @Override - public boolean evaluate() throws Exception { - return (HBaseTestingUtility.getAllOnlineRegions(cluster).size() - >= (NUM_REGIONS_TO_CREATE + 1)); - } - }); - - // wait for all regions are fully recovered - TEST_UTIL.waitFor(180000, 200, new Waiter.Predicate<Exception>() { - @Override - public boolean evaluate() throws Exception { - List<String> recoveringRegions = zkw.getRecoverableZooKeeper().getChildren( - zkw.recoveringRegionsZNode, false); - return (recoveringRegions != null && recoveringRegions.size() == 0); - } - }); - - assertEquals(NUM_LOG_LINES, TEST_UTIL.countRows(ht)); - } finally { - if (ht != null) ht.close(); - if (zkw != null) zkw.close(); - } - } - - @Ignore("DLR is broken by HBASE-12751") @Test(timeout = 300000) - public void testMarkRegionsRecoveringInZK() throws Exception { - LOG.info("testMarkRegionsRecoveringInZK"); - conf.setBoolean(HConstants.DISTRIBUTED_LOG_REPLAY_KEY, true); - startCluster(NUM_RS); - master.balanceSwitch(false); - List<RegionServerThread> rsts = cluster.getLiveRegionServerThreads(); - final ZooKeeperWatcher zkw = master.getZooKeeper(); - Table ht = installTable(zkw, "table", "family", 40); - try { - final SplitLogManager slm = master.getMasterFileSystem().splitLogManager; - - Set<HRegionInfo> regionSet = new HashSet<HRegionInfo>(); - HRegionInfo region = null; - HRegionServer hrs = null; - ServerName firstFailedServer = null; - ServerName secondFailedServer = null; - for (int i = 0; i < NUM_RS; i++) { - hrs = rsts.get(i).getRegionServer(); - List<HRegionInfo> regions = ProtobufUtil.getOnlineRegions(hrs.getRSRpcServices()); - if (regions.isEmpty()) continue; - region = regions.get(0); - regionSet.add(region); - firstFailedServer = hrs.getServerName(); - secondFailedServer = rsts.get((i + 1) % NUM_RS).getRegionServer().getServerName(); - break; - } - - slm.markRegionsRecovering(firstFailedServer, regionSet); - slm.markRegionsRecovering(secondFailedServer, regionSet); - - List<String> recoveringRegions = ZKUtil.listChildrenNoWatch(zkw, - ZKUtil.joinZNode(zkw.recoveringRegionsZNode, region.getEncodedName())); - - assertEquals(recoveringRegions.size(), 2); - - // wait for splitLogWorker to mark them up because there is no WAL files recorded in ZK - final HRegionServer tmphrs = hrs; - TEST_UTIL.waitFor(60000, 1000, new Waiter.Predicate<Exception>() { - @Override - public boolean evaluate() throws Exception { - return (tmphrs.getRecoveringRegions().size() == 0); - } - }); - } finally { - if (ht != null) ht.close(); - if (zkw != null) zkw.close(); - } - } - - @Ignore("DLR is broken by HBASE-12751") @Test(timeout = 300000) - public void testReplayCmd() throws Exception { - LOG.info("testReplayCmd"); - conf.setBoolean(HConstants.DISTRIBUTED_LOG_REPLAY_KEY, true); - startCluster(NUM_RS); - final int NUM_REGIONS_TO_CREATE = 40; - // turn off load balancing to prevent regions from moving around otherwise - // they will consume recovered.edits - master.balanceSwitch(false); - - List<RegionServerThread> rsts = cluster.getLiveRegionServerThreads(); - final ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "table-creation", null); - Table ht = installTable(zkw, "table", "family", NUM_REGIONS_TO_CREATE); - try { - List<HRegionInfo> regions = null; - HRegionServer hrs = null; - for (int i = 0; i < NUM_RS; i++) { - boolean isCarryingMeta = false; - hrs = rsts.get(i).getRegionServer(); - regions = ProtobufUtil.getOnlineRegions(hrs.getRSRpcServices()); - for (HRegionInfo region : regions) { - if (region.isMetaRegion()) { - isCarryingMeta = true; - break; - } - } - if (isCarryingMeta) { - continue; - } - if (regions.size() > 0) break; - } - - this.prepareData(ht, Bytes.toBytes("family"), Bytes.toBytes("c1")); - String originalCheckSum = TEST_UTIL.checksumRows(ht); - - // abort RA and trigger replay - abortRSAndWaitForRecovery(hrs, zkw, NUM_REGIONS_TO_CREATE); - - assertEquals("Data should remain after reopening of regions", originalCheckSum, - TEST_UTIL.checksumRows(ht)); - } finally { - if (ht != null) ht.close(); - if (zkw != null) zkw.close(); - } - } - - @Ignore("DLR is broken by HBASE-12751") @Test(timeout = 300000) - public void testLogReplayForDisablingTable() throws Exception { - LOG.info("testLogReplayForDisablingTable"); - conf.setBoolean(HConstants.DISTRIBUTED_LOG_REPLAY_KEY, true); - startCluster(NUM_RS); - final int NUM_REGIONS_TO_CREATE = 40; - final int NUM_LOG_LINES = 1000; - - List<RegionServerThread> rsts = cluster.getLiveRegionServerThreads(); - final ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "table-creation", null); - Table disablingHT = installTable(zkw, "disableTable", "family", NUM_REGIONS_TO_CREATE); - Table ht = installTable(zkw, "table", "family", NUM_REGIONS_TO_CREATE, NUM_REGIONS_TO_CREATE); - try { - // turn off load balancing to prevent regions from moving around otherwise - // they will consume recovered.edits - master.balanceSwitch(false); - - List<HRegionInfo> regions = null; - HRegionServer hrs = null; - boolean hasRegionsForBothTables = false; - String tableName = null; - for (int i = 0; i < NUM_RS; i++) { - tableName = null; - hasRegionsForBothTables = false; - boolean isCarryingSystem = false; - hrs = rsts.get(i).getRegionServer(); - regions = ProtobufUtil.getOnlineRegions(hrs.getRSRpcServices()); - for (HRegionInfo region : regions) { - if (region.getTable().isSystemTable()) { - isCarryingSystem = true; - break; - } - if (tableName != null && - !tableName.equalsIgnoreCase(region.getTable().getNameAsString())) { - // make sure that we find a RS has online regions for both "table" and "disableTable" - hasRegionsForBothTables = true; - break; - } else if (tableName == null) { - tableName = region.getTable().getNameAsString(); - } - } - if (isCarryingSystem) { - continue; - } - if (hasRegionsForBothTables) { - break; - } - } - - // make sure we found a good RS - Assert.assertTrue(hasRegionsForBothTables); - - LOG.info("#regions = " + regions.size()); - Iterator<HRegionInfo> it = regions.iterator(); - while (it.hasNext()) { - HRegionInfo region = it.next(); - if (region.isMetaTable()) { - it.remove(); - } - } - makeWAL(hrs, regions, "disableTable", "family", NUM_LOG_LINES, 100, false); - makeWAL(hrs, regions, "table", "family", NUM_LOG_LINES, 100); - - LOG.info("Disabling table\n"); - TEST_UTIL.getHBaseAdmin().disableTable(TableName.valueOf("disableTable")); - TEST_UTIL.waitTableDisabled(TableName.valueOf("disableTable").getName()); - - // abort RS - LOG.info("Aborting region server: " + hrs.getServerName()); - hrs.abort("testing"); - - // wait for abort completes - TEST_UTIL.waitFor(120000, 200, new Waiter.Predicate<Exception>() { - @Override - public boolean evaluate() throws Exception { - return (cluster.getLiveRegionServerThreads().size() <= (NUM_RS - 1)); - } - }); - - // wait for regions come online - TEST_UTIL.waitFor(180000, 200, new Waiter.Predicate<Exception>() { - @Override - public boolean evaluate() throws Exception { - return (HBaseTestingUtility.getAllOnlineRegions(cluster).size() - >= (NUM_REGIONS_TO_CREATE + 1)); - } - }); - - // wait for all regions are fully recovered - TEST_UTIL.waitFor(180000, 200, new Waiter.Predicate<Exception>() { - @Override - public boolean evaluate() throws Exception { - List<String> recoveringRegions = zkw.getRecoverableZooKeeper().getChildren( - zkw.recoveringRegionsZNode, false); - ServerManager serverManager = master.getServerManager(); - return (!serverManager.areDeadServersInProgress() && - recoveringRegions != null && recoveringRegions.size() == 0); - } - }); - - int count = 0; - FileSystem fs = master.getMasterFileSystem().getFileSystem(); - Path rootdir = FSUtils.getRootDir(conf); - Path tdir = FSUtils.getTableDir(rootdir, TableName.valueOf("disableTable")); - for (HRegionInfo hri : regions) { - Path editsdir = - WALSplitter.getRegionDirRecoveredEditsDir( - HRegion.getRegionDir(tdir, hri.getEncodedName())); - LOG.debug("checking edits dir " + editsdir); - if(!fs.exists(editsdir)) continue; - FileStatus[] files = fs.listStatus(editsdir, new PathFilter() { - @Override - public boolean accept(Path p) { - if (WALSplitter.isSequenceIdFile(p)) { - return false; - } - return true; - } - }); - if(files != null) { - for(FileStatus file : files) { - int c = countWAL(file.getPath(), fs, conf); - count += c; - LOG.info(c + " edits in " + file.getPath()); - } - } - } - - LOG.info("Verify edits in recovered.edits files"); - assertEquals(NUM_LOG_LINES, count); - LOG.info("Verify replayed edits"); - assertEquals(NUM_LOG_LINES, TEST_UTIL.countRows(ht)); - - // clean up - for (HRegionInfo hri : regions) { - Path editsdir = - WALSplitter.getRegionDirRecoveredEditsDir( - HRegion.getRegionDir(tdir, hri.getEncodedName())); - fs.delete(editsdir, true); - } - disablingHT.close(); - } finally { - if (ht != null) ht.close(); - if (zkw != null) zkw.close(); - } - } - - @Ignore("DLR is broken by HBASE-12751") @Test(timeout = 300000) - public void testDisallowWritesInRecovering() throws Exception { - LOG.info("testDisallowWritesInRecovering"); - conf.setBoolean(HConstants.DISTRIBUTED_LOG_REPLAY_KEY, true); - conf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 3); - conf.setBoolean(HConstants.DISALLOW_WRITES_IN_RECOVERING, true); - startCluster(NUM_RS); - final int NUM_REGIONS_TO_CREATE = 40; - // turn off load balancing to prevent regions from moving around otherwise - // they will consume recovered.edits - master.balanceSwitch(false); - - List<RegionServerThread> rsts = cluster.getLiveRegionServerThreads(); - final ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "table-creation", null); - Table ht = installTable(zkw, "table", "family", NUM_REGIONS_TO_CREATE); - try { - final SplitLogManager slm = master.getMasterFileSystem().splitLogManager; - - Set<HRegionInfo> regionSet = new HashSet<HRegionInfo>(); - HRegionInfo region = null; - HRegionServer hrs = null; - HRegionServer dstRS = null; - for (int i = 0; i < NUM_RS; i++) { - hrs = rsts.get(i).getRegionServer(); - List<HRegionInfo> regions = ProtobufUtil.getOnlineRegions(hrs.getRSRpcServices()); - if (regions.isEmpty()) continue; - region = regions.get(0); - regionSet.add(region); - dstRS = rsts.get((i+1) % NUM_RS).getRegionServer(); - break; - } - - slm.markRegionsRecovering(hrs.getServerName(), regionSet); - // move region in order for the region opened in recovering state - final HRegionInfo hri = region; - final HRegionServer tmpRS = dstRS; - TEST_UTIL.getHBaseAdmin().move(region.getEncodedNameAsBytes(), - Bytes.toBytes(dstRS.getServerName().getServerName())); - // wait for region move completes - final RegionStates regionStates = - TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager().getRegionStates(); - TEST_UTIL.waitFor(45000, 200, new Waiter.Predicate<Exception>() { - @Override - public boolean evaluate() throws Exception { - ServerName sn = regionStates.getRegionServerOfRegion(hri); - return (sn != null && sn.equals(tmpRS.getServerName())); - } - }); - - try { - byte[] key = region.getStartKey(); - if (key == null || key.length == 0) { - key = new byte[] { 0, 0, 0, 0, 1 }; - } - Put put = new Put(key); - put.add(Bytes.toBytes("family"), Bytes.toBytes("c1"), new byte[]{'b'}); - ht.put(put); - } catch (IOException ioe) { - Assert.assertTrue(ioe instanceof RetriesExhaustedWithDetailsException); - RetriesExhaustedWithDetailsException re = (RetriesExhaustedWithDetailsException) ioe; - boolean foundRegionInRecoveryException = false; - for (Throwable t : re.getCauses()) { - if (t instanceof RegionInRecoveryException) { - foundRegionInRecoveryException = true; - break; - } - } - Assert.assertTrue( - "No RegionInRecoveryException. Following exceptions returned=" + re.getCauses(), - foundRegionInRecoveryException); - } - } finally { - if (ht != null) ht.close(); - if (ht != null) zkw.close(); - } - } - - /** - * The original intention of this test was to force an abort of a region - * server and to make sure that the failure path in the region servers is - * properly evaluated. But it is difficult to ensure that the region server - * doesn't finish the log splitting before it aborts. Also now, there is - * this code path where the master will preempt the region server when master - * detects that the region server has aborted. - * @throws Exception - */ - @Ignore ("Disabled because flakey") @Test (timeout=300000) - public void testWorkerAbort() throws Exception { - LOG.info("testWorkerAbort"); - startCluster(3); - final int NUM_LOG_LINES = 10000; - final SplitLogManager slm = master.getMasterFileSystem().splitLogManager; - FileSystem fs = master.getMasterFileSystem().getFileSystem(); - - final List<RegionServerThread> rsts = cluster.getLiveRegionServerThreads(); - HRegionServer hrs = findRSToKill(false, "table"); - Path rootdir = FSUtils.getRootDir(conf); - final Path logDir = new Path(rootdir, - DefaultWALProvider.getWALDirectoryName(hrs.getServerName().toString())); - - Table t = installTable(new ZooKeeperWatcher(conf, "table-creation", null), - "table", "family", 40); - try { - makeWAL(hrs, ProtobufUtil.getOnlineRegions(hrs.getRSRpcServices()), - "table", "family", NUM_LOG_LINES, 100); - - new Thread() { - @Override - public void run() { - waitForCounter(tot_wkr_task_acquired, 0, 1, 1000); - for (RegionServerThread rst : rsts) { - rst.getRegionServer().abort("testing"); - break; - } - } - }.start(); - // slm.splitLogDistributed(logDir); - FileStatus[] logfiles = fs.listStatus(logDir); - TaskBatch batch = new TaskBatch(); - slm.enqueueSplitTask(logfiles[0].getPath().toString(), batch); - //waitForCounter but for one of the 2 counters - long curt = System.currentTimeMillis(); - long waitTime = 80000; - long endt = curt + waitTime; - while (curt < endt) { - if ((tot_wkr_task_resigned.get() + tot_wkr_task_err.get() + - tot_wkr_final_transition_failed.get() + tot_wkr_task_done.get() + - tot_wkr_preempt_task.get()) == 0) { - Thread.yield(); - curt = System.currentTimeMillis(); - } else { - assertTrue(1 <= (tot_wkr_task_resigned.get() + tot_wkr_task_err.get() + - tot_wkr_final_transition_failed.get() + tot_wkr_task_done.get() + - tot_wkr_preempt_task.get())); - return; - } - } - fail("none of the following counters went up in " + waitTime + - " milliseconds - " + - "tot_wkr_task_resigned, tot_wkr_task_err, " + - "tot_wkr_final_transition_failed, tot_wkr_task_done, " + - "tot_wkr_preempt_task"); - } finally { - if (t != null) t.close(); - } - } - - @Test (timeout=300000) - public void testThreeRSAbort() throws Exception { - LOG.info("testThreeRSAbort"); - final int NUM_REGIONS_TO_CREATE = 40; - final int NUM_ROWS_PER_REGION = 100; - - startCluster(NUM_RS); // NUM_RS=6. - - final ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, - "distributed log splitting test", null); - - Table ht = installTable(zkw, "table", "family", NUM_REGIONS_TO_CREATE); - try { - populateDataInTable(NUM_ROWS_PER_REGION, "family"); - - - List<RegionServerThread> rsts = cluster.getLiveRegionServerThreads(); - assertEquals(NUM_RS, rsts.size()); - rsts.get(0).getRegionServer().abort("testing"); - rsts.get(1).getRegionServer().abort("testing"); - rsts.get(2).getRegionServer().abort("testing"); - - long start = EnvironmentEdgeManager.currentTime(); - while (cluster.getLiveRegionServerThreads().size() > (NUM_RS - 3)) { - if (EnvironmentEdgeManager.currentTime() - start > 60000) { - assertTrue(false); - } - Thread.sleep(200); - } - - start = EnvironmentEdgeManager.currentTime(); - while (HBaseTestingUtility.getAllOnlineRegions(cluster).size() - < (NUM_REGIONS_TO_CREATE + 1)) { - if (EnvironmentEdgeManager.currentTime() - start > 60000) { - assertTrue("Timedout", false); - } - Thread.sleep(200); - } - - // wait for all regions are fully recovered - TEST_UTIL.waitFor(180000, 200, new Waiter.Predicate<Exception>() { - @Override - public boolean evaluate() throws Exception { - List<String> recoveringRegions = zkw.getRecoverableZooKeeper().getChildren( - zkw.recoveringRegionsZNode, false); - return (recoveringRegions != null && recoveringRegions.size() == 0); - } - }); - - assertEquals(NUM_REGIONS_TO_CREATE * NUM_ROWS_PER_REGION, - TEST_UTIL.countRows(ht)); - } finally { - if (ht != null) ht.close(); - if (zkw != null) zkw.close(); - } - } - - - - @Test(timeout=30000) - public void testDelayedDeleteOnFailure() throws Exception { - LOG.info("testDelayedDeleteOnFailure"); - startCluster(1); - final SplitLogManager slm = master.getMasterFileSystem().splitLogManager; - final FileSystem fs = master.getMasterFileSystem().getFileSystem(); - final Path logDir = new Path(FSUtils.getRootDir(conf), "x"); - fs.mkdirs(logDir); - ExecutorService executor = null; - try { - final Path corruptedLogFile = new Path(logDir, "x"); - FSDataOutputStream out; - out = fs.create(corruptedLogFile); - out.write(0); - out.write(Bytes.toBytes("corrupted bytes")); - out.close(); - ZKSplitLogManagerCoordination coordination = - (ZKSplitLogManagerCoordination) ((BaseCoordinatedStateManager) master - .getCoordinatedStateManager()).getSplitLogManagerCoordination(); - coordination.setIgnoreDeleteForTesting(true); - executor = Executors.newSingleThreadExecutor(); - Runnable runnable = new Runnable() { - @Override - public void run() { - try { - // since the logDir is a fake, corrupted one, so the split log worker - // will finish it quickly with error, and this call will fail and throw - // an IOException. - slm.splitLogDistributed(logDir); - } catch (IOException ioe) { - try { - assertTrue(fs.exists(corruptedLogFile)); - // this call will block waiting for the task to be removed from the - // tasks map which is not going to happen since ignoreZKDeleteForTesting - // is set to true, until it is interrupted. - slm.splitLogDistributed(logDir); - } catch (IOException e) { - assertTrue(Thread.currentThread().isInterrupted()); - return; - } - fail("did not get the expected IOException from the 2nd call"); - } - fail("did not get the expected IOException from the 1st call"); - } - }; - Future<?> result = executor.submit(runnable); - try { - result.get(2000, TimeUnit.MILLISECONDS); - } catch (TimeoutException te) { - // it is ok, expected. - } - waitForCounter(tot_mgr_wait_for_zk_delete, 0, 1, 10000); - executor.shutdownNow(); - executor = null; - - // make sure the runnable is finished with no exception thrown. - result.get(); - } finally { - if (executor != null) { - // interrupt the thread in case the test fails in the middle. - // it has no effect if the thread is already terminated. - executor.shutdownNow(); - } - fs.delete(logDir, true); - } - } - - @Ignore("DLR is broken by HBASE-12751") @Test(timeout = 300000) - public void testMetaRecoveryInZK() throws Exception { - LOG.info("testMetaRecoveryInZK"); - conf.setBoolean(HConstants.DISTRIBUTED_LOG_REPLAY_KEY, true); - startCluster(NUM_RS); - - // turn off load balancing to prevent regions from moving around otherwise - // they will consume recovered.edits - master.balanceSwitch(false); - final ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "table-creation", null); - - // only testing meta recovery in ZK operation - HRegionServer hrs = findRSToKill(true, null); - List<HRegionInfo> regions = ProtobufUtil.getOnlineRegions(hrs.getRSRpcServices()); - - LOG.info("#regions = " + regions.size()); - Set<HRegionInfo> tmpRegions = new HashSet<HRegionInfo>(); - tmpRegions.add(HRegionInfo.FIRST_META_REGIONINFO); - master.getMasterFileSystem().prepareLogReplay(hrs.getServerName(), tmpRegions); - Set<HRegionInfo> userRegionSet = new HashSet<HRegionInfo>(); - userRegionSet.addAll(regions); - master.getMasterFileSystem().prepareLogReplay(hrs.getServerName(), userRegionSet); - boolean isMetaRegionInRecovery = false; - List<String> recoveringRegions = - zkw.getRecoverableZooKeeper().getChildren(zkw.recoveringRegionsZNode, false); - for (String curEncodedRegionName : recoveringRegions) { - if (curEncodedRegionName.equals(HRegionInfo.FIRST_META_REGIONINFO.getEncodedName())) { - isMetaRegionInRecovery = true; - break; - } - } - assertTrue(isMetaRegionInRecovery); - - master.getMasterFileSystem().splitMetaLog(hrs.getServerName()); - - isMetaRegionInRecovery = false; - recoveringRegions = - zkw.getRecoverableZooKeeper().getChildren(zkw.recoveringRegionsZNode, false); - for (String curEncodedRegionName : recoveringRegions) { - if (curEncodedRegionName.equals(HRegionInfo.FIRST_META_REGIONINFO.getEncodedName())) { - isMetaRegionInRecovery = true; - break; - } - } - // meta region should be recovered - assertFalse(isMetaRegionInRecovery); - zkw.close(); - } - - @Ignore("DLR is broken by HBASE-12751") @Test(timeout = 300000) - public void testSameVersionUpdatesRecovery() throws Exception { - LOG.info("testSameVersionUpdatesRecovery"); - conf.setLong("hbase.regionserver.hlog.blocksize", 15 * 1024); - conf.setBoolean(HConstants.DISTRIBUTED_LOG_REPLAY_KEY, true); - startCluster(NUM_RS); - final AtomicLong sequenceId = new AtomicLong(100); - final int NUM_REGIONS_TO_CREATE = 40; - final int NUM_LOG_LINES = 1000; - // turn off load balancing to prevent regions from moving around otherwise - // they will consume recovered.edits - master.balanceSwitch(false); - - List<RegionServerThread> rsts = cluster.getLiveRegionServerThreads(); - final ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "table-creation", null); - Table ht = installTable(zkw, "table", "family", NUM_REGIONS_TO_CREATE); - try { - List<HRegionInfo> regions = null; - HRegionServer hrs = null; - for (int i = 0; i < NUM_RS; i++) { - boolean isCarryingMeta = false; - hrs = rsts.get(i).getRegionServer(); - regions = ProtobufUtil.getOnlineRegions(hrs.getRSRpcServices()); - for (HRegionInfo region : regions) { - if (region.isMetaRegion()) { - isCarryingMeta = true; - break; - } - } - if (isCarryingMeta) { - continue; - } - break; - } - - LOG.info("#regions = " + regions.size()); - Iterator<HRegionInfo> it = regions.iterator(); - while (it.hasNext()) { - HRegionInfo region = it.next(); - if (region.isMetaTable() - || region.getEncodedName().equals( - HRegionInfo.FIRST_META_REGIONINFO.getEncodedName())) { - it.remove(); - } - } - if (regions.size() == 0) return; - HRegionInfo curRegionInfo = regions.get(0); - byte[] startRow = curRegionInfo.getStartKey(); - if (startRow == null || startRow.length == 0) { - startRow = new byte[] { 0, 0, 0, 0, 1 }; - } - byte[] row = Bytes.incrementBytes(startRow, 1); - // use last 5 bytes because HBaseTestingUtility.createMultiRegions use 5 bytes key - row = Arrays.copyOfRange(row, 3, 8); - long value = 0; - TableName tableName = TableName.valueOf("table"); - byte[] family = Bytes.toBytes("family"); - byte[] qualifier = Bytes.toBytes("c1"); - long timeStamp = System.currentTimeMillis(); - HTableDescriptor htd = new HTableDescriptor(tableName); - htd.addFamily(new HColumnDescriptor(family)); - final WAL wal = hrs.getWAL(curRegionInfo); - for (int i = 0; i < NUM_LOG_LINES; i += 1) { - WALEdit e = new WALEdit(); - value++; - e.add(new KeyValue(row, family, qualifier, timeStamp, Bytes.toBytes(value))); - wal.append(htd, curRegionInfo, - new HLogKey(curRegionInfo.getEncodedNameAsBytes(), tableName, System.currentTimeMillis()), - e, true); - } - wal.sync(); - wal.shutdown(); - - // wait for abort completes - this.abortRSAndWaitForRecovery(hrs, zkw, NUM_REGIONS_TO_CREATE); - - // verify we got the last value - LOG.info("Verification Starts..."); - Get g = new Get(row); - Result r = ht.get(g); - long theStoredVal = Bytes.toLong(r.getValue(family, qualifier)); - assertEquals(value, theStoredVal); - - // after flush - LOG.info("Verification after flush..."); - TEST_UTIL.getHBaseAdmin().flush(tableName); - r = ht.get(g); - theStoredVal = Bytes.toLong(r.getValue(family, qualifier)); - assertEquals(value, theStoredVal); - } finally { - if (ht != null) ht.close(); - if (zkw != null) zkw.close(); - } - } - - @Ignore("DLR is broken by HBASE-12751") @Test(timeout = 300000) - public void testSameVersionUpdatesRecoveryWithCompaction() throws Exception { - LOG.info("testSameVersionUpdatesRecoveryWithWrites"); - conf.setLong("hbase.regionserver.hlog.blocksize", 15 * 1024); - conf.setBoolean(HConstants.DISTRIBUTED_LOG_REPLAY_KEY, true); - conf.setInt(HConstants.HREGION_MEMSTORE_FLUSH_SIZE, 30 * 1024); - conf.setInt("hbase.hstore.compactionThreshold", 3); - startCluster(NUM_RS); - final AtomicLong sequenceId = new AtomicLong(100); - final int NUM_REGIONS_TO_CREATE = 40; - final int NUM_LOG_LINES = 2000; - // turn off load balancing to prevent regions from moving around otherwise - // they will consume recovered.edits - master.balanceSwitch(false); - - List<RegionServerThread> rsts = cluster.getLiveRegionServerThreads(); - final ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "table-creation", null); - Table ht = installTable(zkw, "table", "family", NUM_REGIONS_TO_CREATE); - try { - List<HRegionInfo> regions = null; - HRegionServer hrs = null; - for (int i = 0; i < NUM_RS; i++) { - boolean isCarryingMeta = false; - hrs = rsts.get(i).getRegionServer(); - regions = ProtobufUtil.getOnlineRegions(hrs.getRSRpcServices()); - for (HRegionInfo region : regions) { - if (region.isMetaRegion()) { - isCarryingMeta = true; - break; - } - } - if (isCarryingMeta) { - continue; - } - break; - } - - LOG.info("#regions = " + regions.size()); - Iterator<HRegionInfo> it = regions.iterator(); - while (it.hasNext()) { - HRegionInfo region = it.next(); - if (region.isMetaTable() - || region.getEncodedName().equals(HRegionInfo.FIRST_META_REGIONINFO.getEncodedName())) { - it.remove(); - } - } - if (regions.size() == 0) return; - HRegionInfo curRegionInfo = regions.get(0); - byte[] startRow = curRegionInfo.getStartKey(); - if (startRow == null || startRow.length == 0) { - startRow = new byte[] { 0, 0, 0, 0, 1 }; - } - byte[] row = Bytes.incrementBytes(startRow, 1); - // use last 5 bytes because HBaseTestingUtility.createMultiRegions use 5 bytes key - row = Arrays.copyOfRange(row, 3, 8); - long value = 0; - final TableName tableName = TableName.valueOf("table"); - byte[] family = Bytes.toBytes("family"); - byte[] qualifier = Bytes.toBytes("c1"); - long timeStamp = System.currentTimeMillis(); - HTableDescriptor htd = new HTableDescriptor(tableName); - htd.addFamily(new HColumnDescriptor(family)); - final WAL wal = hrs.getWAL(curRegionInfo); - for (int i = 0; i < NUM_LOG_LINES; i += 1) { - WALEdit e = new WALEdit(); - value++; - e.add(new KeyValue(row, family, qualifier, timeStamp, Bytes.toBytes(value))); - wal.append(htd, curRegionInfo, new HLogKey(curRegionInfo.getEncodedNameAsBytes(), - tableName, System.currentTimeMillis()), e, true); - } - wal.sync(); - wal.shutdown(); - - // wait for abort completes - this.abortRSAndWaitForRecovery(hrs, zkw, NUM_REGIONS_TO_CREATE); - - // verify we got the last value - LOG.info("Verification Starts..."); - Get g = new Get(row); - Result r = ht.get(g); - long theStoredVal = Bytes.toLong(r.getValue(family, qualifier)); - assertEquals(value, theStoredVal); - - // after flush & compaction - LOG.info("Verification after flush..."); - TEST_UTIL.getHBaseAdmin().flush(tableName); - TEST_UTIL.getHBaseAdmin().compact(tableName); - - // wait for compaction completes - TEST_UTIL.waitFor(30000, 200, new Waiter.Predicate<Exception>() { - @Override - public boolean evaluate() throws Exception { - return (TEST_UTIL.getHBaseAdmin().getCompactionState(tableName) == CompactionState.NONE); - } - }); - - r = ht.get(g); - theStoredVal = Bytes.toLong(r.getValue(family, qualifier)); - assertEquals(value, theStoredVal); - } finally { - if (ht != null) ht.close(); - if (zkw != null) zkw.close(); - } - } - - @Test(timeout = 300000) - public void testReadWriteSeqIdFiles() throws Exception { - LOG.info("testReadWriteSeqIdFiles"); - startCluster(2); - final ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "table-creation", null); - Table ht = installTable(zkw, "table", "family", 10); - try { - FileSystem fs = master.getMasterFileSystem().getFileSystem(); - Path tableDir = FSUtils.getTableDir(FSUtils.getRootDir(conf), TableName.valueOf("table")); - List<Path> regionDirs = FSUtils.getRegionDirs(fs, tableDir); - long newSeqId = WALSplitter.writeRegionSequenceIdFile(fs, regionDirs.get(0), 1L, 1000L); - WALSplitter.writeRegionSequenceIdFile(fs, regionDirs.get(0) , 1L, 1000L); - assertEquals(newSeqId + 2000, - WALSplitter.writeRegionSequenceIdFile(fs, regionDirs.get(0), 3L, 1000L)); - - Path editsdir = WALSplitter.getRegionDirRecoveredEditsDir(regionDirs.get(0)); - FileStatus[] files = FSUtils.listStatus(fs, editsdir, new PathFilter() { - @Override - public boolean accept(Path p) { - return WALSplitter.isSequenceIdFile(p); - } - }); - // only one seqid file should exist - assertEquals(1, files.length); - - // verify all seqId files aren't treated as recovered.edits files - NavigableSet<Path> recoveredEdits = - WALSplitter.getSplitEditFilesSorted(fs, regionDirs.get(0)); - assertEquals(0, recoveredEdits.size()); - } finally { - if (ht != null) ht.close(); - if (zkw != null) zkw.close(); - } - } - - Table installTable(ZooKeeperWatcher zkw, String tname, String fname, int nrs) throws Exception { - return installTable(zkw, tname, fname, nrs, 0); - } - - Table installTable(ZooKeeperWatcher zkw, String tname, String fname, int nrs, - int existingRegions) throws Exception { - // Create a table with regions - TableName table = TableName.valueOf(tname); - byte [] family = Bytes.toBytes(fname); - LOG.info("Creating table with " + nrs + " regions"); - Table ht = TEST_UTIL.createMultiRegionTable(table, family, nrs); - int numRegions = -1; - try (RegionLocator r = TEST_UTIL.getConnection().getRegionLocator(table)) { - numRegions = r.getStartKeys().length; - } - assertEquals(nrs, numRegions); - LOG.info("Waiting for no more RIT\n"); - blockUntilNoRIT(zkw, master); - // disable-enable cycle to get rid of table's dead regions left behind - // by createMultiRegions - LOG.debug("Disabling table\n"); - TEST_UTIL.getHBaseAdmin().disableTable(table); - LOG.debug("Waiting for no more RIT\n"); - blockUntilNoRIT(zkw, master); - NavigableSet<String> regions = HBaseTestingUtility.getAllOnlineRegions(cluster); - LOG.debug("Verifying only catalog and namespace regions are assigned\n"); - if (regions.size() != 2) { - for (String oregion : regions) - LOG.debug("Region still online: " + oregion); - } - assertEquals(2 + existingRegions, regions.size()); - LOG.debug("Enabling table\n"); - TEST_UTIL.getHBaseAdmin().enableTable(table); - LOG.debug("Waiting for no more RIT\n"); - blockUntilNoRIT(zkw, master); - LOG.debug("Verifying there are " + numRegions + " assigned on cluster\n"); - regions = HBaseTestingUtility.getAllOnlineRegions(cluster); - assertEquals(numRegions + 2 + existingRegions, regions.size()); - return ht; - } - - void populateDataInTable(int nrows, String fname) throws Exception { - byte [] family = Bytes.toBytes(fname); - - List<RegionServerThread> rsts = cluster.getLiveRegionServerThreads(); - assertEquals(NUM_RS, rsts.size()); - - for (RegionServerThread rst : rsts) { - HRegionServer hrs = rst.getRegionServer(); - List<HRegionInfo> hris = ProtobufUtil.getOnlineRegions(hrs.getRSRpcServices()); - for (HRegionInfo hri : hris) { - if (hri.getTable().isSystemTable()) { - continue; - } - LOG.debug("adding data to rs = " + rst.getName() + - " region = "+ hri.getRegionNameAsString()); - Region region = hrs.getOnlineRegion(hri.getRegionName()); - assertTrue(region != null); - putData(region, hri.getStartKey(), nrows, Bytes.toBytes("q"), family); - } - } - - for (MasterThread mt : cluster.getLiveMasterThreads()) { - HRegionServer hrs = mt.getMaster(); - List<HRegionInfo> hris; - try { - hris = ProtobufUtil.getOnlineRegions(hrs.getRSRpcServices()); - } catch (ServerNotRunningYetException e) { - // It's ok: this master may be a backup. Ignored. - continue; - } - for (HRegionInfo hri : hris) { - if (hri.getTable().isSystemTable()) { - continue; - } - LOG.debug("adding data to rs = " + mt.getName() + - " region = "+ hri.getRegionNameAsString()); - Region region = hrs.getOnlineRegion(hri.getRegionName()); - assertTrue(region != null); - putData(region, hri.getStartKey(), nrows, Bytes.toBytes("q"), family); - } - } - } - - public void makeWAL(HRegionServer hrs, List<HRegionInfo> regions, String tname, String fname, - int num_edits, int edit_size) throws IOException { - makeWAL(hrs, regions, tname, fname, num_edits, edit_size, true); - } - - public void makeWAL(HRegionServer hrs, List<HRegionInfo> regions, String tname, String fname, - int num_edits, int edit_size, boolean cleanShutdown) throws IOException { - TableName fullTName = TableName.valueOf(tname); - // remove root and meta region - regions.remove(HRegionInfo.FIRST_META_REGIONINFO); - // using one sequenceId for edits across all regions is ok. - final AtomicLong sequenceId = new AtomicLong(10); - - - for(Iterator<HRegionInfo> iter = regions.iterator(); iter.hasNext(); ) { - HRegionInfo regionInfo = iter.next(); - if(regionInfo.getTable().isSystemTable()) { - iter.remove(); - } - } - HTableDescriptor htd = new HTableDescriptor(fullTName); - byte[] family = Bytes.toBytes(fname); - htd.addFamily(new HColumnDescriptor(family)); - byte[] value = new byte[edit_size]; - - List<HRegionInfo> hris = new ArrayList<HRegionInfo>(); - for (HRegionInfo region : regions) { - if (!region.getTable().getNameAsString().equalsIgnoreCase(tname)) { - continue; - } - hris.add(region); - } - LOG.info("Creating wal edits across " + hris.size() + " regions."); - for (int i = 0; i < edit_size; i++) { - value[i] = (byte) ('a' + (i % 26)); - } - int n = hris.size(); - int[] counts = new int[n]; - // sync every ~30k to line up with desired wal rolls - final int syncEvery = 30 * 1024 / edit_size; - if (n > 0) { - for (int i = 0; i < num_edits; i += 1) { - WALEdit e = new WALEdit(); - HRegionInfo curRegionInfo = hris.get(i % n); - final WAL log = hrs.getWAL(curRegionInfo); - byte[] startRow = curRegionInfo.getStartKey(); - if (startRow == null || startRow.length == 0) { - startRow = new byte[] { 0, 0, 0, 0, 1 }; - } - byte[] row = Bytes.incrementBytes(startRow, counts[i % n]); - row = Arrays.copyOfRange(row, 3, 8); // use last 5 bytes because - // HBaseTestingUtility.createMultiRegions use 5 bytes - // key - byte[] qualifier = Bytes.toBytes("c" + Integer.toString(i)); - e.add(new KeyValue(row, family, qualifier, System.currentTimeMillis(), value)); - log.append(htd, curRegionInfo, - new HLogKey(curRegionInfo.getEncodedNameAsBytes(), fullTName, - System.currentTimeMillis()), e, true); - if (0 == i % syncEvery) { - log.sync(); - } - counts[i % n] += 1; - } - } - // done as two passes because the regions might share logs. shutdown is idempotent, but sync - // will cause errors if done after. - for (HRegionInfo info : hris) { - final WAL log = hrs.getWAL(info); - log.sync(); - } - if (cleanShutdown) { - for (HRegionInfo info : hris) { - final WAL log = hrs.getWAL(info); - log.shutdown(); - } - } - for (int i = 0; i < n; i++) { - LOG.info("region " + hris.get(i).getRegionNameAsString() + " has " + counts[i] + " edits"); - } - return; - } - - private int countWAL(Path log, FileSystem fs, Configuration conf) - throws IOException { - int count = 0; - WAL.Reader in = WALFactory.createReader(fs, log, conf); - try { - WAL.Entry e; - while ((e = in.next()) != null) { - if (!WALEdit.isMetaEditFamily(e.getEdit().getCells().get(0))) { - count++; - } - } - } finally { - try { - in.close(); - } catch (IOException exception) { - LOG.warn("Problem closing wal: " + exception.getMessage()); - LOG.debug("exception details.", exception); - } - } - return count; - } - - private void blockUntilNoRIT(ZooKeeperWatcher zkw, HMaster master) throws Exception { - TEST_UTIL.waitUntilNoRegionsInTransition(60000); - } - - private void putData(Region region, byte[] startRow, int numRows, byte [] qf, - byte [] ...families) - throws IOException { - for(int i = 0; i < numRows; i++) { - Put put = new Put(Bytes.add(startRow, Bytes.toBytes(i))); - for(byte [] family : families) { - put.add(family, qf, null); - } - region.put(put); - } - } - - /** - * Load table with puts and deletes with expected values so that we can verify later - */ - private void prepareData(final Table t, final byte[] f, final byte[] column) throws IOException { - byte[] k = new byte[3]; - - // add puts - List<Put> puts = new ArrayList<>(); - for (byte b1 = 'a'; b1 <= 'z'; b1++) { - for (byte b2 = 'a'; b2 <= 'z'; b2++) { - for (byte b3 = 'a'; b3 <= 'z'; b3++) { - k[0] = b1; - k[1] = b2; - k[2] = b3; - Put put = new Put(k); - put.add(f, column, k); - puts.add(put); - } - } - } - t.put(puts); - // add deletes - for (byte b3 = 'a'; b3 <= 'z'; b3++) { - k[0] = 'a'; - k[1] = 'a'; - k[2] = b3; - Delete del = new Delete(k); - t.delete(del); - } - } - - private void waitForCounter(AtomicLong ctr, long oldval, long newval, - long timems) { - long curt = System.currentTimeMillis(); - long endt = curt + timems; - while (curt < endt) { - if (ctr.get() == oldval) { - Thread.yield(); - curt = System.currentTimeMillis(); - } else { - assertEquals(newval, ctr.get()); - return; - } - } - assertTrue(false); - } - - private void abortMaster(MiniHBaseCluster cluster) throws InterruptedException { - for (MasterThread mt : cluster.getLiveMasterThreads()) { - if (mt.getMaster().isActiveMaster()) { - mt.getMaster().abort("Aborting for tests", new Exception("Trace info")); - mt.join(); - break; - } - } - LOG.debug("Master is aborted"); - } - - /** - * Find a RS that has regions of a table. - * @param hasMetaRegion when true, the returned RS has hbase:meta region as well - * @param tableName - * @return - * @throws Exception - */ - private HRegionServer findRSToKill(boolean hasMetaRegion, String tableName) throws Exception { - List<RegionServerThread> rsts = cluster.getLiveRegionServerThreads(); - List<HRegionInfo> regions = null; - HRegionServer hrs = null; - - for (RegionServerThread rst: rsts) { - hrs = rst.getRegionServer(); - while (rst.isAlive() && !hrs.isOnline()) { - Thread.sleep(100); - } - if (!rst.isAlive()) { - continue; - } - boolean isCarryingMeta = false; - boolean foundTableRegion = false; - regions = ProtobufUtil.getOnlineRegions(hrs.getRSRpcServices()); - for (HRegionInfo region : regions) { - if (region.isMetaRegion()) { - isCarryingMeta = true; - } - if (tableName == null || region.getTable().getNameAsString().equals(tableName)) { - foundTableRegion = true; - } - if (foundTableRegion && (isCarryingMeta || !hasMetaRegion)) { - break; - } - } - if (isCarryingMeta && hasMetaRegion) { - // clients ask for a RS with META - if (!foundTableRegion) { - final HRegionServer destRS = hrs; - // the RS doesn't have regions of the specified table so we need move one to this RS - List<HRegionInfo> tableRegions = - TEST_UTIL.getHBaseAdmin().getTableRegions(TableName.valueOf(tableName)); - final HRegionInfo hri = tableRegions.get(0); - TEST_UTIL.getHBaseAdmin().move(hri.getEncodedNameAsBytes(), - Bytes.toBytes(destRS.getServerName().getServerName())); - // wait for region move completes - final RegionStates regionStates = - TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager().getRegionStates(); - TEST_UTIL.waitFor(45000, 200, new Waiter.Predicate<Exception>() { - @Override - public boolean evaluate() throws Exception { - ServerName sn = regionStates.getRegionServerOfRegion(hri); - return (sn != null && sn.equals(destRS.getServerName())); - } - }); - } - return hrs; - } else if (hasMetaRegion || isCarryingMeta) { - continue; - } - if (foundTableRegion) break; - } - - return hrs; - } -}