Author: stack Date: Sat May 1 00:16:29 2010 New Revision: 939885 URL: http://svn.apache.org/viewvc?rev=939885&view=rev Log: HBASE-2421 Put hangs for 10 retries on failed region servers
Modified: hadoop/hbase/branches/0.20/CHANGES.txt hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HConnection.java hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HConnectionManager.java hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HTable.java hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java hadoop/hbase/branches/0.20/src/test/org/apache/hadoop/hbase/TestMultiParallelPut.java Modified: hadoop/hbase/branches/0.20/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.20/CHANGES.txt?rev=939885&r1=939884&r2=939885&view=diff ============================================================================== --- hadoop/hbase/branches/0.20/CHANGES.txt (original) +++ hadoop/hbase/branches/0.20/CHANGES.txt Sat May 1 00:16:29 2010 @@ -108,6 +108,8 @@ Release 0.20.4 - Unreleased HBASE-2497 ProcessServerShutdown throws NullPointerException for offline regions (Miklos Kurucz via Stack) HBASE-2499 Race condition when disabling a table leaves regions in transition + HBASE-2421 Put hangs for 10 retries on failed region servers + (Todd Lipcon via Stack) IMPROVEMENTS HBASE-2180 Bad read performance from synchronizing hfile.fddatainputstream Modified: hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HConnection.java URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HConnection.java?rev=939885&r1=939884&r2=939885&view=diff ============================================================================== --- hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HConnection.java (original) +++ hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HConnection.java Sat May 1 00:16:29 2010 @@ -192,7 +192,7 @@ public interface HConnection { * @throws IOException * @throws RuntimeException */ - public <T> T getRegionServerForWithoutRetries(ServerCallable<T> callable) + public <T> T getRegionServerWithoutRetries(ServerCallable<T> callable) throws IOException, RuntimeException; @@ -219,5 +219,5 @@ public interface HConnection { public void processBatchOfPuts(List<Put> list, final byte[] tableName, ExecutorService pool) throws IOException; - + } Modified: hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HConnectionManager.java URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HConnectionManager.java?rev=939885&r1=939884&r2=939885&view=diff ============================================================================== --- hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HConnectionManager.java (original) +++ hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HConnectionManager.java Sat May 1 00:16:29 2010 @@ -681,7 +681,7 @@ public class HConnectionManager implemen // This block guards against two threads trying to load the meta // region at the same time. The first will load the meta region and // the second will use the value that the first one found. - synchronized(regionLockObject) { + synchronized (regionLockObject) { // Check the cache again for a hit in case some other thread made the // same query while we were waiting on the lock. If not supposed to // be using the cache, delete any existing cached location so it won't @@ -1082,15 +1082,19 @@ public class HConnectionManager implemen return null; } - public <T> T getRegionServerForWithoutRetries(ServerCallable<T> callable) + public <T> T getRegionServerWithoutRetries(ServerCallable<T> callable) throws IOException, RuntimeException { try { callable.instantiateServer(false); return callable.call(); } catch (Throwable t) { - t = translateException(t); + Throwable t2 = translateException(t); + if (t2 instanceof IOException) { + throw (IOException)t2; + } else { + throw new RuntimeException(t2); + } } - return null; } private HRegionLocation @@ -1305,8 +1309,28 @@ public class HConnectionManager implemen } } + /** + * Process a batch of Puts on the given executor service. + * + * @param list the puts to make - successful puts will be removed. + * @param pool thread pool to execute requests on + * + * In the case of an exception, we take different actions depending on the + * situation: + * - If the exception is a DoNotRetryException, we rethrow it and leave the + * 'list' parameter in an indeterminate state. + * - If the 'list' parameter is a singleton, we directly throw the specific + * exception for that put. + * - Otherwise, we throw a generic exception indicating that an error occurred. + * The 'list' parameter is mutated to contain those puts that did not succeed. + */ public void processBatchOfPuts(List<Put> list, - final byte[] tableName, ExecutorService pool) throws IOException { + final byte[] tableName, + ExecutorService pool) throws IOException { + boolean singletonList = list.size() == 1; + Throwable singleRowCause = null; + List<Put> permFails = new ArrayList<Put>(); + for ( int tries = 0 ; tries < numRetries && !list.isEmpty(); ++tries) { Collections.sort(list); Map<HServerAddress, MultiPut> regionPuts = @@ -1372,10 +1396,20 @@ public class HConnectionManager implemen LOG.debug("Failed all from " + request.address, e); failed.addAll(request.allPuts()); } catch (ExecutionException e) { - System.out.println(e); // all go into the failed list. LOG.debug("Failed all from " + request.address, e); failed.addAll(request.allPuts()); + + // Just give up, leaving the batch put list in an untouched/semi-committed state + if (e.getCause() instanceof DoNotRetryIOException) { + throw (DoNotRetryIOException) e.getCause(); + } + + if (singletonList) { + // be richer for reporting in a 1 row case. + singleRowCause = e.getCause(); + } + } } list.clear(); @@ -1391,15 +1425,20 @@ public class HConnectionManager implemen " ms!"); try { Thread.sleep(sleepTime); - } catch (InterruptedException e) { - + } catch (InterruptedException ignored) { } } } + if (!list.isEmpty()) { + if (singletonList && singleRowCause != null) { + throw new IOException(singleRowCause); + } + + // ran out of retries and didnt succeed everything! throw new RetriesExhaustedException("Still had " + list.size() + " puts left after retrying " + - numRetries + " times. Should have detail on which Regions failed the most"); + numRetries + " times."); } } @@ -1410,7 +1449,7 @@ public class HConnectionManager implemen final HConnection connection = this; return new Callable<MultiPutResponse>() { public MultiPutResponse call() throws IOException { - return getRegionServerWithRetries( + return getRegionServerWithoutRetries( new ServerCallable<MultiPutResponse>(connection, tableName, null) { public MultiPutResponse call() throws IOException { MultiPutResponse resp = server.multiPut(puts); Modified: hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HTable.java URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HTable.java?rev=939885&r1=939884&r2=939885&view=diff ============================================================================== --- hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HTable.java (original) +++ hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HTable.java Sat May 1 00:16:29 2010 @@ -702,10 +702,10 @@ public class HTable { connection.processBatchOfPuts(writeBuffer, tableName, pool); } finally { - // the write buffer was adjsuted by processBatchOfPuts + // the write buffer was adjusted by processBatchOfPuts currentWriteBufferSize = 0; - for (int i = 0; i < writeBuffer.size(); i++) { - currentWriteBufferSize += writeBuffer.get(i).heapSize(); + for (Put aPut : writeBuffer) { + currentWriteBufferSize += aPut.heapSize(); } } } Modified: hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java?rev=939885&r1=939884&r2=939885&view=diff ============================================================================== --- hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (original) +++ hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java Sat May 1 00:16:29 2010 @@ -234,7 +234,7 @@ public class HRegionServer implements HC // Run HDFS shutdown thread on exit if this is set. We clear this out when // doing a restart() to prevent closing of HDFS. - private final AtomicBoolean shutdownHDFS = new AtomicBoolean(true); + public final AtomicBoolean shutdownHDFS = new AtomicBoolean(true); private final String machineName; Modified: hadoop/hbase/branches/0.20/src/test/org/apache/hadoop/hbase/TestMultiParallelPut.java URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.20/src/test/org/apache/hadoop/hbase/TestMultiParallelPut.java?rev=939885&r1=939884&r2=939885&view=diff ============================================================================== --- hadoop/hbase/branches/0.20/src/test/org/apache/hadoop/hbase/TestMultiParallelPut.java (original) +++ hadoop/hbase/branches/0.20/src/test/org/apache/hadoop/hbase/TestMultiParallelPut.java Sat May 1 00:16:29 2010 @@ -20,15 +20,15 @@ package org.apache.hadoop.hbase; +import org.apache.hadoop.hbase.client.Get; +import org.apache.hadoop.hbase.client.HBaseAdmin; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Put; -import org.apache.hadoop.hbase.client.Get; import org.apache.hadoop.hbase.client.Result; -import org.apache.hadoop.hbase.client.HBaseAdmin; import org.apache.hadoop.hbase.util.Bytes; -import java.util.List; import java.util.ArrayList; +import java.util.List; public class TestMultiParallelPut extends MultiRegionTable { private static final byte[] VALUE = Bytes.toBytes("value"); @@ -58,7 +58,14 @@ public class TestMultiParallelPut extend List<byte[]> keys = new ArrayList<byte[]>(); - public void testMultiPut() throws Exception { + public void testParallelPut() throws Exception { + doATest(false); + } + public void testParallelPutWithRSAbort() throws Exception { + doATest(true); + } + + public void doATest(boolean doAbort) throws Exception { HTable table = new HTable(TEST_TABLE); table.setAutoFlush(false); @@ -73,6 +80,19 @@ public class TestMultiParallelPut extend table.flushCommits(); + if (doAbort) { + cluster.abortRegionServer(0); + + // try putting more keys after the abort. + for ( byte [] k : keys ) { + Put put = new Put(k); + put.add(BYTES_FAMILY, QUALIFIER, VALUE); + + table.put(put); + } + table.flushCommits(); + } + for (byte [] k : keys ) { Get get = new Get(k); get.addColumn(BYTES_FAMILY, QUALIFIER); @@ -88,10 +108,15 @@ public class TestMultiParallelPut extend HBaseAdmin admin = new HBaseAdmin(conf); ClusterStatus cs = admin.getClusterStatus(); - assertEquals(2, cs.getServers()); + int expectedServerCount = 2; + if (doAbort) + expectedServerCount = 1; + + assertEquals(expectedServerCount, cs.getServers()); for ( HServerInfo info : cs.getServerInfo()) { System.out.println(info); assertTrue( info.getLoad().getNumberOfRegions() > 10); } } + }