[ https://issues.apache.org/jira/browse/PHOENIX-2667?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15141517#comment-15141517 ]
James Taylor commented on PHOENIX-2667: --------------------------------------- bq. We can break up the reads for the index work, and do the "read from region" bits from the handler thread, and do the rest from the thread pool. wdyt? +1 as long as we know that a split won't occur during the "read from region". Handling that case would be tricky. > Race condition between IndexBuilder and Split for region lock > ------------------------------------------------------------- > > Key: PHOENIX-2667 > URL: https://issues.apache.org/jira/browse/PHOENIX-2667 > Project: Phoenix > Issue Type: Bug > Reporter: Enis Soztutar > > In a production cluster, we have seen a condition where the split did not > finish for 30+ minutes. Also due to this, no request was being serviced in > this time frame affectively making the region offline. > The jstack provides 3 types of threads waiting on the regions read or write > locks. > First, the handlers are all blocked on trying to acquire the read lock on the > region in multi(), most of the handlers are like this: > {code} > Thread 2328: (state = BLOCKED) > - sun.misc.Unsafe.park(boolean, long) @bci=0 (Compiled frame; information > may be imprecise) > - java.util.concurrent.locks.LockSupport.parkNanos(java.lang.Object, long) > @bci=20, line=226 (Compiled frame) > - > java.util.concurrent.locks.AbstractQueuedSynchronizer.doAcquireSharedNanos(int, > long) @bci=122, line=1033 (Compiled frame) > - > java.util.concurrent.locks.AbstractQueuedSynchronizer.tryAcquireSharedNanos(int, > long) @bci=25, line=1326 (Compiled frame) > - java.util.concurrent.locks.ReentrantReadWriteLock$ReadLock.tryLock(long, > java.util.concurrent.TimeUnit) @bci=10, line=873 (Compiled frame) > - > org.apache.hadoop.hbase.regionserver.HRegion.lock(java.util.concurrent.locks.Lock, > int) @bci=27, line=7754 (Interpreted frame) > - > org.apache.hadoop.hbase.regionserver.HRegion.lock(java.util.concurrent.locks.Lock) > @bci=3, line=7741 (Interpreted frame) > - > org.apache.hadoop.hbase.regionserver.HRegion.startRegionOperation(org.apache.hadoop.hbase.regionserver.Region$Operation) > @bci=211, line=7650 (Interpreted frame) > - > org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(org.apache.hadoop.hbase.regionserver.HRegion$BatchOperationInProgress) > @bci=21, line=2803 (Interpreted frame) > - > org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(org.apache.hadoop.hbase.client.Mutation[], > long, long) @bci=12, line=2760 (Compiled frame) > - > org.apache.hadoop.hbase.regionserver.RSRpcServices.doBatchOp(org.apache.hadoop.hbase.protobuf.generated.ClientProtos$RegionActionResult$Builder, > org.apache.hadoop.hbase.regionserver.Region, org.apache. > - > org.apache.hadoop.hbase.regionserver.RSRpcServices.doNonAtomicRegionMutation(org.apache.hadoop.hbase.regionserver.Region, > org.apache.hadoop.hbase.quotas.OperationQuota, > org.apache.hadoop.hbase.protobuf > - > org.apache.hadoop.hbase.regionserver.RSRpcServices.multi(com.google.protobuf.RpcController, > org.apache.hadoop.hbase.protobuf.generated.ClientProtos$MultiRequest) > @bci=407, line=2032 (Compiled frame) > - > org.apache.hadoop.hbase.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(com.google.protobuf.Descriptors$MethodDescriptor, > com.google.protobuf.RpcController, com.google.protobuf.Messa > - > org.apache.hadoop.hbase.ipc.RpcServer.call(com.google.protobuf.BlockingService, > com.google.protobuf.Descriptors$MethodDescriptor, > com.google.protobuf.Message, org.apache.hadoop.hbase.CellScanner, long, > - org.apache.hadoop.hbase.ipc.CallRunner.run() @bci=345, line=101 (Compiled > frame) > - > org.apache.hadoop.hbase.ipc.RpcExecutor.consumerLoop(java.util.concurrent.BlockingQueue) > @bci=54, line=130 (Compiled frame) > - org.apache.hadoop.hbase.ipc.RpcExecutor$1.run() @bci=20, line=107 > (Interpreted frame) > - java.lang.Thread.run() @bci=11, line=745 (Interpreted frame) > {code} > second, the IndexBuilder threads from Phoenix index are also blocked waiting > on the region read locks: > {code} > Thread 17566: (state = BLOCKED) > - sun.misc.Unsafe.park(boolean, long) @bci=0 (Compiled frame; information > may be imprecise) > - java.util.concurrent.locks.LockSupport.parkNanos(java.lang.Object, long) > @bci=20, line=226 (Compiled frame) > - > java.util.concurrent.locks.AbstractQueuedSynchronizer.doAcquireSharedNanos(int, > long) @bci=122, line=1033 (Compiled frame) > - > java.util.concurrent.locks.AbstractQueuedSynchronizer.tryAcquireSharedNanos(int, > long) @bci=25, line=1326 (Compiled frame) > - java.util.concurrent.locks.ReentrantReadWriteLock$ReadLock.tryLock(long, > java.util.concurrent.TimeUnit) @bci=10, line=873 (Compiled frame) > - > org.apache.hadoop.hbase.regionserver.HRegion.lock(java.util.concurrent.locks.Lock, > int) @bci=27, line=7754 (Interpreted frame) > - > org.apache.hadoop.hbase.regionserver.HRegion.lock(java.util.concurrent.locks.Lock) > @bci=3, line=7741 (Interpreted frame) > - > org.apache.hadoop.hbase.regionserver.HRegion.startRegionOperation(org.apache.hadoop.hbase.regionserver.Region$Operation) > @bci=211, line=7650 (Interpreted frame) > - > org.apache.hadoop.hbase.regionserver.HRegion.getScanner(org.apache.hadoop.hbase.client.Scan, > java.util.List) @bci=4, line=2484 (Interpreted frame) > - > org.apache.hadoop.hbase.regionserver.HRegion.getScanner(org.apache.hadoop.hbase.client.Scan) > @bci=3, line=2479 (Compiled frame) > - > org.apache.phoenix.hbase.index.covered.data.LocalTable.getCurrentRowState(org.apache.hadoop.hbase.client.Mutation, > java.util.Collection) @bci=43, line=63 (Compiled frame) > - > org.apache.phoenix.hbase.index.covered.LocalTableState.ensureLocalStateInitialized(java.util.Collection) > @bci=29, line=158 (Compiled frame) > - > org.apache.phoenix.hbase.index.covered.LocalTableState.getIndexedColumnsTableState(java.util.Collection) > @bci=2, line=126 (Compiled frame) > - > org.apache.phoenix.index.PhoenixIndexCodec.getIndexUpdates(org.apache.phoenix.hbase.index.covered.TableState, > boolean) @bci=213, line=162 (Compiled frame) > - > org.apache.phoenix.index.PhoenixIndexCodec.getIndexDeletes(org.apache.phoenix.hbase.index.covered.TableState) > @bci=3, line=120 (Compiled frame) > - > org.apache.phoenix.hbase.index.covered.CoveredColumnsIndexBuilder.addDeleteUpdatesToMap(org.apache.phoenix.hbase.index.covered.update.IndexUpdateManager, > org.apache.phoenix.hbase.index.covered.LocalTab > - > org.apache.phoenix.hbase.index.covered.CoveredColumnsIndexBuilder.addCleanupForCurrentBatch(org.apache.phoenix.hbase.index.covered.update.IndexUpdateManager, > long, org.apache.phoenix.hbase.index.covere > - > org.apache.phoenix.hbase.index.covered.CoveredColumnsIndexBuilder.addMutationsForBatch(org.apache.phoenix.hbase.index.covered.update.IndexUpdateManager, > org.apache.phoenix.hbase.index.covered.Batch, or > - > org.apache.phoenix.hbase.index.covered.CoveredColumnsIndexBuilder.batchMutationAndAddUpdates(org.apache.phoenix.hbase.index.covered.update.IndexUpdateManager, > org.apache.hadoop.hbase.client.Mutation) @ > - > org.apache.phoenix.hbase.index.covered.CoveredColumnsIndexBuilder.getIndexUpdate(org.apache.hadoop.hbase.client.Mutation) > @bci=11, line=99 (Compiled frame) > - org.apache.phoenix.hbase.index.builder.IndexBuildManager$1.call() @bci=11, > line=133 (Compiled frame) > - org.apache.phoenix.hbase.index.builder.IndexBuildManager$1.call() @bci=1, > line=129 (Compiled frame) > - java.util.concurrent.FutureTask.run() @bci=42, line=262 (Compiled frame) > - > java.util.concurrent.ThreadPoolExecutor.runWorker(java.util.concurrent.ThreadPoolExecutor$Worker) > @bci=95, line=1145 (Compiled frame) > - java.util.concurrent.ThreadPoolExecutor$Worker.run() @bci=5, line=615 > (Interpreted frame) > - java.lang.Thread.run() @bci=11, line=745 (Interpreted frame) > {code} > Finally, the splitting of the region is being executed, which is trying to > close the region. This operation acquires the regions write lock, but it is > also blocked: > {code} > Thread 18417: (state = BLOCKED) > - sun.misc.Unsafe.park(boolean, long) @bci=0 (Compiled frame; information > may be imprecise) > - java.util.concurrent.locks.LockSupport.park(java.lang.Object) @bci=14, > line=186 (Compiled frame) > - > java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt() > @bci=1, line=834 (Compiled frame) > - > java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireQueued(java.util.concurrent.locks.AbstractQueuedSynchronizer$Node, > int) @bci=67, line=867 (Compiled frame) > - java.util.concurrent.locks.AbstractQueuedSynchronizer.acquire(int) > @bci=17, line=1197 (Compiled frame) > - java.util.concurrent.locks.ReentrantReadWriteLock$WriteLock.lock() @bci=5, > line=945 (Interpreted frame) > - org.apache.hadoop.hbase.regionserver.HRegion.doClose(boolean, > org.apache.hadoop.hbase.monitoring.MonitoredTask) @bci=278, line=1401 > (Interpreted frame) > - org.apache.hadoop.hbase.regionserver.HRegion.close(boolean) @bci=62, > line=1349 (Interpreted frame) > - > org.apache.hadoop.hbase.regionserver.SplitTransactionImpl.stepsBeforePONR(org.apache.hadoop.hbase.Server, > org.apache.hadoop.hbase.regionserver.RegionServerServices, boolean) > @bci=238, line=394 (Interpr > - > org.apache.hadoop.hbase.regionserver.SplitTransactionImpl.createDaughters(org.apache.hadoop.hbase.Server, > org.apache.hadoop.hbase.regionserver.RegionServerServices, > org.apache.hadoop.hbase.security.Use > - > org.apache.hadoop.hbase.regionserver.SplitTransactionImpl.execute(org.apache.hadoop.hbase.Server, > org.apache.hadoop.hbase.regionserver.RegionServerServices, > org.apache.hadoop.hbase.security.User) @bci= > - > org.apache.hadoop.hbase.regionserver.SplitRequest.doSplitting(org.apache.hadoop.hbase.security.User) > @bci=357, line=82 (Interpreted frame) > - org.apache.hadoop.hbase.regionserver.SplitRequest.run() @bci=74, line=154 > (Interpreted frame) > - > java.util.concurrent.ThreadPoolExecutor.runWorker(java.util.concurrent.ThreadPoolExecutor$Worker) > @bci=95, line=1145 (Compiled frame) > - java.util.concurrent.ThreadPoolExecutor$Worker.run() @bci=5, line=615 > (Interpreted frame) > - java.lang.Thread.run() @bci=11, line=745 (Interpreted frame) > {code} > This looks like a race between the split thread and the global index updates > racing for the region lock. -- This message was sent by Atlassian JIRA (v6.3.4#6332)