[
https://issues.apache.org/jira/browse/HBASE-22665?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16880112#comment-16880112
]
Yechao Chen edited comment on HBASE-22665 at 7/8/19 8:33 AM:
-------------------------------------------------------------
check the the code and jstack ,
the wal log roller stuck ,so the .AbstractFSWAL.shutdown wait the
AbstractFSWAL.rollWriter
with the lock rollwriterLock.lock
it seems like that:
1、AbstractFSWAL.rollWriter called rollWriterLock.lock();
2、AsyncFSWAL.doReplaceWriter called waitForSafePoint();
3、waitForSafePoint() can't finished
4、AbstractFSWAL.shutdown called rollWriterLock.lock();(waiting)
5、The rs process can't be aborted
with {color:#ff0000}at
org.apache.hadoop.hbase.regionserver.wal.AsyncFSWAL.waitForSafePoint(AsyncFSWAL.java:628){color}
"regionserver/hbase-slave-216-99:16020.logRoller" #297 daemon prio=5 os_prio=0
tid=0x00007f202a4952c0 nid=0x34c2 waiting on condition [0x00007f0fdd19f000]
java.lang.Thread.State: WAITING (parking)
at sun.misc.Unsafe.park(Native Method)
- parking to wait for <0x00007f18d60b93a8> (a
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
at
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitUninterruptibly(AbstractQueuedSynchronizer.java:1976)
{color:#ff0000}at
org.apache.hadoop.hbase.regionserver.wal.AsyncFSWAL.waitForSafePoint(AsyncFSWAL.java:628){color}
at
org.apache.hadoop.hbase.regionserver.wal.AsyncFSWAL.doReplaceWriter(AsyncFSWAL.java:656)
at
org.apache.hadoop.hbase.regionserver.wal.AsyncFSWAL.doReplaceWriter(AsyncFSWAL.java:124)
at
org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.replaceWriter(AbstractFSWAL.java:699)
at
org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.rollWriter(AbstractFSWAL.java:759)
at org.apache.hadoop.hbase.regionserver.LogRoller.run(LogRoller.java:184)
at java.lang.Thread.run(Thread.java:745)
"regionserver/hbase-slave-216-99:16020" #25 daemon prio=5 os_prio=0
tid=0x00007f204282c600 nid=0x34aa waiting on condition [0x00007f0fe044d000]
java.lang.Thread.State: WAITING (parking)
at sun.misc.Unsafe.park(Native Method)
- parking to wait for <0x00007f18a49b2bb8> (a
java.util.concurrent.locks.ReentrantLock$FairSync)
at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
at
java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:836)
at
java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireQueued(AbstractQueuedSynchronizer.java:870)
at
java.util.concurrent.locks.AbstractQueuedSynchronizer.acquire(AbstractQueuedSynchronizer.java:1199)
at
java.util.concurrent.locks.ReentrantLock$FairSync.lock(ReentrantLock.java:224)
{color:#FF0000}at
java.util.concurrent.locks.ReentrantLock.lock(ReentrantLock.java:285){color}
{color:#FF0000} at
org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.shutdown(AbstractFSWAL.java:815){color}
at
org.apache.hadoop.hbase.wal.AbstractFSWALProvider.shutdown(AbstractFSWALProvider.java:168)
at
org.apache.hadoop.hbase.wal.RegionGroupingProvider.shutdown(RegionGroupingProvider.java:221)
at org.apache.hadoop.hbase.wal.WALFactory.shutdown(WALFactory.java:239)
at
org.apache.hadoop.hbase.regionserver.HRegionServer.shutdownWAL(HRegionServer.java:1445)
at
org.apache.hadoop.hbase.regionserver.HRegionServer.run(HRegionServer.java:1117)
at java.lang.Thread.run(Thread.java:745)
!image-2019-07-08-16-07-37-664.png!
!image-2019-07-08-16-08-26-777.png!
!image-2019-07-08-16-14-43-455.png!
was (Author: chenyechao):
check the the code and jstack ,
the wal log roller stuck ,so the .AbstractFSWAL.shutdown wait the
AbstractFSWAL.rollWriter
with the lock rollwriterLock.lock
it seems like that:
1、AbstractFSWAL.rollWriter called rollWriterLock.lock();
2、AsyncFSWAL.doReplaceWriter called waitForSafePoint();
3、waitForSafePoint() can't finished
4、AbstractFSWAL.shutdown called rollWriterLock.lock();(waiting)
5、The rs process can't be aborted
with {color:#FF0000}at
org.apache.hadoop.hbase.regionserver.wal.AsyncFSWAL.waitForSafePoint(AsyncFSWAL.java:628){color}
"regionserver/hbase-slave-216-99:16020.logRoller" #297 daemon prio=5 os_prio=0
tid=0x00007f202a4952c0 nid=0x34c2 waiting on condition [0x00007f0fdd19f000]
java.lang.Thread.State: WAITING (parking)
at sun.misc.Unsafe.park(Native Method)
- parking to wait for <0x00007f18d60b93a8> (a
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
at
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitUninterruptibly(AbstractQueuedSynchronizer.java:1976)
{color:#FF0000}at
org.apache.hadoop.hbase.regionserver.wal.AsyncFSWAL.waitForSafePoint(AsyncFSWAL.java:628){color}
at
org.apache.hadoop.hbase.regionserver.wal.AsyncFSWAL.doReplaceWriter(AsyncFSWAL.java:656)
at
org.apache.hadoop.hbase.regionserver.wal.AsyncFSWAL.doReplaceWriter(AsyncFSWAL.java:124)
at
org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.replaceWriter(AbstractFSWAL.java:699)
at
org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.rollWriter(AbstractFSWAL.java:759)
at org.apache.hadoop.hbase.regionserver.LogRoller.run(LogRoller.java:184)
at java.lang.Thread.run(Thread.java:745)
"regionserver/hbase-slave-216-99:16020" #25 daemon prio=5 os_prio=0
tid=0x00007f204282c600 nid=0x34aa waiting on condition [0x00007f0fe044d000]
java.lang.Thread.State: WAITING (parking)
at sun.misc.Unsafe.park(Native Method)
- parking to wait for <0x00007f18a49b2bb8> (a
java.util.concurrent.locks.ReentrantLock$FairSync)
at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
at
java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:836)
at
java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireQueued(AbstractQueuedSynchronizer.java:870)
at
java.util.concurrent.locks.AbstractQueuedSynchronizer.acquire(AbstractQueuedSynchronizer.java:1199)
at
java.util.concurrent.locks.ReentrantLock$FairSync.lock(ReentrantLock.java:224)
at java.util.concurrent.locks.ReentrantLock.lock(ReentrantLock.java:285)
at
org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.shutdown(AbstractFSWAL.java:815)
at
org.apache.hadoop.hbase.wal.AbstractFSWALProvider.shutdown(AbstractFSWALProvider.java:168)
at
org.apache.hadoop.hbase.wal.RegionGroupingProvider.shutdown(RegionGroupingProvider.java:221)
at org.apache.hadoop.hbase.wal.WALFactory.shutdown(WALFactory.java:239)
at
org.apache.hadoop.hbase.regionserver.HRegionServer.shutdownWAL(HRegionServer.java:1445)
at
org.apache.hadoop.hbase.regionserver.HRegionServer.run(HRegionServer.java:1117)
at java.lang.Thread.run(Thread.java:745)
!image-2019-07-08-16-07-37-664.png!
!image-2019-07-08-16-08-26-777.png!
!image-2019-07-08-16-14-43-455.png!
> RegionServer abort failed when AbstractFSWAL.shutdown hang
> ----------------------------------------------------------
>
> Key: HBASE-22665
> URL: https://issues.apache.org/jira/browse/HBASE-22665
> Project: HBase
> Issue Type: Bug
> Environment: HBase 2.1.2
> Hadoop 3.1.x
> centos 7.4
> Reporter: Yechao Chen
> Priority: Major
> Attachments: image-2019-07-08-16-07-37-664.png,
> image-2019-07-08-16-08-26-777.png, image-2019-07-08-16-14-43-455.png,
> jstack_20190625, jstack_20190704_1, jstack_20190704_2
>
>
> We use hbase 2.1.2,when the rs with heavy qps and rs abort with error like
> "Caused by: org.apache.hadoop.hbase.exceptions.TimeoutIOException: Failed to
> get sync result after 300000 ms for txid=36380334, WAL system stuck?"
>
> RegionServer aborted failed when AbstractFSWAL.shutdown hang
>
> jstack info always show the regionserver hang with "AbstractFSWAL.shutdown"
> "regionserver/hbase-slave-216-99:16020" #25 daemon prio=5 os_prio=0
> tid=0x00007f204282c600 nid=0x34aa waiting on condition [0x00007f0fe044d000]
> java.lang.Thread.State: WAITING (parking)
> at sun.misc.Unsafe.park(Native Method)
> - parking to wait for <0x00007f18a49b2bb8> (a
> java.util.concurrent.locks.ReentrantLock$FairSync)
> at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
> at
> java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:836)
> at
> java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireQueued(AbstractQueuedSynchronizer.java:870)
> at
> java.util.concurrent.locks.AbstractQueuedSynchronizer.acquire(AbstractQueuedSynchronizer.java:1199)
> at
> java.util.concurrent.locks.ReentrantLock$FairSync.lock(ReentrantLock.java:224)
> {color:#FF0000}at
> java.util.concurrent.locks.ReentrantLock.lock(ReentrantLock.java:285){color}
> {color:#FF0000} at
> org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.shutdown(AbstractFSWAL.java:815){color}
> at
> org.apache.hadoop.hbase.wal.AbstractFSWALProvider.shutdown(AbstractFSWALProvider.java:168)
> at
> org.apache.hadoop.hbase.wal.RegionGroupingProvider.shutdown(RegionGroupingProvider.java:221)
> at org.apache.hadoop.hbase.wal.WALFactory.shutdown(WALFactory.java:239)
> at
> org.apache.hadoop.hbase.regionserver.HRegionServer.shutdownWAL(HRegionServer.java:1445)
> {color:#FF0000}at
> org.apache.hadoop.hbase.regionserver.HRegionServer.run(HRegionServer.java:1117){color}
> {color:#FF0000} at java.lang.Thread.run(Thread.java:745){color}
>
>
>
>
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)