hi 张老师 用arthas打印了blocked的线程,快照清理线程卡住的情况如下: "dir-scan-pool4-thread-10" Id=1448 RUNNABLE at org.apache.hadoop.hbase.TableName.createTableNameIfNecessary(TableName.java:377) at org.apache.hadoop.hbase.TableName.valueOf(TableName.java:505) at org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil.toTableName(ProtobufUtil.java:2175) at org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil.toRegionInfo(ProtobufUtil.java:3114) at org.apache.hadoop.hbase.snapshot.SnapshotReferenceUtil.visitRegionStoreFiles(SnapshotReferenceUtil.java:134) at org.apache.hadoop.hbase.snapshot.SnapshotReferenceUtil.visitTableStoreFiles(SnapshotReferenceUtil.java:121) at org.apache.hadoop.hbase.snapshot.SnapshotReferenceUtil.getHFileNames(SnapshotReferenceUtil.java:348) at org.apache.hadoop.hbase.snapshot.SnapshotReferenceUtil.getHFileNames(SnapshotReferenceUtil.java:331) at org.apache.hadoop.hbase.master.snapshot.SnapshotHFileCleaner$1.filesUnderSnapshot(SnapshotHFileCleaner.java:108) at org.apache.hadoop.hbase.master.snapshot.SnapshotFileCache.getSnapshotsInProgress(SnapshotFileCache.java:285) at org.apache.hadoop.hbase.master.snapshot.SnapshotFileCache.getUnreferencedFiles(SnapshotFileCache.java:215) - locked org.apache.hadoop.hbase.master.snapshot.SnapshotFileCache@ab834f8 at org.apache.hadoop.hbase.master.snapshot.SnapshotHFileCleaner.getDeletableFiles(SnapshotHFileCleaner.java:69) - locked org.apache.hadoop.hbase.master.snapshot.SnapshotHFileCleaner@5c750233 <---- but blocks 9 other threads! at org.apache.hadoop.hbase.master.cleaner.CleanerChore.checkAndDeleteFiles(CleanerChore.java:295) at org.apache.hadoop.hbase.master.cleaner.CleanerChore.lambda$traverseAndDelete$1(CleanerChore.java:387) at org.apache.hadoop.hbase.master.cleaner.CleanerChore$$Lambda$323/299183520.act(Unknown Source) at org.apache.hadoop.hbase.master.cleaner.CleanerChore.deleteAction(CleanerChore.java:442) at org.apache.hadoop.hbase.master.cleaner.CleanerChore.traverseAndDelete(CleanerChore.java:387) at org.apache.hadoop.hbase.master.cleaner.CleanerChore.lambda$null$2(CleanerChore.java:396) at org.apache.hadoop.hbase.master.cleaner.CleanerChore$$Lambda$319/2057648148.run(Unknown Source) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:748)
Number of locked synchronizers = 2 - java.util.concurrent.locks.ReentrantReadWriteLock$FairSync@358dbc1b - java.util.concurrent.ThreadPoolExecutor$Worker@236be816 jstack中与snapshot有关的线程栈如下: "SnapshotHandlerChoreCleaner" #855 daemon prio=5 os_prio=0 tid=0x0000000000c20800 nid=0x15a4 waiting on condition [0x00007f7b1803c000] java.lang.Thread.State: TIMED_WAITING (parking) at sun.misc.Unsafe.park(Native Method) - parking to wait for <0x00007f8123cefc98> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215) at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078) at java.util.concurrent.ScheduledThreadPoolExecutor$DelayedWorkQueue.take(ScheduledThreadPoolExecutor.java:1093) at java.util.concurrent.ScheduledThreadPoolExecutor$DelayedWorkQueue.take(ScheduledThreadPoolExecutor.java:809) at java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1067) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1127) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:748) "MASTER_SNAPSHOT_OPERATIONS-master/hmaster01-bdxs-hb1:60000-0" #13548 daemon prio=5 os_prio=0 tid=0x00007f89ff492000 nid=0xd761 waiting on condition [0x00007f7b09fa0000] java.lang.Thread.State: WAITING (parking) at sun.misc.Unsafe.park(Native Method) - parking to wait for <0x00007f8129ea3188> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175) at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039) at java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:442) at java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1067) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1127) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:748) "dir-scan-pool4-thread-10" #1448 daemon prio=5 os_prio=0 tid=0x00007f89fdbd5800 nid=0x3375 waiting for monitor entry [0x00007f7aef60d000] java.lang.Thread.State: BLOCKED (on object monitor) at org.apache.hadoop.hbase.master.snapshot.SnapshotHFileCleaner.getDeletableFiles(SnapshotHFileCleaner.java:69) - waiting to lock <0x00007f813c8e5e38> (a org.apache.hadoop.hbase.master.snapshot.SnapshotHFileCleaner) at org.apache.hadoop.hbase.master.cleaner.CleanerChore.checkAndDeleteFiles(CleanerChore.java:295) at org.apache.hadoop.hbase.master.cleaner.CleanerChore.lambda$traverseAndDelete$1(CleanerChore.java:387) at org.apache.hadoop.hbase.master.cleaner.CleanerChore$$Lambda$323/299183520.act(Unknown Source) at org.apache.hadoop.hbase.master.cleaner.CleanerChore.deleteAction(CleanerChore.java:442) at org.apache.hadoop.hbase.master.cleaner.CleanerChore.traverseAndDelete(CleanerChore.java:387) at org.apache.hadoop.hbase.master.cleaner.CleanerChore.lambda$null$2(CleanerChore.java:396) at org.apache.hadoop.hbase.master.cleaner.CleanerChore$$Lambda$319/2057648148.run(Unknown Source) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:748) ...... dir-scan-pool4-thread-(2-10) 都是BLOCKED状态 "dir-scan-pool4-thread-1" #1385 daemon prio=5 os_prio=0 tid=0x00007f89e2579800 nid=0x31b3 runnable [0x00007f7b09093000] java.lang.Thread.State: RUNNABLE at org.apache.hadoop.hbase.TableName.createTableNameIfNecessary(TableName.java:377) at org.apache.hadoop.hbase.TableName.valueOf(TableName.java:505) at org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil.toTableName(ProtobufUtil.java:2175) at org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil.toRegionInfo(ProtobufUtil.java:3114) at org.apache.hadoop.hbase.snapshot.SnapshotReferenceUtil.visitRegionStoreFiles(SnapshotReferenceUtil.java:134) at org.apache.hadoop.hbase.snapshot.SnapshotReferenceUtil.visitTableStoreFiles(SnapshotReferenceUtil.java:121) at org.apache.hadoop.hbase.snapshot.SnapshotReferenceUtil.getHFileNames(SnapshotReferenceUtil.java:348) at org.apache.hadoop.hbase.snapshot.SnapshotReferenceUtil.getHFileNames(SnapshotReferenceUtil.java:331) at org.apache.hadoop.hbase.master.snapshot.SnapshotHFileCleaner$1.filesUnderSnapshot(SnapshotHFileCleaner.java:108) at org.apache.hadoop.hbase.master.snapshot.SnapshotFileCache.getSnapshotsInProgress(SnapshotFileCache.java:285) at org.apache.hadoop.hbase.master.snapshot.SnapshotFileCache.getUnreferencedFiles(SnapshotFileCache.java:215) - locked <0x00007f8136e18198> (a org.apache.hadoop.hbase.master.snapshot.SnapshotFileCache) at org.apache.hadoop.hbase.master.snapshot.SnapshotHFileCleaner.getDeletableFiles(SnapshotHFileCleaner.java:69) - locked <0x00007f813c8e5e38> (a org.apache.hadoop.hbase.master.snapshot.SnapshotHFileCleaner) at org.apache.hadoop.hbase.master.cleaner.CleanerChore.checkAndDeleteFiles(CleanerChore.java:295) at org.apache.hadoop.hbase.master.cleaner.CleanerChore.lambda$traverseAndDelete$1(CleanerChore.java:387) at org.apache.hadoop.hbase.master.cleaner.CleanerChore$$Lambda$323/299183520.act(Unknown Source) at org.apache.hadoop.hbase.master.cleaner.CleanerChore.deleteAction(CleanerChore.java:442) at org.apache.hadoop.hbase.master.cleaner.CleanerChore.traverseAndDelete(CleanerChore.java:387) at org.apache.hadoop.hbase.master.cleaner.CleanerChore.lambda$null$2(CleanerChore.java:396) at org.apache.hadoop.hbase.master.cleaner.CleanerChore$$Lambda$319/2057648148.run(Unknown Source) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:748) 需要麻烦张老师再帮忙看看,非常感谢 leojie <leo...@apache.org> 于2023年3月29日周三 14:54写道: > hi 张老师 > 我用jstack打印了线程栈,用arthas打印了blocked的线程,快照清理线程卡住的情况如下图: > > [image: image.png] > jstack file 如附件,线程堆栈没太看明白,还需要张老师再指点一下 > > > leojie <leo...@apache.org> 于2023年3月28日周二 19:19写道: > >> 非常感谢张老师的回复,今天集群有切换master,之后,出现这个问题,我用jstack检查下是否有快照相关的线程卡住 >> >> 张铎(Duo Zhang) <palomino...@gmail.com> 于2023年3月27日周一 21:22写道: >> >>> 那个日志的意思是有 snapshot 正在执行,所以不会跑 cleaner。可以查一下是否有正在执行的 snapshot 操作卡住了之类的 >>> >>> leojie <leo...@apache.org> 于2023年3月27日周一 20:53写道: >>> >>> > hi all, >>> > >>> > >>> 向社区求助一个HBase的情况,情况描述如下:在我们快照scan的场景中,有些大表会较为频繁的做快照(如天级),但是这些快照元数据删除后,快照引用的hfile貌似未清理,这体现在,我们的集群archive目录空间占用嗖嗖往上涨。hmaster中,只找到如下貌似相关的日志: >>> > [image: image.png] >>> > 2023-03-27 13:07:10,939 WARN [dir-scan-pool4-thread-5] >>> > snapshot.SnapshotFileCache: Not checking unreferenced files since >>> snapshot >>> > is running, it will skip to clean the HFiles this time >>> > 除此之外无有用日志。 >>> > 当我们切换HMaster服务后,清理线程貌似又开始工作了,archive目录空间占用会被大量释放,这在我们集群容量监控指标上体现的非常明显。 >>> > 我们使用的hbase版本是2.2.6,求助社区出现这样的情况可能是什么原因,是否有相似的PR可以供这个版本使用,非常感谢 >>> > >>> >>