这都是 cleaner 的线程,能跑到这里说明还是在扫描看文件是否可以被删除的,并不是判断有 snapshot 正在执行就直接跳过执行?
可以看看是不是就是删的速度跟不上之类的,我印象里好像 TableName 里那个 cache 的性能似乎比较成问题,有个相关的 issue leojie <leo...@apache.org> 于2023年3月29日周三 15:06写道: > hi 张老师 > 用arthas打印了blocked的线程,快照清理线程卡住的情况如下: > "dir-scan-pool4-thread-10" Id=1448 RUNNABLE > at > org.apache.hadoop.hbase.TableName.createTableNameIfNecessary(TableName.java:377) > at org.apache.hadoop.hbase.TableName.valueOf(TableName.java:505) > at > org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil.toTableName(ProtobufUtil.java:2175) > at > org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil.toRegionInfo(ProtobufUtil.java:3114) > at > org.apache.hadoop.hbase.snapshot.SnapshotReferenceUtil.visitRegionStoreFiles(SnapshotReferenceUtil.java:134) > at > org.apache.hadoop.hbase.snapshot.SnapshotReferenceUtil.visitTableStoreFiles(SnapshotReferenceUtil.java:121) > at > org.apache.hadoop.hbase.snapshot.SnapshotReferenceUtil.getHFileNames(SnapshotReferenceUtil.java:348) > at > org.apache.hadoop.hbase.snapshot.SnapshotReferenceUtil.getHFileNames(SnapshotReferenceUtil.java:331) > at > org.apache.hadoop.hbase.master.snapshot.SnapshotHFileCleaner$1.filesUnderSnapshot(SnapshotHFileCleaner.java:108) > at > org.apache.hadoop.hbase.master.snapshot.SnapshotFileCache.getSnapshotsInProgress(SnapshotFileCache.java:285) > at > org.apache.hadoop.hbase.master.snapshot.SnapshotFileCache.getUnreferencedFiles(SnapshotFileCache.java:215) > - locked > org.apache.hadoop.hbase.master.snapshot.SnapshotFileCache@ab834f8 > at > org.apache.hadoop.hbase.master.snapshot.SnapshotHFileCleaner.getDeletableFiles(SnapshotHFileCleaner.java:69) > - locked > org.apache.hadoop.hbase.master.snapshot.SnapshotHFileCleaner@5c750233 > <---- but blocks 9 other threads! > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore.checkAndDeleteFiles(CleanerChore.java:295) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore.lambda$traverseAndDelete$1(CleanerChore.java:387) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore$$Lambda$323/299183520.act(Unknown > Source) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore.deleteAction(CleanerChore.java:442) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore.traverseAndDelete(CleanerChore.java:387) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore.lambda$null$2(CleanerChore.java:396) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore$$Lambda$319/2057648148.run(Unknown > Source) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) > at java.lang.Thread.run(Thread.java:748) > > Number of locked synchronizers = 2 > - java.util.concurrent.locks.ReentrantReadWriteLock$FairSync@358dbc1b > - java.util.concurrent.ThreadPoolExecutor$Worker@236be816 > > jstack中与snapshot有关的线程栈如下: > > "SnapshotHandlerChoreCleaner" #855 daemon prio=5 os_prio=0 > tid=0x0000000000c20800 nid=0x15a4 waiting on condition [0x00007f7b1803c000] > java.lang.Thread.State: TIMED_WAITING (parking) > at sun.misc.Unsafe.park(Native Method) > - parking to wait for <0x00007f8123cefc98> (a > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) > at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078) > at > java.util.concurrent.ScheduledThreadPoolExecutor$DelayedWorkQueue.take(ScheduledThreadPoolExecutor.java:1093) > at > java.util.concurrent.ScheduledThreadPoolExecutor$DelayedWorkQueue.take(ScheduledThreadPoolExecutor.java:809) > at > java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1067) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1127) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) > at java.lang.Thread.run(Thread.java:748) > > "MASTER_SNAPSHOT_OPERATIONS-master/hmaster01-bdxs-hb1:60000-0" #13548 > daemon prio=5 os_prio=0 tid=0x00007f89ff492000 nid=0xd761 waiting on > condition [0x00007f7b09fa0000] > java.lang.Thread.State: WAITING (parking) > at sun.misc.Unsafe.park(Native Method) > - parking to wait for <0x00007f8129ea3188> (a > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) > at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039) > at > java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:442) > at > java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1067) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1127) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) > at java.lang.Thread.run(Thread.java:748) > > "dir-scan-pool4-thread-10" #1448 daemon prio=5 os_prio=0 > tid=0x00007f89fdbd5800 nid=0x3375 waiting for monitor entry > [0x00007f7aef60d000] > java.lang.Thread.State: BLOCKED (on object monitor) > at > org.apache.hadoop.hbase.master.snapshot.SnapshotHFileCleaner.getDeletableFiles(SnapshotHFileCleaner.java:69) > - waiting to lock <0x00007f813c8e5e38> (a > org.apache.hadoop.hbase.master.snapshot.SnapshotHFileCleaner) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore.checkAndDeleteFiles(CleanerChore.java:295) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore.lambda$traverseAndDelete$1(CleanerChore.java:387) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore$$Lambda$323/299183520.act(Unknown > Source) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore.deleteAction(CleanerChore.java:442) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore.traverseAndDelete(CleanerChore.java:387) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore.lambda$null$2(CleanerChore.java:396) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore$$Lambda$319/2057648148.run(Unknown > Source) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) > at java.lang.Thread.run(Thread.java:748) > > ...... dir-scan-pool4-thread-(2-10) 都是BLOCKED状态 > > "dir-scan-pool4-thread-1" #1385 daemon prio=5 os_prio=0 > tid=0x00007f89e2579800 nid=0x31b3 runnable [0x00007f7b09093000] > java.lang.Thread.State: RUNNABLE > at > org.apache.hadoop.hbase.TableName.createTableNameIfNecessary(TableName.java:377) > at org.apache.hadoop.hbase.TableName.valueOf(TableName.java:505) > at > org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil.toTableName(ProtobufUtil.java:2175) > at > org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil.toRegionInfo(ProtobufUtil.java:3114) > at > org.apache.hadoop.hbase.snapshot.SnapshotReferenceUtil.visitRegionStoreFiles(SnapshotReferenceUtil.java:134) > at > org.apache.hadoop.hbase.snapshot.SnapshotReferenceUtil.visitTableStoreFiles(SnapshotReferenceUtil.java:121) > at > org.apache.hadoop.hbase.snapshot.SnapshotReferenceUtil.getHFileNames(SnapshotReferenceUtil.java:348) > at > org.apache.hadoop.hbase.snapshot.SnapshotReferenceUtil.getHFileNames(SnapshotReferenceUtil.java:331) > at > org.apache.hadoop.hbase.master.snapshot.SnapshotHFileCleaner$1.filesUnderSnapshot(SnapshotHFileCleaner.java:108) > at > org.apache.hadoop.hbase.master.snapshot.SnapshotFileCache.getSnapshotsInProgress(SnapshotFileCache.java:285) > at > org.apache.hadoop.hbase.master.snapshot.SnapshotFileCache.getUnreferencedFiles(SnapshotFileCache.java:215) > - locked <0x00007f8136e18198> (a > org.apache.hadoop.hbase.master.snapshot.SnapshotFileCache) > at > org.apache.hadoop.hbase.master.snapshot.SnapshotHFileCleaner.getDeletableFiles(SnapshotHFileCleaner.java:69) > - locked <0x00007f813c8e5e38> (a > org.apache.hadoop.hbase.master.snapshot.SnapshotHFileCleaner) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore.checkAndDeleteFiles(CleanerChore.java:295) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore.lambda$traverseAndDelete$1(CleanerChore.java:387) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore$$Lambda$323/299183520.act(Unknown > Source) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore.deleteAction(CleanerChore.java:442) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore.traverseAndDelete(CleanerChore.java:387) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore.lambda$null$2(CleanerChore.java:396) > at > org.apache.hadoop.hbase.master.cleaner.CleanerChore$$Lambda$319/2057648148.run(Unknown > Source) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) > at java.lang.Thread.run(Thread.java:748) > > 需要麻烦张老师再帮忙看看,非常感谢 > > leojie <leo...@apache.org> 于2023年3月29日周三 14:54写道: > >> hi 张老师 >> 我用jstack打印了线程栈,用arthas打印了blocked的线程,快照清理线程卡住的情况如下图: >> >> [image: image.png] >> jstack file 如附件,线程堆栈没太看明白,还需要张老师再指点一下 >> >> >> leojie <leo...@apache.org> 于2023年3月28日周二 19:19写道: >> >>> 非常感谢张老师的回复,今天集群有切换master,之后,出现这个问题,我用jstack检查下是否有快照相关的线程卡住 >>> >>> 张铎(Duo Zhang) <palomino...@gmail.com> 于2023年3月27日周一 21:22写道: >>> >>>> 那个日志的意思是有 snapshot 正在执行,所以不会跑 cleaner。可以查一下是否有正在执行的 snapshot 操作卡住了之类的 >>>> >>>> leojie <leo...@apache.org> 于2023年3月27日周一 20:53写道: >>>> >>>> > hi all, >>>> > >>>> > >>>> 向社区求助一个HBase的情况,情况描述如下:在我们快照scan的场景中,有些大表会较为频繁的做快照(如天级),但是这些快照元数据删除后,快照引用的hfile貌似未清理,这体现在,我们的集群archive目录空间占用嗖嗖往上涨。hmaster中,只找到如下貌似相关的日志: >>>> > [image: image.png] >>>> > 2023-03-27 13:07:10,939 WARN [dir-scan-pool4-thread-5] >>>> > snapshot.SnapshotFileCache: Not checking unreferenced files since >>>> snapshot >>>> > is running, it will skip to clean the HFiles this time >>>> > 除此之外无有用日志。 >>>> > 当我们切换HMaster服务后,清理线程貌似又开始工作了,archive目录空间占用会被大量释放,这在我们集群容量监控指标上体现的非常明显。 >>>> > 我们使用的hbase版本是2.2.6,求助社区出现这样的情况可能是什么原因,是否有相似的PR可以供这个版本使用,非常感谢 >>>> > >>>> >>>