[
https://issues.apache.org/jira/browse/HBASE-15234?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15221804#comment-15221804
]
Hudson commented on HBASE-15234:
--------------------------------
FAILURE: Integrated in HBase-1.4 #66 (See
[https://builds.apache.org/job/HBase-1.4/66/])
HBASE-15234 Don't abort ReplicationLogCleaner on ZooKeeper errors (garyh: rev
d7d12aedd4497c81d10f8c7ad14a8145fc91d2ff)
*
hbase-server/src/main/java/org/apache/hadoop/hbase/replication/master/ReplicationLogCleaner.java
*
hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestLogsCleaner.java
> ReplicationLogCleaner can abort due to transient ZK issues
> ----------------------------------------------------------
>
> Key: HBASE-15234
> URL: https://issues.apache.org/jira/browse/HBASE-15234
> Project: HBase
> Issue Type: Bug
> Components: master, Replication
> Reporter: Gary Helmling
> Assignee: Gary Helmling
> Priority: Critical
> Fix For: 1.3.0, 0.98.19, 1.1.5, 1.2.2
>
> Attachments: HBASE-15234.001.patch, HBASE-15234.002.patch
>
>
> The ReplicationLogCleaner delegate for the LogCleaner chore can abort due to
> transient errors reading the replication znodes, leaving the log cleaner
> chore stopped, but the master still running. This causes logs to build up in
> the oldWALs directory, which can even hit storage or file count limits in
> HDFS, causing problems.
> We've seen this happen in a couple of clusters when a rolling restart was
> performed on the zk peers (only one restarted at a time).
> The full stack trace when the log cleaner aborts is:
> {noformat}
> 16/02/02 15:22:39 WARN zookeeper.ZKUtil:
> replicationLogCleaner-0x1522c8b93c2fbae, quorum=XXXXXXXXXXXXXXXXXXXX,
> baseZNode=/hbase Unable to get data of znode /hbase/replication/rs
> org.apache.zookeeper.KeeperException$ConnectionLossException: KeeperErrorCode
> = ConnectionLoss for /hbase/replication/rs
> at
> org.apache.zookeeper.KeeperException.create(KeeperException.java:99)
> at
> org.apache.zookeeper.KeeperException.create(KeeperException.java:51)
> at org.apache.zookeeper.ZooKeeper.getData(ZooKeeper.java:1155)
> at
> org.apache.hadoop.hbase.zookeeper.RecoverableZooKeeper.getData(RecoverableZooKeeper.java:359)
> at
> org.apache.hadoop.hbase.zookeeper.ZKUtil.getDataNoWatch(ZKUtil.java:713)
> at
> org.apache.hadoop.hbase.replication.ReplicationQueuesClientZKImpl.getQueuesZNodeCversion(ReplicationQueuesClientZKImpl.java:80)
>
> at
> org.apache.hadoop.hbase.replication.master.ReplicationLogCleaner.loadWALsFromQueues(ReplicationLogCleaner.java:99)
>
> at
> org.apache.hadoop.hbase.replication.master.ReplicationLogCleaner.getDeletableFiles(ReplicationLogCleaner.java:70)
> at
> org.apache.hadoop.hbase.master.cleaner.CleanerChore.checkAndDeleteFiles(CleanerChore.java:233)
> at
> org.apache.hadoop.hbase.master.cleaner.CleanerChore.checkAndDeleteEntries(CleanerChore.java:157)
> at
> org.apache.hadoop.hbase.master.cleaner.CleanerChore.chore(CleanerChore.java:124)
> at org.apache.hadoop.hbase.ScheduledChore.run(ScheduledChore.java:185)
> at
> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
> at java.util.concurrent.FutureTask.runAndReset(FutureTask.java:308)
> at
> java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$301(ScheduledThreadPoolExecutor.java:180)
>
> at
> java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:294)
>
> at
> org.apache.hadoop.hbase.JitterScheduledThreadPoolExecutorImpl$JitteredRunnableScheduledFuture.run(JitterScheduledThreadPoolExecutorImpl.java:110)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
> at java.lang.Thread.run(Thread.java:745)
> 16/02/02 15:22:39 ERROR zookeeper.ZooKeeperWatcher:
> replicationLogCleaner-0x1522c8b93c2fbae, quorum=XXXXXXXXXXXXXXXXXXXX,
> baseZNode=/hbase Received unexpected KeeperException, re-throwing exception
> org.apache.zookeeper.KeeperException$ConnectionLossException: KeeperErrorCode
> = ConnectionLoss for /hbase/replication/rs
> at
> org.apache.zookeeper.KeeperException.create(KeeperException.java:99)
> at
> org.apache.zookeeper.KeeperException.create(KeeperException.java:51)
> at org.apache.zookeeper.ZooKeeper.getData(ZooKeeper.java:1155)
> at
> org.apache.hadoop.hbase.zookeeper.RecoverableZooKeeper.getData(RecoverableZooKeeper.java:359)
> at
> org.apache.hadoop.hbase.zookeeper.ZKUtil.getDataNoWatch(ZKUtil.java:713)
> at
> org.apache.hadoop.hbase.replication.ReplicationQueuesClientZKImpl.getQueuesZNodeCversion(ReplicationQueuesClientZKImpl.java:80)
>
> at
> org.apache.hadoop.hbase.replication.master.ReplicationLogCleaner.loadWALsFromQueues(ReplicationLogCleaner.java:99)
>
> at
> org.apache.hadoop.hbase.replication.master.ReplicationLogCleaner.getDeletableFiles(ReplicationLogCleaner.java:70)
> at
> org.apache.hadoop.hbase.master.cleaner.CleanerChore.checkAndDeleteFiles(CleanerChore.java:233)
> at
> org.apache.hadoop.hbase.master.cleaner.CleanerChore.checkAndDeleteEntries(CleanerChore.java:157)
> at
> org.apache.hadoop.hbase.master.cleaner.CleanerChore.chore(CleanerChore.java:124)
> at org.apache.hadoop.hbase.ScheduledChore.run(ScheduledChore.java:185)
> at
> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
> at java.util.concurrent.FutureTask.runAndReset(FutureTask.java:308)
> at
> java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$301(ScheduledThreadPoolExecutor.java:180)
>
> at
> java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:294)
> at
> org.apache.hadoop.hbase.JitterScheduledThreadPoolExecutorImpl$JitteredRunnableScheduledFuture.run(JitterScheduledThreadPoolExecutorImpl.java:110)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
> at java.lang.Thread.run(Thread.java:745)
> 16/02/02 15:22:39 WARN master.ReplicationLogCleaner: Aborting
> ReplicationLogCleaner because Failed to get stat of replication rs node
> org.apache.zookeeper.KeeperException$ConnectionLossException: KeeperErrorCode
> = ConnectionLoss for /hbase/replication/rs
> at
> org.apache.zookeeper.KeeperException.create(KeeperException.java:99)
> at
> org.apache.zookeeper.KeeperException.create(KeeperException.java:51)
> at org.apache.zookeeper.ZooKeeper.getData(ZooKeeper.java:1155)
> at
> org.apache.hadoop.hbase.zookeeper.RecoverableZooKeeper.getData(RecoverableZooKeeper.java:359)
> at
> org.apache.hadoop.hbase.zookeeper.ZKUtil.getDataNoWatch(ZKUtil.java:713)
> at
> org.apache.hadoop.hbase.replication.ReplicationQueuesClientZKImpl.getQueuesZNodeCversion(ReplicationQueuesClientZKImpl.java:80)
> at
> org.apache.hadoop.hbase.replication.master.ReplicationLogCleaner.loadWALsFromQueues(ReplicationLogCleaner.java:99)
> at
> org.apache.hadoop.hbase.replication.master.ReplicationLogCleaner.getDeletableFiles(ReplicationLogCleaner.java:70)
> at
> org.apache.hadoop.hbase.master.cleaner.CleanerChore.checkAndDeleteFiles(CleanerChore.java:233)
> at
> org.apache.hadoop.hbase.master.cleaner.CleanerChore.checkAndDeleteEntries(CleanerChore.java:157)
> at
> org.apache.hadoop.hbase.master.cleaner.CleanerChore.chore(CleanerChore.java:124)
> at org.apache.hadoop.hbase.ScheduledChore.run(ScheduledChore.java:185)
> at
> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
> at java.util.concurrent.FutureTask.runAndReset(FutureTask.java:308)
> at
> java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$301(ScheduledThreadPoolExecutor.java:180)
> at
> java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:294)
> at
> org.apache.hadoop.hbase.JitterScheduledThreadPoolExecutorImpl$JitteredRunnableScheduledFuture.run(JitterScheduledThreadPoolExecutorImpl.java:110)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
> at java.lang.Thread.run(Thread.java:745)
> 16/02/02 15:22:40 WARN master.ReplicationLogCleaner: Failed to read
> zookeeper, skipping checking deletable files
> {noformat}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)