[ https://issues.apache.org/jira/browse/HDFS-9908?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15219013#comment-15219013 ]
Hadoop QA commented on HDFS-9908: --------------------------------- | (x) *{color:red}-1 overall{color}* | \\ \\ || Vote || Subsystem || Runtime || Comment || | {color:blue}0{color} | {color:blue} reexec {color} | {color:blue} 0m 17s {color} | {color:blue} Docker mode activated. {color} | | {color:green}+1{color} | {color:green} @author {color} | {color:green} 0m 0s {color} | {color:green} The patch does not contain any @author tags. {color} | | {color:green}+1{color} | {color:green} test4tests {color} | {color:green} 0m 0s {color} | {color:green} The patch appears to include 2 new or modified test files. {color} | | {color:blue}0{color} | {color:blue} mvndep {color} | {color:blue} 0m 15s {color} | {color:blue} Maven dependency ordering for branch {color} | | {color:green}+1{color} | {color:green} mvninstall {color} | {color:green} 6m 54s {color} | {color:green} trunk passed {color} | | {color:green}+1{color} | {color:green} compile {color} | {color:green} 6m 3s {color} | {color:green} trunk passed with JDK v1.8.0_74 {color} | | {color:green}+1{color} | {color:green} compile {color} | {color:green} 6m 53s {color} | {color:green} trunk passed with JDK v1.7.0_95 {color} | | {color:green}+1{color} | {color:green} checkstyle {color} | {color:green} 1m 6s {color} | {color:green} trunk passed {color} | | {color:green}+1{color} | {color:green} mvnsite {color} | {color:green} 1m 46s {color} | {color:green} trunk passed {color} | | {color:green}+1{color} | {color:green} mvneclipse {color} | {color:green} 0m 28s {color} | {color:green} trunk passed {color} | | {color:green}+1{color} | {color:green} findbugs {color} | {color:green} 3m 28s {color} | {color:green} trunk passed {color} | | {color:green}+1{color} | {color:green} javadoc {color} | {color:green} 1m 59s {color} | {color:green} trunk passed with JDK v1.8.0_74 {color} | | {color:green}+1{color} | {color:green} javadoc {color} | {color:green} 2m 49s {color} | {color:green} trunk passed with JDK v1.7.0_95 {color} | | {color:blue}0{color} | {color:blue} mvndep {color} | {color:blue} 0m 15s {color} | {color:blue} Maven dependency ordering for patch {color} | | {color:green}+1{color} | {color:green} mvninstall {color} | {color:green} 1m 27s {color} | {color:green} the patch passed {color} | | {color:green}+1{color} | {color:green} compile {color} | {color:green} 5m 37s {color} | {color:green} the patch passed with JDK v1.8.0_74 {color} | | {color:green}+1{color} | {color:green} javac {color} | {color:green} 5m 37s {color} | {color:green} the patch passed {color} | | {color:green}+1{color} | {color:green} compile {color} | {color:green} 6m 36s {color} | {color:green} the patch passed with JDK v1.7.0_95 {color} | | {color:green}+1{color} | {color:green} javac {color} | {color:green} 6m 36s {color} | {color:green} the patch passed {color} | | {color:green}+1{color} | {color:green} checkstyle {color} | {color:green} 1m 5s {color} | {color:green} the patch passed {color} | | {color:green}+1{color} | {color:green} mvnsite {color} | {color:green} 1m 46s {color} | {color:green} the patch passed {color} | | {color:green}+1{color} | {color:green} mvneclipse {color} | {color:green} 0m 29s {color} | {color:green} the patch passed {color} | | {color:green}+1{color} | {color:green} whitespace {color} | {color:green} 0m 0s {color} | {color:green} Patch has no whitespace issues. {color} | | {color:green}+1{color} | {color:green} findbugs {color} | {color:green} 3m 57s {color} | {color:green} the patch passed {color} | | {color:green}+1{color} | {color:green} javadoc {color} | {color:green} 1m 59s {color} | {color:green} the patch passed with JDK v1.8.0_74 {color} | | {color:green}+1{color} | {color:green} javadoc {color} | {color:green} 2m 55s {color} | {color:green} the patch passed with JDK v1.7.0_95 {color} | | {color:red}-1{color} | {color:red} unit {color} | {color:red} 6m 48s {color} | {color:red} hadoop-common in the patch failed with JDK v1.8.0_74. {color} | | {color:red}-1{color} | {color:red} unit {color} | {color:red} 69m 16s {color} | {color:red} hadoop-hdfs in the patch failed with JDK v1.8.0_74. {color} | | {color:red}-1{color} | {color:red} unit {color} | {color:red} 6m 53s {color} | {color:red} hadoop-common in the patch failed with JDK v1.7.0_95. {color} | | {color:red}-1{color} | {color:red} unit {color} | {color:red} 72m 8s {color} | {color:red} hadoop-hdfs in the patch failed with JDK v1.7.0_95. {color} | | {color:red}-1{color} | {color:red} asflicense {color} | {color:red} 0m 27s {color} | {color:red} Patch generated 3 ASF License warnings. {color} | | {color:black}{color} | {color:black} {color} | {color:black} 215m 5s {color} | {color:black} {color} | \\ \\ || Reason || Tests || | JDK v1.8.0_74 Failed junit tests | hadoop.hdfs.server.datanode.TestDataNodeMetrics | | | hadoop.hdfs.server.namenode.snapshot.TestOpenFilesWithSnapshot | | | hadoop.hdfs.TestDFSUpgradeFromImage | | JDK v1.8.0_74 Timed out junit tests | org.apache.hadoop.util.TestNativeLibraryChecker | | JDK v1.7.0_95 Failed junit tests | hadoop.hdfs.TestHFlush | | | hadoop.hdfs.server.namenode.snapshot.TestOpenFilesWithSnapshot | | JDK v1.7.0_95 Timed out junit tests | org.apache.hadoop.util.TestNativeLibraryChecker | | | org.apache.hadoop.hdfs.TestLeaseRecovery2 | \\ \\ || Subsystem || Report/Notes || | Docker | Image:yetus/hadoop:fbe3e86 | | JIRA Patch URL | https://issues.apache.org/jira/secure/attachment/12796140/HDFS-9908.007.patch | | JIRA Issue | HDFS-9908 | | Optional Tests | asflicense compile javac javadoc mvninstall mvnsite unit findbugs checkstyle | | uname | Linux 2df70faf93fc 3.13.0-36-lowlatency #63-Ubuntu SMP PREEMPT Wed Sep 3 21:56:12 UTC 2014 x86_64 x86_64 x86_64 GNU/Linux | | Build tool | maven | | Personality | /testptch/hadoop/patchprocess/precommit/personality/provided.sh | | git revision | trunk / 09d63d5 | | Default Java | 1.7.0_95 | | Multi-JDK versions | /usr/lib/jvm/java-8-oracle:1.8.0_74 /usr/lib/jvm/java-7-openjdk-amd64:1.7.0_95 | | findbugs | v3.0.0 | | unit | https://builds.apache.org/job/PreCommit-HDFS-Build/15005/artifact/patchprocess/patch-unit-hadoop-common-project_hadoop-common-jdk1.8.0_74.txt | | unit | https://builds.apache.org/job/PreCommit-HDFS-Build/15005/artifact/patchprocess/patch-unit-hadoop-hdfs-project_hadoop-hdfs-jdk1.8.0_74.txt | | unit | https://builds.apache.org/job/PreCommit-HDFS-Build/15005/artifact/patchprocess/patch-unit-hadoop-common-project_hadoop-common-jdk1.7.0_95.txt | | unit | https://builds.apache.org/job/PreCommit-HDFS-Build/15005/artifact/patchprocess/patch-unit-hadoop-hdfs-project_hadoop-hdfs-jdk1.7.0_95.txt | | unit test logs | https://builds.apache.org/job/PreCommit-HDFS-Build/15005/artifact/patchprocess/patch-unit-hadoop-common-project_hadoop-common-jdk1.8.0_74.txt https://builds.apache.org/job/PreCommit-HDFS-Build/15005/artifact/patchprocess/patch-unit-hadoop-hdfs-project_hadoop-hdfs-jdk1.8.0_74.txt https://builds.apache.org/job/PreCommit-HDFS-Build/15005/artifact/patchprocess/patch-unit-hadoop-common-project_hadoop-common-jdk1.7.0_95.txt https://builds.apache.org/job/PreCommit-HDFS-Build/15005/artifact/patchprocess/patch-unit-hadoop-hdfs-project_hadoop-hdfs-jdk1.7.0_95.txt | | JDK v1.7.0_95 Test Results | https://builds.apache.org/job/PreCommit-HDFS-Build/15005/testReport/ | | asflicense | https://builds.apache.org/job/PreCommit-HDFS-Build/15005/artifact/patchprocess/patch-asflicense-problems.txt | | modules | C: hadoop-common-project/hadoop-common hadoop-hdfs-project/hadoop-hdfs U: . | | Console output | https://builds.apache.org/job/PreCommit-HDFS-Build/15005/console | | Powered by | Apache Yetus 0.2.0 http://yetus.apache.org | This message was automatically generated. > Datanode should tolerate disk scan failure during NN handshake > -------------------------------------------------------------- > > Key: HDFS-9908 > URL: https://issues.apache.org/jira/browse/HDFS-9908 > Project: Hadoop HDFS > Issue Type: Bug > Components: datanode > Affects Versions: 2.5.0 > Environment: CDH5.3.3 > Reporter: Wei-Chiu Chuang > Assignee: Wei-Chiu Chuang > Attachments: HDFS-9908.001.patch, HDFS-9908.002.patch, > HDFS-9908.003.patch, HDFS-9908.004.patch, HDFS-9908.005.patch, > HDFS-9908.006.patch, HDFS-9908.007.patch > > > DN may treat a disk scan failure exception as an NN handshake exception, and > this can prevent a DN to join a cluster even if most of its disks are healthy. > During NN handshake, DN initializes block pools. It will create a lock files > per disk, and then scan the volumes. However, if the scanning throws > exceptions due to disk failure, DN will think it's an exception because NN is > inconsistent with the local storage (see {{DataNode#initBlockPool}}. As a > result, it will attempt to reconnect to NN again. > However, at this point, DN has not deleted its lock files on the disks. If it > reconnects to NN again, it will think the same disks are already being used, > and then it will fail handshake again because all disks can not be used (due > to locking), and repeatedly. This will happen even if the DN has multiple > disks, and only one of them fails. The DN will not be able to connect to NN > despite just one failing disk. Note that it is possible to successfully > create a lock file on a disk, and then has error scanning the disk. > We saw this on a CDH 5.3.3 cluster (which is based on Apache Hadoop 2.5.0, > and we still see the same bug in 3.0.0 trunk branch). The root cause is that > DN treats an internal error (single disk failure) as an external one (NN > handshake failure) and we should fix it. > {code:title=DataNode.java} > /** > * One of the Block Pools has successfully connected to its NN. > * This initializes the local storage for that block pool, > * checks consistency of the NN's cluster ID, etc. > * > * If this is the first block pool to register, this also initializes > * the datanode-scoped storage. > * > * @param bpos Block pool offer service > * @throws IOException if the NN is inconsistent with the local storage. > */ > void initBlockPool(BPOfferService bpos) throws IOException { > NamespaceInfo nsInfo = bpos.getNamespaceInfo(); > if (nsInfo == null) { > throw new IOException("NamespaceInfo not found: Block pool " + bpos > + " should have retrieved namespace info before initBlockPool."); > } > > setClusterId(nsInfo.clusterID, nsInfo.getBlockPoolID()); > // Register the new block pool with the BP manager. > blockPoolManager.addBlockPool(bpos); > > // In the case that this is the first block pool to connect, initialize > // the dataset, block scanners, etc. > initStorage(nsInfo); > // Exclude failed disks before initializing the block pools to avoid > startup > // failures. > checkDiskError(); > data.addBlockPool(nsInfo.getBlockPoolID(), conf); <----- this line > throws disk error exception > blockScanner.enableBlockPoolId(bpos.getBlockPoolId()); > initDirectoryScanner(conf); > } > {code} > {{FsVolumeList#addBlockPool}} is the source of exception. > {code:title=FsVolumeList.java} > void addBlockPool(final String bpid, final Configuration conf) throws > IOException { > long totalStartTime = Time.monotonicNow(); > > final List<IOException> exceptions = Collections.synchronizedList( > new ArrayList<IOException>()); > List<Thread> blockPoolAddingThreads = new ArrayList<Thread>(); > for (final FsVolumeImpl v : volumes) { > Thread t = new Thread() { > public void run() { > try (FsVolumeReference ref = v.obtainReference()) { > FsDatasetImpl.LOG.info("Scanning block pool " + bpid + > " on volume " + v + "..."); > long startTime = Time.monotonicNow(); > v.addBlockPool(bpid, conf); > long timeTaken = Time.monotonicNow() - startTime; > FsDatasetImpl.LOG.info("Time taken to scan block pool " + bpid + > " on " + v + ": " + timeTaken + "ms"); > } catch (ClosedChannelException e) { > // ignore. > } catch (IOException ioe) { > FsDatasetImpl.LOG.info("Caught exception while scanning " + v + > ". Will throw later.", ioe); > exceptions.add(ioe); > } > } > }; > blockPoolAddingThreads.add(t); > t.start(); > } > for (Thread t : blockPoolAddingThreads) { > try { > t.join(); > } catch (InterruptedException ie) { > throw new IOException(ie); > } > } > if (!exceptions.isEmpty()) { > throw exceptions.get(0); <----- here's the original of exception > } > > long totalTimeTaken = Time.monotonicNow() - totalStartTime; > FsDatasetImpl.LOG.info("Total time to scan all replicas for block pool " + > bpid + ": " + totalTimeTaken + "ms"); > } > {code} -- This message was sent by Atlassian JIRA (v6.3.4#6332)