[
https://issues.apache.org/jira/browse/HDFS-9908?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15219013#comment-15219013
]
Hadoop QA commented on HDFS-9908:
---------------------------------
| (x) *{color:red}-1 overall{color}* |
\\
\\
|| Vote || Subsystem || Runtime || Comment ||
| {color:blue}0{color} | {color:blue} reexec {color} | {color:blue} 0m 17s
{color} | {color:blue} Docker mode activated. {color} |
| {color:green}+1{color} | {color:green} @author {color} | {color:green} 0m 0s
{color} | {color:green} The patch does not contain any @author tags. {color} |
| {color:green}+1{color} | {color:green} test4tests {color} | {color:green} 0m
0s {color} | {color:green} The patch appears to include 2 new or modified test
files. {color} |
| {color:blue}0{color} | {color:blue} mvndep {color} | {color:blue} 0m 15s
{color} | {color:blue} Maven dependency ordering for branch {color} |
| {color:green}+1{color} | {color:green} mvninstall {color} | {color:green} 6m
54s {color} | {color:green} trunk passed {color} |
| {color:green}+1{color} | {color:green} compile {color} | {color:green} 6m 3s
{color} | {color:green} trunk passed with JDK v1.8.0_74 {color} |
| {color:green}+1{color} | {color:green} compile {color} | {color:green} 6m 53s
{color} | {color:green} trunk passed with JDK v1.7.0_95 {color} |
| {color:green}+1{color} | {color:green} checkstyle {color} | {color:green} 1m
6s {color} | {color:green} trunk passed {color} |
| {color:green}+1{color} | {color:green} mvnsite {color} | {color:green} 1m 46s
{color} | {color:green} trunk passed {color} |
| {color:green}+1{color} | {color:green} mvneclipse {color} | {color:green} 0m
28s {color} | {color:green} trunk passed {color} |
| {color:green}+1{color} | {color:green} findbugs {color} | {color:green} 3m
28s {color} | {color:green} trunk passed {color} |
| {color:green}+1{color} | {color:green} javadoc {color} | {color:green} 1m 59s
{color} | {color:green} trunk passed with JDK v1.8.0_74 {color} |
| {color:green}+1{color} | {color:green} javadoc {color} | {color:green} 2m 49s
{color} | {color:green} trunk passed with JDK v1.7.0_95 {color} |
| {color:blue}0{color} | {color:blue} mvndep {color} | {color:blue} 0m 15s
{color} | {color:blue} Maven dependency ordering for patch {color} |
| {color:green}+1{color} | {color:green} mvninstall {color} | {color:green} 1m
27s {color} | {color:green} the patch passed {color} |
| {color:green}+1{color} | {color:green} compile {color} | {color:green} 5m 37s
{color} | {color:green} the patch passed with JDK v1.8.0_74 {color} |
| {color:green}+1{color} | {color:green} javac {color} | {color:green} 5m 37s
{color} | {color:green} the patch passed {color} |
| {color:green}+1{color} | {color:green} compile {color} | {color:green} 6m 36s
{color} | {color:green} the patch passed with JDK v1.7.0_95 {color} |
| {color:green}+1{color} | {color:green} javac {color} | {color:green} 6m 36s
{color} | {color:green} the patch passed {color} |
| {color:green}+1{color} | {color:green} checkstyle {color} | {color:green} 1m
5s {color} | {color:green} the patch passed {color} |
| {color:green}+1{color} | {color:green} mvnsite {color} | {color:green} 1m 46s
{color} | {color:green} the patch passed {color} |
| {color:green}+1{color} | {color:green} mvneclipse {color} | {color:green} 0m
29s {color} | {color:green} the patch passed {color} |
| {color:green}+1{color} | {color:green} whitespace {color} | {color:green} 0m
0s {color} | {color:green} Patch has no whitespace issues. {color} |
| {color:green}+1{color} | {color:green} findbugs {color} | {color:green} 3m
57s {color} | {color:green} the patch passed {color} |
| {color:green}+1{color} | {color:green} javadoc {color} | {color:green} 1m 59s
{color} | {color:green} the patch passed with JDK v1.8.0_74 {color} |
| {color:green}+1{color} | {color:green} javadoc {color} | {color:green} 2m 55s
{color} | {color:green} the patch passed with JDK v1.7.0_95 {color} |
| {color:red}-1{color} | {color:red} unit {color} | {color:red} 6m 48s {color}
| {color:red} hadoop-common in the patch failed with JDK v1.8.0_74. {color} |
| {color:red}-1{color} | {color:red} unit {color} | {color:red} 69m 16s {color}
| {color:red} hadoop-hdfs in the patch failed with JDK v1.8.0_74. {color} |
| {color:red}-1{color} | {color:red} unit {color} | {color:red} 6m 53s {color}
| {color:red} hadoop-common in the patch failed with JDK v1.7.0_95. {color} |
| {color:red}-1{color} | {color:red} unit {color} | {color:red} 72m 8s {color}
| {color:red} hadoop-hdfs in the patch failed with JDK v1.7.0_95. {color} |
| {color:red}-1{color} | {color:red} asflicense {color} | {color:red} 0m 27s
{color} | {color:red} Patch generated 3 ASF License warnings. {color} |
| {color:black}{color} | {color:black} {color} | {color:black} 215m 5s {color}
| {color:black} {color} |
\\
\\
|| Reason || Tests ||
| JDK v1.8.0_74 Failed junit tests |
hadoop.hdfs.server.datanode.TestDataNodeMetrics |
| | hadoop.hdfs.server.namenode.snapshot.TestOpenFilesWithSnapshot |
| | hadoop.hdfs.TestDFSUpgradeFromImage |
| JDK v1.8.0_74 Timed out junit tests |
org.apache.hadoop.util.TestNativeLibraryChecker |
| JDK v1.7.0_95 Failed junit tests | hadoop.hdfs.TestHFlush |
| | hadoop.hdfs.server.namenode.snapshot.TestOpenFilesWithSnapshot |
| JDK v1.7.0_95 Timed out junit tests |
org.apache.hadoop.util.TestNativeLibraryChecker |
| | org.apache.hadoop.hdfs.TestLeaseRecovery2 |
\\
\\
|| Subsystem || Report/Notes ||
| Docker | Image:yetus/hadoop:fbe3e86 |
| JIRA Patch URL |
https://issues.apache.org/jira/secure/attachment/12796140/HDFS-9908.007.patch |
| JIRA Issue | HDFS-9908 |
| Optional Tests | asflicense compile javac javadoc mvninstall mvnsite
unit findbugs checkstyle |
| uname | Linux 2df70faf93fc 3.13.0-36-lowlatency #63-Ubuntu SMP PREEMPT Wed
Sep 3 21:56:12 UTC 2014 x86_64 x86_64 x86_64 GNU/Linux |
| Build tool | maven |
| Personality | /testptch/hadoop/patchprocess/precommit/personality/provided.sh
|
| git revision | trunk / 09d63d5 |
| Default Java | 1.7.0_95 |
| Multi-JDK versions | /usr/lib/jvm/java-8-oracle:1.8.0_74
/usr/lib/jvm/java-7-openjdk-amd64:1.7.0_95 |
| findbugs | v3.0.0 |
| unit |
https://builds.apache.org/job/PreCommit-HDFS-Build/15005/artifact/patchprocess/patch-unit-hadoop-common-project_hadoop-common-jdk1.8.0_74.txt
|
| unit |
https://builds.apache.org/job/PreCommit-HDFS-Build/15005/artifact/patchprocess/patch-unit-hadoop-hdfs-project_hadoop-hdfs-jdk1.8.0_74.txt
|
| unit |
https://builds.apache.org/job/PreCommit-HDFS-Build/15005/artifact/patchprocess/patch-unit-hadoop-common-project_hadoop-common-jdk1.7.0_95.txt
|
| unit |
https://builds.apache.org/job/PreCommit-HDFS-Build/15005/artifact/patchprocess/patch-unit-hadoop-hdfs-project_hadoop-hdfs-jdk1.7.0_95.txt
|
| unit test logs |
https://builds.apache.org/job/PreCommit-HDFS-Build/15005/artifact/patchprocess/patch-unit-hadoop-common-project_hadoop-common-jdk1.8.0_74.txt
https://builds.apache.org/job/PreCommit-HDFS-Build/15005/artifact/patchprocess/patch-unit-hadoop-hdfs-project_hadoop-hdfs-jdk1.8.0_74.txt
https://builds.apache.org/job/PreCommit-HDFS-Build/15005/artifact/patchprocess/patch-unit-hadoop-common-project_hadoop-common-jdk1.7.0_95.txt
https://builds.apache.org/job/PreCommit-HDFS-Build/15005/artifact/patchprocess/patch-unit-hadoop-hdfs-project_hadoop-hdfs-jdk1.7.0_95.txt
|
| JDK v1.7.0_95 Test Results |
https://builds.apache.org/job/PreCommit-HDFS-Build/15005/testReport/ |
| asflicense |
https://builds.apache.org/job/PreCommit-HDFS-Build/15005/artifact/patchprocess/patch-asflicense-problems.txt
|
| modules | C: hadoop-common-project/hadoop-common
hadoop-hdfs-project/hadoop-hdfs U: . |
| Console output |
https://builds.apache.org/job/PreCommit-HDFS-Build/15005/console |
| Powered by | Apache Yetus 0.2.0 http://yetus.apache.org |
This message was automatically generated.
> Datanode should tolerate disk scan failure during NN handshake
> --------------------------------------------------------------
>
> Key: HDFS-9908
> URL: https://issues.apache.org/jira/browse/HDFS-9908
> Project: Hadoop HDFS
> Issue Type: Bug
> Components: datanode
> Affects Versions: 2.5.0
> Environment: CDH5.3.3
> Reporter: Wei-Chiu Chuang
> Assignee: Wei-Chiu Chuang
> Attachments: HDFS-9908.001.patch, HDFS-9908.002.patch,
> HDFS-9908.003.patch, HDFS-9908.004.patch, HDFS-9908.005.patch,
> HDFS-9908.006.patch, HDFS-9908.007.patch
>
>
> DN may treat a disk scan failure exception as an NN handshake exception, and
> this can prevent a DN to join a cluster even if most of its disks are healthy.
> During NN handshake, DN initializes block pools. It will create a lock files
> per disk, and then scan the volumes. However, if the scanning throws
> exceptions due to disk failure, DN will think it's an exception because NN is
> inconsistent with the local storage (see {{DataNode#initBlockPool}}. As a
> result, it will attempt to reconnect to NN again.
> However, at this point, DN has not deleted its lock files on the disks. If it
> reconnects to NN again, it will think the same disks are already being used,
> and then it will fail handshake again because all disks can not be used (due
> to locking), and repeatedly. This will happen even if the DN has multiple
> disks, and only one of them fails. The DN will not be able to connect to NN
> despite just one failing disk. Note that it is possible to successfully
> create a lock file on a disk, and then has error scanning the disk.
> We saw this on a CDH 5.3.3 cluster (which is based on Apache Hadoop 2.5.0,
> and we still see the same bug in 3.0.0 trunk branch). The root cause is that
> DN treats an internal error (single disk failure) as an external one (NN
> handshake failure) and we should fix it.
> {code:title=DataNode.java}
> /**
> * One of the Block Pools has successfully connected to its NN.
> * This initializes the local storage for that block pool,
> * checks consistency of the NN's cluster ID, etc.
> *
> * If this is the first block pool to register, this also initializes
> * the datanode-scoped storage.
> *
> * @param bpos Block pool offer service
> * @throws IOException if the NN is inconsistent with the local storage.
> */
> void initBlockPool(BPOfferService bpos) throws IOException {
> NamespaceInfo nsInfo = bpos.getNamespaceInfo();
> if (nsInfo == null) {
> throw new IOException("NamespaceInfo not found: Block pool " + bpos
> + " should have retrieved namespace info before initBlockPool.");
> }
>
> setClusterId(nsInfo.clusterID, nsInfo.getBlockPoolID());
> // Register the new block pool with the BP manager.
> blockPoolManager.addBlockPool(bpos);
>
> // In the case that this is the first block pool to connect, initialize
> // the dataset, block scanners, etc.
> initStorage(nsInfo);
> // Exclude failed disks before initializing the block pools to avoid
> startup
> // failures.
> checkDiskError();
> data.addBlockPool(nsInfo.getBlockPoolID(), conf); <----- this line
> throws disk error exception
> blockScanner.enableBlockPoolId(bpos.getBlockPoolId());
> initDirectoryScanner(conf);
> }
> {code}
> {{FsVolumeList#addBlockPool}} is the source of exception.
> {code:title=FsVolumeList.java}
> void addBlockPool(final String bpid, final Configuration conf) throws
> IOException {
> long totalStartTime = Time.monotonicNow();
>
> final List<IOException> exceptions = Collections.synchronizedList(
> new ArrayList<IOException>());
> List<Thread> blockPoolAddingThreads = new ArrayList<Thread>();
> for (final FsVolumeImpl v : volumes) {
> Thread t = new Thread() {
> public void run() {
> try (FsVolumeReference ref = v.obtainReference()) {
> FsDatasetImpl.LOG.info("Scanning block pool " + bpid +
> " on volume " + v + "...");
> long startTime = Time.monotonicNow();
> v.addBlockPool(bpid, conf);
> long timeTaken = Time.monotonicNow() - startTime;
> FsDatasetImpl.LOG.info("Time taken to scan block pool " + bpid +
> " on " + v + ": " + timeTaken + "ms");
> } catch (ClosedChannelException e) {
> // ignore.
> } catch (IOException ioe) {
> FsDatasetImpl.LOG.info("Caught exception while scanning " + v +
> ". Will throw later.", ioe);
> exceptions.add(ioe);
> }
> }
> };
> blockPoolAddingThreads.add(t);
> t.start();
> }
> for (Thread t : blockPoolAddingThreads) {
> try {
> t.join();
> } catch (InterruptedException ie) {
> throw new IOException(ie);
> }
> }
> if (!exceptions.isEmpty()) {
> throw exceptions.get(0); <----- here's the original of exception
> }
>
> long totalTimeTaken = Time.monotonicNow() - totalStartTime;
> FsDatasetImpl.LOG.info("Total time to scan all replicas for block pool " +
> bpid + ": " + totalTimeTaken + "ms");
> }
> {code}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)