[
https://issues.apache.org/jira/browse/HDFS-9908?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15207329#comment-15207329
]
Hadoop QA commented on HDFS-9908:
---------------------------------
| (x) *{color:red}-1 overall{color}* |
\\
\\
|| Vote || Subsystem || Runtime || Comment ||
| {color:blue}0{color} | {color:blue} reexec {color} | {color:blue} 0m 10s
{color} | {color:blue} Docker mode activated. {color} |
| {color:green}+1{color} | {color:green} @author {color} | {color:green} 0m 0s
{color} | {color:green} The patch does not contain any @author tags. {color} |
| {color:green}+1{color} | {color:green} test4tests {color} | {color:green} 0m
0s {color} | {color:green} The patch appears to include 1 new or modified test
files. {color} |
| {color:blue}0{color} | {color:blue} mvndep {color} | {color:blue} 0m 14s
{color} | {color:blue} Maven dependency ordering for branch {color} |
| {color:green}+1{color} | {color:green} mvninstall {color} | {color:green} 6m
42s {color} | {color:green} trunk passed {color} |
| {color:green}+1{color} | {color:green} compile {color} | {color:green} 6m 1s
{color} | {color:green} trunk passed with JDK v1.8.0_74 {color} |
| {color:green}+1{color} | {color:green} compile {color} | {color:green} 6m 43s
{color} | {color:green} trunk passed with JDK v1.7.0_95 {color} |
| {color:green}+1{color} | {color:green} checkstyle {color} | {color:green} 1m
6s {color} | {color:green} trunk passed {color} |
| {color:green}+1{color} | {color:green} mvnsite {color} | {color:green} 1m 46s
{color} | {color:green} trunk passed {color} |
| {color:green}+1{color} | {color:green} mvneclipse {color} | {color:green} 0m
27s {color} | {color:green} trunk passed {color} |
| {color:green}+1{color} | {color:green} findbugs {color} | {color:green} 3m
25s {color} | {color:green} trunk passed {color} |
| {color:green}+1{color} | {color:green} javadoc {color} | {color:green} 1m 57s
{color} | {color:green} trunk passed with JDK v1.8.0_74 {color} |
| {color:green}+1{color} | {color:green} javadoc {color} | {color:green} 2m 49s
{color} | {color:green} trunk passed with JDK v1.7.0_95 {color} |
| {color:blue}0{color} | {color:blue} mvndep {color} | {color:blue} 0m 13s
{color} | {color:blue} Maven dependency ordering for patch {color} |
| {color:green}+1{color} | {color:green} mvninstall {color} | {color:green} 1m
27s {color} | {color:green} the patch passed {color} |
| {color:green}+1{color} | {color:green} compile {color} | {color:green} 6m 0s
{color} | {color:green} the patch passed with JDK v1.8.0_74 {color} |
| {color:green}+1{color} | {color:green} javac {color} | {color:green} 6m 0s
{color} | {color:green} the patch passed {color} |
| {color:green}+1{color} | {color:green} compile {color} | {color:green} 6m 40s
{color} | {color:green} the patch passed with JDK v1.7.0_95 {color} |
| {color:green}+1{color} | {color:green} javac {color} | {color:green} 6m 40s
{color} | {color:green} the patch passed {color} |
| {color:red}-1{color} | {color:red} checkstyle {color} | {color:red} 1m 7s
{color} | {color:red} root: patch generated 1 new + 214 unchanged - 0 fixed =
215 total (was 214) {color} |
| {color:green}+1{color} | {color:green} mvnsite {color} | {color:green} 1m 46s
{color} | {color:green} the patch passed {color} |
| {color:green}+1{color} | {color:green} mvneclipse {color} | {color:green} 0m
27s {color} | {color:green} the patch passed {color} |
| {color:green}+1{color} | {color:green} whitespace {color} | {color:green} 0m
0s {color} | {color:green} Patch has no whitespace issues. {color} |
| {color:red}-1{color} | {color:red} findbugs {color} | {color:red} 1m 50s
{color} | {color:red} hadoop-common-project/hadoop-common generated 13 new + 0
unchanged - 0 fixed = 13 total (was 0) {color} |
| {color:green}+1{color} | {color:green} javadoc {color} | {color:green} 1m 59s
{color} | {color:green} the patch passed with JDK v1.8.0_74 {color} |
| {color:green}+1{color} | {color:green} javadoc {color} | {color:green} 2m 50s
{color} | {color:green} the patch passed with JDK v1.7.0_95 {color} |
| {color:red}-1{color} | {color:red} unit {color} | {color:red} 20m 57s {color}
| {color:red} hadoop-common in the patch failed with JDK v1.8.0_74. {color} |
| {color:red}-1{color} | {color:red} unit {color} | {color:red} 56m 27s {color}
| {color:red} hadoop-hdfs in the patch failed with JDK v1.8.0_74. {color} |
| {color:red}-1{color} | {color:red} unit {color} | {color:red} 7m 24s {color}
| {color:red} hadoop-common in the patch failed with JDK v1.7.0_95. {color} |
| {color:green}+1{color} | {color:green} unit {color} | {color:green} 55m 24s
{color} | {color:green} hadoop-hdfs in the patch passed with JDK v1.7.0_95.
{color} |
| {color:red}-1{color} | {color:red} asflicense {color} | {color:red} 0m 26s
{color} | {color:red} Patch generated 2 ASF License warnings. {color} |
| {color:black}{color} | {color:black} {color} | {color:black} 199m 49s {color}
| {color:black} {color} |
\\
\\
|| Reason || Tests ||
| FindBugs | module:hadoop-common-project/hadoop-common |
| | Dead store to emptyResults in
org.apache.hadoop.security.LdapGroupsMapping.getGroups(String) At
LdapGroupsMapping.java:org.apache.hadoop.security.LdapGroupsMapping.getGroups(String)
At LdapGroupsMapping.java:[line 214] |
| | Inconsistent synchronization of
org.apache.hadoop.security.LdapGroupsMapping.bindPassword; locked 75% of time
Unsynchronized access at LdapGroupsMapping.java:75% of time Unsynchronized
access at LdapGroupsMapping.java:[line 323] |
| | Inconsistent synchronization of
org.apache.hadoop.security.LdapGroupsMapping.bindUser; locked 50% of time
Unsynchronized access at LdapGroupsMapping.java:50% of time Unsynchronized
access at LdapGroupsMapping.java:[line 322] |
| | Inconsistent synchronization of
org.apache.hadoop.security.LdapGroupsMapping.groupNameAttr; locked 66% of time
Unsynchronized access at LdapGroupsMapping.java:66% of time Unsynchronized
access at LdapGroupsMapping.java:[line 297] |
| | Inconsistent synchronization of
org.apache.hadoop.security.LdapGroupsMapping.groupSearchFilter; locked 50% of
time Unsynchronized access at LdapGroupsMapping.java:50% of time
Unsynchronized access at LdapGroupsMapping.java:[line 288] |
| | Inconsistent synchronization of
org.apache.hadoop.security.LdapGroupsMapping.isPosix; locked 50% of time
Unsynchronized access at LdapGroupsMapping.java:50% of time Unsynchronized
access at LdapGroupsMapping.java:[line 268] |
| | Inconsistent synchronization of
org.apache.hadoop.security.LdapGroupsMapping.keystore; locked 50% of time
Unsynchronized access at LdapGroupsMapping.java:50% of time Unsynchronized
access at LdapGroupsMapping.java:[line 318] |
| | Inconsistent synchronization of
org.apache.hadoop.security.LdapGroupsMapping.keystorePass; locked 75% of time
Unsynchronized access at LdapGroupsMapping.java:75% of time Unsynchronized
access at LdapGroupsMapping.java:[line 319] |
| | Inconsistent synchronization of
org.apache.hadoop.security.LdapGroupsMapping.ldapUrl; locked 75% of time
Unsynchronized access at LdapGroupsMapping.java:75% of time Unsynchronized
access at LdapGroupsMapping.java:[line 312] |
| | Inconsistent synchronization of
org.apache.hadoop.security.LdapGroupsMapping.posixGidAttr; locked 50% of time
Unsynchronized access at LdapGroupsMapping.java:50% of time Unsynchronized
access at LdapGroupsMapping.java:[line 271] |
| | Inconsistent synchronization of
org.apache.hadoop.security.LdapGroupsMapping.posixUidAttr; locked 66% of time
Unsynchronized access at LdapGroupsMapping.java:66% of time Unsynchronized
access at LdapGroupsMapping.java:[line 272] |
| | Inconsistent synchronization of
org.apache.hadoop.security.LdapGroupsMapping.useSsl; locked 50% of time
Unsynchronized access at LdapGroupsMapping.java:50% of time Unsynchronized
access at LdapGroupsMapping.java:[line 316] |
| | Inconsistent synchronization of
org.apache.hadoop.security.LdapGroupsMapping.userSearchFilter; locked 66% of
time Unsynchronized access at LdapGroupsMapping.java:66% of time
Unsynchronized access at LdapGroupsMapping.java:[line 258] |
| JDK v1.8.0_74 Failed junit tests |
hadoop.security.TestLdapGroupsMappingWithPosixGroup |
| | hadoop.net.TestDNS |
| | hadoop.security.TestLdapGroupsMapping |
| | hadoop.hdfs.shortcircuit.TestShortCircuitCache |
| | hadoop.hdfs.server.namenode.TestEditLog |
| | hadoop.hdfs.server.datanode.fsdataset.impl.TestLazyPersistReplicaRecovery
|
| | hadoop.hdfs.TestFileAppend |
| JDK v1.8.0_74 Timed out junit tests |
org.apache.hadoop.util.TestNativeLibraryChecker |
| | org.apache.hadoop.http.TestHttpServerLifecycle |
| JDK v1.7.0_95 Failed junit tests |
hadoop.security.TestLdapGroupsMappingWithPosixGroup |
| | hadoop.security.TestLdapGroupsMapping |
| JDK v1.7.0_95 Timed out junit tests |
org.apache.hadoop.util.TestNativeLibraryChecker |
\\
\\
|| Subsystem || Report/Notes ||
| Docker | Image:yetus/hadoop:fbe3e86 |
| JIRA Patch URL |
https://issues.apache.org/jira/secure/attachment/12794795/HDFS-9908.003.patch |
| JIRA Issue | HDFS-9908 |
| Optional Tests | asflicense compile javac javadoc mvninstall mvnsite
unit findbugs checkstyle |
| uname | Linux 5569b62de057 3.13.0-36-lowlatency #63-Ubuntu SMP PREEMPT Wed
Sep 3 21:56:12 UTC 2014 x86_64 x86_64 x86_64 GNU/Linux |
| Build tool | maven |
| Personality | /testptch/hadoop/patchprocess/precommit/personality/provided.sh
|
| git revision | trunk / e7ed05e |
| Default Java | 1.7.0_95 |
| Multi-JDK versions | /usr/lib/jvm/java-8-oracle:1.8.0_74
/usr/lib/jvm/java-7-openjdk-amd64:1.7.0_95 |
| findbugs | v3.0.0 |
| checkstyle |
https://builds.apache.org/job/PreCommit-HDFS-Build/14895/artifact/patchprocess/diff-checkstyle-root.txt
|
| findbugs |
https://builds.apache.org/job/PreCommit-HDFS-Build/14895/artifact/patchprocess/new-findbugs-hadoop-common-project_hadoop-common.html
|
| unit |
https://builds.apache.org/job/PreCommit-HDFS-Build/14895/artifact/patchprocess/patch-unit-hadoop-common-project_hadoop-common-jdk1.8.0_74.txt
|
| unit |
https://builds.apache.org/job/PreCommit-HDFS-Build/14895/artifact/patchprocess/patch-unit-hadoop-hdfs-project_hadoop-hdfs-jdk1.8.0_74.txt
|
| unit |
https://builds.apache.org/job/PreCommit-HDFS-Build/14895/artifact/patchprocess/patch-unit-hadoop-common-project_hadoop-common-jdk1.7.0_95.txt
|
| unit test logs |
https://builds.apache.org/job/PreCommit-HDFS-Build/14895/artifact/patchprocess/patch-unit-hadoop-common-project_hadoop-common-jdk1.8.0_74.txt
https://builds.apache.org/job/PreCommit-HDFS-Build/14895/artifact/patchprocess/patch-unit-hadoop-hdfs-project_hadoop-hdfs-jdk1.8.0_74.txt
https://builds.apache.org/job/PreCommit-HDFS-Build/14895/artifact/patchprocess/patch-unit-hadoop-common-project_hadoop-common-jdk1.7.0_95.txt
|
| JDK v1.7.0_95 Test Results |
https://builds.apache.org/job/PreCommit-HDFS-Build/14895/testReport/ |
| asflicense |
https://builds.apache.org/job/PreCommit-HDFS-Build/14895/artifact/patchprocess/patch-asflicense-problems.txt
|
| modules | C: hadoop-common-project/hadoop-common
hadoop-hdfs-project/hadoop-hdfs U: . |
| Console output |
https://builds.apache.org/job/PreCommit-HDFS-Build/14895/console |
| Powered by | Apache Yetus 0.2.0 http://yetus.apache.org |
This message was automatically generated.
> Datanode should tolerate disk scan failure during NN handshake
> --------------------------------------------------------------
>
> Key: HDFS-9908
> URL: https://issues.apache.org/jira/browse/HDFS-9908
> Project: Hadoop HDFS
> Issue Type: Bug
> Components: datanode
> Affects Versions: 2.5.0
> Environment: CDH5.3.3
> Reporter: Wei-Chiu Chuang
> Assignee: Wei-Chiu Chuang
> Attachments: HDFS-9908.001.patch, HDFS-9908.002.patch,
> HDFS-9908.003.patch
>
>
> DN may treat a disk scan failure exception as an NN handshake exception, and
> this can prevent a DN to join a cluster even if most of its disks are healthy.
> During NN handshake, DN initializes block pools. It will create a lock files
> per disk, and then scan the volumes. However, if the scanning throws
> exceptions due to disk failure, DN will think it's an exception because NN is
> inconsistent with the local storage (see {{DataNode#initBlockPool}}. As a
> result, it will attempt to reconnect to NN again.
> However, at this point, DN has not deleted its lock files on the disks. If it
> reconnects to NN again, it will think the same disks are already being used,
> and then it will fail handshake again because all disks can not be used (due
> to locking), and repeatedly. This will happen even if the DN has multiple
> disks, and only one of them fails. The DN will not be able to connect to NN
> despite just one failing disk. Note that it is possible to successfully
> create a lock file on a disk, and then has error scanning the disk.
> We saw this on a CDH 5.3.3 cluster (which is based on Apache Hadoop 2.5.0,
> and we still see the same bug in 3.0.0 trunk branch). The root cause is that
> DN treats an internal error (single disk failure) as an external one (NN
> handshake failure) and we should fix it.
> {code:title=DataNode.java}
> /**
> * One of the Block Pools has successfully connected to its NN.
> * This initializes the local storage for that block pool,
> * checks consistency of the NN's cluster ID, etc.
> *
> * If this is the first block pool to register, this also initializes
> * the datanode-scoped storage.
> *
> * @param bpos Block pool offer service
> * @throws IOException if the NN is inconsistent with the local storage.
> */
> void initBlockPool(BPOfferService bpos) throws IOException {
> NamespaceInfo nsInfo = bpos.getNamespaceInfo();
> if (nsInfo == null) {
> throw new IOException("NamespaceInfo not found: Block pool " + bpos
> + " should have retrieved namespace info before initBlockPool.");
> }
>
> setClusterId(nsInfo.clusterID, nsInfo.getBlockPoolID());
> // Register the new block pool with the BP manager.
> blockPoolManager.addBlockPool(bpos);
>
> // In the case that this is the first block pool to connect, initialize
> // the dataset, block scanners, etc.
> initStorage(nsInfo);
> // Exclude failed disks before initializing the block pools to avoid
> startup
> // failures.
> checkDiskError();
> data.addBlockPool(nsInfo.getBlockPoolID(), conf); <----- this line
> throws disk error exception
> blockScanner.enableBlockPoolId(bpos.getBlockPoolId());
> initDirectoryScanner(conf);
> }
> {code}
> {{FsVolumeList#addBlockPool}} is the source of exception.
> {code:title=FsVolumeList.java}
> void addBlockPool(final String bpid, final Configuration conf) throws
> IOException {
> long totalStartTime = Time.monotonicNow();
>
> final List<IOException> exceptions = Collections.synchronizedList(
> new ArrayList<IOException>());
> List<Thread> blockPoolAddingThreads = new ArrayList<Thread>();
> for (final FsVolumeImpl v : volumes) {
> Thread t = new Thread() {
> public void run() {
> try (FsVolumeReference ref = v.obtainReference()) {
> FsDatasetImpl.LOG.info("Scanning block pool " + bpid +
> " on volume " + v + "...");
> long startTime = Time.monotonicNow();
> v.addBlockPool(bpid, conf);
> long timeTaken = Time.monotonicNow() - startTime;
> FsDatasetImpl.LOG.info("Time taken to scan block pool " + bpid +
> " on " + v + ": " + timeTaken + "ms");
> } catch (ClosedChannelException e) {
> // ignore.
> } catch (IOException ioe) {
> FsDatasetImpl.LOG.info("Caught exception while scanning " + v +
> ". Will throw later.", ioe);
> exceptions.add(ioe);
> }
> }
> };
> blockPoolAddingThreads.add(t);
> t.start();
> }
> for (Thread t : blockPoolAddingThreads) {
> try {
> t.join();
> } catch (InterruptedException ie) {
> throw new IOException(ie);
> }
> }
> if (!exceptions.isEmpty()) {
> throw exceptions.get(0); <----- here's the original of exception
> }
>
> long totalTimeTaken = Time.monotonicNow() - totalStartTime;
> FsDatasetImpl.LOG.info("Total time to scan all replicas for block pool " +
> bpid + ": " + totalTimeTaken + "ms");
> }
> {code}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)