[
https://issues.apache.org/jira/browse/HDFS-13596?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16826162#comment-16826162
]
Yuriy Malygin commented on HDFS-13596:
--------------------------------------
I'm testing last patch on a cluster with Kerberos:
# download sources from [https://github.com/apache/hadoop]
# apply patch _HDFS-13596.007.patch_
# build patched version of _hadoop-3.3.0-SNAPSHOT_
# prepare running cluster (_hadoop-2.7.3_) to _rollingUpgrade_
# run _hadoop-3.3.0-SNAPSHOT_ in _rollingUpgrade_ mode
# upload test data to HDFS
# restart NN and all is fine
# stop cluster for rollback to hadoop-2.7.3
# hadoop-2.7.3 NN start fails:
{code:java}
INFO | jvm 1 | 2019/04/25 18:01:24 | STARTUP_MSG: build =
https://git-wip-us.apache.org/repos/asf/hadoop.git -r
baa91f7c6bc9cb92be5982de4719c1c8af91ccff; compiled by 'vinodkv' on
2016-08-18T01:01Z
INFO | jvm 1 | 2019/04/25 18:01:24 | STARTUP_MSG: java = 1.8.0_102
INFO | jvm 1 | 2019/04/25 18:01:24 |
************************************************************/
INFO | jvm 1 | 2019/04/25 18:01:24 | 2019-04-25 18:01:24,767 INFO
[Thread-3] NameNode - registered UNIX signal handlers for [TERM, HUP, INT]
INFO | jvm 1 | 2019/04/25 18:01:24 | 2019-04-25 18:01:24,769 INFO
[Thread-3] NameNode - createNameNode [-rollingUpgrade, rollback]
{code}
{code:java}
INFO | jvm 1 | 2019/04/25 18:01:30 | 2019-04-25 18:01:30,342 DEBUG
[Thread-3] FSImage - Planning to load edit log stream:
org.apache.hadoop.hdfs.server.namenode.RedundantEditLogInputStream@53468702
INFO | jvm 1 | 2019/04/25 18:01:30 | 2019-04-25 18:01:30,342 DEBUG
[Thread-3] FSImage - Planning to load edit log stream:
org.apache.hadoop.hdfs.server.namenode.RedundantEditLogInputStream@5e350e83
INFO | jvm 1 | 2019/04/25 18:01:30 | 2019-04-25 18:01:30,342 DEBUG
[Thread-3] FSImage - Planning to load edit log stream:
org.apache.hadoop.hdfs.server.namenode.RedundantEditLogInputStream@cf86362
INFO | jvm 1 | 2019/04/25 18:01:30 | 2019-04-25 18:01:30,342 DEBUG
[Thread-3] FSImage - Planning to load edit log stream:
org.apache.hadoop.hdfs.server.namenode.RedundantEditLogInputStream@840d683
INFO | jvm 1 | 2019/04/25 18:01:30 | 2019-04-25 18:01:30,342 INFO
[Thread-3] FSImage - Planning to load image:
FSImageFile(file=/one/hadoop-data/dfs/current/fsimage_rollback_0000000000000000677,
cpktTxId=0000000000000000677)
INFO | jvm 1 | 2019/04/25 18:01:30 | Total time for which application
threads were stopped: 0.0007557 seconds, Stopping threads took: 0.0001181
seconds
INFO | jvm 1 | 2019/04/25 18:01:30 | 2019-04-25 18:01:30,377 ERROR
[Thread-3] FSImage - Failed to load image from
FSImageFile(file=/one/hadoop-data/dfs/current/fsimage_rollback_0000000000000000677,
cpktTxId=0000000000000000677)
INFO | jvm 1 | 2019/04/25 18:01:30 |
java.lang.ArrayIndexOutOfBoundsException: 536870913
INFO | jvm 1 | 2019/04/25 18:01:30 | at
org.apache.hadoop.hdfs.server.namenode.FSImageFormatProtobuf$Loader.loadStringTableSection(FSImageFormatProtobuf.java:318)
INFO | jvm 1 | 2019/04/25 18:01:30 | at
org.apache.hadoop.hdfs.server.namenode.FSImageFormatProtobuf$Loader.loadInternal(FSImageFormatProtobuf.java:251)
INFO | jvm 1 | 2019/04/25 18:01:30 | at
org.apache.hadoop.hdfs.server.namenode.FSImageFormatProtobuf$Loader.load(FSImageFormatProtobuf.java:182)
INFO | jvm 1 | 2019/04/25 18:01:30 | at
org.apache.hadoop.hdfs.server.namenode.FSImageFormat$LoaderDelegator.load(FSImageFormat.java:226)
INFO | jvm 1 | 2019/04/25 18:01:30 | at
org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSImage(FSImage.java:963)
INFO | jvm 1 | 2019/04/25 18:01:30 | at
org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSImage(FSImage.java:947)
INFO | jvm 1 | 2019/04/25 18:01:30 | at
org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSImageFile(FSImage.java:746)
INFO | jvm 1 | 2019/04/25 18:01:30 | at
org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSImage(FSImage.java:677)
INFO | jvm 1 | 2019/04/25 18:01:30 | at
org.apache.hadoop.hdfs.server.namenode.FSImage.recoverTransitionRead(FSImage.java:294)
INFO | jvm 1 | 2019/04/25 18:01:30 | at
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.loadFSImage(FSNamesystem.java:976)
INFO | jvm 1 | 2019/04/25 18:01:30 | at
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.loadFromDisk(FSNamesystem.java:681)
INFO | jvm 1 | 2019/04/25 18:01:30 | at
org.apache.hadoop.hdfs.server.namenode.NameNode.loadNamesystem(NameNode.java:585)
INFO | jvm 1 | 2019/04/25 18:01:30 | at
org.apache.hadoop.hdfs.server.namenode.NameNode.initialize(NameNode.java:645)
INFO | jvm 1 | 2019/04/25 18:01:30 | at
org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:812)
INFO | jvm 1 | 2019/04/25 18:01:30 | at
org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:796)
INFO | jvm 1 | 2019/04/25 18:01:30 | at
org.apache.hadoop.hdfs.server.namenode.NameNode.createNameNode(NameNode.java:1493)
INFO | jvm 1 | 2019/04/25 18:01:30 | at
org.apache.hadoop.hdfs.server.namenode.NameNode.main(NameNode.java:1559)
{code}
{code:java}
INFO | jvm 1 | 2019/04/25 18:01:30 | 2019-04-25 18:01:30,726 ERROR
[Thread-3] NameNode - Failed to start namenode.
INFO | jvm 1 | 2019/04/25 18:01:30 | java.io.IOException: Failed to load
FSImage file, see error(s) above for more info.
INFO | jvm 1 | 2019/04/25 18:01:30 | at
org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSImage(FSImage.java:692)
INFO | jvm 1 | 2019/04/25 18:01:30 | at
org.apache.hadoop.hdfs.server.namenode.FSImage.recoverTransitionRead(FSImage.java:294)
INFO | jvm 1 | 2019/04/25 18:01:30 | at
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.loadFSImage(FSNamesystem.java:976)
INFO | jvm 1 | 2019/04/25 18:01:30 | at
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.loadFromDisk(FSNamesystem.java:681)
INFO | jvm 1 | 2019/04/25 18:01:30 | at
org.apache.hadoop.hdfs.server.namenode.NameNode.loadNamesystem(NameNode.java:585)
INFO | jvm 1 | 2019/04/25 18:01:30 | at
org.apache.hadoop.hdfs.server.namenode.NameNode.initialize(NameNode.java:645)
INFO | jvm 1 | 2019/04/25 18:01:30 | at
org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:812)
INFO | jvm 1 | 2019/04/25 18:01:30 | at
org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:796)
INFO | jvm 1 | 2019/04/25 18:01:30 | at
org.apache.hadoop.hdfs.server.namenode.NameNode.createNameNode(NameNode.java:1493)
INFO | jvm 1 | 2019/04/25 18:01:30 | at
org.apache.hadoop.hdfs.server.namenode.NameNode.main(NameNode.java:1559)
INFO | jvm 1 | 2019/04/25 18:01:30 | 2019-04-25 18:01:30,728 INFO
[Thread-3] ExitUtil - Exiting with status 1
{code}
> NN restart fails after RollingUpgrade from 2.x to 3.x
> -----------------------------------------------------
>
> Key: HDFS-13596
> URL: https://issues.apache.org/jira/browse/HDFS-13596
> Project: Hadoop HDFS
> Issue Type: Bug
> Components: hdfs
> Reporter: Hanisha Koneru
> Assignee: Fei Hui
> Priority: Critical
> Attachments: HDFS-13596.001.patch, HDFS-13596.002.patch,
> HDFS-13596.003.patch, HDFS-13596.004.patch, HDFS-13596.005.patch,
> HDFS-13596.006.patch, HDFS-13596.007.patch
>
>
> After rollingUpgrade NN from 2.x and 3.x, if the NN is restarted, it fails
> while replaying edit logs.
> * After NN is started with rollingUpgrade, the layoutVersion written to
> editLogs (before finalizing the upgrade) is the pre-upgrade layout version
> (so as to support downgrade).
> * When writing transactions to log, NN writes as per the current layout
> version. In 3.x, erasureCoding bits are added to the editLog transactions.
> * So any edit log written after the upgrade and before finalizing the
> upgrade will have the old layout version but the new format of transactions.
> * When NN is restarted and the edit logs are replayed, the NN reads the old
> layout version from the editLog file. When parsing the transactions, it
> assumes that the transactions are also from the previous layout and hence
> skips parsing the erasureCoding bits.
> * This cascades into reading the wrong set of bits for other fields and
> leads to NN shutting down.
> Sample error output:
> {code:java}
> java.lang.IllegalArgumentException: Invalid clientId - length is 0 expected
> length 16
> at com.google.common.base.Preconditions.checkArgument(Preconditions.java:88)
> at org.apache.hadoop.ipc.RetryCache$CacheEntry.<init>(RetryCache.java:74)
> at org.apache.hadoop.ipc.RetryCache$CacheEntry.<init>(RetryCache.java:86)
> at
> org.apache.hadoop.ipc.RetryCache$CacheEntryWithPayload.<init>(RetryCache.java:163)
> at
> org.apache.hadoop.ipc.RetryCache.addCacheEntryWithPayload(RetryCache.java:322)
> at
> org.apache.hadoop.hdfs.server.namenode.FSNamesystem.addCacheEntryWithPayload(FSNamesystem.java:960)
> at
> org.apache.hadoop.hdfs.server.namenode.FSEditLogLoader.applyEditLogOp(FSEditLogLoader.java:397)
> at
> org.apache.hadoop.hdfs.server.namenode.FSEditLogLoader.loadEditRecords(FSEditLogLoader.java:249)
> at
> org.apache.hadoop.hdfs.server.namenode.FSEditLogLoader.loadFSEdits(FSEditLogLoader.java:158)
> at org.apache.hadoop.hdfs.server.namenode.FSImage.loadEdits(FSImage.java:888)
> at
> org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSImage(FSImage.java:745)
> at
> org.apache.hadoop.hdfs.server.namenode.FSImage.recoverTransitionRead(FSImage.java:323)
> at
> org.apache.hadoop.hdfs.server.namenode.FSNamesystem.loadFSImage(FSNamesystem.java:1086)
> at
> org.apache.hadoop.hdfs.server.namenode.FSNamesystem.loadFromDisk(FSNamesystem.java:714)
> at
> org.apache.hadoop.hdfs.server.namenode.NameNode.loadNamesystem(NameNode.java:632)
> at
> org.apache.hadoop.hdfs.server.namenode.NameNode.initialize(NameNode.java:694)
> at org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:937)
> at org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:910)
> at
> org.apache.hadoop.hdfs.server.namenode.NameNode.createNameNode(NameNode.java:1643)
> at org.apache.hadoop.hdfs.server.namenode.NameNode.main(NameNode.java:1710)
> 2018-05-17 19:10:06,522 WARN
> org.apache.hadoop.hdfs.server.namenode.FSNamesystem: Encountered exception
> loading fsimage
> java.io.IOException: java.lang.IllegalStateException: Cannot skip to less
> than the current value (=16389), where newValue=16388
> at
> org.apache.hadoop.hdfs.server.namenode.FSDirectory.resetLastInodeId(FSDirectory.java:1945)
> at
> org.apache.hadoop.hdfs.server.namenode.FSEditLogLoader.loadEditRecords(FSEditLogLoader.java:298)
> at
> org.apache.hadoop.hdfs.server.namenode.FSEditLogLoader.loadFSEdits(FSEditLogLoader.java:158)
> at org.apache.hadoop.hdfs.server.namenode.FSImage.loadEdits(FSImage.java:888)
> at
> org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSImage(FSImage.java:745)
> at
> org.apache.hadoop.hdfs.server.namenode.FSImage.recoverTransitionRead(FSImage.java:323)
> at
> org.apache.hadoop.hdfs.server.namenode.FSNamesystem.loadFSImage(FSNamesystem.java:1086)
> at
> org.apache.hadoop.hdfs.server.namenode.FSNamesystem.loadFromDisk(FSNamesystem.java:714)
> at
> org.apache.hadoop.hdfs.server.namenode.NameNode.loadNamesystem(NameNode.java:632)
> at
> org.apache.hadoop.hdfs.server.namenode.NameNode.initialize(NameNode.java:694)
> at org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:937)
> at org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:910)
> at
> org.apache.hadoop.hdfs.server.namenode.NameNode.createNameNode(NameNode.java:1643)
> at org.apache.hadoop.hdfs.server.namenode.NameNode.main(NameNode.java:1710)
> Caused by: java.lang.IllegalStateException: Cannot skip to less than the
> current value (=16389), where newValue=16388
> at org.apache.hadoop.util.SequentialNumber.skipTo(SequentialNumber.java:58)
> at
> org.apache.hadoop.hdfs.server.namenode.FSDirectory.resetLastInodeId(FSDirectory.java:1943)
> {code}
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]