[
https://issues.apache.org/jira/browse/HDFS-13596?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16820034#comment-16820034
]
yao commented on HDFS-13596:
----------------------------
after rollingUpgrade NN and DN nodes to 3.2.0 , at this point, use 3.2.0
client run example pi job will failure
write failure sample:
{quote}19/04/17 11:45:00 INFO client.AHSProxy: Connecting to Application
History server at slave01.jd.com/192.168.1.101:10200
19/04/17 11:45:00 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding
for path: /yarn/staging/hdfs/.staging/job_1555472525840_0001
19/04/17 11:45:00 INFO mapreduce.JobSubmitter: Cleaning up the staging area
/yarn/staging/hdfs/.staging/job_1555472525840_0001
org.apache.hadoop.security.AccessControlException: setErasureCodingPolicy not
supported.
at
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.checkErasureCodingSupported(FSNamesystem.java:7797)
at
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.setErasureCodingPolicy(FSNamesystem.java:7530)
at
org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.setErasureCodingPolicy(NameNodeRpcServer.java:2171)
at
org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.setErasureCodingPolicy(ClientNamenodeProtocolServerSideTranslatorPB.java:1598)
at
org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java)
at
org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:524)
at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1025)
at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:876)
at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:822)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1730)
at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2682)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at
org.apache.hadoop.ipc.RemoteException.instantiateException(RemoteException.java:121)
at
org.apache.hadoop.ipc.RemoteException.unwrapRemoteException(RemoteException.java:88)
at org.apache.hadoop.hdfs.DFSClient.setErasureCodingPolicy(DFSClient.java:2748)
at
org.apache.hadoop.hdfs.DistributedFileSystem$65.doCall(DistributedFileSystem.java:2852)
at
org.apache.hadoop.hdfs.DistributedFileSystem$65.doCall(DistributedFileSystem.java:2849)
at
org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)
at
org.apache.hadoop.hdfs.DistributedFileSystem.setErasureCodingPolicy(DistributedFileSystem.java:2867)
at
org.apache.hadoop.mapreduce.JobResourceUploader.disableErasureCodingForPath(JobResourceUploader.java:885)
at
org.apache.hadoop.mapreduce.JobResourceUploader.uploadResourcesInternal(JobResourceUploader.java:176)
at
org.apache.hadoop.mapreduce.JobResourceUploader.uploadResources(JobResourceUploader.java:133)
at
org.apache.hadoop.mapreduce.JobSubmitter.copyAndConfigureFiles(JobSubmitter.java:99)
at
org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal(JobSubmitter.java:194)
at org.apache.hadoop.mapreduce.Job$11.run(Job.java:1570)
at org.apache.hadoop.mapreduce.Job$11.run(Job.java:1567)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1730)
at org.apache.hadoop.mapreduce.Job.submit(Job.java:1567)
at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:1588)
at
org.apache.hadoop.examples.QuasiMonteCarlo.estimatePi(QuasiMonteCarlo.java:307)
at org.apache.hadoop.examples.QuasiMonteCarlo.run(QuasiMonteCarlo.java:360)
at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:76)
at org.apache.hadoop.examples.QuasiMonteCarlo.main(QuasiMonteCarlo.java:368)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at
org.apache.hadoop.util.ProgramDriver$ProgramDescription.invoke(ProgramDriver.java:71)
at org.apache.hadoop.util.ProgramDriver.run(ProgramDriver.java:144)
at org.apache.hadoop.examples.ExampleDriver.main(ExampleDriver.java:74)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.hadoop.util.RunJar.run(RunJar.java:323)
at org.apache.hadoop.util.RunJar.main(RunJar.java:236)
Caused by:
org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.security.AccessControlException):
setErasureCodingPolicy not supported.
at
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.checkErasureCodingSupported(FSNamesystem.java:7797)
at
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.setErasureCodingPolicy(FSNamesystem.java:7530)
at
org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.setErasureCodingPolicy(NameNodeRpcServer.java:2171)
at
org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.setErasureCodingPolicy(ClientNamenodeProtocolServerSideTranslatorPB.java:1598)
at
org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java)
at
org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:524)
at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1025)
at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:876)
at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:822)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1730)
at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2682)
at org.apache.hadoop.ipc.Client.getRpcResponse(Client.java:1511)
at org.apache.hadoop.ipc.Client.call(Client.java:1457)
at org.apache.hadoop.ipc.Client.call(Client.java:1367)
at
org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:228)
at
org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:116)
at com.sun.proxy.$Proxy9.setErasureCodingPolicy(Unknown Source)
at
org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.setErasureCodingPolicy(ClientNamenodeProtocolTranslatorPB.java:1603)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at
org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:422)
at
org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeMethod(RetryInvocationHandler.java:165)
at
org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invoke(RetryInvocationHandler.java:157)
at
org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeOnce(RetryInvocationHandler.java:95)
at
org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:359)
at com.sun.proxy.$Proxy10.setErasureCodingPolicy(Unknown Source)
at org.apache.hadoop.hdfs.DFSClient.setErasureCodingPolicy(DFSClient.java:2746)
... 33 more
{quote}
I find class JobResourceUploader has a disableErasureCodingForPath method,
which invoke setErasureCodingPolicy, when namenode invoke the method
setErasureCodingPolicy the checkErasureCodingSupported was invoked, and throw
this exception.
So I change the FSNamesystem.setErasureCodingPolicy() method add a if statement
which ignore the 3 replica policy like follow:
{quote}if(!ErasureCodeConstants.REPLICATION_POLICY_NAME.equals(ecPolicyName)) {
checkErasureCodingSupported(operationName);
}
{quote}
It work.
> NN restart fails after RollingUpgrade from 2.x to 3.x
> -----------------------------------------------------
>
> Key: HDFS-13596
> URL: https://issues.apache.org/jira/browse/HDFS-13596
> Project: Hadoop HDFS
> Issue Type: Bug
> Components: hdfs
> Reporter: Hanisha Koneru
> Assignee: Fei Hui
> Priority: Critical
> Attachments: HDFS-13596.001.patch, HDFS-13596.002.patch,
> HDFS-13596.003.patch, HDFS-13596.004.patch, HDFS-13596.005.patch,
> HDFS-13596.006.patch, HDFS-13596.007.patch
>
>
> After rollingUpgrade NN from 2.x and 3.x, if the NN is restarted, it fails
> while replaying edit logs.
> * After NN is started with rollingUpgrade, the layoutVersion written to
> editLogs (before finalizing the upgrade) is the pre-upgrade layout version
> (so as to support downgrade).
> * When writing transactions to log, NN writes as per the current layout
> version. In 3.x, erasureCoding bits are added to the editLog transactions.
> * So any edit log written after the upgrade and before finalizing the
> upgrade will have the old layout version but the new format of transactions.
> * When NN is restarted and the edit logs are replayed, the NN reads the old
> layout version from the editLog file. When parsing the transactions, it
> assumes that the transactions are also from the previous layout and hence
> skips parsing the erasureCoding bits.
> * This cascades into reading the wrong set of bits for other fields and
> leads to NN shutting down.
> Sample error output:
> {code:java}
> java.lang.IllegalArgumentException: Invalid clientId - length is 0 expected
> length 16
> at com.google.common.base.Preconditions.checkArgument(Preconditions.java:88)
> at org.apache.hadoop.ipc.RetryCache$CacheEntry.<init>(RetryCache.java:74)
> at org.apache.hadoop.ipc.RetryCache$CacheEntry.<init>(RetryCache.java:86)
> at
> org.apache.hadoop.ipc.RetryCache$CacheEntryWithPayload.<init>(RetryCache.java:163)
> at
> org.apache.hadoop.ipc.RetryCache.addCacheEntryWithPayload(RetryCache.java:322)
> at
> org.apache.hadoop.hdfs.server.namenode.FSNamesystem.addCacheEntryWithPayload(FSNamesystem.java:960)
> at
> org.apache.hadoop.hdfs.server.namenode.FSEditLogLoader.applyEditLogOp(FSEditLogLoader.java:397)
> at
> org.apache.hadoop.hdfs.server.namenode.FSEditLogLoader.loadEditRecords(FSEditLogLoader.java:249)
> at
> org.apache.hadoop.hdfs.server.namenode.FSEditLogLoader.loadFSEdits(FSEditLogLoader.java:158)
> at org.apache.hadoop.hdfs.server.namenode.FSImage.loadEdits(FSImage.java:888)
> at
> org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSImage(FSImage.java:745)
> at
> org.apache.hadoop.hdfs.server.namenode.FSImage.recoverTransitionRead(FSImage.java:323)
> at
> org.apache.hadoop.hdfs.server.namenode.FSNamesystem.loadFSImage(FSNamesystem.java:1086)
> at
> org.apache.hadoop.hdfs.server.namenode.FSNamesystem.loadFromDisk(FSNamesystem.java:714)
> at
> org.apache.hadoop.hdfs.server.namenode.NameNode.loadNamesystem(NameNode.java:632)
> at
> org.apache.hadoop.hdfs.server.namenode.NameNode.initialize(NameNode.java:694)
> at org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:937)
> at org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:910)
> at
> org.apache.hadoop.hdfs.server.namenode.NameNode.createNameNode(NameNode.java:1643)
> at org.apache.hadoop.hdfs.server.namenode.NameNode.main(NameNode.java:1710)
> 2018-05-17 19:10:06,522 WARN
> org.apache.hadoop.hdfs.server.namenode.FSNamesystem: Encountered exception
> loading fsimage
> java.io.IOException: java.lang.IllegalStateException: Cannot skip to less
> than the current value (=16389), where newValue=16388
> at
> org.apache.hadoop.hdfs.server.namenode.FSDirectory.resetLastInodeId(FSDirectory.java:1945)
> at
> org.apache.hadoop.hdfs.server.namenode.FSEditLogLoader.loadEditRecords(FSEditLogLoader.java:298)
> at
> org.apache.hadoop.hdfs.server.namenode.FSEditLogLoader.loadFSEdits(FSEditLogLoader.java:158)
> at org.apache.hadoop.hdfs.server.namenode.FSImage.loadEdits(FSImage.java:888)
> at
> org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSImage(FSImage.java:745)
> at
> org.apache.hadoop.hdfs.server.namenode.FSImage.recoverTransitionRead(FSImage.java:323)
> at
> org.apache.hadoop.hdfs.server.namenode.FSNamesystem.loadFSImage(FSNamesystem.java:1086)
> at
> org.apache.hadoop.hdfs.server.namenode.FSNamesystem.loadFromDisk(FSNamesystem.java:714)
> at
> org.apache.hadoop.hdfs.server.namenode.NameNode.loadNamesystem(NameNode.java:632)
> at
> org.apache.hadoop.hdfs.server.namenode.NameNode.initialize(NameNode.java:694)
> at org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:937)
> at org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:910)
> at
> org.apache.hadoop.hdfs.server.namenode.NameNode.createNameNode(NameNode.java:1643)
> at org.apache.hadoop.hdfs.server.namenode.NameNode.main(NameNode.java:1710)
> Caused by: java.lang.IllegalStateException: Cannot skip to less than the
> current value (=16389), where newValue=16388
> at org.apache.hadoop.util.SequentialNumber.skipTo(SequentialNumber.java:58)
> at
> org.apache.hadoop.hdfs.server.namenode.FSDirectory.resetLastInodeId(FSDirectory.java:1943)
> {code}
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]