[ 
https://issues.apache.org/jira/browse/YARN-6013?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15822290#comment-15822290
 ] 

Steven Rand commented on YARN-6013:
-----------------------------------

Relevant part of AM container log at DEBUG level:

{code}
2017-01-13 14:27:45,422 DEBUG [RMCommunicator Allocator] 
org.apache.hadoop.security.SaslRpcClient: Sending sasl message state: NEGOTIATE

2017-01-13 14:27:45,423 DEBUG [RMCommunicator Allocator] 
org.apache.hadoop.security.SaslRpcClient: Get token info proto:interface 
org.apache.hadoop.yarn.api.ApplicationMasterProtocolPB 
info:org.apache.hadoop.yarn.security.SchedulerSecurityInfo$1@76856ec2
2017-01-13 14:27:45,423 DEBUG [RMCommunicator Allocator] 
org.apache.hadoop.yarn.security.AMRMTokenSelector: Looking for a token with 
service 10.0.22.125:8030
2017-01-13 14:27:45,423 DEBUG [RMCommunicator Allocator] 
org.apache.hadoop.yarn.security.AMRMTokenSelector: Token kind is 
YARN_AM_RM_TOKEN and the token's service name is 10.0.22.125:8030
2017-01-13 14:27:45,423 DEBUG [RMCommunicator Allocator] 
org.apache.hadoop.security.SaslRpcClient: Creating SASL DIGEST-MD5(TOKEN)  
client to authenticate to service at default
2017-01-13 14:27:45,423 DEBUG [RMCommunicator Allocator] 
org.apache.hadoop.security.SaslRpcClient: Use TOKEN authentication for protocol 
ApplicationMasterProtocolPB
2017-01-13 14:27:45,423 DEBUG [RMCommunicator Allocator] 
org.apache.hadoop.security.SaslRpcClient: SASL client callback: setting 
username: Cg0KCQgDELeEw8iZKxABEOq0jBE=
2017-01-13 14:27:45,423 DEBUG [RMCommunicator Allocator] 
org.apache.hadoop.security.SaslRpcClient: SASL client callback: setting 
userPassword
2017-01-13 14:27:45,423 DEBUG [RMCommunicator Allocator] 
org.apache.hadoop.security.SaslRpcClient: SASL client callback: setting realm: 
default
2017-01-13 14:27:45,423 DEBUG [RMCommunicator Allocator] 
org.apache.hadoop.security.SaslRpcClient: Sending sasl message state: INITIATE
token: 
"charset=utf-8,username=\"Cg0KCQgDELeEw8iZKxABEOq0jBE=\",realm=\"default\",nonce=\"WR1cP6XR4aRmnTNDvPNIJVyQoVcKs/L31wZ/2aTq\",nc=00000001,cnonce=\"h2O79JUxneBTofVfLlzS4BnkU1k2QWeFV8K0f6V7\",digest-uri=\"/default\",maxbuf=65536,response=5c07c8484a146548ee71bc1347451c23,qop=auth-conf,cipher=\"3des\""
auths {
  method: "TOKEN"
  mechanism: "DIGEST-MD5"
  protocol: ""
  serverId: "default"
}

2017-01-13 14:27:45,425 DEBUG [RMCommunicator Allocator] 
org.apache.hadoop.ipc.Client: Negotiated QOP is :auth-conf
2017-01-13 14:27:45,425 DEBUG [IPC Parameter Sending Thread #0] 
org.apache.hadoop.ipc.Client: IPC Client (1943634922) connection to 
<RM_HOSTNAME>/10.0.22.125:8030 from <YARN_USER> sending #25
2017-01-13 14:27:45,426 DEBUG [IPC Client (1943634922) connection to 
<RM_HOSTNAME>/10.0.22.125:8030 from <YARN_USER>] org.apache.hadoop.ipc.Client: 
IPC Client (1943634922) connection to <RM_HOSTNAME>/10.0.22.125:8030 from 
<YARN_USER>: starting, having connections 2
2017-01-13 14:27:45,426 DEBUG [IPC Parameter Sending Thread #0] 
org.apache.hadoop.security.SaslRpcClient: wrapping token of length:306
2017-01-13 14:27:45,427 DEBUG [IPC Parameter Sending Thread #0] 
org.apache.hadoop.security.SaslRpcClient: Sending sasl message state: WRAP
token: 
"i\236:\323o!\016C\032\032\303c%I\036\305\245\253J\271\311\226\213\246\234\260\341\315\371p\017\271\027k\223\367D\326\276\210e\330\237\207\247\r\002\037\004\031l\231q\203\232rnx\271\317h\017\247\357\325_}\233s,\250\340\002\345\3318\364\307\246\032+\fI\307\366\351\303l\006\030\tY\251\205WE\227;\276\022Z\210\363W\317\252\376$\316\243\214\313\004\317hv\033\333\211\230w\273s\375S\260\262\205}\343\033\362m\006\a\250\236e\266\034\362\352\317\211=F\367#H6\237\322\232$]\217%\340\260Y\330\034\302\266\315\201*=//\350\220U\276lk\345\253o\021\242e\365M&\200\037\235?O\224\371\317\2322{f\262\270\323\b\247\264\364\231\243\257\342\334e\036d\001ez\266`R\362\026~\340.A\356]E\331\005\214\033E\223\202\t\275\275h\355\v\240B\300\265\335\303a\021\326
 
\331\247\206f|\211H|E~\334;\210\371p\370\222^\241O\005;3\rNY\2456\006\257\222\220K;\363\222J\311\006\030\243\337\340pG/\357w\017\2276Z~\313\303\241\246\016\232_\345D\374\343o\266\231yF\371\000\001\000\000\000\000"

2017-01-13 14:27:45,427 DEBUG [IPC Client (1943634922) connection to 
<RM_HOSTNAME>/10.0.22.125:8030 from <YARN_USER>] 
org.apache.hadoop.security.SaslRpcClient: reading next wrapped RPC packet
2017-01-13 14:27:45,430 DEBUG [IPC Client (1943634922) connection to 
<RM_HOSTNAME>/10.0.22.125:8030 from <YARN_USER>] 
org.apache.hadoop.security.SaslRpcClient: unwrapping token of length:3574
2017-01-13 14:27:45,431 DEBUG [IPC Client (1943634922) connection to 
<RM_HOSTNAME>/10.0.22.125:8030 from <YARN_USER>] org.apache.hadoop.ipc.Client: 
closing ipc connection to <RM_HOSTNAME>/10.0.22.125:8030: null
java.io.EOFException
        at java.io.DataInputStream.readInt(DataInputStream.java:392)
        at 
org.apache.hadoop.ipc.Client$IpcStreams.readResponse(Client.java:1785)
        at 
org.apache.hadoop.ipc.Client$Connection.receiveRpcResponse(Client.java:1156)
        at org.apache.hadoop.ipc.Client$Connection.run(Client.java:1053)
2017-01-13 14:27:45,431 DEBUG [IPC Client (1943634922) connection to 
<RM_HOSTNAME>/10.0.22.125:8030 from <YARN_USER>] org.apache.hadoop.ipc.Client: 
IPC Client (1943634922) connection to <RM_HOSTNAME>/10.0.22.125:8030 from 
<YARN_USER>: closed
2017-01-13 14:27:45,431 DEBUG [IPC Client (1943634922) connection to 
<RM_HOSTNAME>/10.0.22.125:8030 from <YARN_USER>] org.apache.hadoop.ipc.Client: 
IPC Client (1943634922) connection to <RM_HOSTNAME>/10.0.22.125:8030 from 
<YARN_USER>: stopped, remaining connections 1
2017-01-13 14:27:45,431 INFO [RMCommunicator Allocator] 
org.apache.hadoop.io.retry.RetryInvocationHandler: Exception while invoking 
ApplicationMasterProtocolPBClientImpl.allocate over null. Retrying after 
sleeping for 30000ms.
java.io.EOFException: End of File Exception between local host is: 
"<AM_HOST>/10.0.22.190"; destination host is: "<RM_HOSTNAME>":8030; : 
java.io.EOFException; For more details see:  
http://wiki.apache.org/hadoop/EOFException
        at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
        at 
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
        ... rest of stack trace from above ...
{code}

The part that sticks out to me is {{org.apache.hadoop.security.SaslRpcClient: 
unwrapping token of length:3574}}, indicating that the response received from 
the RM is of a valid length. However, the EOFException comes from {{int length 
= in.readInt();}}, meaning that after we've unwrapped the RPC response and 
created a {{DataInputStream}} from it, the length of the DataInputStream is 
less than 4 bytes.

This seems to indicate a problem in {{SaslRpcClient}} (maybe 
SaslRpcClient#getInputStream?), but other RPC calls from the AM to the RM and 
back are working just fine (e.g., before we get around to allocating containers 
the AM was able to successfully call {{registerApplicationMaster}}.

> ApplicationMasterProtocolPBClientImpl.allocate fails with EOFException when 
> RPC privacy is enabled
> --------------------------------------------------------------------------------------------------
>
>                 Key: YARN-6013
>                 URL: https://issues.apache.org/jira/browse/YARN-6013
>             Project: Hadoop YARN
>          Issue Type: Bug
>          Components: client, yarn
>    Affects Versions: 2.8.0
>            Reporter: Steven Rand
>            Priority: Critical
>         Attachments: yarn-rm-log.txt
>
>
> When privacy is enabled for RPC (hadoop.rpc.protection = privacy), 
> {{ApplicationMasterProtocolPBClientImpl.allocate}} sometimes (but not always) 
> fails with an EOFException. I've reproduced this with Spark 2.0.2 built 
> against latest branch-2.8 and with a simple distcp job on latest branch-2.8.
> Steps to reproduce using distcp:
> 1. Set hadoop.rpc.protection equal to privacy
> 2. Write data to HDFS. I did this with Spark as follows: 
> {code}
> sc.parallelize(1 to (5*1024*1024)).map(k => Seq(k, 
> org.apache.commons.lang.RandomStringUtils.random(1024, 
> "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWxyZ0123456789")).mkString("|")).toDF().repartition(100).write.parquet("hdfs:///tmp/testData")
> {code}
> 3. Attempt to distcp that data to another location in HDFS. For example:
> {code}
> hadoop distcp -Dmapreduce.framework.name=yarn hdfs:///tmp/testData 
> hdfs:///tmp/testDataCopy
> {code}
> I observed this error in the ApplicationMaster's syslog:
> {code}
> 2016-12-19 19:13:50,097 INFO [eventHandlingThread] 
> org.apache.hadoop.mapreduce.jobhistory.JobHistoryEventHandler: Event Writer 
> setup for JobId: job_1482189777425_0004, File: 
> hdfs://<namenode_host>:8020/tmp/hadoop-yarn/staging/<hdfs_user>/.staging/job_1482189777425_0004/job_1482189777425_0004_1.jhist
> 2016-12-19 19:13:51,004 INFO [RMCommunicator Allocator] 
> org.apache.hadoop.mapreduce.v2.app.rm.RMContainerAllocator: Before 
> Scheduling: PendingReds:0 ScheduledMaps:4 ScheduledReds:0 AssignedMaps:0 
> AssignedReds:0 CompletedMaps:0 CompletedReds:0 ContAlloc:0 ContRel:0 
> HostLocal:0 RackLocal:0
> 2016-12-19 19:13:51,031 INFO [RMCommunicator Allocator] 
> org.apache.hadoop.mapreduce.v2.app.rm.RMContainerRequestor: getResources() 
> for application_1482189777425_0004: ask=1 release= 0 newContainers=0 
> finishedContainers=0 resourcelimit=<memory:22528, vCores:23> knownNMs=3
> 2016-12-19 19:13:52,043 INFO [RMCommunicator Allocator] 
> org.apache.hadoop.io.retry.RetryInvocationHandler: Exception while invoking 
> ApplicationMasterProtocolPBClientImpl.allocate over null. Retrying after 
> sleeping for 30000ms.
> java.io.EOFException: End of File Exception between local host is: 
> "<application_master_host>/<ip_addr>"; destination host is: "<rm_host>":8030; 
> : java.io.EOFException; For more details see:  
> http://wiki.apache.org/hadoop/EOFException
>       at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
>       at 
> sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
>       at 
> sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
>       at java.lang.reflect.Constructor.newInstance(Constructor.java:422)
>       at org.apache.hadoop.net.NetUtils.wrapWithMessage(NetUtils.java:801)
>       at org.apache.hadoop.net.NetUtils.wrapException(NetUtils.java:765)
>       at org.apache.hadoop.ipc.Client.getRpcResponse(Client.java:1486)
>       at org.apache.hadoop.ipc.Client.call(Client.java:1428)
>       at org.apache.hadoop.ipc.Client.call(Client.java:1338)
>       at 
> org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:227)
>       at 
> org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:116)
>       at com.sun.proxy.$Proxy80.allocate(Unknown Source)
>       at 
> org.apache.hadoop.yarn.api.impl.pb.client.ApplicationMasterProtocolPBClientImpl.allocate(ApplicationMasterProtocolPBClientImpl.java:77)
>       at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
>       at 
> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
>       at 
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
>       at java.lang.reflect.Method.invoke(Method.java:497)
>       at 
> org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:398)
>       at 
> org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeMethod(RetryInvocationHandler.java:163)
>       at 
> org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invoke(RetryInvocationHandler.java:155)
>       at 
> org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeOnce(RetryInvocationHandler.java:95)
>       at 
> org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:335)
>       at com.sun.proxy.$Proxy81.allocate(Unknown Source)
>       at 
> org.apache.hadoop.mapreduce.v2.app.rm.RMContainerRequestor.makeRemoteRequest(RMContainerRequestor.java:204)
>       at 
> org.apache.hadoop.mapreduce.v2.app.rm.RMContainerAllocator.getResources(RMContainerAllocator.java:735)
>       at 
> org.apache.hadoop.mapreduce.v2.app.rm.RMContainerAllocator.heartbeat(RMContainerAllocator.java:269)
>       at 
> org.apache.hadoop.mapreduce.v2.app.rm.RMCommunicator$AllocatorRunnable.run(RMCommunicator.java:281)
>       at java.lang.Thread.run(Thread.java:745)
> Caused by: java.io.EOFException
>       at java.io.DataInputStream.readInt(DataInputStream.java:392)
>       at 
> org.apache.hadoop.ipc.Client$IpcStreams.readResponse(Client.java:1785)
>       at 
> org.apache.hadoop.ipc.Client$Connection.receiveRpcResponse(Client.java:1156)
>       at org.apache.hadoop.ipc.Client$Connection.run(Client.java:1053)
> {code}
> Marking as "critical" since this blocks YARN users from encrypting RPC in 
> their Hadoop clusters.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to