Hi, I am running a MR job with AvroMutipleOutputs on hadoop 2.3.0. and I am facing following issue. What could be the problem?
1) Job stuck at reduce 100%, and fails with Lease Exception 2) Observed that every time out of 100 reducers only 3 of them are failing. 3) I verified no other process is accessing MR output path, which could lead to lease exception. 4) I even tried with speculative execution = false, and my output paths are relative. Below is my stack trace. 14/04/29 04:31:12 INFO mapreduce.Job: map 100% reduce 99% 14/04/29 04:35:48 INFO mapreduce.Job: map 100% reduce 100% 14/04/29 04:42:58 INFO mapreduce.Job: Task Id : attempt_1397655922515_87333_r_000069_0, Status : FAILED AttemptID:attempt_1397655922515_87333_r_000069_0 Timed out after 600 secs 14/04/29 04:42:59 INFO mapreduce.Job: map 100% reduce 99% 14/04/29 04:44:12 INFO mapreduce.Job: map 100% reduce 100% 14/04/29 04:54:58 INFO mapreduce.Job: Task Id : attempt_1397655922515_87333_r_000056_0, Status : FAILED AttemptID:attempt_1397655922515_87333_r_000056_0 Timed out after 600 secs 14/04/29 04:54:59 INFO mapreduce.Job: map 100% reduce 99% 14/04/29 04:55:32 INFO mapreduce.Job: map 100% reduce 100% 14/04/29 04:56:29 INFO mapreduce.Job: Task Id : attempt_1397655922515_87333_r_000069_1, Status : FAILED AttemptID:attempt_1397655922515_87333_r_000069_1 Timed out after 600 secs 14/04/29 04:56:30 INFO mapreduce.Job: map 100% reduce 99% 14/04/29 04:58:11 INFO mapreduce.Job: map 100% reduce 100% 2014-04-28 06:15:35,198 ERROR [Thread-6] org.apache.hadoop.hdfs.DFSClient: Failed to close file /tmp/supersessionOutput63/2014/04/20/15/_temporary/1/_temporary/attempt_1397655922515_79608_r_000073_1/error-r-00073.avro org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.hdfs.server.namenode.LeaseExpiredException): No lease on /tmp/supersessionOutput63/2014/04/20/15/_temporary/1/_temporary/attempt_1397655922515_79608_r_000073_1/error-r-00073.avro: File does not exist. Holder DFSClient_attempt_1397655922515_79608_r_000073_1_-929899663_1 does not have any open files. at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.checkLease(FSNamesystem.java:2755) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.completeFileInternal(FSNamesystem.java:2817) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.completeFile(FSNamesystem.java:2799) at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.complete(NameNodeRpcServer.java:611) at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.complete(ClientNamenodeProtocolServerSideTranslatorPB.java:428) at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java:59586) at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:585) at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:928) at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2060) at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2056) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:396) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1547) at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2054) at org.apache.hadoop.ipc.Client.call(Client.java:1347) at org.apache.hadoop.ipc.Client.call(Client.java:1300) at org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:206) at $Proxy10.complete(Unknown Source) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25) at java.lang.reflect.Method.invoke(Method.java:597) at org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:186) at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:102) at $Proxy10.complete(Unknown Source) at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.complete(ClientNamenodeProtocolTranslatorPB.java:371) at org.apache.hadoop.hdfs.DFSOutputStream.completeFile(DFSOutputStream.java:1900) at org.apache.hadoop.hdfs.DFSOutputStream.close(DFSOutputStream.java:1886) at org.apache.hadoop.hdfs.DFSClient.closeAllFilesBeingWritten(DFSClient.java:773) at org.apache.hadoop.hdfs.DFSClient.close(DFSClient.java:790) at org.apache.hadoop.hdfs.DistributedFileSystem.close(DistributedFileSystem.java:847) at org.apache.hadoop.fs.FileSystem$Cache.closeAll(FileSystem.java:2524) at org.apache.hadoop.fs.FileSystem$Cache$ClientFinalizer.run(FileSystem.java:2541) at org.apache.hadoop.util.ShutdownHookManager$1.run(ShutdownHookManager.java:54) 2014-04-28 06:15:35,203 WARN [Thread-51] org.apache.hadoop.hdfs.DFSClient: DataStreamer Exception org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.hdfs.server.namenode.LeaseExpiredException): No lease on /tmp/supersessionOutput63/2014/04/20/15/_temporary/1/_temporary/attempt_1397655922515_79608_r_000073_1/part-r-00073.avro: File does not exist. Holder DFSClient_attempt_1397655922515_79608_r_000073_1_-929899663_1 does not have any open files. at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.checkLease(FSNamesystem.java:2755) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.analyzeFileState(FSNamesystem.java:2567) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getAdditionalBlock(FSNamesystem.java:2480) at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.addBlock(NameNodeRpcServer.java:555) at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.addBlock(ClientNamenodeProtocolServerSideTranslatorPB.java:387) at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java:59582) at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:585) at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:928) at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2060) at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2056) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:396) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1547) at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2054) at org.apache.hadoop.ipc.Client.call(Client.java:1347) at org.apache.hadoop.ipc.Client.call(Client.java:1300) at org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:206) at $Proxy10.addBlock(Unknown Source) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25) at java.lang.reflect.Method.invoke(Method.java:597) at org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:186) at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:102) at $Proxy10.addBlock(Unknown Source) at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.addBlock(ClientNamenodeProtocolTranslatorPB.java:330) at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.locateFollowingBlock(DFSOutputStream.java:1226) at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.nextBlockOutputStream(DFSOutputStream.java:1078) at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.run(DFSOutputStream.java:514) 2014-04-28 06:15:35,204 ERROR [Thread-6] org.apache.hadoop.hdfs.DFSClient: Failed to close file /tmp/supersessionOutput63/2014/04/20/15/_temporary/1/_temporary/attempt_1397655922515_79608_r_000073_1/part-r-00073.avro org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.hdfs.server.namenode.LeaseExpiredException): No lease on /tmp/supersessionOutput63/2014/04/20/15/_temporary/1/_temporary/attempt_1397655922515_79608_r_000073_1/part-r-00073.avro: File does not exist. Holder DFSClient_attempt_1397655922515_79608_r_000073_1_-929899663_1 does not have any open files. at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.checkLease(FSNamesystem.java:2755) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.analyzeFileState(FSNamesystem.java:2567) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getAdditionalBlock(FSNamesystem.java:2480) at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.addBlock(NameNodeRpcServer.java:555) at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.addBlock(ClientNamenodeProtocolServerSideTranslatorPB.java:387) at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java:59582) at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:585) at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:928) at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2060) at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2056) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:396) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1547) at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2054) at org.apache.hadoop.ipc.Client.call(Client.java:1347) at org.apache.hadoop.ipc.Client.call(Client.java:1300) at org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:206) at $Proxy10.addBlock(Unknown Source) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25) at java.lang.reflect.Method.invoke(Method.java:597) at org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:186) at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:102) at $Proxy10.addBlock(Unknown Source) at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.addBlock(ClientNamenodeProtocolTranslatorPB.java:330) at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.locateFollowingBlock(DFSOutputStream.java:1226) at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.nextBlockOutputStream(DFSOutputStream.java:1078) at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.run(DFSOutputStream.java:514) 2014-04-28 06:15:35,222 ERROR [Thread-6] org.apache.hadoop.hdfs.DFSClient: Failed to close file /tmp/supersessionOutput63/2014/04/20/15/_temporary/1/_temporary/attempt_1397655922515_79608_r_000073_1/stage1-r-00073.avro org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.hdfs.server.namenode.LeaseExpiredException): No lease on /tmp/supersessionOutput63/2014/04/20/15/_temporary/1/_temporary/attempt_1397655922515_79608_r_000073_1/stage1-r-00073.avro: File does not exist. Holder DFSClient_attempt_1397655922515_79608_r_000073_1_-929899663_1 does not have any open files. at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.checkLease(FSNamesystem.java:2755) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.completeFileInternal(FSNamesystem.java:2817) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.completeFile(FSNamesystem.java:2799) at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.complete(NameNodeRpcServer.java:611) at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.complete(ClientNamenodeProtocolServerSideTranslatorPB.java:428) at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java:59586) at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:585) at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:928) at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2060) at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2056) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:396) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1547) at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2054) at org.apache.hadoop.ipc.Client.call(Client.java:1347) at org.apache.hadoop.ipc.Client.call(Client.java:1300) at org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:206) at $Proxy10.complete(Unknown Source) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25) at java.lang.reflect.Method.invoke(Method.java:597) at org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:186) at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:102) at $Proxy10.complete(Unknown Source) at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.complete(ClientNamenodeProtocolTranslatorPB.java:371) at org.apache.hadoop.hdfs.DFSOutputStream.completeFile(DFSOutputStream.java:1900) at org.apache.hadoop.hdfs.DFSOutputStream.close(DFSOutputStream.java:1886) at org.apache.hadoop.hdfs.DFSClient.closeAllFilesBeingWritten(DFSClient.java:773) at org.apache.hadoop.hdfs.DFSClient.close(DFSClient.java:790) at org.apache.hadoop.hdfs.DistributedFileSystem.close(DistributedFileSystem.java:847) at org.apache.hadoop.fs.FileSystem$Cache.closeAll(FileSystem.java:2524) at org.apache.hadoop.fs.FileSystem$Cache$ClientFinalizer.run(FileSystem.java:2541) at org.apache.hadoop.util.ShutdownHookManager$1.run(ShutdownHookManager.java:54)
