[
https://issues.apache.org/jira/browse/YARN-4731?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15167106#comment-15167106
]
Bibin A Chundatt commented on YARN-4731:
----------------------------------------
[~vvasudev]
I have check the patch attached
*Issue 1*
# Container localization files are getting deleted properly
*Issues that still exists*
# Signal to container is throwing throwing exception in LCE
{noformat}
2016-02-25 13:08:20,442 INFO
org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.DelegatingLinuxContainerRuntime:
Using container runtime: DefaultLinuxContainerRuntime
2016-02-25 13:08:20,447 WARN
org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor:
Shell execution returned exit code: 9. Privileged Execution Operation Output:
main : command provided 2
main : run as user is yarn
main : requested yarn user is yarn
Full command array for failed execution:
[/opt/bibin/dsperf/HAINSTALL/install/hadoop/nodemanager/bin/container-executor,
yarn, yarn, 2, 23524, 9]
2016-02-25 13:08:20,447 WARN
org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.DefaultLinuxContainerRuntime:
Signal container failed. Exception:
org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationException:
ExitCodeException exitCode=9:
at
org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor.executePrivilegedOperation(PrivilegedOperationExecutor.java:173)
at
org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.DefaultLinuxContainerRuntime.signalContainer(DefaultLinuxContainerRuntime.java:132)
at
org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.DelegatingLinuxContainerRuntime.signalContainer(DelegatingLinuxContainerRuntime.java:109)
at
org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor.signalContainer(LinuxContainerExecutor.java:513)
at
org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor$DelayedProcessKiller.run(ContainerExecutor.java:532)
Caused by: ExitCodeException exitCode=9:
at org.apache.hadoop.util.Shell.runCommand(Shell.java:927)
at org.apache.hadoop.util.Shell.run(Shell.java:838)
at
org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:1117)
at
org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor.executePrivilegedOperation(PrivilegedOperationExecutor.java:150)
... 4 more
{noformat}
# Container initalization error was thrown
{noformat}
2016-02-25 13:08:20,183 INFO
org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.DelegatingLinuxContainerRuntime:
Using container runtime: DefaultLinuxContainerRuntime
2016-02-25 13:08:20,191 WARN
org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor:
Shell execution returned exit code: 143. Privileged Execution Operation Output:
main : command provided 1
main : run as user is yarn
main : requested yarn user is yarn
Getting exit code file...
Creating script paths...
Writing pid file...
Writing to tmp file
/opt/bibin/dsperf/HAINSTALL/nmlocal/nmPrivate/application_1456385661741_0001/container_1456385661741_0001_01_000003/container_1456385661741_0001_01_000003.pid.tmp
Writing to cgroup task files...
Creating local dirs...
Launching container...
Getting exit code file...
Creating script paths...
Full command array for failed execution:
[nice, -n, 0,
/opt/bibin/dsperf/HAINSTALL/install/hadoop/nodemanager/bin/container-executor,
yarn, yarn, 1, application_1456385661741_0001,
container_1456385661741_0001_01_000003,
/opt/bibin/dsperf/HAINSTALL/nmlocal/usercache/yarn/appcache/application_1456385661741_0001/container_1456385661741_0001_01_000003,
/opt/bibin/dsperf/HAINSTALL/nmlocal/nmPrivate/application_1456385661741_0001/container_1456385661741_0001_01_000003/launch_container.sh,
/opt/bibin/dsperf/HAINSTALL/nmlocal/nmPrivate/application_1456385661741_0001/container_1456385661741_0001_01_000003/container_1456385661741_0001_01_000003.tokens,
/opt/bibin/dsperf/HAINSTALL/nmlocal/nmPrivate/application_1456385661741_0001/container_1456385661741_0001_01_000003/container_1456385661741_0001_01_000003.pid,
/opt/bibin/dsperf/HAINSTALL/nmlocal, /opt/bibin/dsperf/HAINSTALL/nmlog,
cgroups=/cgroups/cpu/hadoop-yarn/container_1456385661741_0001_01_000003/tasks]
2016-02-25 13:08:20,191 WARN
org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.DefaultLinuxContainerRuntime:
Launch container failed. Exception:
org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationException:
ExitCodeException exitCode=143:
at
org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor.executePrivilegedOperation(PrivilegedOperationExecutor.java:173)
at
org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.DefaultLinuxContainerRuntime.launchContainer(DefaultLinuxContainerRuntime.java:103)
at
org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.DelegatingLinuxContainerRuntime.launchContainer(DelegatingLinuxContainerRuntime.java:100)
at
org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor.launchContainer(LinuxContainerExecutor.java:408)
at
org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.call(ContainerLaunch.java:319)
at
org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.call(ContainerLaunch.java:88)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: ExitCodeException exitCode=143:
at org.apache.hadoop.util.Shell.runCommand(Shell.java:927)
at org.apache.hadoop.util.Shell.run(Shell.java:838)
at
org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:1117)
at
org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor.executePrivilegedOperation(PrivilegedOperationExecutor.java:150)
... 9 more
2016-02-25 13:08:20,192 WARN
org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor: Exit code
from container container_1456385661741_0001_01_000003 is : 143
{noformat}
[~vvasudev] : Should i raise separate jira for the same or will handle as part
of the same jira ?
> Linux container executor fails to delete nmlocal folders
> --------------------------------------------------------
>
> Key: YARN-4731
> URL: https://issues.apache.org/jira/browse/YARN-4731
> Project: Hadoop YARN
> Issue Type: Bug
> Reporter: Bibin A Chundatt
> Assignee: Varun Vasudev
> Priority: Critical
> Attachments: YARN-4731.001.patch
>
>
> Enable LCE and CGroups
> Submit a mapreduce job
> {noformat}
> 2016-02-24 18:56:46,889 INFO
> org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor: Deleting
> absolute path :
> /opt/bibin/dsperf/HAINSTALL/nmlocal/usercache/dsperf/appcache/application_1456319010019_0003/container_e02_1456319010019_0003_01_000001
> 2016-02-24 18:56:46,894 WARN
> org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor:
> Shell execution returned exit code: 255. Privileged Execution Operation
> Output:
> main : command provided 3
> main : run as user is dsperf
> main : requested yarn user is dsperf
> failed to rmdir job.jar: Not a directory
> Error while deleting
> /opt/bibin/dsperf/HAINSTALL/nmlocal/usercache/dsperf/appcache/application_1456319010019_0003/container_e02_1456319010019_0003_01_000001:
> 20 (Not a directory)
> Full command array for failed execution:
> [/opt/bibin/dsperf/HAINSTALL/install/hadoop/nodemanager/bin/container-executor,
> dsperf, dsperf, 3,
> /opt/bibin/dsperf/HAINSTALL/nmlocal/usercache/dsperf/appcache/application_1456319010019_0003/container_e02_1456319010019_0003_01_000001]
> 2016-02-24 18:56:46,894 ERROR
> org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor:
> DeleteAsUser for
> /opt/bibin/dsperf/HAINSTALL/nmlocal/usercache/dsperf/appcache/application_1456319010019_0003/container_e02_1456319010019_0003_01_000001
> returned with exit code: 255
> org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationException:
> ExitCodeException exitCode=255:
> at
> org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor.executePrivilegedOperation(PrivilegedOperationExecutor.java:173)
> at
> org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor.executePrivilegedOperation(PrivilegedOperationExecutor.java:199)
> at
> org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor.deleteAsUser(LinuxContainerExecutor.java:569)
> at
> org.apache.hadoop.yarn.server.nodemanager.DeletionService$FileDeletionTask.run(DeletionService.java:265)
> at
> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
> at java.util.concurrent.FutureTask.run(FutureTask.java:266)
> at
> java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
> at
> java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
> at java.lang.Thread.run(Thread.java:745)
> Caused by: ExitCodeException exitCode=255:
> at org.apache.hadoop.util.Shell.runCommand(Shell.java:927)
> at org.apache.hadoop.util.Shell.run(Shell.java:838)
> at
> org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:1117)
> at
> org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor.executePrivilegedOperation(PrivilegedOperationExecutor.java:150)
> ... 10 more
> {noformat}
> As a result nodemanager-local directory are not getting deleted for each
> application
> {noformat}
> total 36
> drwxr-s--- 4 hdfs hadoop 4096 Feb 25 08:25 ./
> drwxr-s--- 7 hdfs hadoop 4096 Feb 25 08:25 ../
> -rw------- 1 hdfs hadoop 340 Feb 25 08:25 container_tokens
> lrwxrwxrwx 1 hdfs hadoop 111 Feb 25 08:25 job.jar ->
> /opt/bibin/dsperf/HAINSTALL/nmlocal/usercache/hdfs/appcache/application_1456364845478_0004/filecache/11/job.jar/
> lrwxrwxrwx 1 hdfs hadoop 111 Feb 25 08:25 job.xml ->
> /opt/bibin/dsperf/HAINSTALL/nmlocal/usercache/hdfs/appcache/application_1456364845478_0004/filecache/13/job.xml*
> drwxr-s--- 2 hdfs hadoop 4096 Feb 25 08:25 jobSubmitDir/
> -rwx------ 1 hdfs hadoop 5348 Feb 25 08:25 launch_container.sh*
> drwxr-s--- 2 hdfs hadoop 4096 Feb 25 08:25 tmp/
> {noformat}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)