[
https://issues.apache.org/jira/browse/MESOS-4279?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15091844#comment-15091844
]
Qian Zhang commented on MESOS-4279:
-----------------------------------
[~bydga], Here is my slave's state.json:
{code}
{
"attributes": {},
"build_date": "2015-12-18 09:38:58",
"build_time": 1450402738.0,
"build_user": "stack",
"completed_frameworks": [],
"flags": {
"appc_store_dir": "/tmp/mesos/store/appc",
"authenticatee": "crammd5",
"cgroups_cpu_enable_pids_and_tids_count": "false",
"cgroups_enable_cfs": "false",
"cgroups_hierarchy": "/sys/fs/cgroup",
"cgroups_limit_swap": "false",
"cgroups_root": "mesos",
"container_disk_watch_interval": "15secs",
"containerizers": "mesos,docker",
"default_role": "*",
"disk_watch_interval": "1mins",
"docker": "docker",
"docker_auth_server": "auth.docker.io",
"docker_auth_server_port": "443",
"docker_kill_orphans": "true",
"docker_local_archives_dir": "/tmp/mesos/images/docker",
"docker_puller": "local",
"docker_puller_timeout": "60",
"docker_registry": "registry-1.docker.io",
"docker_registry_port": "443",
"docker_remove_delay": "6hrs",
"docker_socket": "/var/run/docker.sock",
"docker_stop_timeout": "10secs",
"docker_store_dir": "/tmp/mesos/store/docker",
"enforce_container_disk_quota": "false",
"executor_registration_timeout": "1mins",
"executor_shutdown_grace_period": "5secs",
"fetcher_cache_dir": "/tmp/mesos/fetch",
"fetcher_cache_size": "2GB",
"frameworks_home": "",
"gc_delay": "1weeks",
"gc_disk_headroom": "0.1",
"hadoop_home": "",
"help": "false",
"hostname_lookup": "true",
"image_provisioner_backend": "copy",
"initialize_driver_logging": "true",
"isolation": "cgroups/cpu,cgroups/mem",
"launcher_dir": "/home/stack/mesos-community/build/src",
"logbufsecs": "0",
"logging_level": "INFO",
"master": "zk://192.168.122.171:2181/mesos",
"oversubscribed_resources_interval": "15secs",
"perf_duration": "10secs",
"perf_interval": "1mins",
"port": "5051",
"qos_correction_interval_min": "0ns",
"quiet": "false",
"recover": "reconnect",
"recovery_timeout": "15mins",
"registration_backoff_factor": "1secs",
"revocable_cpu_low_priority": "true",
"sandbox_directory": "/mnt/mesos/sandbox",
"strict": "true",
"switch_user": "true",
"systemd_runtime_directory": "/run/systemd/system",
"version": "false",
"work_dir": "/tmp/mesos"
},
"git_branch": "refs/heads/master",
"git_sha": "09a2fb3ee5bf95e8a05fd3ac03e7d9b6782989dd",
"hostname": "mesos",
"id": "0e66b344-aee2-45be-b5ec-d606f3a14dfb-S0",
"master_hostname": "mesos",
"pid": "slave(1)@192.168.122.171:5051",
"resources": {
"cpus": 4.0,
"disk": 36813.0,
"mem": 2929.0,
"ports": "[31000-32000]"
},
"start_time": 1452492906.79797,
"version": "0.27.0"
}
{code}
And my master's state.json:
{code}
{
"activated_slaves": 1.0,
"build_date": "2015-12-18 09:38:58",
"build_time": 1450402738.0,
"build_user": "stack",
"completed_frameworks": [],
"deactivated_slaves": 0.0,
"elected_time": 1452000266.21308,
"flags": {
"allocation_interval": "1secs",
"allocator": "HierarchicalDRF",
"authenticate": "false",
"authenticate_slaves": "false",
"authenticators": "crammd5",
"authorizers": "local",
"framework_sorter": "drf",
"help": "false",
"hostname_lookup": "true",
"initialize_driver_logging": "true",
"ip": "192.168.122.171",
"log_auto_initialize": "true",
"logbufsecs": "0",
"logging_level": "INFO",
"max_slave_ping_timeouts": "5",
"port": "5050",
"quiet": "false",
"quorum": "1",
"recovery_slave_removal_limit": "100%",
"registry": "replicated_log",
"registry_fetch_timeout": "1mins",
"registry_store_timeout": "5secs",
"registry_strict": "false",
"root_submissions": "true",
"slave_ping_timeout": "15secs",
"slave_reregister_timeout": "10mins",
"user_sorter": "drf",
"version": "false",
"webui_dir": "/home/stack/mesos-community/build/../src/webui",
"work_dir": "/home/stack/workdir",
"zk": "zk://192.168.122.171:2181/mesos",
"zk_session_timeout": "10secs"
},
"git_branch": "refs/heads/master",
"git_sha": "09a2fb3ee5bf95e8a05fd3ac03e7d9b6782989dd",
"hostname": "mesos",
"id": "0e66b344-aee2-45be-b5ec-d606f3a14dfb",
"leader": "[email protected]:5050",
"orphan_tasks": [],
"pid": "[email protected]:5050",
"slaves": [
{
"active": true,
"attributes": {},
"hostname": "mesos",
"id": "0e66b344-aee2-45be-b5ec-d606f3a14dfb-S0",
"offered_resources": {
"cpus": 0,
"disk": 0,
"mem": 0
},
"pid": "slave(1)@192.168.122.171:5051",
"registered_time": 1452342166.53401,
"reregistered_time": 1452492908.11292,
"reserved_resources": {},
"resources": {
"cpus": 4.0,
"disk": 36813.0,
"mem": 2929.0,
"ports": "[31000-32000]"
},
"unreserved_resources": {
"cpus": 4.0,
"disk": 36813.0,
"mem": 2929.0,
"ports": "[31000-32000]"
},
"used_resources": {
"cpus": 0.1,
"disk": 0,
"mem": 16.0,
"ports": "[31274-31274]"
},
"version": "0.27.0"
}
],
"start_time": 1452000266.03415,
"unregistered_frameworks": [],
"version": "0.27.0"
}
{code}
And {{/var/log/upstart/docker.log}} during app's restart:
{code}
INFO[5096569] GET /v1.21/containers/mesos-4279:latest/json
ERRO[5096569] Handler for GET /v1.21/containers/mesos-4279:latest/json returned
error: no such id: mesos-4279:latest
ERRO[5096569] HTTP Error err=no such id:
mesos-4279:latest statusCode=404
INFO[5096569] GET /v1.21/images/mesos-4279:latest/json
INFO[5096569] GET
/v1.21/containers/mesos-0e66b344-aee2-45be-b5ec-d606f3a14dfb-S0.b824a7ad-35c6-48ab-9378-b180ed431fb5/json
ERRO[5096569] Handler for GET
/v1.21/containers/mesos-0e66b344-aee2-45be-b5ec-d606f3a14dfb-S0.b824a7ad-35c6-48ab-9378-b180ed431fb5/json
returned error: no such id:
mesos-0e66b344-aee2-45be-b5ec-d606f3a14dfb-S0.b824a7ad-35c6-48ab-9378-b180ed431fb5
ERRO[5096569] HTTP Error err=no such id:
mesos-0e66b344-aee2-45be-b5ec-d606f3a14dfb-S0.b824a7ad-35c6-48ab-9378-b180ed431fb5
statusCode=404
INFO[5096569] GET
/v1.21/images/mesos-0e66b344-aee2-45be-b5ec-d606f3a14dfb-S0.b824a7ad-35c6-48ab-9378-b180ed431fb5/json
ERRO[5096569] Handler for GET
/v1.21/images/mesos-0e66b344-aee2-45be-b5ec-d606f3a14dfb-S0.b824a7ad-35c6-48ab-9378-b180ed431fb5/json
returned error: No such image:
mesos-0e66b344-aee2-45be-b5ec-d606f3a14dfb-S0.b824a7ad-35c6-48ab-9378-b180ed431fb5
ERRO[5096569] HTTP Error err=No such image:
mesos-0e66b344-aee2-45be-b5ec-d606f3a14dfb-S0.b824a7ad-35c6-48ab-9378-b180ed431fb5
statusCode=404
INFO[5096569] POST
/v1.21/containers/create?name=mesos-0e66b344-aee2-45be-b5ec-d606f3a14dfb-S0.b824a7ad-35c6-48ab-9378-b180ed431fb5
WARN[5096569] Your kernel does not support swap limit capabilities, memory
limited without swap.
INFO[5096570] POST
/v1.21/containers/9ee5d5100d39ef3f5f628ac8d5f0cc44b1e1f10ab6801468373ab8d795be420d/attach?stderr=1&stdout=1&stream=1
INFO[5096570] POST
/v1.21/containers/9ee5d5100d39ef3f5f628ac8d5f0cc44b1e1f10ab6801468373ab8d795be420d/start
INFO[5096570] GET
/v1.21/containers/mesos-0e66b344-aee2-45be-b5ec-d606f3a14dfb-S0.b824a7ad-35c6-48ab-9378-b180ed431fb5/json
INFO[5096570] POST
/v1.21/containers/mesos-0e66b344-aee2-45be-b5ec-d606f3a14dfb-S0.c8df404c-506a-4a59-b534-d3d7f0d62d48/stop?t=10
ERRO[5096570] attach: stdout: write unix @: broken pipe
INFO[5096573] GET
/v1.21/containers/mesos-0e66b344-aee2-45be-b5ec-d606f3a14dfb-S0.c8df404c-506a-4a59-b534-d3d7f0d62d48/json
INFO[5096594] GET /v1.21/images/json
INFO[5096596] GET /v1.21/containers/json
{code}
> Graceful restart of docker task
> -------------------------------
>
> Key: MESOS-4279
> URL: https://issues.apache.org/jira/browse/MESOS-4279
> Project: Mesos
> Issue Type: Bug
> Components: containerization, docker
> Affects Versions: 0.25.0
> Reporter: Martin Bydzovsky
> Assignee: Qian Zhang
>
> I'm implementing a graceful restarts of our mesos-marathon-docker setup and I
> came to a following issue:
> (it was already discussed on
> https://github.com/mesosphere/marathon/issues/2876 and guys form mesosphere
> got to a point that its probably a docker containerizer problem...)
> To sum it up:
> When i deploy simple python script to all mesos-slaves:
> {code}
> #!/usr/bin/python
> from time import sleep
> import signal
> import sys
> import datetime
> def sigterm_handler(_signo, _stack_frame):
> print "got %i" % _signo
> print datetime.datetime.now().time()
> sys.stdout.flush()
> sleep(2)
> print datetime.datetime.now().time()
> print "ending"
> sys.stdout.flush()
> sys.exit(0)
> signal.signal(signal.SIGTERM, sigterm_handler)
> signal.signal(signal.SIGINT, sigterm_handler)
> try:
> print "Hello"
> i = 0
> while True:
> i += 1
> print datetime.datetime.now().time()
> print "Iteration #%i" % i
> sys.stdout.flush()
> sleep(1)
> finally:
> print "Goodbye"
> {code}
> and I run it through Marathon like
> {code:javascript}
> data = {
> args: ["/tmp/script.py"],
> instances: 1,
> cpus: 0.1,
> mem: 256,
> id: "marathon-test-api"
> }
> {code}
> During the app restart I get expected result - the task receives sigterm and
> dies peacefully (during my script-specified 2 seconds period)
> But when i wrap this python script in a docker:
> {code}
> FROM node:4.2
> RUN mkdir /app
> ADD . /app
> WORKDIR /app
> ENTRYPOINT []
> {code}
> and run appropriate application by Marathon:
> {code:javascript}
> data = {
> args: ["./script.py"],
> container: {
> type: "DOCKER",
> docker: {
> image: "bydga/marathon-test-api"
> },
> forcePullImage: yes
> },
> cpus: 0.1,
> mem: 256,
> instances: 1,
> id: "marathon-test-api"
> }
> {code}
> The task during restart (issued from marathon) dies immediately without
> having a chance to do any cleanup.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)