[ 
https://issues.apache.org/jira/browse/MESOS-6577?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Marc Villacorta updated MESOS-6577:
-----------------------------------
    Environment: 
{code:none}
core@kato-2 ~ $ cat /etc/kato.env 
   KATO_CLUSTER_ID=cell-1-dub
   KATO_QUORUM_COUNT=3
   KATO_ROLES='quorum master worker '
   KATO_HOST_NAME=kato
   KATO_HOST_ID=2
   KATO_ZK=quorum-1:2181,quorum-2:2181,quorum-3:2181
   
KATO_ALERT_MANAGERS=http://master-1:9093,http://master-2:9093,http://master-3:9093
   KATO_DOMAIN=cell-1.dub.xnood.com
   KATO_MESOS_DOMAIN=cell-1.dub.mesos
   KATO_HOST_IP=10.136.64.12 
   KATO_QUORUM=2
   DOCKER_VERSION=1.12.3
{code}

{code:none}
core@kato-2 ~ $ cat /etc/systemd/system/mesos-agent.service
[Unit]
Description=Mesos agent
After=go-dnsmasq.service

[Service]
Slice=machine.slice
Restart=always
RestartSec=10
TimeoutStartSec=0
KillMode=mixed
EnvironmentFile=/etc/kato.env
ExecStartPre=/usr/bin/sh -c "[ -d /var/lib/mesos/agent ] || mkdir -p 
/var/lib/mesos/agent"
ExecStartPre=/usr/bin/sh -c "[ -d /etc/certs ] || mkdir -p /etc/certs"
ExecStartPre=/usr/bin/sh -c "[ -d /etc/cni ] || mkdir -p /etc/cni"
ExecStartPre=/opt/bin/zk-alive ${KATO_QUORUM_COUNT}
ExecStartPre=/usr/bin/rkt fetch quay.io/kato/mesos:v1.0.1-${DOCKER_VERSION}-2
ExecStartPre=/usr/bin/docker pull quay.io/kato/mesos:v1.0.1-${DOCKER_VERSION}-2
ExecStart=/usr/bin/rkt run \
 --net=host \
 --dns=host \
 --hosts-entry=host \
 --volume cni,kind=host,source=/etc/cni \
 --mount volume=cni,target=/etc/cni \
 --volume certs,kind=host,source=/etc/certs \
 --mount volume=certs,target=/etc/certs \
 --volume docker,kind=host,source=/var/run/docker.sock \
 --mount volume=docker,target=/var/run/docker.sock \
 --volume data,kind=host,source=/var/lib/mesos \
 --mount volume=data,target=/var/lib/mesos \
 --stage1-name=coreos.com/rkt/stage1-fly \
 quay.io/kato/mesos:v1.0.1-${DOCKER_VERSION}-2 --exec /usr/sbin/mesos-agent -- \
 --no-systemd_enable_support \
 --docker_mesos_image=quay.io/kato/mesos:v1.0.1-${DOCKER_VERSION}-2 \
 --hostname=worker-${KATO_HOST_ID}.${KATO_DOMAIN} \
 --ip=${KATO_HOST_IP} \
 --containerizers=docker \
 --executor_registration_timeout=2mins \
 --master=zk://${KATO_ZK}/mesos \
 --work_dir=/var/lib/mesos/agent \
 --log_dir=/var/log/mesos/agent \
 --network_cni_config_dir=/etc/cni \
 --network_cni_plugins_dir=/var/lib/mesos/cni-plugins

[Install]
WantedBy=kato.target
{code}

{code:none}
core@kato-2 ~ $ docker version
Client:
 Version:      1.12.3
 API version:  1.24
 Go version:   go1.6.3
 Git commit:   34a2ead
 Built:        
 OS/Arch:      linux/amd64

Server:
 Version:      1.12.3
 API version:  1.24
 Go version:   go1.6.3
 Git commit:   34a2ead
 Built:        
 OS/Arch:      linux/amd64
{code}

  was:
{code:none}
core@kato-2 ~ $ cat /etc/systemd/system/mesos-agent.service
[Unit]
Description=Mesos agent
After=go-dnsmasq.service

[Service]
Slice=machine.slice
Restart=always
RestartSec=10
TimeoutStartSec=0
KillMode=mixed
EnvironmentFile=/etc/kato.env
ExecStartPre=/usr/bin/sh -c "[ -d /var/lib/mesos/agent ] || mkdir -p 
/var/lib/mesos/agent"
ExecStartPre=/usr/bin/sh -c "[ -d /etc/certs ] || mkdir -p /etc/certs"
ExecStartPre=/usr/bin/sh -c "[ -d /etc/cni ] || mkdir -p /etc/cni"
ExecStartPre=/opt/bin/zk-alive ${KATO_QUORUM_COUNT}
ExecStartPre=/usr/bin/rkt fetch quay.io/kato/mesos:v1.0.1-${DOCKER_VERSION}-2
ExecStartPre=/usr/bin/docker pull quay.io/kato/mesos:v1.0.1-${DOCKER_VERSION}-2
ExecStart=/usr/bin/rkt run \
 --net=host \
 --dns=host \
 --hosts-entry=host \
 --volume cni,kind=host,source=/etc/cni \
 --mount volume=cni,target=/etc/cni \
 --volume certs,kind=host,source=/etc/certs \
 --mount volume=certs,target=/etc/certs \
 --volume docker,kind=host,source=/var/run/docker.sock \
 --mount volume=docker,target=/var/run/docker.sock \
 --volume data,kind=host,source=/var/lib/mesos \
 --mount volume=data,target=/var/lib/mesos \
 --stage1-name=coreos.com/rkt/stage1-fly \
 quay.io/kato/mesos:v1.0.1-${DOCKER_VERSION}-2 --exec /usr/sbin/mesos-agent -- \
 --no-systemd_enable_support \
 --docker_mesos_image=quay.io/kato/mesos:v1.0.1-${DOCKER_VERSION}-2 \
 --hostname=worker-${KATO_HOST_ID}.${KATO_DOMAIN} \
 --ip=${KATO_HOST_IP} \
 --containerizers=docker \
 --executor_registration_timeout=2mins \
 --master=zk://${KATO_ZK}/mesos \
 --work_dir=/var/lib/mesos/agent \
 --log_dir=/var/log/mesos/agent \
 --network_cni_config_dir=/etc/cni \
 --network_cni_plugins_dir=/var/lib/mesos/cni-plugins

[Install]
WantedBy=kato.target
{code}

{code:none}
core@kato-2 ~ $ docker version
Client:
 Version:      1.12.3
 API version:  1.24
 Go version:   go1.6.3
 Git commit:   34a2ead
 Built:        
 OS/Arch:      linux/amd64

Server:
 Version:      1.12.3
 API version:  1.24
 Go version:   go1.6.3
 Git commit:   34a2ead
 Built:        
 OS/Arch:      linux/amd64
{code}


> Failed to run docker inspect
> ----------------------------
>
>                 Key: MESOS-6577
>                 URL: https://issues.apache.org/jira/browse/MESOS-6577
>             Project: Mesos
>          Issue Type: Bug
>          Components: containerization, docker
>    Affects Versions: 1.0.1
>         Environment: {code:none}
> core@kato-2 ~ $ cat /etc/kato.env 
>    KATO_CLUSTER_ID=cell-1-dub
>    KATO_QUORUM_COUNT=3
>    KATO_ROLES='quorum master worker '
>    KATO_HOST_NAME=kato
>    KATO_HOST_ID=2
>    KATO_ZK=quorum-1:2181,quorum-2:2181,quorum-3:2181
>    
> KATO_ALERT_MANAGERS=http://master-1:9093,http://master-2:9093,http://master-3:9093
>    KATO_DOMAIN=cell-1.dub.xnood.com
>    KATO_MESOS_DOMAIN=cell-1.dub.mesos
>    KATO_HOST_IP=10.136.64.12 
>    KATO_QUORUM=2
>    DOCKER_VERSION=1.12.3
> {code}
> {code:none}
> core@kato-2 ~ $ cat /etc/systemd/system/mesos-agent.service
> [Unit]
> Description=Mesos agent
> After=go-dnsmasq.service
> [Service]
> Slice=machine.slice
> Restart=always
> RestartSec=10
> TimeoutStartSec=0
> KillMode=mixed
> EnvironmentFile=/etc/kato.env
> ExecStartPre=/usr/bin/sh -c "[ -d /var/lib/mesos/agent ] || mkdir -p 
> /var/lib/mesos/agent"
> ExecStartPre=/usr/bin/sh -c "[ -d /etc/certs ] || mkdir -p /etc/certs"
> ExecStartPre=/usr/bin/sh -c "[ -d /etc/cni ] || mkdir -p /etc/cni"
> ExecStartPre=/opt/bin/zk-alive ${KATO_QUORUM_COUNT}
> ExecStartPre=/usr/bin/rkt fetch quay.io/kato/mesos:v1.0.1-${DOCKER_VERSION}-2
> ExecStartPre=/usr/bin/docker pull 
> quay.io/kato/mesos:v1.0.1-${DOCKER_VERSION}-2
> ExecStart=/usr/bin/rkt run \
>  --net=host \
>  --dns=host \
>  --hosts-entry=host \
>  --volume cni,kind=host,source=/etc/cni \
>  --mount volume=cni,target=/etc/cni \
>  --volume certs,kind=host,source=/etc/certs \
>  --mount volume=certs,target=/etc/certs \
>  --volume docker,kind=host,source=/var/run/docker.sock \
>  --mount volume=docker,target=/var/run/docker.sock \
>  --volume data,kind=host,source=/var/lib/mesos \
>  --mount volume=data,target=/var/lib/mesos \
>  --stage1-name=coreos.com/rkt/stage1-fly \
>  quay.io/kato/mesos:v1.0.1-${DOCKER_VERSION}-2 --exec /usr/sbin/mesos-agent 
> -- \
>  --no-systemd_enable_support \
>  --docker_mesos_image=quay.io/kato/mesos:v1.0.1-${DOCKER_VERSION}-2 \
>  --hostname=worker-${KATO_HOST_ID}.${KATO_DOMAIN} \
>  --ip=${KATO_HOST_IP} \
>  --containerizers=docker \
>  --executor_registration_timeout=2mins \
>  --master=zk://${KATO_ZK}/mesos \
>  --work_dir=/var/lib/mesos/agent \
>  --log_dir=/var/log/mesos/agent \
>  --network_cni_config_dir=/etc/cni \
>  --network_cni_plugins_dir=/var/lib/mesos/cni-plugins
> [Install]
> WantedBy=kato.target
> {code}
> {code:none}
> core@kato-2 ~ $ docker version
> Client:
>  Version:      1.12.3
>  API version:  1.24
>  Go version:   go1.6.3
>  Git commit:   34a2ead
>  Built:        
>  OS/Arch:      linux/amd64
> Server:
>  Version:      1.12.3
>  API version:  1.24
>  Go version:   go1.6.3
>  Git commit:   34a2ead
>  Built:        
>  OS/Arch:      linux/amd64
> {code}
>            Reporter: Marc Villacorta
>
> I am running a _rocketized_ mesos agent.
> I am using the docker containerizer.
> My executors are _dockerized_.
> The very first time I deploy a sample platform I get some errors like the one 
> below:
> {code:none}
> Failed to launch container: Failed to run 'docker -H 
> unix:///var/run/docker.sock inspect 
> mesos-84a9df2b-be0e-459e-afc9-b95d4e8ced57-S0.0116a0a2-ccaf-4f1a-846c-361ec4e4a179':
>  exited with status 1; stderr='Error: No such image, container or task: 
> mesos-84a9df2b-be0e-459e-afc9-b95d4e8ced57-S0.0116a0a2-ccaf-4f1a-846c-361ec4e4a179
>  '
> {code}
> But when I check with {{docker ps}} I can see the supposedly missing 
> container and I can even successfully run {{docker inspect}} on it. Then 
> marathon reschedules and I get a duplicate. Nor mesos neither marathon list 
> any duplicate (only docker does).
> Restarting the mesos-agent wipes out the reported missing container leaving 
> the other ones alive.
> When all my nodes have the docker image layers cached I can deploy the sample 
> platform smoothly and I don't get the previous errors.
> If a container needs a remote volume attached (EBS via REX-Ray) the error 
> happens all the time. No matter if cached or not.
> Reading the code I suspect it is related to the _retryInterval_ of 
> _Docker::inspect_ 
> https://github.com/apache/mesos/blob/2e013890e47c30053b7b83cd205b432376589216/src/docker/docker.cpp#L950-L952
>  but there is no option to modify this setting.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to