[ 
https://issues.apache.org/jira/browse/MESOS-4869?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15193774#comment-15193774
 ] 

Anthony Scalisi edited comment on MESOS-4869 at 3/14/16 6:05 PM:
-----------------------------------------------------------------

What do you mean ? Without having Mesos doing the health checks, on a host with 
6 tasks for example:

{noformat}
scalp@mesos-slave-i-d00b6017 $ free -m
             total       used       free     shared    buffers     cached
Mem:         16047      15306        740          0       3174       2547
-/+ buffers/cache:       9583       6463
Swap:            0          0          0


root@mesos-slave-i-d00b6017 # docker stats --no-stream
CONTAINER           CPU %               MEM USAGE / LIMIT     MEM %             
  NET I/O               BLOCK I/O
33cb349404e1        3.23%               897.8 MB / 1.611 GB   55.74%            
  4.859 GB / 4.625 GB   53.25 kB / 61.44 kB
61eba49cf71d        3.22%               1.166 GB / 1.611 GB   72.41%            
  5.49 GB / 5.155 GB    106.5 kB / 118.8 kB
630739e12032        3.76%               1.163 GB / 1.611 GB   72.22%            
  3.891 GB / 3.657 GB   348.2 kB / 118.8 kB
b5b9da9facfb        2.84%               901.9 MB / 1.611 GB   55.99%            
  2.254 GB / 2.153 GB   0 B / 118.8 kB
dcd2a73f71a9        3.55%               1.29 GB / 1.611 GB    80.10%            
  2.726 GB / 2.672 GB   0 B / 118.8 kB
de923d88a781        3.17%               889.5 MB / 1.611 GB   55.23%            
  3.817 GB / 3.645 GB   36.86 kB / 61.44 kB
{noformat}

Or another with 11 tasks:

{noformat}
root@mesos-slave-i-0fe036d7 # free -m
             total       used       free     shared    buffers     cached
Mem:         16047      15189        857          0       1347        688
-/+ buffers/cache:      13153       2893
Swap:            0

root@mesos-slave-i-0fe036d7 # docker stats --no-stream
CONTAINER           CPU %               MEM USAGE / LIMIT     MEM %             
  NET I/O               BLOCK I/O
1527ccec3562        0.39%               46.75 MB / 134.2 MB   34.83%            
  318.5 MB / 283.5 MB   634.9 kB / 0 B
16c0afe372f1        3.12%               1.139 GB / 1.611 GB   70.69%            
  5.443 GB / 5.139 GB   1.757 MB / 118.8 kB
2aaac6a34f3b        3.50%               1.34 GB / 1.611 GB    83.18%            
  9.928 GB / 9.006 GB   2.646 MB / 118.8 kB
4bda58242e66        2.57%               875.5 MB / 1.611 GB   54.36%            
  4.853 GB / 4.632 GB   135.2 kB / 61.44 kB
67ed575e6f44        2.14%               1.171 GB / 1.611 GB   72.73%            
  3.878 GB / 3.664 GB   4.739 MB / 118.8 kB
87010c4fa547        4.23%               1.208 GB / 1.611 GB   74.99%            
  313.5 MB / 419.1 MB   213 kB / 94.21 kB
8ca7c160b196        1.73%               730.4 MB / 1.611 GB   45.35%            
  305.6 MB / 447.7 MB   0 B / 61.44 kB
cbac44b2663c        4.66%               1.088 GB / 1.611 GB   67.53%            
  16.48 GB / 14.91 GB   262.1 kB / 61.44 kB
d0fe165aecac        3.02%               901.2 MB / 1.611 GB   55.95%            
  1.573 GB / 1.555 GB   106.5 kB / 61.44 kB
df668f59a149        3.57%               1.143 GB / 1.611 GB   70.98%            
  2.732 GB / 2.681 GB   1.888 MB / 118.8 kB
e0fc97fa33cf        3.43%               1.034 GB / 1.611 GB   64.21%            
  3.823 GB / 3.655 GB   2.433 MB / 61.44 kB
{noformat}

If you were referring to the actual Mesos processes:

{noformat}
root@mesos-slave-i-0fe036d7 # ps awwuxf | egrep "mesos-docker|mesos-slave" | 
egrep -v "grep|node"
root     27470  0.3  0.3 962568 51020 ?        Ssl  Mar11  14:46 
/usr/sbin/mesos-slave 
--master=zk://10.92.21.247:2181,10.92.31.170:2181,10.92.41.178:2181/mesos 
--log_dir=/var/log/mesos --containerizers=docker,mesos 
--docker_stop_timeout=30secs --executor_registration_timeout=5mins 
--executor_shutdown_grace_period=90secs --gc_delay=1weeks 
--hostname=mesos-slave-i-0fe036d7.gz-prod.us-west-2a.gearzero.us 
--ip=10.92.22.241 --isolation=cgroups/cpu,cgroups/mem --logbufsecs=1 
--recover=reconnect --strict=false --work_dir=/opt/mesos 
--attributes=az:us-west-2a --resources=cpus:4;mem:16047;ports:[31000-32000]
root     27511  0.0  0.0   5916   596 ?        S    Mar11   0:00  \_ logger -p 
user.info -t mesos-slave[27470]
root     27512  0.0  0.0   5916  1884 ?        S    Mar11   0:00  \_ logger -p 
user.err -t mesos-slave[27470]
root     28907  0.1  0.0 802068  5360 ?        Ssl  Mar11   7:02  \_ 
mesos-docker-executor 
--container=mesos-29e183be-f611-41b4-824c-2d05b052231b-S3.f552977a-040c-41a2-bb60-0e441c6491ef
 --docker=docker --docker_socket=/var/run/docker.sock --help=false 
--launcher_dir=/usr/libexec/mesos --mapped_directory=/mnt/mesos/sandbox 
--sandbox_directory=/opt/mesos/slaves/29e183be-f611-41b4-824c-2d05b052231b-S3/frameworks/8ace1cd7-5a79-40f6-99cd-62c87ce2ef49-0001/executors/prod_talkk_metric-green.cac70614-e7d1-11e5-a617-02429957d388/runs/f552977a-040c-41a2-bb60-0e441c6491ef
 --stop_timeout=30secs
root     29193  0.1  0.0 802596  5816 ?        Ssl  Mar11   7:02  \_ 
mesos-docker-executor 
--container=mesos-29e183be-f611-41b4-824c-2d05b052231b-S3.5ada3858-b09b-4a5e-a320-b3c66bb237a6
 --docker=docker --docker_socket=/var/run/docker.sock --help=false 
--launcher_dir=/usr/libexec/mesos --mapped_directory=/mnt/mesos/sandbox 
--sandbox_directory=/opt/mesos/slaves/29e183be-f611-41b4-824c-2d05b052231b-S3/frameworks/8ace1cd7-5a79-40f6-99cd-62c87ce2ef49-0001/executors/prod_talkk_push-green.cac70613-e7d1-11e5-a617-02429957d388/runs/5ada3858-b09b-4a5e-a320-b3c66bb237a6
 --stop_timeout=30secs
root     29373  0.1  0.0 802596  5172 ?        Ssl  Mar11   7:00  \_ 
mesos-docker-executor 
--container=mesos-29e183be-f611-41b4-824c-2d05b052231b-S3.2a93bbbb-0daa-4f22-bbb8-aa7a92791918
 --docker=docker --docker_socket=/var/run/docker.sock --help=false 
--launcher_dir=/usr/libexec/mesos --mapped_directory=/mnt/mesos/sandbox 
--sandbox_directory=/opt/mesos/slaves/29e183be-f611-41b4-824c-2d05b052231b-S3/frameworks/8ace1cd7-5a79-40f6-99cd-62c87ce2ef49-0001/executors/prod_talkk_email-green.cac6b7f0-e7d1-11e5-a617-02429957d388/runs/2a93bbbb-0daa-4f22-bbb8-aa7a92791918
 --stop_timeout=30secs
root     29538  0.1  0.0 802068  5768 ?        Ssl  Mar11   6:59  \_ 
mesos-docker-executor 
--container=mesos-29e183be-f611-41b4-824c-2d05b052231b-S3.c7aa0614-8afa-404a-a6a7-a591f3f20371
 --docker=docker --docker_socket=/var/run/docker.sock --help=false 
--launcher_dir=/usr/libexec/mesos --mapped_directory=/mnt/mesos/sandbox 
--sandbox_directory=/opt/mesos/slaves/29e183be-f611-41b4-824c-2d05b052231b-S3/frameworks/8ace1cd7-5a79-40f6-99cd-62c87ce2ef49-0001/executors/prod_talkk_search-green.cac6df01-e7d1-11e5-a617-02429957d388/runs/c7aa0614-8afa-404a-a6a7-a591f3f20371
 --stop_timeout=30secs
root     30831  0.1  0.0 802068  6012 ?        Ssl  Mar11   6:59  \_ 
mesos-docker-executor 
--container=mesos-29e183be-f611-41b4-824c-2d05b052231b-S3.2c086ae3-c0b9-4069-983a-d8efc37ff220
 --docker=docker --docker_socket=/var/run/docker.sock --help=false 
--launcher_dir=/usr/libexec/mesos --mapped_directory=/mnt/mesos/sandbox 
--sandbox_directory=/opt/mesos/slaves/29e183be-f611-41b4-824c-2d05b052231b-S3/frameworks/8ace1cd7-5a79-40f6-99cd-62c87ce2ef49-0001/executors/prod_talkk_user-green.6c806415-e7d2-11e5-a617-02429957d388/runs/2c086ae3-c0b9-4069-983a-d8efc37ff220
 --stop_timeout=30secs
root     30989  0.1  0.0 802068  5508 ?        Ssl  Mar11   6:59  \_ 
mesos-docker-executor 
--container=mesos-29e183be-f611-41b4-824c-2d05b052231b-S3.79d0c2bd-c3d5-4e47-a7de-74ce713dd6cf
 --docker=docker --docker_socket=/var/run/docker.sock --help=false 
--launcher_dir=/usr/libexec/mesos --mapped_directory=/mnt/mesos/sandbox 
--sandbox_directory=/opt/mesos/slaves/29e183be-f611-41b4-824c-2d05b052231b-S3/frameworks/8ace1cd7-5a79-40f6-99cd-62c87ce2ef49-0001/executors/prod_talkk_identity-green.6d185e58-e7d2-11e5-a617-02429957d388/runs/79d0c2bd-c3d5-4e47-a7de-74ce713dd6cf
 --stop_timeout=30secs
root     31132  0.1  0.0 802068  5612 ?        Ssl  Mar11   6:59  \_ 
mesos-docker-executor 
--container=mesos-29e183be-f611-41b4-824c-2d05b052231b-S3.5e370455-97fc-476c-9cc4-c300c472a002
 --docker=docker --docker_socket=/var/run/docker.sock --help=false 
--launcher_dir=/usr/libexec/mesos --mapped_directory=/mnt/mesos/sandbox 
--sandbox_directory=/opt/mesos/slaves/29e183be-f611-41b4-824c-2d05b052231b-S3/frameworks/8ace1cd7-5a79-40f6-99cd-62c87ce2ef49-0001/executors/prod_talkk_integration-green.6d185e57-e7d2-11e5-a617-02429957d388/runs/5e370455-97fc-476c-9cc4-c300c472a002
 --stop_timeout=30secs
root     31292  0.1  0.0 802596  5688 ?        Ssl  Mar11   7:00  \_ 
mesos-docker-executor 
--container=mesos-29e183be-f611-41b4-824c-2d05b052231b-S3.ff592e36-b9da-48e6-9d6c-960edba25050
 --docker=docker --docker_socket=/var/run/docker.sock --help=false 
--launcher_dir=/usr/libexec/mesos --mapped_directory=/mnt/mesos/sandbox 
--sandbox_directory=/opt/mesos/slaves/29e183be-f611-41b4-824c-2d05b052231b-S3/frameworks/8ace1cd7-5a79-40f6-99cd-62c87ce2ef49-0001/executors/prod_talkk_media-green.6d183746-e7d2-11e5-a617-02429957d388/runs/ff592e36-b9da-48e6-9d6c-960edba25050
 --stop_timeout=30secs
root      2292  0.1  0.0 802068  5388 ?        Ssl  Mar11   6:57  \_ 
mesos-docker-executor 
--container=mesos-29e183be-f611-41b4-824c-2d05b052231b-S3.7caeae52-f0f3-43b9-b89a-fee798841757
 --docker=docker --docker_socket=/var/run/docker.sock --help=false 
--launcher_dir=/usr/libexec/mesos --mapped_directory=/mnt/mesos/sandbox 
--sandbox_directory=/opt/mesos/slaves/29e183be-f611-41b4-824c-2d05b052231b-S3/frameworks/8ace1cd7-5a79-40f6-99cd-62c87ce2ef49-0001/executors/prod_talkk_chat-green.037a7cc1-e7d4-11e5-a617-02429957d388/runs/7caeae52-f0f3-43b9-b89a-fee798841757
 --stop_timeout=30secs
root      2068  0.1  0.0 802068  5216 ?        Ssl  Mar12   6:31  \_ 
mesos-docker-executor 
--container=mesos-29e183be-f611-41b4-824c-2d05b052231b-S3.9bd3e685-8ed1-442c-84bb-7e8c0a37acfe
 --docker=docker --docker_socket=/var/run/docker.sock --help=false 
--launcher_dir=/usr/libexec/mesos --mapped_directory=/mnt/mesos/sandbox 
--sandbox_directory=/opt/mesos/slaves/29e183be-f611-41b4-824c-2d05b052231b-S3/frameworks/8ace1cd7-5a79-40f6-99cd-62c87ce2ef49-0001/executors/prod_talkk_notification-green.383537a4-e7fa-11e5-a617-02429957d388/runs/9bd3e685-8ed1-442c-84bb-7e8c0a37acfe
 --stop_timeout=30secs
{noformat}


was (Author: scalp42):
What do you mean ? Without having Mesos doing the health checks, on a host with 
6 tasks for example:

{noformat}
scalp@mesos-slave-i-d00b6017 $ free -m
             total       used       free     shared    buffers     cached
Mem:         16047      15306        740          0       3174       2547
-/+ buffers/cache:       9583       6463
Swap:            0          0          0


root@mesos-slave-i-d00b6017 # docker stats --no-stream
CONTAINER           CPU %               MEM USAGE / LIMIT     MEM %             
  NET I/O               BLOCK I/O
33cb349404e1        3.23%               897.8 MB / 1.611 GB   55.74%            
  4.859 GB / 4.625 GB   53.25 kB / 61.44 kB
61eba49cf71d        3.22%               1.166 GB / 1.611 GB   72.41%            
  5.49 GB / 5.155 GB    106.5 kB / 118.8 kB
630739e12032        3.76%               1.163 GB / 1.611 GB   72.22%            
  3.891 GB / 3.657 GB   348.2 kB / 118.8 kB
b5b9da9facfb        2.84%               901.9 MB / 1.611 GB   55.99%            
  2.254 GB / 2.153 GB   0 B / 118.8 kB
dcd2a73f71a9        3.55%               1.29 GB / 1.611 GB    80.10%            
  2.726 GB / 2.672 GB   0 B / 118.8 kB
de923d88a781        3.17%               889.5 MB / 1.611 GB   55.23%            
  3.817 GB / 3.645 GB   36.86 kB / 61.44 kB
{noformat}

Or another with 11 tasks:

{noformat}
root@mesos-slave-i-0fe036d7 # free -m
             total       used       free     shared    buffers     cached
Mem:         16047      15189        857          0       1347        688
-/+ buffers/cache:      13153       2893
Swap:            0

root@mesos-slave-i-0fe036d7 # docker stats --no-stream
CONTAINER           CPU %               MEM USAGE / LIMIT     MEM %             
  NET I/O               BLOCK I/O
1527ccec3562        0.39%               46.75 MB / 134.2 MB   34.83%            
  318.5 MB / 283.5 MB   634.9 kB / 0 B
16c0afe372f1        3.12%               1.139 GB / 1.611 GB   70.69%            
  5.443 GB / 5.139 GB   1.757 MB / 118.8 kB
2aaac6a34f3b        3.50%               1.34 GB / 1.611 GB    83.18%            
  9.928 GB / 9.006 GB   2.646 MB / 118.8 kB
4bda58242e66        2.57%               875.5 MB / 1.611 GB   54.36%            
  4.853 GB / 4.632 GB   135.2 kB / 61.44 kB
67ed575e6f44        2.14%               1.171 GB / 1.611 GB   72.73%            
  3.878 GB / 3.664 GB   4.739 MB / 118.8 kB
87010c4fa547        4.23%               1.208 GB / 1.611 GB   74.99%            
  313.5 MB / 419.1 MB   213 kB / 94.21 kB
8ca7c160b196        1.73%               730.4 MB / 1.611 GB   45.35%            
  305.6 MB / 447.7 MB   0 B / 61.44 kB
cbac44b2663c        4.66%               1.088 GB / 1.611 GB   67.53%            
  16.48 GB / 14.91 GB   262.1 kB / 61.44 kB
d0fe165aecac        3.02%               901.2 MB / 1.611 GB   55.95%            
  1.573 GB / 1.555 GB   106.5 kB / 61.44 kB
df668f59a149        3.57%               1.143 GB / 1.611 GB   70.98%            
  2.732 GB / 2.681 GB   1.888 MB / 118.8 kB
e0fc97fa33cf        3.43%               1.034 GB / 1.611 GB   64.21%            
  3.823 GB / 3.655 GB   2.433 MB / 61.44 kB
{noformat}

> /usr/libexec/mesos/mesos-health-check using/leaking a lot of memory
> -------------------------------------------------------------------
>
>                 Key: MESOS-4869
>                 URL: https://issues.apache.org/jira/browse/MESOS-4869
>             Project: Mesos
>          Issue Type: Bug
>    Affects Versions: 0.27.1
>            Reporter: Anthony Scalisi
>            Priority: Critical
>
> We switched our health checks in Marathon from HTTP to COMMAND:
> {noformat}
> "healthChecks": [
>     {
>       "protocol": "COMMAND",
>       "path": "/ops/ping",
>       "command": { "value": "curl --silent -f -X GET 
> http://$HOST:$PORT0/ops/ping > /dev/null" },
>       "gracePeriodSeconds": 90,
>       "intervalSeconds": 2,
>       "portIndex": 0,
>       "timeoutSeconds": 5,
>       "maxConsecutiveFailures": 3
>     }
>   ]
> {noformat}
> All our applications have the same health check (and /ops/ping endpoint).
> Even though we have the issue on all our Meos slaves, I'm going to focus on a 
> particular one: *mesos-slave-i-e3a9c724*.
> The slave has 16 gigs of memory, with about 12 gigs allocated for 8 tasks:
> !https://i.imgur.com/gbRf804.png!
> Here is a *docker ps* on it:
> {noformat}
> root@mesos-slave-i-e3a9c724 # docker ps
> CONTAINER ID        IMAGE               COMMAND                  CREATED      
>        STATUS              PORTS                     NAMES
> 4f7c0aa8d03a        java:8              "/bin/sh -c 'JAVA_OPT"   6 hours ago  
>        Up 6 hours          0.0.0.0:31926->8080/tcp   
> mesos-29e183be-f611-41b4-824c-2d05b052231b-S6.3dbb1004-5bb8-432f-8fd8-b863bd29341d
> 66f2fc8f8056        java:8              "/bin/sh -c 'JAVA_OPT"   6 hours ago  
>        Up 6 hours          0.0.0.0:31939->8080/tcp   
> mesos-29e183be-f611-41b4-824c-2d05b052231b-S6.60972150-b2b1-45d8-8a55-d63e81b8372a
> f7382f241fce        java:8              "/bin/sh -c 'JAVA_OPT"   6 hours ago  
>        Up 6 hours          0.0.0.0:31656->8080/tcp   
> mesos-29e183be-f611-41b4-824c-2d05b052231b-S6.39731a2f-d29e-48d1-9927-34ab8c5f557d
> 880934c0049e        java:8              "/bin/sh -c 'JAVA_OPT"   24 hours ago 
>        Up 24 hours         0.0.0.0:31371->8080/tcp   
> mesos-29e183be-f611-41b4-824c-2d05b052231b-S6.23dfe408-ab8f-40be-bf6f-ce27fe885ee0
> 5eab1f8dac4a        java:8              "/bin/sh -c 'JAVA_OPT"   46 hours ago 
>        Up 46 hours         0.0.0.0:31500->8080/tcp   
> mesos-29e183be-f611-41b4-824c-2d05b052231b-S6.5ac75198-283f-4349-a220-9e9645b313e7
> b63740fe56e7        java:8              "/bin/sh -c 'JAVA_OPT"   46 hours ago 
>        Up 46 hours         0.0.0.0:31382->8080/tcp   
> mesos-29e183be-f611-41b4-824c-2d05b052231b-S6.5d417f16-df24-49d5-a5b0-38a7966460fe
> 5c7a9ea77b0e        java:8              "/bin/sh -c 'JAVA_OPT"   2 days ago   
>        Up 2 days           0.0.0.0:31186->8080/tcp   
> mesos-29e183be-f611-41b4-824c-2d05b052231b-S6.b05043c5-44fc-40bf-aea2-10354e8f5ab4
> 53065e7a31ad        java:8              "/bin/sh -c 'JAVA_OPT"   2 days ago   
>        Up 2 days           0.0.0.0:31839->8080/tcp   
> mesos-29e183be-f611-41b4-824c-2d05b052231b-S6.f0a3f4c5-ecdb-4f97-bede-d744feda670c
> {noformat}
> Here is a *docker stats* on it:
> {noformat}
> root@mesos-slave-i-e3a9c724  # docker stats
> CONTAINER           CPU %               MEM USAGE / LIMIT     MEM %           
>     NET I/O               BLOCK I/O
> 4f7c0aa8d03a        2.93%               797.3 MB / 1.611 GB   49.50%          
>     1.277 GB / 1.189 GB   155.6 kB / 151.6 kB
> 53065e7a31ad        8.30%               738.9 MB / 1.611 GB   45.88%          
>     419.6 MB / 554.3 MB   98.3 kB / 61.44 kB
> 5c7a9ea77b0e        4.91%               1.081 GB / 1.611 GB   67.10%          
>     423 MB / 526.5 MB     3.219 MB / 61.44 kB
> 5eab1f8dac4a        3.13%               1.007 GB / 1.611 GB   62.53%          
>     2.737 GB / 2.564 GB   6.566 MB / 118.8 kB
> 66f2fc8f8056        3.15%               768.1 MB / 1.611 GB   47.69%          
>     258.5 MB / 252.8 MB   1.86 MB / 151.6 kB
> 880934c0049e        10.07%              735.1 MB / 1.611 GB   45.64%          
>     1.451 GB / 1.399 GB   573.4 kB / 94.21 kB
> b63740fe56e7        12.04%              629 MB / 1.611 GB     39.06%          
>     10.29 GB / 9.344 GB   8.102 MB / 61.44 kB
> f7382f241fce        6.21%               505 MB / 1.611 GB     31.36%          
>     153.4 MB / 151.9 MB   5.837 MB / 94.21 kB
> {noformat}
> Not much else is running on the slave, yet the used memory doesn't map to the 
> tasks memory:
> {noformat}
> Mem:16047M used:13340M buffers:1139M cache:776M
> {noformat}
> If I exec into the container (*java:8* image), I can see correctly the shell 
> calls to execute the curl specified in the health check as expected and exit 
> correctly.
> The only change we noticed since the memory usage woes was related to moving 
> to Mesos doing the health checks instead, so I decided to take a look:
> {noformat}
> root@mesos-slave-i-e3a9c724 # ps awwx | grep health_check | grep -v grep
>  2504 ?        Sl    47:33 /usr/libexec/mesos/mesos-health-check 
> --executor=(1)@10.92.32.63:53432 
> --health_check_json={"command":{"shell":true,"value":"docker exec 
> mesos-29e183be-f611-41b4-824c-2d05b052231b-S6.f0a3f4c5-ecdb-4f97-bede-d744feda670c
>  sh -c \" curl --silent -f -X GET http:\/\/$HOST:$PORT0\/ops\/ping > 
> \/dev\/null 
> \""},"consecutive_failures":3,"delay_seconds":0.0,"grace_period_seconds":90.0,"interval_seconds":2.0,"timeout_seconds":5.0}
>  --task_id=prod_talkk_email-green.b086206a-e000-11e5-a617-02429957d388
>  4220 ?        Sl    47:26 /usr/libexec/mesos/mesos-health-check 
> --executor=(1)@10.92.32.63:54982 
> --health_check_json={"command":{"shell":true,"value":"docker exec 
> mesos-29e183be-f611-41b4-824c-2d05b052231b-S6.b05043c5-44fc-40bf-aea2-10354e8f5ab4
>  sh -c \" curl --silent -f -X GET http:\/\/$HOST:$PORT0\/ops\/ping > 
> \/dev\/null 
> \""},"consecutive_failures":3,"delay_seconds":0.0,"grace_period_seconds":90.0,"interval_seconds":2.0,"timeout_seconds":5.0}
>  --task_id=prod_talkk_chat-green.ed53ec41-e000-11e5-a617-02429957d388
>  7444 ?        Sl     1:31 /usr/libexec/mesos/mesos-health-check 
> --executor=(1)@10.92.32.63:59422 
> --health_check_json={"command":{"shell":true,"value":"docker exec 
> mesos-29e183be-f611-41b4-824c-2d05b052231b-S6.60972150-b2b1-45d8-8a55-d63e81b8372a
>  sh -c \" curl --silent -f -X GET http:\/\/$HOST:$PORT0\/ops\/ping > 
> \/dev\/null 
> \""},"consecutive_failures":3,"delay_seconds":0.0,"grace_period_seconds":90.0,"interval_seconds":2.0,"timeout_seconds":5.0}
>  --task_id=prod_talkk_identity-green.aeb2ef3b-e219-11e5-a617-02429957d388
> 10368 ?        Sl     1:30 /usr/libexec/mesos/mesos-health-check 
> --executor=(1)@10.92.32.63:40981 
> --health_check_json={"command":{"shell":true,"value":"docker exec 
> mesos-29e183be-f611-41b4-824c-2d05b052231b-S6.3dbb1004-5bb8-432f-8fd8-b863bd29341d
>  sh -c \" curl --silent -f -X GET http:\/\/$HOST:$PORT0\/ops\/ping > 
> \/dev\/null 
> \""},"consecutive_failures":3,"delay_seconds":0.0,"grace_period_seconds":90.0,"interval_seconds":2.0,"timeout_seconds":5.0}
>  --task_id=prod_talkk_channel-green.c6fbd2ac-e219-11e5-a617-02429957d388
> 12399 ?        Sl     9:45 /usr/libexec/mesos/mesos-health-check 
> --executor=(1)@10.92.32.63:44815 
> --health_check_json={"command":{"shell":true,"value":"docker exec 
> mesos-29e183be-f611-41b4-824c-2d05b052231b-S6.23dfe408-ab8f-40be-bf6f-ce27fe885ee0
>  sh -c \" curl --silent -f -X GET http:\/\/$HOST:$PORT0\/ops\/ping > 
> \/dev\/null 
> \""},"consecutive_failures":3,"delay_seconds":0.0,"grace_period_seconds":90.0,"interval_seconds":2.0,"timeout_seconds":5.0}
>  --task_id=prod_talkk_integration-green.143865d5-e17d-11e5-a617-02429957d388
> 13538 ?        Sl    24:54 /usr/libexec/mesos/mesos-health-check 
> --executor=(1)@10.92.32.63:56598 
> --health_check_json={"command":{"shell":true,"value":"docker exec 
> mesos-29e183be-f611-41b4-824c-2d05b052231b-S6.5d417f16-df24-49d5-a5b0-38a7966460fe
>  sh -c \" curl --silent -f -X GET http:\/\/$HOST:$PORT0\/ops\/ping > 
> \/dev\/null 
> \""},"consecutive_failures":3,"delay_seconds":0.0,"grace_period_seconds":90.0,"interval_seconds":2.0,"timeout_seconds":5.0}
>  --task_id=prod_talkk_metric-green.75296986-e0c7-11e5-a617-02429957d388
> 32034 ?        Sl     1:31 /usr/libexec/mesos/mesos-health-check 
> --executor=(1)@10.92.32.63:48119 
> --health_check_json={"command":{"shell":true,"value":"docker exec 
> mesos-29e183be-f611-41b4-824c-2d05b052231b-S6.39731a2f-d29e-48d1-9927-34ab8c5f557d
>  sh -c \" curl --silent -f -X GET http:\/\/$HOST:$PORT0\/ops\/ping > 
> \/dev\/null 
> \""},"consecutive_failures":3,"delay_seconds":0.0,"grace_period_seconds":90.0,"interval_seconds":2.0,"timeout_seconds":5.0}
>  --task_id=prod_talkk_push-green.601337e6-e219-11e5-a617-02429957d388
> {noformat}
> The memory usage is really bad:
> {noformat}
> root@mesos-slave-i-e3a9c724 # ps -eo size,pid,user,command --sort -size | 
> grep health_check | awk '{ hr=$1/1024 ; printf("%13.2f Mb ",hr) } { for ( x=4 
> ; x<=NF ; x++ ) { printf("%s ",$x) } print "" }'
>       2185.39 Mb /usr/libexec/mesos/mesos-health-check 
> --executor=(1)@10.92.32.63:53432 
> --health_check_json={"command":{"shell":true,"value":"docker exec 
> mesos-29e183be-f611-41b4-824c-2d05b052231b-S6.f0a3f4c5-ecdb-4f97-bede-d744feda670c
>  sh -c \" curl --silent -f -X GET http:\/\/$HOST:$PORT0\/ops\/ping > 
> \/dev\/null 
> \""},"consecutive_failures":3,"delay_seconds":0.0,"grace_period_seconds":90.0,"interval_seconds":2.0,"timeout_seconds":5.0}
>  --task_id=prod_talkk_email-green.b086206a-e000-11e5-a617-02429957d388
>       2185.39 Mb /usr/libexec/mesos/mesos-health-check 
> --executor=(1)@10.92.32.63:54982 
> --health_check_json={"command":{"shell":true,"value":"docker exec 
> mesos-29e183be-f611-41b4-824c-2d05b052231b-S6.b05043c5-44fc-40bf-aea2-10354e8f5ab4
>  sh -c \" curl --silent -f -X GET http:\/\/$HOST:$PORT0\/ops\/ping > 
> \/dev\/null 
> \""},"consecutive_failures":3,"delay_seconds":0.0,"grace_period_seconds":90.0,"interval_seconds":2.0,"timeout_seconds":5.0}
>  --task_id=prod_talkk_chat-green.ed53ec41-e000-11e5-a617-02429957d388
>       1673.39 Mb /usr/libexec/mesos/mesos-health-check 
> --executor=(1)@10.92.32.63:56598 
> --health_check_json={"command":{"shell":true,"value":"docker exec 
> mesos-29e183be-f611-41b4-824c-2d05b052231b-S6.5d417f16-df24-49d5-a5b0-38a7966460fe
>  sh -c \" curl --silent -f -X GET http:\/\/$HOST:$PORT0\/ops\/ping > 
> \/dev\/null 
> \""},"consecutive_failures":3,"delay_seconds":0.0,"grace_period_seconds":90.0,"interval_seconds":2.0,"timeout_seconds":5.0}
>  --task_id=prod_talkk_metric-green.75296986-e0c7-11e5-a617-02429957d388
>       1161.39 Mb /usr/libexec/mesos/mesos-health-check 
> --executor=(1)@10.92.32.63:44815 
> --health_check_json={"command":{"shell":true,"value":"docker exec 
> mesos-29e183be-f611-41b4-824c-2d05b052231b-S6.23dfe408-ab8f-40be-bf6f-ce27fe885ee0
>  sh -c \" curl --silent -f -X GET http:\/\/$HOST:$PORT0\/ops\/ping > 
> \/dev\/null 
> \""},"consecutive_failures":3,"delay_seconds":0.0,"grace_period_seconds":90.0,"interval_seconds":2.0,"timeout_seconds":5.0}
>  --task_id=prod_talkk_integration-green.143865d5-e17d-11e5-a617-02429957d388
>        649.39 Mb /usr/libexec/mesos/mesos-health-check 
> --executor=(1)@10.92.32.63:59422 
> --health_check_json={"command":{"shell":true,"value":"docker exec 
> mesos-29e183be-f611-41b4-824c-2d05b052231b-S6.60972150-b2b1-45d8-8a55-d63e81b8372a
>  sh -c \" curl --silent -f -X GET http:\/\/$HOST:$PORT0\/ops\/ping > 
> \/dev\/null 
> \""},"consecutive_failures":3,"delay_seconds":0.0,"grace_period_seconds":90.0,"interval_seconds":2.0,"timeout_seconds":5.0}
>  --task_id=prod_talkk_identity-green.aeb2ef3b-e219-11e5-a617-02429957d388
>        649.39 Mb /usr/libexec/mesos/mesos-health-check 
> --executor=(1)@10.92.32.63:40981 
> --health_check_json={"command":{"shell":true,"value":"docker exec 
> mesos-29e183be-f611-41b4-824c-2d05b052231b-S6.3dbb1004-5bb8-432f-8fd8-b863bd29341d
>  sh -c \" curl --silent -f -X GET http:\/\/$HOST:$PORT0\/ops\/ping > 
> \/dev\/null 
> \""},"consecutive_failures":3,"delay_seconds":0.0,"grace_period_seconds":90.0,"interval_seconds":2.0,"timeout_seconds":5.0}
>  --task_id=prod_talkk_channel-green.c6fbd2ac-e219-11e5-a617-02429957d388
>        649.39 Mb /usr/libexec/mesos/mesos-health-check 
> --executor=(1)@10.92.32.63:48119 
> --health_check_json={"command":{"shell":true,"value":"docker exec 
> mesos-29e183be-f611-41b4-824c-2d05b052231b-S6.39731a2f-d29e-48d1-9927-34ab8c5f557d
>  sh -c \" curl --silent -f -X GET http:\/\/$HOST:$PORT0\/ops\/ping > 
> \/dev\/null 
> \""},"consecutive_failures":3,"delay_seconds":0.0,"grace_period_seconds":90.0,"interval_seconds":2.0,"timeout_seconds":5.0}
>  --task_id=prod_talkk_push-green.601337e6-e219-11e5-a617-02429957d388
>          0.32 Mb grep --color=auto health_check
> {noformat}
> Killing the *mesos-health-check* process for each container fix our memory 
> issues (but I'm assuming health checks won't be reported anymore or 
> something):
> {noformat}
> root@mesos-slave-i-e3a9c724 # date ; free -m ; ps awwx | grep health_check | 
> grep -v grep | awk '{print $1}' | xargs -I% -P1 kill % ; date ; free -m
> Fri Mar  4 21:20:55 UTC 2016
>              total       used       free     shared    buffers     cached
> Mem:         16047      13538       2508          0       1140        774
> -/+ buffers/cache:      11623       4423
> Swap:            0          0          0
> Fri Mar  4 21:20:56 UTC 2016
>              total       used       free     shared    buffers     cached
> Mem:         16047       9101       6945          0       1140        774
> -/+ buffers/cache:       7186       8860
> Swap:            0          0          0
> {noformat}
> We're reverting to Marathon doing the health checks for now but would like to 
> emphasize it's happening across all our slaves (not an isolated issue).
> Thanks for looking into it :)



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to