[1/4] aurora git commit: Enable `Tasks` to specify their own custom maintenance SLA.

santhk Tue, 05 Jun 2018 16:16:36 -0700

Repository: aurora
Updated Branches:
  refs/heads/master 34be63158 -> f2acf53ff



http://git-wip-us.apache.org/repos/asf/aurora/blob/f2acf53f/src/test/python/apache/aurora/client/cli/test_kill.py
----------------------------------------------------------------------
diff --git a/src/test/python/apache/aurora/client/cli/test_kill.py 
b/src/test/python/apache/aurora/client/cli/test_kill.py
index 0e859dc..bb78ee2 100644
--- a/src/test/python/apache/aurora/client/cli/test_kill.py
+++ b/src/test/python/apache/aurora/client/cli/test_kill.py
@@ -59,7 +59,8 @@ class TestKillCommand(AuroraClientCommandTest):
 
     fake_context = FakeAuroraCommandContext()
     fake_context.set_options(mock_options)
-
+    fake_context.add_expected_query_result(
+      self.create_query_call_result(), job_key=self.TEST_JOBKEY)
     mock_api = fake_context.get_api('test')
     mock_api.kill_job.return_value = 
AuroraClientCommandTest.create_blank_response(
       ResponseCode.JOB_UPDATING_ERROR, "Error.")
@@ -86,6 +87,8 @@ class TestKillCommand(AuroraClientCommandTest):
     fake_context.set_options(mock_options)
 
     fake_context.add_expected_query_result(
+      self.create_query_call_result(), job_key=self.TEST_JOBKEY)
+    fake_context.add_expected_query_result(
       AuroraClientCommandTest.create_query_call_result(
         AuroraClientCommandTest.create_scheduled_task(1, 
ScheduleStatus.RUNNING)))
 
@@ -115,6 +118,8 @@ class TestKillCommand(AuroraClientCommandTest):
     fake_context = FakeAuroraCommandContext()
     fake_context.set_options(mock_options)
 
+    fake_context.add_expected_query_result(
+      self.create_query_call_result(), job_key=self.TEST_JOBKEY)
     
fake_context.add_expected_query_result(AuroraClientCommandTest.create_empty_task_result())
 
     with pytest.raises(Context.CommandError) as e:
@@ -131,6 +136,8 @@ class TestKillCommand(AuroraClientCommandTest):
     fake_context = FakeAuroraCommandContext()
     fake_context.set_options(mock_options)
 
+    fake_context.add_expected_query_result(
+      self.create_query_call_result(), job_key=self.TEST_JOBKEY)
     
fake_context.add_expected_query_result(AuroraClientCommandTest.create_empty_task_result())
 
     command.execute(fake_context)
@@ -147,6 +154,8 @@ class TestKillCommand(AuroraClientCommandTest):
     fake_context = FakeAuroraCommandContext()
     fake_context.set_options(mock_options)
 
+    fake_context.add_expected_query_result(
+      self.create_query_call_result(), job_key=self.TEST_JOBKEY)
     
fake_context.add_expected_query_result(AuroraClientCommandTest.create_empty_task_result())
 
     command.execute(fake_context)
@@ -170,6 +179,8 @@ class TestKillCommand(AuroraClientCommandTest):
     fake_context.add_config(config)
 
     mock_api = fake_context.get_api('test')
+    fake_context.add_expected_query_result(
+      self.create_query_call_result(), job_key=self.TEST_JOBKEY)
     
fake_context.add_expected_query_result(AuroraClientCommandTest.create_empty_task_result())
     mock_api.kill_job.return_value = self.create_simple_success_response()
 
@@ -194,6 +205,8 @@ class TestKillCommand(AuroraClientCommandTest):
     fake_context.add_config(config)
 
     fake_context.add_expected_query_result(
+      self.create_query_call_result(), job_key=self.TEST_JOBKEY)
+    fake_context.add_expected_query_result(
       AuroraClientCommandTest.create_query_call_result(
         AuroraClientCommandTest.create_scheduled_task(1, 
ScheduleStatus.RUNNING)))
 
@@ -222,6 +235,8 @@ class TestKillAllCommand(AuroraClientCommandTest):
     fake_context.add_config(config)
 
     mock_api = fake_context.get_api('test')
+    fake_context.add_expected_query_result(
+      self.create_query_call_result(), job_key=self.TEST_JOBKEY)
     
fake_context.add_expected_query_result(AuroraClientCommandTest.create_empty_task_result())
     mock_api.kill_job.return_value = self.create_simple_success_response()
 
@@ -272,8 +287,9 @@ class TestClientKillCommand(AuroraClientCommandTest):
     assert api.kill_job.mock_calls == [call(cls.TEST_JOBKEY, None, 
config=None, message=None)]
 
   @classmethod
-  def assert_query(cls, fake_api):
-    calls = [call(TaskQuery(jobKeys=[cls.TEST_JOBKEY.to_thrift()], 
statuses=ACTIVE_STATES))]
+  def assert_query(cls, fake_api, times=1):
+    calls = [call(TaskQuery(jobKeys=[cls.TEST_JOBKEY.to_thrift()], 
statuses=ACTIVE_STATES))
+      for _ in range(times)]
     assert fake_api.query_no_configs.mock_calls == calls
 
   def test_killall_job(self):
@@ -321,6 +337,8 @@ class TestClientKillCommand(AuroraClientCommandTest):
         patch('apache.aurora.client.cli.jobs.Job.create_context', 
return_value=mock_context),
         patch('apache.aurora.client.cli.jobs.JobMonitor', 
return_value=mock_monitor)) as (_, m):
       api = mock_context.get_api('west')
+      mock_context.add_expected_query_result(
+        self.create_query_call_result(), job_key=self.TEST_JOBKEY)
       api.kill_job.return_value = self.create_simple_success_response()
       cmd = AuroraCommandLine()
       cmd.execute(['job', 'kill', '--no-batching', 
self.get_instance_spec('0,2,4-6')])
@@ -328,6 +346,7 @@ class TestClientKillCommand(AuroraClientCommandTest):
       instances = [0, 2, 4, 5, 6]
       self.assert_kill_calls(api, instances=instances, message=None)
       self.assert_wait_calls(mock_monitor, m.terminal, instances=instances)
+      self.assert_query(api)
 
   def test_kill_job_with_invalid_instances_strict(self):
     """Test kill client-side API logic."""
@@ -350,12 +369,15 @@ class TestClientKillCommand(AuroraClientCommandTest):
         patch('apache.aurora.client.cli.jobs.Job.create_context', 
return_value=mock_context),
         patch('apache.aurora.client.cli.jobs.JobMonitor', 
return_value=mock_monitor)) as (_, m):
       api = mock_context.get_api('west')
+      mock_context.add_expected_query_result(
+        self.create_query_call_result(), job_key=self.TEST_JOBKEY)
       api.kill_job.return_value = self.create_simple_success_response()
       cmd = AuroraCommandLine()
       cmd.execute(['job', 'kill', '--no-batching', 
self.get_instance_spec('0,2,4-6,11-13')])
       instances = [0, 2, 4, 5, 6, 11, 12, 13]
       self.assert_kill_calls(api, instances=instances, message=None)
       self.assert_wait_calls(mock_monitor, m.terminal, instances=instances)
+      self.assert_query(api)
 
   def test_kill_job_with_instances_batched(self):
     """Test kill client-side API logic."""
@@ -366,7 +388,9 @@ class TestClientKillCommand(AuroraClientCommandTest):
         patch('apache.aurora.client.cli.jobs.JobMonitor', 
return_value=mock_monitor)) as (_, m):
       api = mock_context.get_api('west')
       mock_context.add_expected_query_result(
-          self.create_query_call_result(), job_key=self.TEST_JOBKEY)
+        self.create_query_call_result(), job_key=self.TEST_JOBKEY)
+      mock_context.add_expected_query_result(
+        self.create_query_call_result(), job_key=self.TEST_JOBKEY)
 
       api.kill_job.return_value = self.create_simple_success_response()
       cmd = AuroraCommandLine()
@@ -374,7 +398,7 @@ class TestClientKillCommand(AuroraClientCommandTest):
 
       self.assert_kill_calls(api, instance_range=range(7))
       self.assert_wait_calls(mock_monitor, m.terminal, instance_range=range(7))
-      self.assert_query(api)
+      self.assert_query(api, times=2)
 
   def test_kill_job_with_instances_batched_maxerrors(self):
     """Test kill client-side API logic."""
@@ -385,7 +409,9 @@ class TestClientKillCommand(AuroraClientCommandTest):
         patch('apache.aurora.client.cli.jobs.JobMonitor', 
return_value=mock_monitor)):
       api = mock_context.get_api('west')
       mock_context.add_expected_query_result(
-          self.create_query_call_result(), job_key=self.TEST_JOBKEY)
+        self.create_query_call_result(), job_key=self.TEST_JOBKEY)
+      mock_context.add_expected_query_result(
+        self.create_query_call_result(), job_key=self.TEST_JOBKEY)
 
       api.kill_job.return_value = self.create_simple_success_response()
       cmd = AuroraCommandLine()
@@ -393,7 +419,7 @@ class TestClientKillCommand(AuroraClientCommandTest):
 
       # We should have aborted after the second batch.
       self.assert_kill_calls(api, instance_range=range(2), message=None)
-      self.assert_query(api)
+      self.assert_query(api, times=2)
 
   def test_kill_job_with_empty_instances_batched(self):
     """Test kill client-side API logic."""
@@ -401,6 +427,8 @@ class TestClientKillCommand(AuroraClientCommandTest):
     with contextlib.nested(
         patch('apache.aurora.client.cli.jobs.Job.create_context', 
return_value=mock_context)):
       api = mock_context.get_api('west')
+      mock_context.add_expected_query_result(
+        self.create_query_call_result(), job_key=self.TEST_JOBKEY)
       # set up an empty instance list in the getTasksWithoutConfigs response
       status_response = self.create_simple_success_response()
       status_response.result = 
Result(scheduleStatusResult=ScheduleStatusResult(tasks=[]))
@@ -410,6 +438,7 @@ class TestClientKillCommand(AuroraClientCommandTest):
       cmd.execute(['job', 'kill', self.get_instance_spec('0,2,4-13')])
 
       assert api.kill_job.call_count == 0
+      self.assert_query(api, times=2)
 
   def test_killall_job_output(self):
     """Test kill output."""
@@ -433,6 +462,8 @@ class TestClientKillCommand(AuroraClientCommandTest):
         patch('apache.aurora.client.cli.jobs.Job.create_context', 
return_value=mock_context),
         patch('apache.aurora.client.cli.jobs.JobMonitor', 
return_value=self.get_monitor_mock())):
       api = mock_context.get_api('west')
+      mock_context.add_expected_query_result(
+        self.create_query_call_result(), job_key=self.TEST_JOBKEY)
       mock_context.add_expected_query_result(self.create_query_call_result())
       api.kill_job.return_value = self.create_simple_success_response()
       cmd = AuroraCommandLine()
@@ -441,6 +472,7 @@ class TestClientKillCommand(AuroraClientCommandTest):
     assert mock_context.get_out() == ['Successfully killed instances [0, 2, 4, 
5, 6]',
         'Job kill succeeded']
     assert mock_context.get_err() == []
+    self.assert_query(api, times=2)
 
   def test_kill_job_with_instances_batched_maxerrors_output(self):
     """Test kill client-side API logic."""
@@ -450,6 +482,8 @@ class TestClientKillCommand(AuroraClientCommandTest):
         patch('apache.aurora.client.cli.jobs.Job.create_context', 
return_value=mock_context),
         patch('apache.aurora.client.cli.jobs.JobMonitor', 
return_value=mock_monitor)):
       api = mock_context.get_api('west')
+      mock_context.add_expected_query_result(
+        self.create_query_call_result(), job_key=self.TEST_JOBKEY)
       mock_context.add_expected_query_result(self.create_query_call_result())
       api.kill_job.return_value = self.create_simple_success_response()
       cmd = AuroraCommandLine()
@@ -461,6 +495,7 @@ class TestClientKillCommand(AuroraClientCommandTest):
          'Instances [0, 2, 4, 5, 6] were not killed in time',
          'Instances [7, 8, 9, 10, 11] were not killed in time',
          'Exceeded maximum number of errors while killing instances']
+      self.assert_query(api, times=2)
 
   def test_kill_job_with_message(self):
     """Test kill client-side API logic."""
@@ -470,6 +505,8 @@ class TestClientKillCommand(AuroraClientCommandTest):
         patch('apache.aurora.client.cli.jobs.Job.create_context', 
return_value=mock_context),
         patch('apache.aurora.client.cli.jobs.JobMonitor', 
return_value=mock_monitor)) as (_, m):
       api = mock_context.get_api('west')
+      mock_context.add_expected_query_result(
+        self.create_query_call_result(), job_key=self.TEST_JOBKEY)
       api.kill_job.return_value = self.create_simple_success_response()
       cmd = AuroraCommandLine()
       message = 'Test message'
@@ -477,3 +514,4 @@ class TestClientKillCommand(AuroraClientCommandTest):
 
       instances = [0]
       self.assert_kill_calls(api, instances=instances, message=message)
+      self.assert_query(api)

http://git-wip-us.apache.org/repos/asf/aurora/blob/f2acf53f/src/test/sh/org/apache/aurora/e2e/http_example.py
----------------------------------------------------------------------
diff --git a/src/test/sh/org/apache/aurora/e2e/http_example.py 
b/src/test/sh/org/apache/aurora/e2e/http_example.py
index ba7d114..140cc75 100644
--- a/src/test/sh/org/apache/aurora/e2e/http_example.py
+++ b/src/test/sh/org/apache/aurora/e2e/http_example.py
@@ -14,7 +14,6 @@
 from __future__ import print_function
 
 from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
-from SocketServer import ThreadingMixIn
 from sys import argv
 from threading import Thread
 
@@ -41,6 +40,7 @@ def start_server(port, handler_class):
   print('Listening on port %s' % port)
   server.serve_forever()
 
+
 request_thread = Thread(target=start_server, args=[int(argv[1]), 
RequestHandler])
 health_thread = Thread(target=start_server, args=[int(argv[2]), HealthHandler])
 

http://git-wip-us.apache.org/repos/asf/aurora/blob/f2acf53f/src/test/sh/org/apache/aurora/e2e/partition_aware.aurora
----------------------------------------------------------------------
diff --git a/src/test/sh/org/apache/aurora/e2e/partition_aware.aurora 
b/src/test/sh/org/apache/aurora/e2e/partition_aware.aurora
index 7ea9fad..98732e4 100644
--- a/src/test/sh/org/apache/aurora/e2e/partition_aware.aurora
+++ b/src/test/sh/org/apache/aurora/e2e/partition_aware.aurora
@@ -1,3 +1,17 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
 # run the script
 hello_world = Process(
   name = 'hello_world',

http://git-wip-us.apache.org/repos/asf/aurora/blob/f2acf53f/src/test/sh/org/apache/aurora/e2e/sla_coordinator.py
----------------------------------------------------------------------
diff --git a/src/test/sh/org/apache/aurora/e2e/sla_coordinator.py 
b/src/test/sh/org/apache/aurora/e2e/sla_coordinator.py
new file mode 100644
index 0000000..24aa8f5
--- /dev/null
+++ b/src/test/sh/org/apache/aurora/e2e/sla_coordinator.py
@@ -0,0 +1,60 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import print_function
+
+from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
+from threading import Thread
+from urlparse import urlparse, parse_qs
+
+import json
+import sys
+
+
+class RequestHandler(BaseHTTPRequestHandler):
+  def do_POST(self):
+    task = parse_qs(urlparse(self.path).query)['task'][0]
+    body = self.rfile.read(int(self.headers.getheader('content-length', 0)))
+
+    # Only allow draining instance-0
+    allow = True if task == 'devcluster/vagrant/test/coordinator/0' else False
+
+    self.send_response(200)
+    self.send_header('Content-Type', 'application/json')
+    self.end_headers()
+    self.wfile.write('{"drain":%s}' % allow)
+
+    print('Request received for {}'.format(task))
+    print(json.dumps(json.loads(body), indent=2, sort_keys=True))
+    print('Responded: {}'.format(allow))
+    sys.stdout.flush()
+
+
+def start_server(instance, port, handler_class):
+  # This service will act as its own SLA Coordinator.
+  # We need a static port for the coordinator service so the scheduler can 
communicate to it when
+  # checking SLA. We cannot have 2 tasks binding to the same port so 
instance-1 will bind to 8080
+  # as instance-0 is the one that will be acked for draining. instance-0 will 
bind to a random port.
+  if instance != 1:
+    port = 0
+
+  server = HTTPServer(('', port), handler_class)
+  print('Listening on port %s' % port)
+  sys.stdout.flush()
+  server.serve_forever()
+
+
+thread = Thread(target=start_server, args=[int(sys.argv[1]), 8080, 
RequestHandler])
+
+thread.start()
+thread.join()

http://git-wip-us.apache.org/repos/asf/aurora/blob/f2acf53f/src/test/sh/org/apache/aurora/e2e/sla_policy.aurora
----------------------------------------------------------------------
diff --git a/src/test/sh/org/apache/aurora/e2e/sla_policy.aurora 
b/src/test/sh/org/apache/aurora/e2e/sla_policy.aurora
new file mode 100644
index 0000000..d819778
--- /dev/null
+++ b/src/test/sh/org/apache/aurora/e2e/sla_policy.aurora
@@ -0,0 +1,61 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+hello = Process(
+  name = 'hello',
+  cmdline = """
+    while true; do
+      echo hello world
+      sleep 10
+    done
+  """)
+
+copy_coordinator = Process(
+  name = 'copy_coordinator',
+  cmdline = 'cp /vagrant/src/test/sh/org/apache/aurora/e2e/sla_coordinator.py 
.'
+)
+
+run_coordinator = Process(
+  name = 'run_coordinator',
+  cmdline = 'python sla_coordinator.py {{mesos.instance}}')
+
+task = SequentialTask(
+  processes = [hello],
+  resources = Resources(cpu = 0.1, ram = 1*MB, disk = 8*MB))
+
+coordinator_task = SequentialTask(
+  processes = [copy_coordinator, run_coordinator],
+  resources = Resources(cpu = 0.1, ram = 1*MB, disk = 8*MB))
+
+service = Service(
+  task = task,
+  tier = 'preferred',
+  instances = 2,
+  cluster = 'devcluster',
+  role = 'vagrant',
+  environment = 'test')
+
+coordinator_service = Service(
+  task = coordinator_task,
+  tier = 'preferred',
+  instances = 2,
+  cluster = 'devcluster',
+  role = 'vagrant',
+  environment = 'test')
+
+jobs = [
+  service(name = 'count', sla_policy=CountSlaPolicy(count=1, 
duration_secs=60)),
+  service(name = 'percentage', sla_policy=PercentageSlaPolicy(percentage=50, 
duration_secs=60)),
+  coordinator_service(name = 'coordinator', 
sla_policy=CoordinatorSlaPolicy(coordinator_url='http://localhost:8080')),
+]

http://git-wip-us.apache.org/repos/asf/aurora/blob/f2acf53f/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh
----------------------------------------------------------------------
diff --git a/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh 
b/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh
index 888efe4..24f4448 100755
--- a/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh
+++ b/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh
@@ -33,11 +33,16 @@ _curl() { curl --silent --fail --retry 4 --retry-delay 10 
"$@" ; }
 tear_down() {
   set +x  # Disable command echo, as this makes it more difficult see which 
command failed.
 
-  for job in http_example http_example_watch_secs http_example_revocable 
http_example_docker http_example_unified_appc http_example_unified_docker; do
-    aurora update abort devcluster/vagrant/test/$job >/dev/null 2>&1 || true
-    aurora job killall --no-batching devcluster/vagrant/test/$job >/dev/null 
2>&1
+  local _jobs=$(aurora job list $TEST_CLUSTER/$TEST_ROLE| grep $TEST_ROLE)
+
+  for job in ${_jobs[@]}; do
+    aurora update abort $job >/dev/null 2>&1 || true
+    aurora job killall --no-batching $job >/dev/null 2>&1
   done
 
+  aurora_admin set_quota $TEST_CLUSTER $TEST_ROLE 0 0m 0m
+  aurora_admin host_activate --hosts=$TEST_SLAVE_IP $TEST_CLUSTER
+
   sudo mv /etc/aurora/clusters.json.old /etc/aurora/clusters.json >/dev/null 
2>&1 || true
 }
 
@@ -241,12 +246,50 @@ wait_until_task_status() {
       _success=1
       break
     else
-      sleep 1
+      sleep 20
+    fi
+  done
+
+  if [[ "$_success" -ne "1" ]]; then
+    echo "Task did not transition to $_expected_state within timeout."
+    exit 1
+  fi
+}
+
+assert_host_status() {
+  local _host=$1 _cluster=$2 _expected_state=$3
+
+  local _state=$(aurora_admin host_status --hosts=$_host $_cluster 2>&1 | tail 
-n1 | awk -F' ' '{print $6}')
+
+  if [[ $_state != $_expected_state ]]; then
+    echo "Expected host $_host to be in state $_expected_state, but found 
$_state"
+    exit 1
+  fi
+}
+
+wait_until_task_counts() {
+  # Poll the job, waiting for it to enter the target number of task counts
+  local _jobkey=$1 _expected_running=$2 _expected_pending=$3
+  local _num_running=0
+  local _num_pending=0
+  local _success=0
+
+  for i in $(seq 1 120); do
+    # || is so that we don't return an EXIT so that `trap collect_result` 
doesn't get triggered.
+    _num_running=$(aurora job status $_jobkey --write-json | jq -r 
".[0].active[].status" | grep "RUNNING" | wc -l) || echo $?
+    _num_pending=$(aurora job status $_jobkey --write-json | jq -r 
".[0].active[].status" | grep "PENDING" | wc -l) || echo $?
+
+    if [[ $_num_running == $_expected_running ]] && [[ $_num_pending == 
$_expected_pending ]]; then
+      _success=1
+      break
+    else
+      echo "Waiting for job $_jobkey to have $_expected_running RUNNING and 
$_expected_pending PENDING tasks."
+      sleep 20
     fi
   done
 
   if [[ "$_success" -ne "1" ]]; then
-    echo "Task did not transition to $_expected_state within two minutes."
+    echo "Job $_jobkey did not have $_expected_running RUNNING tasks and 
$_expected_pending PENDING tasks within timeout."
     exit 1
   fi
 }
@@ -341,6 +384,73 @@ test_partition_awareness() {
   aurora job killall $_delay_jobkey
 }
 
+run_sla_aware_maintenance() {
+  local _config=$1
+  local _cluster=$2
+  local _jobkey=$3
+
+  aurora job create $_jobkey $_config --wait-until RUNNING
+
+  # assert the number of tasks, the job should have 2 RUNNING tasks
+  wait_until_task_counts $_jobkey 2 0
+
+  # check that the host starts with no maintenance mode
+  assert_host_status $TEST_SLAVE_IP $_cluster "NONE"
+
+  # trigger sla aware drain with default timeout of 2hr
+  # so, only allowed number (1 each) of tasks should drain for each job
+  aurora_admin sla_host_drain --hosts=$TEST_SLAVE_IP $_cluster
+
+  # force a scheduler restart and make sure that the maintenance request is 
still satisfied
+  sudo systemctl restart aurora-scheduler
+
+  # host must have maintenance mode set
+  assert_host_status $TEST_SLAVE_IP $_cluster "DRAINING"
+
+  # tasks get drained as allowed by the sla policy
+  wait_until_task_counts $_jobkey 1 1
+
+  # for coordinator sla check specific task states
+  if [[ $_jobkey == $TEST_JOB_COORDINATOR_SLA ]]; then
+    assert_task_status $_jobkey "0" PENDING
+    assert_task_status $_jobkey "1" RUNNING
+  fi
+
+  # host must have maintenance mode set and should be waiting in DRAINING
+  assert_host_status $TEST_SLAVE_IP $_cluster "DRAINING"
+
+  # force sla aware drain with zero timeout
+  aurora_admin sla_host_drain --force_drain_timeout=0s --hosts=$TEST_SLAVE_IP 
$_cluster
+
+  # tasks get drained as allowed by the sla policy
+  wait_until_task_counts $_jobkey 0 2
+
+  # activate host again
+  aurora_admin host_activate --hosts=$TEST_SLAVE_IP $_cluster
+
+  # assert the number of tasks the job should have 2 RUNNING tasks
+  wait_until_task_counts $_jobkey 2 0
+
+  # clean up
+  aurora job killall $_jobkey
+}
+
+test_sla_aware_maintenance() {
+  local _config=$1
+  local _cluster=$2
+  local _role=$3
+  local _count_jobkey=$4
+  local _percentage_jobkey=$5
+  local _coordinator_jobkey=$6
+
+  # add quota for each job (addl. for executor overhead) since only preferred 
jobs get sla policy
+  aurora_admin increase_quota $_cluster $_role 1.0 10m 50m
+
+  run_sla_aware_maintenance $_config $_cluster $_count_jobkey
+  run_sla_aware_maintenance $_config $_cluster $_percentage_jobkey
+  run_sla_aware_maintenance $_config $_cluster $_coordinator_jobkey
+}
+
 test_announce() {
   local _role=$1 _env=$2 _job=$3
 
@@ -744,6 +854,10 @@ 
TEST_PARTITION_AWARENESS_CONFIG_FILE=$TEST_ROOT/partition_aware.aurora
 TEST_JOB_PA_DEFAULT=$TEST_CLUSTER/$TEST_ROLE/$TEST_ENV/partition_aware_default
 
TEST_JOB_PA_DISABLED=$TEST_CLUSTER/$TEST_ROLE/$TEST_ENV/partition_aware_disabled
 TEST_JOB_PA_DELAY=$TEST_CLUSTER/$TEST_ROLE/$TEST_ENV/partition_aware_delay
+TEST_SLA_POLICY_CONFIG_FILE=$TEST_ROOT/sla_policy.aurora
+TEST_JOB_COUNT_SLA=$TEST_CLUSTER/$TEST_ROLE/$TEST_ENV/count
+TEST_JOB_PERCENTAGE_SLA=$TEST_CLUSTER/$TEST_ROLE/$TEST_ENV/percentage
+TEST_JOB_COORDINATOR_SLA=$TEST_CLUSTER/$TEST_ROLE/$TEST_ENV/coordinator
 
 BASE_ARGS=(
   $TEST_CLUSTER
@@ -792,6 +906,15 @@ TEST_PARTITION_AWARENESS_ARGS=(
   $TEST_JOB_PA_DELAY
 )
 
+TEST_SLA_AWARE_MAINTENANCE_ARGS=(
+  $TEST_SLA_POLICY_CONFIG_FILE
+  $TEST_CLUSTER
+  $TEST_ROLE
+  $TEST_JOB_COUNT_SLA
+  $TEST_JOB_PERCENTAGE_SLA
+  $TEST_JOB_COORDINATOR_SLA
+)
+
 TEST_JOB_KILL_MESSAGE_ARGS=("${TEST_JOB_ARGS[@]}" "--message='Test message'")
 
 trap collect_result EXIT
@@ -800,6 +923,8 @@ aurorabuild all
 setup_ssh
 setup_docker_registry
 
+test_sla_aware_maintenance "${TEST_SLA_AWARE_MAINTENANCE_ARGS[@]}"
+
 test_partition_awareness "${TEST_PARTITION_AWARENESS_ARGS[@]}"
 
 test_version

http://git-wip-us.apache.org/repos/asf/aurora/blob/f2acf53f/ui/src/main/js/components/TaskConfigSummary.js
----------------------------------------------------------------------
diff --git a/ui/src/main/js/components/TaskConfigSummary.js 
b/ui/src/main/js/components/TaskConfigSummary.js
index 64880f4..f03d44d 100644
--- a/ui/src/main/js/components/TaskConfigSummary.js
+++ b/ui/src/main/js/components/TaskConfigSummary.js
@@ -70,6 +70,46 @@ function PartitionPolicy({ config }) {
     <td>{config.partitionPolicy.delaySecs}</td>
   </tr>];
 }
+
+function SlaPolicy({ config }) {
+  if (isNully(config.slaPolicy)) {
+    return null;
+  }
+
+  if (!isNully(config.slaPolicy.countSlaPolicy)) {
+    return [<tr>
+        <th rowSpan='2'>Count SLA Policy</th>
+        <td>count</td>
+        <td>{config.slaPolicy.countSlaPolicy.count}</td>
+      </tr>,
+      <tr>
+        <td>duration secs</td>
+        <td>{config.slaPolicy.countSlaPolicy.durationSecs}</td>
+      </tr>];
+  } else if (!isNully(config.slaPolicy.percentageSlaPolicy)) {
+    return [<tr>
+        <th rowSpan='2'>Percentage SLA Policy</th>
+        <td>precentage</td>
+        <td>{config.slaPolicy.percentageSlaPolicy.percentage + '%'}</td>
+      </tr>,
+      <tr>
+        <td>duration secs</td>
+        <td>{config.slaPolicy.percentageSlaPolicy.durationSecs}</td>
+      </tr>];
+  } else if (!isNully(config.slaPolicy.coordinatorSlaPolicy)) {
+    return [<tr>
+        <th rowSpan='4'>Coordinator SLA Policy</th>
+        <td>coordinator url</td>
+        <td>{config.slaPolicy.coordinatorSlaPolicy.coordinatorUrl}</td>
+      </tr>,
+      <tr>
+        <td>status key</td>
+        <td>{config.slaPolicy.coordinatorSlaPolicy.statusKey}</td>
+      </tr>];
+  }
+
+  return null;
+}
 /* eslint-enable */
 
 export function CronConfigSummary({ cronJob }) {
@@ -108,6 +148,7 @@ export function CronConfigSummary({ cronJob }) {
       </tr>
       <Metadata config={config} />
       <PartitionPolicy config={config} />
+      <SlaPolicy config={config} />
       <tr>
         <th>Contact</th>
         <td colSpan='2'>{config.contactEmail}</td>
@@ -136,6 +177,7 @@ export default function TaskConfigSummary({ config, 
instances }) {
       </tr>
       <Metadata config={config} />
       <PartitionPolicy config={config} />
+      <SlaPolicy config={config} />
       <tr>
         <th>Contact</th>
         <td colSpan='2'>{config.contactEmail}</td>

[1/4] aurora git commit: Enable `Tasks` to specify their own custom maintenance SLA.

Reply via email to