This is an automated email from the ASF dual-hosted git repository. potiuk pushed a commit to branch add-gcp-cloud-init in repository https://gitbox.apache.org/repos/asf/airflow-ci-infra.git
commit e20c7b07d10d012f12ea974e3c5656fabf85005a Author: Jarek Potiuk <[email protected]> AuthorDate: Sun May 2 18:54:44 2021 +0200 Cloud init is adapted to run GCE instance --- cloud-init.yml | 66 +++++++++++++++++----- gcp/README.md | 45 +++++++++++++++ gcp/metrics/gcp_create_metrics_descriptor.py | 56 ++++++++++++++++++ .../metrics/gcp_delete_metrics_descriptor.py | 23 +++++++- gcp/metrics/gcp_write_metrics_data.py | 57 +++++++++++++++++++ lambdas/scale_out_runner/app.py | 7 ++- lambdas/scale_out_runner/requirements.txt | 1 + requirements.txt | 2 + scripts/runner-supervisor.py | 2 +- 9 files changed, 240 insertions(+), 19 deletions(-) diff --git a/cloud-init.yml b/cloud-init.yml index 05d71d5..af14d2c 100644 --- a/cloud-init.yml +++ b/cloud-init.yml @@ -39,14 +39,21 @@ runcmd: - -c - | set -eu -o pipefail - echo "AWS_DEFAULT_REGION=$(cloud-init query region)" >> /etc/environment + if [[ $(cloud-init query cloud_name) == "aws" ]]; then + echo "AWS_DEFAULT_REGION=$(cloud-init query region)" >> /etc/environment + fi + if [[ $(cloud-init query cloud_name) == "gce" ]]; then + echo "GCP_DEFAULT_REGION=$(cloud-init query region)" >> /etc/environment + fi # Set an env var (that is visible in runners) that will let us know we are on a self-hosted runner echo 'AIRFLOW_SELF_HOSTED_RUNNER="[\"self-hosted\"]"' >> /etc/environment set -a . /etc/environment set +a - echo "ASG_GROUP_NAME=$(aws ec2 describe-tags --filter Name=resource-id,Values=$(cloud-init query instance_id) Name=key,Values=aws:autoscaling:groupName \ - | jq -r '@sh "\(.Tags[0].Value)"')" >> /etc/environment + if [[ $(cloud-init query cloud_name) == "aws" ]]; then + echo "ASG_GROUP_NAME=$(aws ec2 describe-tags --filter Name=resource-id,Values=$(cloud-init query instance_id) Name=key,Values=aws:autoscaling:groupName \ + | jq -r '@sh "\(.Tags[0].Value)"')" >> /etc/environment + fi - [systemctl, daemon-reload] - - bash @@ -75,10 +82,27 @@ runcmd: . /etc/environment set +a - aws s3 cp s3://airflow-ci-assets/runner-supervisor.py /opt/runner-supervisor/bin/runner-supervisor + if [[ $(cloud-init query cloud_name) == "aws" ]]; then + aws s3 cp s3://airflow-ci-assets/runner-supervisor.py /opt/runner-supervisor/bin/runner-supervisor + fi + if [[ $(cloud-init query cloud_name) == "gce" ]]; then + gsutil cp gs://airflow-ci-assets/runner-supervisor.py /opt/runner-supervisor/bin/runner-supervisor + fi chmod 755 /opt/runner-supervisor/bin/runner-supervisor - - 2.277.1-airflow3 + - + - bash + - -c + - | + set -eu -o pipefail + if [[ $(cloud-init query cloud_name) == "gce" ]]; then + gsutil cp gs://airflow-ci-assets/requirements.txt /opt/requirements.txt + python3 -mvenv /opt/gcp-metrics-writer + /opt/gcp-metrics-writer/bin/pip install -r /opt/requirements.txt + gsutil cp gs://airflow-ci-assets/gcp_write_metrics_data.py /opt/gcp-metrics-writer/bin/gcp_write_metrics_data + chmod a+x /opt/gcp-metrics-writer/bin/gcp_write_metrics_data + fi + - [systemctl, enable, --now, iptables.service] # Restart docker after applying the user firewall -- else some rules/chains might be list! - [systemctl, restart, docker.service] @@ -120,9 +144,15 @@ write_files: docker ps -qa | xargs --verbose --no-run-if-empty docker rm -fv echo "Log in to a paid docker user to get unlimited docker pulls" - aws ssm get-parameter --with-decryption --name /runners/apache/airflow/dockerPassword | \ - jq .Parameter.Value -r | \ - sudo -u runner docker login --username airflowcirunners --password-stdin + if [[ $(cloud-init query cloud_name) == "aws" ]]; then + aws ssm get-parameter --with-decryption --name /runners/apache/airflow/dockerPassword | \ + jq .Parameter.Value -r | \ + sudo -u runner docker login --username airflowcirunners --password-stdin + fi + if [[ $(cloud-init query cloud_name) == "gce" ]]; then + gcloud secrets versions access latest --secret "runners-apache-airflow-dockerPassword" | \ + sudo -u runner docker login --username airflowcirunners --password-stdin + fi if [[ -d ~runner/actions-runner/_work/airflow/airflow ]]; then cd ~runner/actions-runner/_work/airflow/airflow @@ -134,7 +164,8 @@ write_files: git submodule deinit --all -f && \ git submodule foreach git clean -fxd && \ git clean -fxd \ - " + + fi fi owner: root:root @@ -176,7 +207,11 @@ write_files: runner ALL=(ALL) NOPASSWD:/usr/sbin/swapoff -a, /usr/bin/rm -f /swapfile, /usr/bin/apt clean - path: /etc/iptables/rules.v4 content: | + # # Generated by iptables-save v1.8.4 on Thu Jan 14 13:59:27 2021 + # The Metadata server IP address is the same for AWS and GCP: 169.254.169.254 + # Which is pretty cool. + # *filter :INPUT ACCEPT [833:75929] :FORWARD DROP [0:0] @@ -188,18 +223,23 @@ write_files: -A DOCKER-USER -j RETURN COMMIT - - path: /usr/local/sbin/actions-runner-ec2-reporting + - path: /usr/local/sbin/actions-runner-reporting permissions: '0775' content: | #!/bin/bash set -eu -o pipefail if pgrep -c Runner.Worker >/dev/null; then # Only report metric when we're doing something -- no point paying to submit zeros - aws cloudwatch put-metric-data --metric-name jobs-running --value "$(pgrep -c Runner.Worker)" --namespace github.actions + if [[ $(cloud-init query cloud_name) == "aws" ]]; then + aws cloudwatch put-metric-data --metric-name jobs-running --value "$(pgrep -c Runner.Worker)" --namespace github.actions + fi + if [[ $(cloud-init query cloud_name) == "gce" ]]; then + /opt/gcp-metrics-writer/bin/python /opt/gcp-metrics-writer/bin/gcp_write_metrics_data --value "$(pgrep -c Runner.Worker)" + fi fi - - path: /etc/cron.d/cloudwatch-metrics-github-runners + - path: /etc/cron.d/metrics-github-runners content: | - */1 * * * * nobody /usr/local/sbin/actions-runner-ec2-reporting + */1 * * * * nobody /usr/local/sbin/actions-runner-reporting - path: /etc/systemd/system/actions.runner-supervisor.service content: | diff --git a/gcp/README.md b/gcp/README.md new file mode 100644 index 0000000..ca74299 --- /dev/null +++ b/gcp/README.md @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Notes from setting up GCP version of Airflow CI runner + +Those are notes taken while setting-up GCP version of the Runner. + +1. Created a new service account without any permissions + [email protected] + +2. Created custom roles with those permissions: + +* Monitoring Metric Writer + * monitoring.timeSeries.create + +2. Created `runners-apache-airflow-dockerPassword` secret with the same value as in AWS. + +3. Assigned roles to the "airflow-ci-runner" service account: + +* Monitoring Metric Writer +* Secret Manager Secret Accessor + + +4. Created `airflow-ci-assets` GCS bucket with "public read" permissions + +5. Copied those files there (they need to be copied every time they are changed) + * gcp_write_metrics_data.py + * get-runner-creds.py + * requirements.txt + * runner-supervisor.py diff --git a/gcp/metrics/gcp_create_metrics_descriptor.py b/gcp/metrics/gcp_create_metrics_descriptor.py new file mode 100755 index 0000000..fda60be --- /dev/null +++ b/gcp/metrics/gcp_create_metrics_descriptor.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import click +from google.api import label_pb2 as ga_label, metric_pb2 as ga_metric +from google.cloud import monitoring_v3 + +DEFAULT_PROJECT = 'apache-airflow-ci-cd' +DEFAULT_ZONE = 'us-central1-a' +CUSTOM_METRICS_TYPE = 'custom.googleapis.com/github-actions/jobs-running' + + [email protected]() [email protected]('--project', default=DEFAULT_PROJECT) +def main(project: str): + client = monitoring_v3.MetricServiceClient() + project_name = f"projects/{project}" + descriptor = ga_metric.MetricDescriptor() + descriptor.display_name = "GitHub Actions jobs" + descriptor.type = CUSTOM_METRICS_TYPE + descriptor.metric_kind = ga_metric.MetricDescriptor.MetricKind.GAUGE + descriptor.value_type = ga_metric.MetricDescriptor.ValueType.INT64 + descriptor.description = "Number of Jobs running for GitHub Actions." + + label_instance_id = ga_label.LabelDescriptor() + label_instance_id.key = "instance_id" + label_instance_id.value_type = ga_label.LabelDescriptor.ValueType.STRING + label_instance_id.description = "The instance_id" + label_zone = ga_label.LabelDescriptor() + label_zone.key = "zone" + label_zone.value_type = ga_label.LabelDescriptor.ValueType.STRING + label_zone.description = "The zone" + descriptor.labels.append(label_instance_id) + descriptor.labels.append(label_zone) + + descriptor = client.create_metric_descriptor(name=project_name, metric_descriptor=descriptor) + print(f"Created {descriptor.name}.") + + +if __name__ == '__main__': + main() diff --git a/lambdas/scale_out_runner/requirements.txt b/gcp/metrics/gcp_delete_metrics_descriptor.py old mode 100644 new mode 100755 similarity index 54% copy from lambdas/scale_out_runner/requirements.txt copy to gcp/metrics/gcp_delete_metrics_descriptor.py index c5402f0..c7e6a0f --- a/lambdas/scale_out_runner/requirements.txt +++ b/gcp/metrics/gcp_delete_metrics_descriptor.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -15,5 +16,23 @@ # specific language governing permissions and limitations # under the License. -boto3 -chalice +import click +from google.cloud import monitoring_v3 + +DEFAULT_PROJECT = 'apache-airflow-ci-cd' +DEFAULT_ZONE = 'us-central1-a' +CUSTOM_METRICS_TYPE = 'custom.googleapis.com/github-actions/jobs-running' + + [email protected]() [email protected]('--project', default=DEFAULT_PROJECT) [email protected]('--zone', default=DEFAULT_ZONE) +def main(project: str, zone: str): + client = monitoring_v3.MetricServiceClient() + descriptor_name = f"projects/{project}/metricDescriptors/{CUSTOM_METRICS_TYPE}" + client.delete_metric_descriptor(name=descriptor_name) + print(f"Deleted metric descriptor {descriptor_name}.") + + +if __name__ == '__main__': + main() diff --git a/gcp/metrics/gcp_write_metrics_data.py b/gcp/metrics/gcp_write_metrics_data.py new file mode 100755 index 0000000..56020f6 --- /dev/null +++ b/gcp/metrics/gcp_write_metrics_data.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import time + +import click +import requests +from google.cloud import monitoring_v3 + +DEFAULT_PROJECT = 'apache-airflow-ci-cd' +DEFAULT_ZONE = 'us-central1-a' +CUSTOM_METRICS_TYPE = 'custom.googleapis.com/github-actions/jobs-running' + + [email protected]() [email protected]('--project', default=DEFAULT_PROJECT) [email protected]('--instance') [email protected]('--value', type=int, default=1) +def main(project: str, instance: str, value): + client = monitoring_v3.MetricServiceClient() + project_name = f"projects/{project}" + + if not instance: + instance = requests.get( + "http://metadata/computeMetadata/v1/instance/id", headers={'Metadata-Flavor': 'Google'} + ).text + series = monitoring_v3.TimeSeries() + series.metric.type = CUSTOM_METRICS_TYPE + series.resource.type = "gce_instance" + series.resource.labels["instance_id"] = instance + series.resource.labels["zone"] = DEFAULT_ZONE + now = time.time() + seconds = int(now) + nanos = int((now - seconds) * 10 ** 9) + interval = monitoring_v3.TimeInterval({"end_time": {"seconds": seconds, "nanos": nanos}}) + point = monitoring_v3.Point({"interval": interval, "value": {"int64_value": value}}) + series.points = [point] + client.create_time_series(name=project_name, time_series=[series]) + print(f"Reported {CUSTOM_METRICS_TYPE} with value {value}") + + +if __name__ == '__main__': + main() diff --git a/lambdas/scale_out_runner/app.py b/lambdas/scale_out_runner/app.py index 2f40fb1..e011794 100644 --- a/lambdas/scale_out_runner/app.py +++ b/lambdas/scale_out_runner/app.py @@ -29,7 +29,8 @@ from chalice.app import Request app = Chalice(app_name='scale_out_runner') app.log.setLevel(logging.INFO) -ASG_GROUP_NAME = os.getenv('ASG_NAME', 'AshbRunnerASG') +AWS_ASG_GROUP_NAME = os.getenv('AWS_ASG_NAME', 'AshbRunnerASG') +GCP_ASG_GROUP_NAME = os.getenv('GCP_ASG_NAME', 'AshbRunnerASG') TABLE_NAME = os.getenv('COUNTER_TABLE', 'GithubRunnerQueue') _commiters = set() GH_WEBHOOK_TOKEN = None @@ -180,7 +181,7 @@ def scale_asg_if_needed(num_queued_jobs: int) -> dict: asg = boto3.client('autoscaling') resp = asg.describe_auto_scaling_groups( - AutoScalingGroupNames=[ASG_GROUP_NAME], + AutoScalingGroupNames=[AWS_ASG_GROUP_NAME], ) asg_info = resp['AutoScalingGroups'][0] @@ -199,7 +200,7 @@ def scale_asg_if_needed(num_queued_jobs: int) -> dict: if new_size <= max_size or current < max_size: try: new_size = min(new_size, max_size) - asg.set_desired_capacity(AutoScalingGroupName=ASG_GROUP_NAME, DesiredCapacity=new_size) + asg.set_desired_capacity(AutoScalingGroupName=AWS_ASG_GROUP_NAME, DesiredCapacity=new_size) return {'new_capcity': new_size} except asg.exceptions.ScalingActivityInProgressFault as e: return {'error': str(e)} diff --git a/lambdas/scale_out_runner/requirements.txt b/lambdas/scale_out_runner/requirements.txt index c5402f0..ac067a9 100644 --- a/lambdas/scale_out_runner/requirements.txt +++ b/lambdas/scale_out_runner/requirements.txt @@ -17,3 +17,4 @@ boto3 chalice +google-cloud-compute diff --git a/requirements.txt b/requirements.txt index 69d30f2..b8b2e0b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,6 +18,8 @@ boto3 click~=7.1 chalice +google-cloud-monitoring +google-cloud-compute pytest~=6.0 python-dynamodb-lock psutil diff --git a/scripts/runner-supervisor.py b/scripts/runner-supervisor.py index d3d0e0c..60fb5aa 100755 --- a/scripts/runner-supervisor.py +++ b/scripts/runner-supervisor.py @@ -137,7 +137,7 @@ def main(repo, output_folder, user): # Just keep trying until we get some credentials. while True: - # Have each runner try to get a credential in a random order. + # Have each runner try to get a credential iKMSn a random order. possibles = get_possible_credentials(repo) random.shuffle(possibles)
