[incubator-mxnet-ci] branch master updated: Add jenkins run statistics (#4)

marcoabreu Thu, 15 Aug 2019 14:25:48 -0700

This is an automated email from the ASF dual-hosted git repository.

marcoabreu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet-ci.git



The following commit(s) were added to refs/heads/master by this push:
     new e1e4dc4  Add jenkins run statistics (#4)
e1e4dc4 is described below

commit e1e4dc4d9a042ba1466487151eab132988bc04ae
Author: Marco de Abreu <marcoab...@users.noreply.github.com>
AuthorDate: Thu Aug 15 23:25:36 2019 +0200

    Add jenkins run statistics (#4)
---
 services/jenkins-run-statistics/.gitignore       |  46 ++++
 services/jenkins-run-statistics/README.md        |  37 +++
 services/jenkins-run-statistics/__init__.py      |   0
 services/jenkins-run-statistics/aws_utils.py     |  69 ++++++
 services/jenkins-run-statistics/deploy_lambda.sh |  35 +++
 services/jenkins-run-statistics/environment.yml  |  12 +
 services/jenkins-run-statistics/jenkins_utils.py | 259 +++++++++++++++++++
 services/jenkins-run-statistics/requirements.txt |   3 +
 services/jenkins-run-statistics/serverless.yml   |  77 ++++++
 services/jenkins-run-statistics/statistics.py    | 300 +++++++++++++++++++++++
 10 files changed, 838 insertions(+)

diff --git a/services/jenkins-run-statistics/.gitignore 
b/services/jenkins-run-statistics/.gitignore
new file mode 100644
index 0000000..c021710
--- /dev/null
+++ b/services/jenkins-run-statistics/.gitignore
@@ -0,0 +1,46 @@
+*~
+__pycache__/
+
+# Logs
+logs
+*.log
+npm-debug.log
+
+# Runtime data
+pids
+*.pid
+*.seed
+dist
+
+# Directory for instrumented libs generated by jscoverage/JSCover
+lib-cov
+
+# Coverage directory used by tools like istanbul
+coverage
+
+# Grunt intermediate storage 
(http://gruntjs.com/creating-plugins#storing-task-files)
+.grunt
+
+# node-waf configuration
+.lock-wscript
+
+# Compiled binary addons (http://nodejs.org/api/addons.html)
+build/Release
+
+# Dependency directory
+# 
https://www.npmjs.org/doc/misc/npm-faq.html#should-i-check-my-node_modules-folder-into-git
+node_modules
+package-lock.json
+
+#IDE Stuff
+**/.idea
+
+#OS STUFF
+.DS_Store
+.tmp
+
+#SERVERLESS STUFF
+admin.env
+.env
+_meta
+.serverless
diff --git a/services/jenkins-run-statistics/README.md 
b/services/jenkins-run-statistics/README.md
new file mode 100644
index 0000000..c6d6e38
--- /dev/null
+++ b/services/jenkins-run-statistics/README.md
@@ -0,0 +1,37 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# Jenkins run statistics
+This script automatically generates CloudWatch metrics regarding the duration 
of Jenkins runs.
+
+## Metrics
+The metrics can be found in CloudWatch metrics. Check the environment.yml for 
the metric namespace.
+
+## Logs
+The logs are available in CloudWatch logs. Check the serverless.yml for the 
log namespace.
+
+## Limitations
+This tool processes all runs that are in the Jenkins database, but CloudWatch 
Metrics only allows to go back as far as 14 days. Thus, any runs that are older 
will be skipped. Please also note that for metrics, that are older than 24 
hours, it may take them up to 48 hours until they are visible in the web 
interface. Consult 
https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/cloudwatch_concepts.html#Metric
 and https://docs.aws.amazon.com/AmazonCloudWatch/latest/APIReference/API [...]
+
+If this lambda function times out due to too much data, it will automatically 
recover from that state and continue the work at the same point. This is 
achieved due to the DynamoDB backend in combination with Jenkins reporting the 
last build time of each job.
+
+## Set up
+- Install the Serverless framework
+
+## Execution
+Run deploy_lambda.sh
+
diff --git a/services/jenkins-run-statistics/__init__.py 
b/services/jenkins-run-statistics/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/services/jenkins-run-statistics/aws_utils.py 
b/services/jenkins-run-statistics/aws_utils.py
new file mode 100644
index 0000000..f0a1628
--- /dev/null
+++ b/services/jenkins-run-statistics/aws_utils.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import time
+from datetime import datetime, timedelta, timezone
+import logging
+from collections import namedtuple
+
+import boto3
+from botocore.exceptions import ClientError
+
+
+AwsServiceObjectsTuple = namedtuple('AwsServiceObjectsTuple', ['dynamo_db', 
'cloudwatch'])
+
+CLOUDWATCH_MAXIMUM_LOOKBACK_TIMEFRAME_SECONDS = 60 * 60 * 24 * 7 * 2
+
+
+def generate_aws_service_objects(region_name):
+    """
+    Generate AWS Boto objects
+    :return: AwsServiceObjectsTuple object
+    """
+    dynamo_db = boto3.resource('dynamodb', region_name=region_name)
+    cloudwatch = boto3.client('cloudwatch')
+
+    return AwsServiceObjectsTuple(dynamo_db=dynamo_db, cloudwatch=cloudwatch)
+
+
+def publish_cloudwatch_metric(cloudwatch, metric_namespace, metric_name, 
value, unix_timestamp, dimensions, unit='Milliseconds'):
+    # CloudWatch does not allow submission older than 2 weeks.
+    if time.time() - unix_timestamp >= 
CLOUDWATCH_MAXIMUM_LOOKBACK_TIMEFRAME_SECONDS:
+        logging.info('Skipping submission of CloudWatch metric that was older 
than 2 weeks.')
+        return
+
+    try:
+        cloudwatch.put_metric_data(
+            MetricData=[
+                {
+                    'MetricName': metric_name,
+                    'Dimensions': [{'Name': name, 'Value': value} for name, 
value in dimensions.items()],
+                    'Unit': unit,
+                    'Value': value,
+                    'Timestamp': datetime.utcfromtimestamp(unix_timestamp)
+                }
+            ],
+            Namespace=metric_namespace
+        )
+    except ClientError as e:
+        if e.response['Error']['Code'] == 'InvalidParameterValue':
+            logging.info('Skipping submission of CloudWatch metric that was 
older than 2 weeks.')
+            logging.exception('Exception:')
+        else:
+            raise
diff --git a/services/jenkins-run-statistics/deploy_lambda.sh 
b/services/jenkins-run-statistics/deploy_lambda.sh
new file mode 100755
index 0000000..03c52b4
--- /dev/null
+++ b/services/jenkins-run-statistics/deploy_lambda.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+
+echo "Deployment stage (test, prod)"
+read config_dir
+
+if [ "$config_dir" == "test" ]; then
+    echo "Deploying to test"
+    export AWS_PROFILE=mxnet-ci-dev
+    sls deploy -s test
+elif [ "$config_dir" == "prod" ]; then
+    echo "Deploying to prod"
+    export AWS_PROFILE=mxnet-ci
+    sls deploy -s prod
+else
+    echo "Unrecognized stage: ${config_dir}"
+fi
diff --git a/services/jenkins-run-statistics/environment.yml 
b/services/jenkins-run-statistics/environment.yml
new file mode 100644
index 0000000..b519adb
--- /dev/null
+++ b/services/jenkins-run-statistics/environment.yml
@@ -0,0 +1,12 @@
+test:
+    JENKINS_URL: "http://jenkins.mxnet-ci.amazon-ml.com/";
+    REGION: us-west-2
+    CLOUDWATCH_METRIC_NAMESPACE: 'marco-jenkins-run-statistics4'
+    DYNAMODB_TABLE_NAME: 'marco-test-jenkins-run-statistics4'
+
+prod:
+    JENKINS_URL: "http://jenkins.mxnet-ci.amazon-ml.com/";
+    REGION: us-west-2
+    CLOUDWATCH_METRIC_NAMESPACE: 'JenkinsRunStatistics'
+    DYNAMODB_TABLE_NAME: 'JenkinsRunStatistics'
+
diff --git a/services/jenkins-run-statistics/jenkins_utils.py 
b/services/jenkins-run-statistics/jenkins_utils.py
new file mode 100644
index 0000000..6f06e6f
--- /dev/null
+++ b/services/jenkins-run-statistics/jenkins_utils.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import json
+import logging
+import os
+import ast
+import re
+import ssl
+import sys
+import time
+import urllib.request
+from typing import Optional
+from datetime import datetime, timedelta, timezone
+from typing import Dict, List
+import dateutil
+
+import boto3
+from botocore.exceptions import ClientError
+import botocore
+
+import dateutil
+import dateutil.parser
+import dateutil.tz
+
+import requests
+from requests_xml import XMLSession
+
+JENKINS_ALL_RUNS_API = 'view/all/cc.xml?recursive'
+JENKINS_RUN_METADATA_API = '{job_url}{run_id}/api/python'
+JENKINS_RUN_BLUEOCEAN_API = 
'{jenkins_url}blue/rest/organizations/jenkins/{pipeline_paths}/runs/{run_id}/'
+JENKINS_JOB_METADATA_API = '{jenkins_url}{job_paths}/api/python'
+
+REGEX_URL_EXTRACT_JOB_NAME = re.compile(r'job\/([^\/]+)')
+
+
+class JenkinsJob(object):
+    """
+    Object representing a Jenkins Job
+    """
+
+    def __init__(self, jenkins_url, last_run_id, job_url, full_job_name, 
last_build_time):
+        self.jenkins_url = jenkins_url
+        self.last_run_id = last_run_id
+        self.job_url = job_url
+        self.full_job_name = full_job_name
+        self.last_scanned_run_id = 0
+        self.last_build_time = dateutil.parser.parse(last_build_time)
+        self.job_hierarchy = None  # Will be retrieved later if required
+
+    def __repr__(self):
+        return f'{self.full_job_name} @ {self.job_url}'
+
+    def update_last_scanned_run_id(self, last_scanned_run_id):
+        """
+        Update the last scanned run id of this run.
+        :param last_scanned_run_id: ID of the last scanned run
+        :return: Nothing
+        """
+        self.last_scanned_run_id = last_scanned_run_id
+
+    def get_job_hierarchy(self):
+        """
+        Query the jenkins API to get the real job hierarchy - e.g. which part 
of the job name is a folder, which one
+        is the job name and which one is the branch name (if applicable). This 
is necessary because there are multiple
+        methods to define Jenkins jobs.
+        :return: Dictionary
+        """
+        if self.job_hierarchy:
+            # Cached result
+            return self.job_hierarchy
+
+        # By looking at the parent job, if applicable, we can see whether we 
are currently part of a multi-branch job.
+        # If we are, we have to take the last part of the job name as branch 
name instead.
+        job_groups = REGEX_URL_EXTRACT_JOB_NAME.findall(self.job_url)
+
+        self.job_hierarchy = {}
+        if len(job_groups) > 1:
+            # This job has a parent. Inspect it.
+            job_paths = '/'.join(['job/' + job for job in job_groups[:-1]])
+            url = 
JENKINS_JOB_METADATA_API.format(jenkins_url=self.jenkins_url, 
job_paths=job_paths)
+
+            try:
+                metadata = ast.literal_eval(
+                    requests.get(
+                        url=url,
+                        params={'tree': '_class,fullName'}, 
allow_redirects=False).text)
+            except SyntaxError:
+                raise Exception(f'Unable to retrieve meta data for parent job 
of {self} at {url}')
+
+            if metadata['_class'] == 
'org.jenkinsci.plugins.workflow.multibranch.WorkflowMultiBranchProject':
+                logging.debug('%s is part of a MultiBranchProject', self)
+                branch_name = job_groups[-1]  # Last entry is the branch name
+            else:
+                logging.debug('%s is probably not part of a MultiBranchProject 
since the parent class is a %s. Thus,'
+                              'considering it as independenct job.', self, 
metadata['_class'])
+                branch_name = None
+
+            job_name = metadata['fullName']
+        else:
+            logging.debug('%s has no parent, considering it a standalone job', 
self)
+            branch_name = None
+            job_name = job_groups[0]
+
+        self.job_hierarchy['job_name'] = job_name
+        self.job_hierarchy['branch_name'] = branch_name
+
+        return self.job_hierarchy
+
+
+    def get_outstanding_jenkins_runs(self):
+        """
+        Retrieve a list of Jenkins runs that have not been processed yet
+        :return: Array of JenkinsRuns
+        """
+        return [JenkinsRun(parent_job=self, run_id=run_id) for run_id in
+                range(self.last_scanned_run_id + 1, self.last_run_id)]
+
+
+class JenkinsRun(object):
+    """
+    Object representing a Jenkins Run
+    """
+
+    def __init__(self, parent_job, run_id):
+        self.parent_job = parent_job
+        self.run_id = run_id
+
+    def __repr__(self):
+        return f'{self.parent_job.full_job_name} #{self.run_id}'
+
+    def retrieve_metadata(self, tree_filter_string):
+        """
+        Retrieve this runs' metadata.
+        :param tree_filter_string: A string that limits which fields are being 
retrieved for performance reasons.
+                                   This is a Jenkins Rest API feature.
+        :return: Dictionary containing the requested meta data
+        """
+        try:
+            return ast.literal_eval(
+                
requests.get(url=JENKINS_RUN_METADATA_API.format(job_url=self.parent_job.job_url,
 run_id=self.run_id),
+                             params={'tree': tree_filter_string}, 
allow_redirects=False).text)
+        except SyntaxError:
+            # Jenkins prints a 404 as HTML with a 200 code...
+            logging.debug('Run %s does not exist, skipping...', self)
+            return None
+
+    def _get_blue_ocean_api(self):
+        """
+        Get blue ocean API endpoint for this run
+        :return: URL
+        """
+        job_groups = 
REGEX_URL_EXTRACT_JOB_NAME.findall(self.parent_job.job_url)
+
+        pipeline_paths = '/'.join(['pipelines/' + job for job in job_groups])
+        return 
JENKINS_RUN_BLUEOCEAN_API.format(jenkins_url=self.parent_job.jenkins_url, 
pipeline_paths=pipeline_paths,
+                                                run_id=self.run_id)
+
+    def retrieve_nodes(self):
+        """
+        Retrieve all Jenkins nodes associated with this run.
+        :return: List JenkinsNode
+        """
+        try:
+            response = requests.get(url=self._get_blue_ocean_api() + 'nodes',
+                             allow_redirects=True).json()
+        except json.decoder.JSONDecodeError:
+            # Jenkins sometimes prints a 404 as HTML with a 200 code...
+            return None
+
+        if 'code' in response and response['code'] is not 200:
+            logging.error('Error retrieving nodes for run %s: %s', self, 
response['message'])
+            return None
+
+        jenkins_nodes = list()
+
+        for json_node_entry in response:
+            if not json_node_entry['state']:
+                logging.debug('Step %s of %s is empty, skipping', 
json_node_entry['displayName'], self)
+                logging.debug(json_node_entry)
+                continue
+
+            jenkins_nodes.append(JenkinsNode(parent_run=self, 
json_node_entry=json_node_entry))
+
+        return jenkins_nodes
+
+
+class JenkinsNode(object):
+    """
+    Object representing a Jenkins node that is part of a Jenkins run
+    """
+    def __init__(self, parent_run, json_node_entry):
+        self.parent_run = parent_run
+        self.result = json_node_entry['result']
+        self.type = json_node_entry['type']
+        self.display_name = json_node_entry['displayName']
+        self.start_timestamp = 
dateutil.parser.parse(json_node_entry['startTime']).timestamp()
+        self.duration_ms = json_node_entry['durationInMillis']
+        self._steps_api_link = json_node_entry['_links']['steps']['href']
+
+    def get_steps(self):
+        """
+        Return the underlying steps that are being executed as part of this 
Jenkins node
+        :return:
+        """
+        try:
+            response = requests.get(url=self.parent_run.parent_job.jenkins_url 
+ self._steps_api_link,
+                                    allow_redirects=True).json()
+        except json.decoder.JSONDecodeError:
+            # Jenkins sometimes prints a 404 as HTML with a 200 code...
+            return None
+
+        return [JenkinsStep(parent_step=self, json_step_entry=json_step_entry) 
for json_step_entry in response]
+
+
+class JenkinsStep(object):
+    """
+    Object representing a Jenkins step that is part of a Jenkins node
+    """
+    def __init__(self, parent_step, json_step_entry):
+        self.parent_step = parent_step
+        self.duration_ms = json_step_entry['durationInMillis']
+
+
+def _retrieve_jenkins_jobs(jenkins_url):
+    """
+    Query the Jenkins server and return all jenkins jobs and the last run id
+    :return: Array of JenkinsJobs
+    """
+    session = XMLSession()
+    r = session.get(url=jenkins_url + JENKINS_ALL_RUNS_API)
+    # <Project activity="Sleeping" lastBuildStatus="Success" 
lastBuildLabel="756"
+    # 
webUrl="http://jenkins.mxnet-ci.amazon-ml.com/job/Broken_Link_Checker_Pipeline/";
+    # name="Broken_Link_Checker_Pipeline" 
lastBuildTime="2018-11-30T01:12:59Z"/>
+    #
+    # <Project activity="Sleeping" lastBuildStatus="Success" lastBuildLabel="1"
+    # 
webUrl="http://jenkins.mxnet-ci.amazon-ml.com/job/incubator-mxnet/job/PR-10008/";
+    # name="incubator-mxnet » PR-10008" lastBuildTime="2018-03-06T18:19:44Z"/>
+
+    return [JenkinsJob(jenkins_url=jenkins_url, 
last_run_id=int(run.attrs['lastBuildLabel']),
+                       job_url=run.attrs['webUrl'], 
full_job_name=run.attrs['name'],
+                       last_build_time=run.attrs['lastBuildTime'])
+            for run in r.xml.xpath('//Project')]
diff --git a/services/jenkins-run-statistics/requirements.txt 
b/services/jenkins-run-statistics/requirements.txt
new file mode 100644
index 0000000..669fb02
--- /dev/null
+++ b/services/jenkins-run-statistics/requirements.txt
@@ -0,0 +1,3 @@
+boto3
+requests-xml
+python-dateutil
\ No newline at end of file
diff --git a/services/jenkins-run-statistics/serverless.yml 
b/services/jenkins-run-statistics/serverless.yml
new file mode 100644
index 0000000..965a3b7
--- /dev/null
+++ b/services/jenkins-run-statistics/serverless.yml
@@ -0,0 +1,77 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Lambda configuration for jenkins run statistics
+
+service: JenkinsRunStatistics
+
+plugins:
+  - serverless-python-requirements
+
+provider:
+  name: aws
+  runtime: python3.7
+  region: us-west-2
+  stage: ${opt:stage}
+  environment: ${file(environment.yml):${self:provider.stage}}
+  iamRoleStatements:
+    - Effect: Allow
+      Action:
+        - dynamodb:Query
+        - dynamodb:Scan
+        - dynamodb:GetItem
+        - dynamodb:PutItem
+        - dynamodb:UpdateItem
+        - dynamodb:DeleteItem
+      Resource: "arn:aws:dynamodb:${opt:region, 
self:provider.region}:*:table/${self:provider.environment.DYNAMODB_TABLE_NAME}"
+
+    # Grant CloudWatch metric access
+    - Effect: "Allow"
+      Action:
+        - "cloudwatch:PutMetricData"
+      Resource:
+        - "*"
+
+functions:
+  jenkins_run_statistics:
+    name: JenkinsRunStatisticsHandler
+    handler: statistics.lambda_handler
+    reservedConcurrency: 1
+    timeout: 290
+    events:
+      - schedule: rate(5 minutes)
+
+resources:
+  Resources:
+    JenkinsRunsDynamoDbTable:
+      Type: 'AWS::DynamoDB::Table'
+      DeletionPolicy: Retain
+      Properties:
+        AttributeDefinitions:
+          -
+            AttributeName: FULL_JOB_NAME
+            AttributeType: S
+        KeySchema:
+          -
+            AttributeName: FULL_JOB_NAME
+            KeyType: HASH
+        BillingMode: PAY_PER_REQUEST
+        TableName: ${self:provider.environment.DYNAMODB_TABLE_NAME}
+
+custom:
+  pythonRequirements:
+    dockerizePip: true
diff --git a/services/jenkins-run-statistics/statistics.py 
b/services/jenkins-run-statistics/statistics.py
new file mode 100644
index 0000000..f916aac
--- /dev/null
+++ b/services/jenkins-run-statistics/statistics.py
@@ -0,0 +1,300 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""This script records statistics about Jenkins runs"""
+from pprint import pprint
+
+__author__ = 'Marco de Abreu'
+__version__ = '0.1'
+
+import json
+import logging
+import os
+import ast
+import re
+import ssl
+import sys
+import time
+import urllib.request
+from typing import Optional
+from datetime import datetime, timedelta, timezone
+from typing import Dict, List
+import dateutil
+
+import boto3
+from botocore.exceptions import ClientError
+import botocore
+
+import dateutil
+import dateutil.parser
+import dateutil.tz
+
+import requests
+from requests_xml import XMLSession
+
+import jenkins_utils
+import aws_utils
+
+REGION_NAME = os.environ['REGION']
+JENKINS_URL = os.environ['JENKINS_URL']
+DYNAMODB_TABLE = os.environ['DYNAMODB_TABLE_NAME']
+CLOUDWATCH_METRIC_NAMESPACE = os.environ['CLOUDWATCH_METRIC_NAMESPACE']
+
+MAXIMUM_LOOKBACK_TIMEFRAME_SECONDS = 60 * 60 * 24 * 7 * 2
+
+DYNAMO_KEY_FULL_JOB_NAME = 'FULL_JOB_NAME'
+DYNAMO_VALUE_LAST_SCANNED_RUN_ID = 'LAST_SCANNED_RUN_ID'
+
+
+def record_jenkins_run_durations(dynamo_db, cloudwatch):
+    """
+    Main handler to initiate the process of recording Jenkins run durations
+    :param dynamo_db: Handle to Boto DynamoDB
+    :param cloudwatch:Handle to Boto CloudWatch
+    :return: Nothing
+    """
+    # Basically get a list of all Jenkins jobs and their last run id. Then 
compare the last run id with our database
+    # in order to determine jobs that we haven't scanned entirely or got new 
data in the meantime.
+    # Then, retrieve all the new runs, record the metrics and update the 
database.
+    jenkins_jobs = 
jenkins_utils._retrieve_jenkins_jobs(jenkins_url=JENKINS_URL)
+    _process_jenkins_jobs(dynamo_db=dynamo_db, cloudwatch=cloudwatch, 
jenkins_jobs=jenkins_jobs)
+
+
+def _process_jenkins_jobs(dynamo_db, cloudwatch, jenkins_jobs):
+    """
+    Process the passed Jenkins Jobs and record metrics of the underlying runs
+    :param dynamo_db: Handle to Boto DynamoDB
+    :param cloudwatch: Handle to Boto CloudWatch
+    :param jenkins_jobs: List of Jenkins Jobs
+    :return: Nothing
+    """
+    def generate_metric_dimensions():
+        job_name = jenkins_job.get_job_hierarchy()['job_name']
+        branch_name = jenkins_job.get_job_hierarchy()['branch_name']
+        if branch_name and 'PR-' in branch_name:
+            # Replace pull request branch names with a generalized name. We 
don't want to track PR branches individually
+            branch_name = 'Pull Request'
+
+        metric_dimensions = {'Job': job_name}
+        if branch_name:
+            metric_dimensions['Branch'] = branch_name
+
+        return metric_dimensions
+
+    table = dynamo_db.Table(DYNAMODB_TABLE)
+
+    for jenkins_job in jenkins_jobs:
+        time_diff = datetime.now(tz=timezone.utc) - jenkins_job.last_build_time
+        if time_diff.total_seconds() >= MAXIMUM_LOOKBACK_TIMEFRAME_SECONDS:
+            logging.debug('%s has last been run %d days ago, skipping since 
its more than two weeks', jenkins_job,
+                          time_diff.days)
+            continue
+
+        last_processed_run_id = 
_dynamo_get_last_processed_jenkins_run_id(dynamo_table=table,
+                                                                          
jenkins_job_name=jenkins_job.full_job_name)
+        if last_processed_run_id:
+            jenkins_job.update_last_scanned_run_id(last_processed_run_id)
+
+        outstanding_jenkins_runs = jenkins_job.get_outstanding_jenkins_runs()
+        if not outstanding_jenkins_runs:
+            logging.debug('%s has no outstanding runs', jenkins_job)
+            continue
+
+        metric_dimensions = generate_metric_dimensions()
+
+        for jenkins_run in outstanding_jenkins_runs:
+            if _process_jenkins_run(cloudwatch=cloudwatch, 
jenkins_run=jenkins_run,
+                                    metric_dimensions=dict(metric_dimensions)):
+                logging.info('%s has been processed, saving state in 
database', jenkins_run)
+                _dynamo_set_last_processed_jenkins_run_id(dynamo_db=dynamo_db, 
jenkins_run=jenkins_run)
+            else:
+                logging.info('%s requested to not be processed further, 
aborting scan of job', jenkins_run)
+                break
+
+
+def _process_jenkins_run(cloudwatch, jenkins_run, metric_dimensions):
+    """
+    Process a single Jenkins run and record metrics accordingly
+    :param jenkins_run:
+    :return: True if we should continue or False if job should no longer be 
crawled, e.g. due to running jobs
+    """
+    def process_stage(jenkins_node):
+        """
+        Process the Jenkins node that is being considered a stage
+        :param jenkins_node: Jenkins node
+        :return: New stage name
+        """
+        # The nodes are always in the correct order, so we can use that fact 
to preserve the
+        # information about the stage we are currently in during parallel 
steps.
+        current_stage = jenkins_node.display_name
+        stage_metric_dimensions = dict(node_metric_dimensions)
+        stage_metric_dimensions['Stage'] = current_stage
+        aws_utils.publish_cloudwatch_metric(
+            cloudwatch=cloudwatch, metric_name='Stage Duration',
+            metric_namespace=CLOUDWATCH_METRIC_NAMESPACE, 
value=jenkins_node.duration_ms / 1000,
+            unix_timestamp=unix_timestamp, dimensions=stage_metric_dimensions, 
unit='Seconds')
+        logging.info('= STAGE %s took %s',
+                     current_stage, 
str(timedelta(milliseconds=jenkins_node.duration_ms)))
+        return current_stage
+
+    def process_parallel(jenkins_node):
+        """
+        Process the Jenkins node that is being considered a parallel node
+        :param jenkins_node:
+        :return:
+        """
+        # Determine duration of each parallel-entry by making the sum of all 
steps. This is
+        # necessary because durationInMillis contains garbage for these nodes. 
Thanks, Jenkins!
+        steps = jenkins_node.get_steps()
+        if not steps:
+            logging.error('No steps available')
+            return
+
+        parallel_duration_ms = 0
+        for step in steps:
+            parallel_duration_ms += step.duration_ms
+
+        step_metric_dimensions = dict(node_metric_dimensions)
+        step_metric_dimensions['Stage'] = current_stage
+        step_metric_dimensions['Step'] = jenkins_node.display_name
+        aws_utils.publish_cloudwatch_metric(
+            cloudwatch=cloudwatch, metric_name='Step Duration', unit='Seconds',
+            value=int(parallel_duration_ms / 1000), 
unix_timestamp=unix_timestamp,
+            metric_namespace=CLOUDWATCH_METRIC_NAMESPACE, 
dimensions=step_metric_dimensions)
+
+        logging.info('== STEP %s ran for %s',
+                     jenkins_node.display_name, 
str(timedelta(milliseconds=parallel_duration_ms)))
+
+    metadata = 
jenkins_run.retrieve_metadata(tree_filter_string='duration,building,timestamp,result')
+
+    if metadata and metadata['building']:
+        logging.info('%s is still running, skipping...', jenkins_run)
+        return False
+
+    # Make sure to not return eagerly because the DynamoDB entry creation has 
to happen to mark the run as processed
+
+    if not metadata:
+        logging.debug('Run %s does not exist, skipping...', jenkins_run)
+    else:
+        total_duration_ms = metadata['duration']
+        unix_timestamp = metadata['timestamp'] / 1000
+
+        time_diff = time.time() - unix_timestamp
+        if time_diff >= MAXIMUM_LOOKBACK_TIMEFRAME_SECONDS:
+            logging.info('Run %s is from %d days ago, skipping since its more 
than two weeks',
+                         jenkins_run, int(time_diff/60/60/24))
+        else:
+            run_metric_dimensions = dict(metric_dimensions)
+            run_metric_dimensions['Result'] = metadata['result']
+            aws_utils.publish_cloudwatch_metric(cloudwatch=cloudwatch, 
metric_namespace=CLOUDWATCH_METRIC_NAMESPACE,
+                                                metric_name='Total Run 
Duration', unix_timestamp=unix_timestamp,
+                                                
dimensions=run_metric_dimensions, unit='Seconds',
+                                                value=total_duration_ms/1000)
+            logging.info('Run %s has been running for %s', jenkins_run, 
str(timedelta(milliseconds=total_duration_ms)))
+
+            nodes = jenkins_run.retrieve_nodes()
+
+            if not nodes:
+                logging.debug('Run %s has no child stages', jenkins_run)
+            else:
+                current_stage = 'Unknown stage'
+                for jenkins_node in nodes:
+                    node_metric_dimensions = dict(metric_dimensions)
+                    if jenkins_node.result:  # This is none if the stage has 
not been reached
+                        # Make sure to differentiate metrics by whether the 
step was successful or not. Otherwise,
+                        # time measurements would be off since some jobs did 
not run until the end.
+                        node_metric_dimensions['Result'] = jenkins_node.result
+                        unix_timestamp = jenkins_node.start_timestamp
+
+                        if jenkins_node.type == 'STAGE':
+                            current_stage = process_stage(jenkins_node)
+                        elif jenkins_node.type == 'PARALLEL':
+                            process_parallel(jenkins_node)
+                        else:
+                            logging.error('Unknown stage: %s for %s', 
jenkins_node.type, jenkins_node)
+
+    return True
+
+
+def _dynamo_set_last_processed_jenkins_run_id(dynamo_db, jenkins_run):
+    """
+    Mark the passed Jenkins run as processed in the database. This allows to 
avoid duplicate processing in future.
+    It's important that runs are processed from oldest to latest (and not in 
parallel) since we expect to only increase
+    the 'last scanned run id'.
+    :param dyanmo_db: Boto DynamoDB handle
+    :param jenkins_run: Jenkins run
+    :return: Nothing
+    """
+    table = dynamo_db.Table(DYNAMODB_TABLE)
+    table.update_item(
+                Key={
+                    DYNAMO_KEY_FULL_JOB_NAME: 
jenkins_run.parent_job.full_job_name
+                },
+                UpdateExpression=f"set {DYNAMO_VALUE_LAST_SCANNED_RUN_ID} = 
:id",
+                ExpressionAttributeValues={
+                    ':id': jenkins_run.run_id
+                }
+            )
+
+
+def _dynamo_get_last_processed_jenkins_run_id(dynamo_table, jenkins_job_name):
+    response = dynamo_table.get_item(
+        Key={
+            DYNAMO_KEY_FULL_JOB_NAME: jenkins_job_name
+        }
+    )
+    if 'Item' in response and DYNAMO_VALUE_LAST_SCANNED_RUN_ID in 
response['Item']:
+        return int(response['Item'][DYNAMO_VALUE_LAST_SCANNED_RUN_ID])
+    else:
+        logging.debug('%s has not been recorded yet', jenkins_job_name)
+        return None
+
+
+def _configure_logging():
+    logging.basicConfig()
+    logging.getLogger().setLevel(logging.INFO)
+    logging.getLogger('botocore').setLevel(logging.INFO)
+    logging.getLogger('boto3').setLevel(logging.INFO)
+    logging.getLogger('urllib3').setLevel(logging.INFO)
+    logging.getLogger('requests').setLevel(logging.ERROR)
+    
logging.getLogger('botocore.vendored.requests.packages.urllib3.connectionpool').setLevel(logging.ERROR)
+
+
+def main():
+    _configure_logging()
+    aws_service_objects = 
aws_utils.generate_aws_service_objects(region_name=REGION_NAME)
+    logging.info('Starting gathering statistics')
+    record_jenkins_run_durations(dynamo_db=aws_service_objects.dynamo_db, 
cloudwatch=aws_service_objects.cloudwatch)
+    logging.info('Statistics recorded')
+
+
+def lambda_handler(event, context):
+    try:
+        main()
+        return 'Done'
+    except Exception:  # pylint: disable=broad-except
+        logging.exception('Unexpected exception')
+        logging.fatal('Unexpected exception')
+        return 'Error'
+        # This try-catch is important because we have to catch all exceptions. 
Otherwise, the exceptions bubble up to
+        # lambda and the service retries executing multiple times. We only 
want exactly one execution per request.
+
+
+
+if __name__ == '__main__':
+    sys.exit(main())

[incubator-mxnet-ci] branch master updated: Add jenkins run statistics (#4)

Reply via email to