[
https://issues.apache.org/jira/browse/BEAM-5836?focusedWorklogId=160373&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-160373
]
ASF GitHub Bot logged work on BEAM-5836:
----------------------------------------
Author: ASF GitHub Bot
Created on: 30/Oct/18 03:09
Start Date: 30/Oct/18 03:09
Worklog Time Spent: 10m
Work Description: swegner commented on a change in pull request #6880:
[BEAM-5836] Add script to sync data for Beam GitHub community metrics.
URL: https://github.com/apache/beam/pull/6880#discussion_r229162971
##########
File path: .test-infra/metrics/sync/github/sync.py
##########
@@ -0,0 +1,398 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+
+'''This module queries GitHub to collect Beam-related metrics and put them in
PostgreSQL.'''
+import itertools
+import os
+import socket
+import sys
+import time
+import traceback
+import re
+from datetime import datetime
+
+import requests
+import psycopg2
+
+import queries
+import ghutilities
+
+# Keeping this as reference for localhost debug
+# Fetching docker host machine ip for testing purposes.
+# Actual host should be used for production.
+# def findDockerNetworkIP():
+# '''Utilizes ip tool to find docker network IP'''
+# import subprocess
+# cmd_out = subprocess.check_output(["ip", "route", "show"]).decode("utf-8")
+# return cmd_out.split(" ")[2]
+# DB_HOST = findDockerNetworkIP()
+
+DB_HOST = os.environ['DB_HOST']
+DB_PORT = os.environ['DB_PORT']
+DB_NAME = os.environ['DB_DBNAME']
+DB_USER_NAME = os.environ['DB_DBUSERNAME']
+DB_PASSWORD = os.environ['DB_DBPWD']
+
+GH_ACCESS_TOKEN = os.environ['GH_ACCESS_TOKEN']
+
+GH_PRS_TABLE_NAME = 'gh_pull_requests'
+GH_PRS_CREATE_TABLE_QUERY = f"""
+ create table {GH_PRS_TABLE_NAME} (
+ pr_id integer NOT NULL PRIMARY KEY,
+ author varchar NOT NULL,
+ created_ts timestamp NOT NULL,
+ first_non_author_activity_ts timestamp NULL,
+ first_non_author_activity_author varchar NULL,
+ closed_ts timestamp NULL,
+ updated_ts timestamp NOT NULL,
+ is_merged boolean NOT NULL,
+ requested_reviewers varchar[] NOT NULL,
+ beam_reviewers varchar[] NOT NULL,
+ mentioned varchar[] NOT NULL,
+ reviewed_by varchar[] NOT NULL
+ )
+ """
+
+GH_SYNC_METADATA_TABLE_NAME = 'gh_sync_metadata'
+GH_SYNC_METADATA_TABLE_CREATE_QUERY = f"""
+ create table {GH_SYNC_METADATA_TABLE_NAME} (
+ name varchar NOT NULL PRIMARY KEY,
+ timestamp timestamp NOT NULL
+ )
+ """
+
+
+def initDBConnection():
+ '''Opens connection to postgresql DB, as configured via global variables.'''
+ conn = None
+ while not conn:
+ try:
+ conn = psycopg2.connect(f"dbname='{DB_NAME}' user='{DB_USER_NAME}'
host='{DB_HOST}'"
+ f" port='{DB_PORT}' password='{DB_PASSWORD}'")
+ except:
+ print('Failed to connect to DB; retrying in 1 minute')
+ time.sleep(60)
+ return conn
+
+
+def tableExists(cursor, tableName):
+ '''Checks the existense of table.'''
+ cursor.execute(f"select * from information_schema.tables"
+ f" where table_name='{tableName}';")
+ return bool(cursor.rowcount)
+
+
+def initDbTablesIfNeeded():
+ '''Creates and initializes DB tables required for script to work.'''
+ connection = initDBConnection()
+ cursor = connection.cursor()
+
+ buildsTableExists = tableExists(cursor, GH_PRS_TABLE_NAME)
+ print('PRs table exists', buildsTableExists)
+ if not buildsTableExists:
+ cursor.execute(GH_PRS_CREATE_TABLE_QUERY)
+ if not bool(cursor.rowcount):
+ raise Exception(f"Failed to create table {GH_PRS_TABLE_NAME}")
+
+ metadataTableExists = tableExists(cursor, GH_SYNC_METADATA_TABLE_NAME)
+ print('Metadata table exists', metadataTableExists)
+ if not buildsTableExists:
+ cursor.execute(GH_SYNC_METADATA_TABLE_CREATE_QUERY)
+ if not bool(cursor.rowcount):
+ raise Exception(f"Failed to create table {GH_SYNC_METADATA_TABLE_NAME}")
+
+ cursor.close()
+ connection.commit()
+
+ connection.close()
+
+
+def fetchLastSyncTimestamp(cursor):
+ '''Fetches last sync timestamp from metadata DB table.'''
+ fetchQuery = f'''
+ SELECT timestamp
+ FROM {GH_SYNC_METADATA_TABLE_NAME}
+ WHERE name LIKE 'gh_sync'
+ '''
+
+ cursor.execute(fetchQuery)
+ queryResult = cursor.fetchone()
+
+ defaultResult = datetime(year=1980, month=1, day=1)
+ return defaultResult if queryResult is None else queryResult[0]
+
+
+def updateLastSyncTimestamp(timestamp):
+ '''Updates last sync timestamp in metadata DB table.'''
+ connection = initDBConnection()
+ cursor = connection.cursor()
+
+ insertTimestampSqlQuery = f'''INSERT INTO {GH_SYNC_METADATA_TABLE_NAME}
(name, timestamp)
+ VALUES ('gh_sync', %s)
+ ON CONFLICT (name) DO UPDATE
+ SET timestamp = excluded.timestamp
+ '''
+ cursor.execute(insertTimestampSqlQuery, [timestamp])
+
+ cursor.close()
+ connection.commit()
+ connection.close()
+
+
+def executeGHGraphqlQuery(query):
+ '''Runs graphql query on GitHub.'''
+ url = 'https://api.github.com/graphql'
+ headers = {'Authorization': f'Bearer {GH_ACCESS_TOKEN}'}
+ r = requests.post(url=url, json={'query': query}, headers=headers)
+ return r.json()
+
+def fetchGHData(timestamp):
+ '''Fetches GitHub data required for reporting Beam metrics'''
+ tsString = ghutilities.datetimeToGHTimeStr(timestamp)
+ query = queries.MAIN_PR_QUERY.replace('<TemstampSubstitueLocation>',
tsString)
+ return executeGHGraphqlQuery(query)
+
+
+def extractRequestedReviewers(pr):
+ reviewEdges = pr["reviewRequests"]["edges"]
+ return list(map(lambda x: x["node"]["requestedReviewer"]["login"],
reviewEdges))
+
+
+def extractMentions(pr):
+ body = pr["body"]
+ commentEdges = pr["comments"]["edges"]
+ reviewEdges = pr["reviews"]["edges"]
+
+ bodyMentions = ghutilities.findMentions(body)
+ commentMentionsLists = map(lambda x:
ghutilities.findMentions(x["node"]["body"]),
+ commentEdges)
+ reviewMentionsLists = map(lambda x:
ghutilities.findMentions(x["node"]["body"]),
+ commentEdges)
+ commentMentions = [item for sublist in commentMentionsLists for item in
sublist]
+ reviewMentions = [item for sublist in reviewMentionsLists for item in
sublist]
+
+ mentionsSet = set(bodyMentions) | set(commentMentions) | set(reviewMentions)
+ return list(mentionsSet)
+
+
+def extractFirstNAActivity(pr):
+ '''Returns timestamp and login of author on first activity on pull request
done by non-author.'''
+ author = pr["author"]["login"]
+ commentEdges = None
+ commentEdges = [edge for edge in pr["comments"]["edges"] if
edge["node"]["author"]["login"] != author]
+ reviewEdges = [edge for edge in pr["reviews"]["edges"] if
edge["node"]["author"]["login"] != author]
+ merged = pr["merged"]
+ mergedAt = pr["mergedAt"]
+ mergedBy = None if not merged else pr["mergedBy"]["login"]
+ commentTimestamps = list(map(lambda x: (x["node"]["createdAt"],
x["node"]["author"]["login"]),
+ commentEdges))
+ reviewTimestamps = list(map(lambda x: (x["node"]["createdAt"],
x["node"]["author"]["login"]),
+ reviewEdges))
+ allTimestamps = commentTimestamps + reviewTimestamps
+ if merged:
+ allTimestamps.append((mergedAt, mergedBy))
+ return (None, None) if not allTimestamps else min(allTimestamps, key=lambda
t: t[0])
+
+
+def extractBeamReviewers(pr):
+ '''Extract logins of users defined by Beam as reviewers.'''
+ author = pr['author']['login']
+
+ # All the direct GitHub indicators of reviewers
+ reviewers = []
+ for r in pr['assignees']['edges']:
+ reviewers.append(r['node']['login'])
+ for r in pr['reviewRequests']['edges']:
+ reviewers.append(r['node']['requestedReviewer']['login'])
+
+ # GitHub users that have performed reviews.
+ for r in pr['reviews']['edges']:
+ reviewers.append(r['node']['author']['login'])
+
+ ## Hi, @r1, can you .. look?
+ beamReviewerRegex = r'(Hi.*look?)'
+ ## R= @r1 @r2 @R3
+ g3ReviewerRegex = r'(?:^|\W)[Rr]\s*[=:.]((?:[\s,;.]*-?@\w+)+)'
Review comment:
`g3` sounds like a reference to Google internal infrastructure
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
Issue Time Tracking
-------------------
Worklog Id: (was: 160373)
Time Spent: 50m (was: 40m)
> Build Code Velocity metrics pipeline
> ------------------------------------
>
> Key: BEAM-5836
> URL: https://issues.apache.org/jira/browse/BEAM-5836
> Project: Beam
> Issue Type: Sub-task
> Components: project-management
> Reporter: Scott Wegner
> Assignee: Mikhail Gryzykhin
> Priority: Major
> Time Spent: 50m
> Remaining Estimate: 0h
>
> Based on discussion fromĀ [Improving Beam Code
> Review|https://s.apache.org/beam-code-review], we want to build two new
> metric dashboards:
> * Open Pull Requests (over time, aggregate and per-reviewer), to track
> reviewer load
> * Pull Request Time-to-first-comment (over time), to track review latency
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)