tvalentyn commented on code in PR #23931: URL: https://github.com/apache/beam/pull/23931#discussion_r1034169011
########## sdks/python/apache_beam/testing/analyzers/perf_regression_analysis.py: ########## @@ -0,0 +1,414 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# This script is used to run Change Point Analysis using a config file. +# config file holds the parameters required to fetch data, and to run the +# change point analysis. Change Point Analysis is used to find Performance +# regressions for Benchmark/load/performance test. + +import argparse +import logging +import os +import time +import uuid +from typing import Any +from typing import Dict +from typing import List +from typing import Optional +from typing import Tuple +from typing import Union + +import numpy as np +import pandas as pd +import yaml +from google.api_core import exceptions + +from apache_beam.testing.analyzers.github_issues_utils import create_or_comment_issue +from apache_beam.testing.analyzers.github_issues_utils import get_issue_description +from apache_beam.testing.load_tests import load_test_metrics_utils +from apache_beam.testing.load_tests.load_test_metrics_utils import BigQueryMetricsPublisher +from apache_beam.testing.load_tests.load_test_metrics_utils import FetchMetrics +from signal_processing_algorithms.energy_statistics.energy_statistics import e_divisive + +_BQ_PROJECT_NAME = 'apache-beam-testing' +_BQ_DATASET = 'beam_perf_storage' + +UNIQUE_ID = 'test_id' +ISSUE_CREATION_TIMESTAMP_LABEL = 'issue_timestamp' +CHANGEPOINT_TIMESTAMP_LABEL = 'change_point_timestamp' +CHANGE_POINT_LABEL = 'change_point' +TEST_NAME = 'test_name' +METRIC_NAME = 'metric_name' +ISSUE_NUMBER = 'issue_number' +ISSUE_URL = 'issue_url' +# number of results to display on the issue description +# from change point index in both directions. +NUM_RESULTS_TO_DISPLAY_ON_ISSUE_DESCRIPTION = 10 + +SCHEMA = [{ + 'name': UNIQUE_ID, 'field_type': 'STRING', 'mode': 'REQUIRED' +}, + { + 'name': ISSUE_CREATION_TIMESTAMP_LABEL, + 'field_type': 'TIMESTAMP', + 'mode': 'REQUIRED' + }, + { + 'name': CHANGEPOINT_TIMESTAMP_LABEL, + 'field_type': 'TIMESTAMP', + 'mode': 'REQUIRED' + }, + { + 'name': CHANGE_POINT_LABEL, + 'field_type': 'FLOAT64', + 'mode': 'REQUIRED' + }, { + 'name': METRIC_NAME, 'field_type': 'STRING', 'mode': 'REQUIRED' + }, { + 'name': TEST_NAME, 'field_type': 'STRING', 'mode': 'REQUIRED' + }, { + 'name': ISSUE_NUMBER, 'field_type': 'INT64', 'mode': 'REQUIRED' + }, { + 'name': ISSUE_URL, 'field_type': 'STRING', 'mode': 'REQUIRED' + }] + +TITLE_TEMPLATE = """ + Performance Regression: {}:{} +""" +# TODO: Add mean value before and mean value after. +_METRIC_DESCRIPTION = """ + Affected metric: `{}` +""" +_METRIC_INFO = "timestamp: {}, metric_value: `{}`" +ISSUE_LABELS = ['perf-alerts'] + + +class GitHubIssueMetaData: + """ + This class holds metadata that needs to be published to the + BigQuery when a GitHub issue is created on a performance + alert. + """ + def __init__( + self, + issue_creation_timestamp, + change_point_timestamp, + test_name, + metric_name, + issue_number, + issue_url, + test_id, + change_point): + self.issue_creation_timestamp = issue_creation_timestamp + self.change_point_timestamp = change_point_timestamp + self.test_name = test_name + self.metric_name = metric_name + self.issue_number = issue_number + self.issue_url = issue_url + self.test_id = test_id + self.change_point = change_point + + def as_dict(self) -> Dict: + return { + ISSUE_CREATION_TIMESTAMP_LABEL: self.issue_creation_timestamp, + CHANGEPOINT_TIMESTAMP_LABEL: self.change_point_timestamp, + TEST_NAME: self.test_name, + METRIC_NAME: self.metric_name, + ISSUE_NUMBER: self.issue_number, + UNIQUE_ID: self.test_id, + CHANGE_POINT_LABEL: self.change_point, + ISSUE_URL: self.issue_url + } + + +class ChangePointAnalysis: + def __init__( + self, + data: Union[List[float], List[List[float]], np.ndarray], + metric_name: str, + ): + self.data = data + self.metric_name = metric_name + + def edivisive_means(self, + pvalue: float = 0.05, + permutations: int = 100) -> List[int]: + """ + Args: + pvalue: p value for the permutation test. + permutations: Number of permutations for the permutation test. + + Performs edivisive means on the data and returns the indices of the + Change points. + + Returns: + The indices of change points. + """ + return e_divisive(self.data, pvalue, permutations) + + +def is_change_point_in_valid_window( + change_point_to_recent_run_window: int, change_point_index: int) -> bool: + # If the change point is more than N runs behind the most recent run, + # Ignore the change point and don't raise an alert for it. Review Comment: Every unique changepoint should receive an alert. But, when looking for a changepoint we limit ourselves to some number of recent runs (N). If we identified the changepoint in this search, and it is unique, we should alert. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
