This is an automated email from the ASF dual-hosted git repository.
ahmedabualsaud pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/beam.git
The following commit(s) were added to refs/heads/master by this push:
new ea8596f2df0 Add ability to run performance regression checks on Beam
IO Load tests. (#29226)
ea8596f2df0 is described below
commit ea8596f2df0e3e4b9da9f215ae6745c2ddfb6612
Author: Pranav Bhandari <[email protected]>
AuthorDate: Wed Nov 1 12:50:01 2023 -0400
Add ability to run performance regression checks on Beam IO Load tests.
(#29226)
---
.../testing/analyzers/io_tests_config.yaml | 256 +++++++++++++++++++++
.../testing/analyzers/load_test_perf_analysis.py | 98 ++++++++
.../testing/analyzers/perf_analysis_utils.py | 11 +-
3 files changed, 364 insertions(+), 1 deletion(-)
diff --git a/sdks/python/apache_beam/testing/analyzers/io_tests_config.yaml
b/sdks/python/apache_beam/testing/analyzers/io_tests_config.yaml
new file mode 100644
index 00000000000..2a33ae31797
--- /dev/null
+++ b/sdks/python/apache_beam/testing/analyzers/io_tests_config.yaml
@@ -0,0 +1,256 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+spanner_io_read:
+ test_description: |
+ SpannerIO Read test 100 GB.
+ project: apache-beam-testing
+ metrics_dataset: performance_tests
+ metrics_table: io_performance_metrics
+ # test_name is in the format testName,pipelineName
+ test_name: testSpannerWriteAndRead,read-spanner
+ metric_name:
+ - RunTime
+ - EstimatedCost
+
+spanner_io_read_runnerV2:
+ test_description: |
+ SpannerIO RunnerV2 Read test 100 GB.
+ project: apache-beam-testing
+ metrics_dataset: performance_tests
+ metrics_table: io_performance_metrics
+ # test_name is in the format testName,pipelineName
+ test_name: testSpannerWriteAndRead,read_spanner_v2
+ metric_name:
+ - RunTime
+ - EstimatedCost
+
+spanner_io_write:
+ test_description: |
+ SpannerIO write test 100 GB.
+ project: apache-beam-testing
+ metrics_dataset: performance_tests
+ metrics_table: io_performance_metrics
+ # test_name is in the format testName,pipelineName
+ test_name: testSpannerWriteAndRead,write-spanner
+ metric_name:
+ - RunTime
+ - EstimatedCost
+
+spanner_io_write_runnerV2:
+ test_description: |
+ SpannerIO RunnerV2 write test 100 GB.
+ project: apache-beam-testing
+ metrics_dataset: performance_tests
+ metrics_table: io_performance_metrics
+ # test_name is in the format testName,pipelineName
+ test_name: testSpannerWriteAndRead,write_spanner_v2
+ metric_name:
+ - RunTime
+ - EstimatedCost
+
+bigquery_io_storage_api_read:
+ test_description: |
+ BigQueryIO Storage write API read test 100 GB.
+ project: apache-beam-testing
+ metrics_dataset: performance_tests
+ metrics_table: io_performance_metrics
+ # test_name is in the format testName,pipelineName
+ test_name: testStorageAPIWriteThenRead,read-bigquery
+ metric_name:
+ - RunTime
+ - EstimatedCost
+
+bigquery_io_storage_api_read_runnerV2:
+ test_description: |
+ BigQueryIO RunnerV2 Storage write API read test 100 GB.
+ project: apache-beam-testing
+ metrics_dataset: performance_tests
+ metrics_table: io_performance_metrics
+ # test_name is in the format testName,pipelineName
+ test_name: testStorageAPIWriteThenRead,read_bigquery_v2
+ metric_name:
+ - RunTime
+ - EstimatedCost
+
+bigquery_io_storage_api_write:
+ test_description: |
+ BigQueryIO Storage write API write test 100 GB.
+ project: apache-beam-testing
+ metrics_dataset: performance_tests
+ metrics_table: io_performance_metrics
+ # test_name is in the format testName,pipelineName
+ test_name: testStorageAPIWriteThenRead,write-bigquery
+ metric_name:
+ - RunTime
+ - EstimatedCost
+
+bigquery_io_storage_api_write_runnerV2:
+ test_description: |
+ BigQueryIO Storage write API write test 100 GB.
+ project: apache-beam-testing
+ metrics_dataset: performance_tests
+ metrics_table: io_performance_metrics
+ # test_name is in the format testName,pipelineName
+ test_name: testStorageAPIWriteThenRead,write_bigquery_v2
+ metric_name:
+ - RunTime
+ - EstimatedCost
+
+bigquery_io_avro_file_loads_read:
+ test_description: |
+ BigQueryIO Avro file loads read test 100 GB.
+ project: apache-beam-testing
+ metrics_dataset: performance_tests
+ metrics_table: io_performance_metrics
+ # test_name is in the format testName,pipelineName
+ test_name: testAvroFileLoadsWriteThenRead,read-bigquery
+ metric_name:
+ - RunTime
+ - EstimatedCost
+
+bigquery_io_avro_file_loads_read_runnerV2:
+ test_description: |
+ BigQueryIO RunnerV2 Avro file loads read test 100 GB.
+ project: apache-beam-testing
+ metrics_dataset: performance_tests
+ metrics_table: io_performance_metrics
+ # test_name is in the format testName,pipelineName
+ test_name: testAvroFileLoadsWriteThenRead,read_bigquery_v2
+ metric_name:
+ - RunTime
+ - EstimatedCost
+
+bigquery_io_avro_file_loads_write:
+ test_description: |
+ BigQueryIO Avro file loads write test 100 GB.
+ project: apache-beam-testing
+ metrics_dataset: performance_tests
+ metrics_table: io_performance_metrics
+ # test_name is in the format testName,pipelineName
+ test_name: testAvroFileLoadsWriteThenRead,write-bigquery
+ metric_name:
+ - RunTime
+ - EstimatedCost
+
+bigquery_io_avro_file_loads_write_runnerV2:
+ test_description: |
+ BigQueryIO RunnerV2 Avro file loads write test 100 GB.
+ project: apache-beam-testing
+ metrics_dataset: performance_tests
+ metrics_table: io_performance_metrics
+ # test_name is in the format testName,pipelineName
+ test_name: testAvroFileLoadsWriteThenRead,write_bigquery_v2
+ metric_name:
+ - RunTime
+ - EstimatedCost
+
+bigquery_io_json_file_loads_read:
+ test_description: |
+ BigQueryIO Json file loads read test 100 GB.
+ project: apache-beam-testing
+ metrics_dataset: performance_tests
+ metrics_table: io_performance_metrics
+ # test_name is in the format testName,pipelineName
+ test_name: testJsonFileLoadsWriteThenRead,read-bigquery
+ metric_name:
+ - RunTime
+ - EstimatedCost
+
+bigquery_io_json_file_loads_write:
+ test_description: |
+ BigQueryIO Json file loads write test 100 GB.
+ project: apache-beam-testing
+ metrics_dataset: performance_tests
+ metrics_table: io_performance_metrics
+ # test_name is in the format testName,pipelineName
+ test_name: testJsonFileLoadsWriteThenRead,write-bigquery
+ metric_name:
+ - RunTime
+ - EstimatedCost
+
+bigtable_io_read:
+ test_description: |
+ BigTableIO read test 100 GB.
+ project: apache-beam-testing
+ metrics_dataset: performance_tests
+ metrics_table: io_performance_metrics
+ # test_name is in the format testName,pipelineName
+ test_name: testBigtableWriteAndRead,read-bigtable
+ metric_name:
+ - RunTime
+ - EstimatedCost
+
+bigtable_io_write:
+ test_description: |
+ BigTableIO write test 100 GB.
+ project: apache-beam-testing
+ metrics_dataset: performance_tests
+ metrics_table: io_performance_metrics
+ # test_name is in the format testName,pipelineName
+ test_name: testBigtableWriteAndRead,write-bigtable
+ metric_name:
+ - RunTime
+ - EstimatedCost
+
+text_io_read:
+ test_description: |
+ TextIO read test 100 GB.
+ project: apache-beam-testing
+ metrics_dataset: performance_tests
+ metrics_table: io_performance_metrics
+ # test_name is in the format testName,pipelineName
+ test_name: testTextIOWriteThenRead,read-textio
+ metric_name:
+ - RunTime
+ - EstimatedCost
+
+text_io_read_runnerV2:
+ test_description: |
+ TextIO RunnerV2 read test 100 GB.
+ project: apache-beam-testing
+ metrics_dataset: performance_tests
+ metrics_table: io_performance_metrics
+ # test_name is in the format testName,pipelineName
+ test_name: testTextIOWriteThenRead,read_textio_v2
+ metric_name:
+ - RunTime
+ - EstimatedCost
+
+text_io_write:
+ test_description: |
+ TextIO write test 100 GB.
+ project: apache-beam-testing
+ metrics_dataset: performance_tests
+ metrics_table: io_performance_metrics
+ # test_name is in the format testName,pipelineName
+ test_name: testTextIOWriteThenRead,write-textio
+ metric_name:
+ - RunTime
+ - EstimatedCost
+
+text_io_write_runnerV2:
+ test_description: |
+ TextIO RunnerV2 write test 100 GB.
+ project: apache-beam-testing
+ metrics_dataset: performance_tests
+ metrics_table: io_performance_metrics
+ # test_name is in the format testName,pipelineName
+ test_name: testTextIOWriteThenRead,write_textio_v2
+ metric_name:
+ - RunTime
+ - EstimatedCost
diff --git
a/sdks/python/apache_beam/testing/analyzers/load_test_perf_analysis.py
b/sdks/python/apache_beam/testing/analyzers/load_test_perf_analysis.py
new file mode 100644
index 00000000000..ee9d04e6260
--- /dev/null
+++ b/sdks/python/apache_beam/testing/analyzers/load_test_perf_analysis.py
@@ -0,0 +1,98 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import argparse
+import logging
+
+from apache_beam.testing.analyzers import constants
+from apache_beam.testing.analyzers import perf_analysis
+from apache_beam.testing.analyzers import perf_analysis_utils
+from apache_beam.testing.analyzers.perf_analysis_utils import MetricContainer
+from apache_beam.testing.analyzers.perf_analysis_utils import
TestConfigContainer
+
+try:
+ from google.cloud import bigquery
+except ImportError:
+ bigquery = None # type: ignore
+
+
+class LoadTestMetricsFetcher(perf_analysis_utils.MetricsFetcher):
+ """
+ Metrics fetcher used to get metric data from a BigQuery table. The metrics
+ are fetched and returned as a dataclass containing lists of timestamps and
+ metric_values.
+ """
+ def fetch_metric_data(
+ self, *, test_config: TestConfigContainer) -> MetricContainer:
+ if test_config.test_name:
+ test_name, pipeline_name = test_config.test_name.split(',')
+ else:
+ raise Exception("test_name not provided in config.")
+
+ query = f"""
+ SELECT timestamp, metric.value
+ FROM
{test_config.project}.{test_config.metrics_dataset}.{test_config.metrics_table}
+ CROSS JOIN UNNEST(metrics) AS metric
+ WHERE test_name = "{test_name}" AND pipeline_name = "{pipeline_name}"
AND metric.name = "{test_config.metric_name}"
+ ORDER BY timestamp DESC
+ LIMIT {constants._NUM_DATA_POINTS_TO_RUN_CHANGE_POINT_ANALYSIS}
+ """
+ logging.debug("Running query: %s" % query)
+ if bigquery is None:
+ raise ImportError('Bigquery dependencies are not installed.')
+ client = bigquery.Client()
+ query_job = client.query(query=query)
+ metric_data = query_job.result().to_dataframe()
+ if metric_data.empty:
+ logging.error(
+ "No results returned from BigQuery. Please check the query.")
+ return MetricContainer(
+ values=metric_data['value'].tolist(),
+ timestamps=metric_data['timestamp'].tolist(),
+ )
+
+
+if __name__ == '__main__':
+ logging.basicConfig(level=logging.INFO)
+ load_test_metrics_fetcher = LoadTestMetricsFetcher()
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--config_file_path',
+ required=True,
+ type=str,
+ help='Path to the config file that contains data to run the Change Point
'
+ 'Analysis.The default file will used will be '
+ 'apache_beam/testing/analyzers/tests.config.yml. '
+ 'If you would like to use the Change Point Analysis for finding '
+ 'performance regression in the tests, '
+ 'please provide an .yml file in the same structure as the above '
+ 'mentioned file. ')
+ parser.add_argument(
+ '--save_alert_metadata',
+ action='store_true',
+ default=False,
+ help='Save perf alert/ GH Issue metadata to BigQuery table.')
+ known_args, unknown_args = parser.parse_known_args()
+
+ if unknown_args:
+ logging.warning('Discarding unknown arguments : %s ' % unknown_args)
+
+ perf_analysis.run(
+ big_query_metrics_fetcher=load_test_metrics_fetcher,
+ config_file_path=known_args.config_file_path,
+ # Set this to true while running in production.
+ save_alert_metadata=known_args.save_alert_metadata)
diff --git a/sdks/python/apache_beam/testing/analyzers/perf_analysis_utils.py
b/sdks/python/apache_beam/testing/analyzers/perf_analysis_utils.py
index 11b1cc18ca5..a9015d715e9 100644
--- a/sdks/python/apache_beam/testing/analyzers/perf_analysis_utils.py
+++ b/sdks/python/apache_beam/testing/analyzers/perf_analysis_utils.py
@@ -28,13 +28,18 @@ from typing import Union
import pandas as pd
import yaml
from google.api_core import exceptions
-from google.cloud import bigquery
from apache_beam.testing.analyzers import constants
from apache_beam.testing.load_tests import load_test_metrics_utils
from apache_beam.testing.load_tests.load_test_metrics_utils import
BigQueryMetricsPublisher
from signal_processing_algorithms.energy_statistics.energy_statistics import
e_divisive
+# pylint: disable=ungrouped-imports
+try:
+ from google.cloud import bigquery
+except ImportError:
+ bigquery = None # type: ignore
+
@dataclass(frozen=True)
class GitHubIssueMetaData:
@@ -118,6 +123,8 @@ def get_existing_issues_data(table_name: str) ->
Optional[pd.DataFrame]:
LIMIT 10
"""
try:
+ if bigquery is None:
+ raise ImportError('Bigquery dependencies are not installed.')
client = bigquery.Client()
query_job = client.query(query=query)
existing_issue_data = query_job.result().to_dataframe()
@@ -354,6 +361,8 @@ class BigQueryMetricsFetcher(MetricsFetcher):
ORDER BY {load_test_metrics_utils.SUBMIT_TIMESTAMP_LABEL} DESC
LIMIT {constants._NUM_DATA_POINTS_TO_RUN_CHANGE_POINT_ANALYSIS}
"""
+ if bigquery is None:
+ raise ImportError('Bigquery dependencies are not installed.')
client = bigquery.Client()
query_job = client.query(query=query)
metric_data = query_job.result().to_dataframe()