[GitHub] [beam] tysonjh commented on a change in pull request #12612: [BEAM-10675] Add Python GBK Load Tests for streaming on Dataflow

GitBox Fri, 21 Aug 2020 10:38:00 -0700


tysonjh commented on a change in pull request #12612:
URL: https://github.com/apache/beam/pull/12612#discussion_r474830190




##########
File path: .test-infra/jenkins/job_LoadTests_GBK_Python.groovy
##########
@@ -22,119 +22,138 @@ import InfluxDBCredentialsHelper
 
 def now = new Date().format("MMddHHmmss", TimeZone.getTimeZone('UTC'))
 
-def loadTestConfigurations = { datasetName ->
+// TODO(BEAM-10774): Skipping some cases because they are too slow.
+def STREAMING_TESTS_TO_SKIP = [1, 2, 4, 5]
+
+def loadTestConfigurations = { mode, datasetName ->
   [
     [
       title          : 'GroupByKey Python Load test: 2GB of 10B records',
       test           : 'apache_beam.testing.load_tests.group_by_key_test',
       runner         : CommonTestProperties.Runner.DATAFLOW,
       pipelineOptions: [
-        job_name             : 'load-tests-python-dataflow-batch-gbk-1-' + now,
+        job_name             : 
"load-tests-python-dataflow-${mode}-gbk-1-${now}",
         project              : 'apache-beam-testing',
         region               : 'us-central1',
         temp_location        : 'gs://temp-storage-for-perf-tests/loadtests',
         publish_to_big_query : true,
         metrics_dataset      : datasetName,
-        metrics_table        : 'python_dataflow_batch_gbk_1',
-        influx_measurement   : 'python_batch_gbk_1',
+        metrics_table        : "python_dataflow_${mode}_gbk_1",
+        influx_measurement   : "python_${mode}_gbk_1",
         input_options        : '\'{"num_records": 200000000,' +
         '"key_size": 1,' +
         '"value_size": 9}\'',
         iterations           : 1,
         fanout               : 1,
         num_workers          : 5,
-        autoscaling_algorithm: "NONE"
+        autoscaling_algorithm: 'NONE',
       ]
     ],
     [
       title          : 'GroupByKey Python Load test: 2GB of 100B records',
       test           : 'apache_beam.testing.load_tests.group_by_key_test',
       runner         : CommonTestProperties.Runner.DATAFLOW,
       pipelineOptions: [
-        job_name             : 'load-tests-python-dataflow-batch-gbk-2-' + now,
+        job_name             : 'load-tests-python-dataflow-${mode}-gbk-2-' + 
now,

Review comment:
       ```suggestion
           job_name             : 
"load-tests-python-dataflow-${mode}-gbk-2-${now}",
   ```

##########
File path: .test-infra/jenkins/job_LoadTests_GBK_Python.groovy
##########
@@ -22,119 +22,138 @@ import InfluxDBCredentialsHelper
 
 def now = new Date().format("MMddHHmmss", TimeZone.getTimeZone('UTC'))
 
-def loadTestConfigurations = { datasetName ->
+// TODO(BEAM-10774): Skipping some cases because they are too slow.
+def STREAMING_TESTS_TO_SKIP = [1, 2, 4, 5]
+
+def loadTestConfigurations = { mode, datasetName ->
   [
     [
       title          : 'GroupByKey Python Load test: 2GB of 10B records',
       test           : 'apache_beam.testing.load_tests.group_by_key_test',
       runner         : CommonTestProperties.Runner.DATAFLOW,
       pipelineOptions: [
-        job_name             : 'load-tests-python-dataflow-batch-gbk-1-' + now,
+        job_name             : 
"load-tests-python-dataflow-${mode}-gbk-1-${now}",
         project              : 'apache-beam-testing',
         region               : 'us-central1',
         temp_location        : 'gs://temp-storage-for-perf-tests/loadtests',
         publish_to_big_query : true,
         metrics_dataset      : datasetName,
-        metrics_table        : 'python_dataflow_batch_gbk_1',
-        influx_measurement   : 'python_batch_gbk_1',
+        metrics_table        : "python_dataflow_${mode}_gbk_1",
+        influx_measurement   : "python_${mode}_gbk_1",
         input_options        : '\'{"num_records": 200000000,' +
         '"key_size": 1,' +
         '"value_size": 9}\'',
         iterations           : 1,
         fanout               : 1,
         num_workers          : 5,
-        autoscaling_algorithm: "NONE"
+        autoscaling_algorithm: 'NONE',
       ]
     ],
     [
       title          : 'GroupByKey Python Load test: 2GB of 100B records',
       test           : 'apache_beam.testing.load_tests.group_by_key_test',
       runner         : CommonTestProperties.Runner.DATAFLOW,
       pipelineOptions: [
-        job_name             : 'load-tests-python-dataflow-batch-gbk-2-' + now,
+        job_name             : 'load-tests-python-dataflow-${mode}-gbk-2-' + 
now,
         project              : 'apache-beam-testing',
         region               : 'us-central1',
         temp_location        : 'gs://temp-storage-for-perf-tests/loadtests',
         publish_to_big_query : true,
         metrics_dataset      : datasetName,
-        metrics_table        : 'python_dataflow_batch_gbk_2',
-        influx_measurement   : 'python_batch_gbk_2',
+        metrics_table        : 'python_dataflow_${mode}_gbk_2',
+        influx_measurement   : 'python_${mode}_gbk_2',
         input_options        : '\'{"num_records": 20000000,' +
         '"key_size": 10,' +
         '"value_size": 90}\'',
         iterations           : 1,
         fanout               : 1,
         num_workers          : 5,
-        autoscaling_algorithm: "NONE"
+        autoscaling_algorithm: 'NONE',
       ]
     ],
     [
       title          : 'GroupByKey Python Load test: 2GB of 100kB records',
       test           : 'apache_beam.testing.load_tests.group_by_key_test',
       runner         : CommonTestProperties.Runner.DATAFLOW,
       pipelineOptions: [
-        job_name             : 'load-tests-python-dataflow-batch-gbk-3-' + now,
+        job_name             : 
"load-tests-python-dataflow-${mode}-gbk-3-${now}",
         project              : 'apache-beam-testing',
         region               : 'us-central1',
         temp_location        : 'gs://temp-storage-for-perf-tests/loadtests',
         publish_to_big_query : true,
         metrics_dataset      : datasetName,
-        metrics_table        : 'python_dataflow_batch_gbk_3',
-        influx_measurement   : 'python_batch_gbk_3',
+        metrics_table        : "python_dataflow_${mode}_gbk_3",
+        influx_measurement   : "python_${mode}_gbk_3",
         input_options        : '\'{"num_records": 20000,' +
         '"key_size": 10000,' +
         '"value_size": 90000}\'',
         iterations           : 1,
         fanout               : 1,
         num_workers          : 5,
-        autoscaling_algorithm: "NONE"
+        autoscaling_algorithm: 'NONE',
       ]
     ],
     [
       title          : 'GroupByKey Python Load test: fanout 4 times with 2GB 
10-byte records total',
       test           : 'apache_beam.testing.load_tests.group_by_key_test',
       runner         : CommonTestProperties.Runner.DATAFLOW,
       pipelineOptions: [
-        job_name             : 'load-tests-python-dataflow-batch-gbk-4-' + now,
+        job_name             : 
"load-tests-python-dataflow-${mode}-gbk-4-${now}",
         project              : 'apache-beam-testing',
         region               : 'us-central1',
         temp_location        : 'gs://temp-storage-for-perf-tests/loadtests',
         publish_to_big_query : true,
         metrics_dataset      : datasetName,
-        metrics_table        : 'python_dataflow_batch_gbk_4',
-        influx_measurement   : 'python_batch_gbk_4',
+        metrics_table        : "python_dataflow_${mode}_gbk_4",
+        influx_measurement   : "python_${mode}_gbk_4",
         input_options        : '\'{"num_records": 5000000,' +
         '"key_size": 10,' +
         '"value_size": 90}\'',
         iterations           : 1,
         fanout               : 4,
-        num_workers          : 5,
-        autoscaling_algorithm: "NONE"
+        num_workers          : 16,
+        autoscaling_algorithm: 'NONE',
       ]
     ],
     [
       title          : 'GroupByKey Python Load test: fanout 8 times with 2GB 
10-byte records total',
       test           : 'apache_beam.testing.load_tests.group_by_key_test',
       runner         : CommonTestProperties.Runner.DATAFLOW,
       pipelineOptions: [
-        job_name             : 'load-tests-python-dataflow-batch-gbk-5-' + now,
+        job_name             : 
"load-tests-python-dataflow-${mode}-gbk-5-${now}",
         project              : 'apache-beam-testing',
         region               : 'us-central1',
         temp_location        : 'gs://temp-storage-for-perf-tests/loadtests',
         publish_to_big_query : true,
         metrics_dataset      : datasetName,
-        metrics_table        : 'python_dataflow_batch_gbk_5',
-        influx_measurement   : 'python_batch_gbk_5',
+        metrics_table        : "python_dataflow_${mode}_gbk_5",
+        influx_measurement   : "python_${mode}_gbk_5",
         input_options        : '\'{"num_records": 2500000,' +
         '"key_size": 10,' +
         '"value_size": 90}\'',
         iterations           : 1,
         fanout               : 8,
-        num_workers          : 5,
-        autoscaling_algorithm: "NONE"
+        num_workers          : 16,
+        autoscaling_algorithm: 'NONE',
       ]
     ],
-  ].each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) }
+  ]
+  .each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) }
+  .each { test -> (mode != 'streaming') ?: addStreamingOptions(test) }
+  .withIndex().collectMany { test, i ->
+    mode == 'streaming' && STREAMING_TESTS_TO_SKIP.contains(i + 1) ? []: [test]
+  }
+}
+
+def addStreamingOptions(test) {
+  test.pipelineOptions << [streaming: null, experiments: 'use_runner_v2',
+    enable_streaming_engine: null ]
+}

Review comment:
       Can you add a comment to explain what these settings are? It's 
unexpected to see that 'streaming: null' or 'enable_streaming_engine: null' 
somehow enables streaming, or why 'use_runner_v2' is required.

##########
File path: .test-infra/jenkins/job_LoadTests_GBK_Python_reiterate.groovy
##########
@@ -58,43 +58,47 @@ def loadTestConfigurations = { datasetName ->
       pipelineOptions: [
         project              : 'apache-beam-testing',
         region               : 'us-central1',
-        job_name             : 'load-tests-python-dataflow-batch-gbk-7-' + now,
+        job_name             : 
"load-tests-python-dataflow-${mode}-gbk-7-${now}",
         temp_location        : 'gs://temp-storage-for-perf-tests/loadtests',
         publish_to_big_query : true,
         metrics_dataset      : datasetName,
-        metrics_table        : 'python_dataflow_batch_gbk_7',
-        influx_measurement   : 'python_batch_gbk_7',
+        metrics_table        : "python_dataflow_${mode}_gbk_7",
+        influx_measurement   : "python_${mode}_gbk_7",
         input_options        : '\'{"num_records": 20000000,' +
         '"key_size": 10,' +
         '"value_size": 90,' +
         '"num_hot_keys": 10,' +
         '"hot_key_fraction": 1}\'',
-        fanout               : 1,
         iterations           : 4,
+        fanout               : 1,
         num_workers          : 5,
-        autoscaling_algorithm: 'NONE'
+        autoscaling_algorithm: 'NONE',
       ]
     ]
-  ].each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) }
+  ]
+  .each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) }
+  .each { test -> (mode != 'streaming') ?: addStreamingOptions(test) }
 }
 
-def batchLoadTestJob = { scope, triggeringContext ->
-  scope.description('Runs Python GBK reiterate load tests on Dataflow runner 
in batch mode')
-  commonJobProperties.setTopLevelMainJobProperties(scope, 'master', 240)
+def addStreamingOptions(test) {

Review comment:
       Same here, please add a comment.

##########
File path: .test-infra/jenkins/job_LoadTests_GBK_Python_reiterate.groovy
##########
@@ -58,43 +58,47 @@ def loadTestConfigurations = { datasetName ->
       pipelineOptions: [
         project              : 'apache-beam-testing',
         region               : 'us-central1',
-        job_name             : 'load-tests-python-dataflow-batch-gbk-7-' + now,
+        job_name             : 
"load-tests-python-dataflow-${mode}-gbk-7-${now}",
         temp_location        : 'gs://temp-storage-for-perf-tests/loadtests',
         publish_to_big_query : true,
         metrics_dataset      : datasetName,
-        metrics_table        : 'python_dataflow_batch_gbk_7',
-        influx_measurement   : 'python_batch_gbk_7',
+        metrics_table        : "python_dataflow_${mode}_gbk_7",
+        influx_measurement   : "python_${mode}_gbk_7",
         input_options        : '\'{"num_records": 20000000,' +
         '"key_size": 10,' +
         '"value_size": 90,' +
         '"num_hot_keys": 10,' +
         '"hot_key_fraction": 1}\'',
-        fanout               : 1,
         iterations           : 4,
+        fanout               : 1,
         num_workers          : 5,
-        autoscaling_algorithm: 'NONE'
+        autoscaling_algorithm: 'NONE',
       ]
     ]
-  ].each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) }
+  ]
+  .each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) }
+  .each { test -> (mode != 'streaming') ?: addStreamingOptions(test) }
 }
 
-def batchLoadTestJob = { scope, triggeringContext ->
-  scope.description('Runs Python GBK reiterate load tests on Dataflow runner 
in batch mode')
-  commonJobProperties.setTopLevelMainJobProperties(scope, 'master', 240)
+def addStreamingOptions(test) {
+  test.pipelineOptions << [streaming: null, experiments: 'use_runner_v2',
+    enable_streaming_engine: null ]
+}
 
+def loadTestJob = { scope, triggeringContext, mode ->
   def datasetName = loadTestsBuilder.getBigQueryDataset('load_test', 
triggeringContext)
-  for (testConfiguration in loadTestConfigurations(datasetName)) {
-    loadTestsBuilder.loadTest(scope, testConfiguration.title, 
testConfiguration.runner, CommonTestProperties.SDK.PYTHON_37, 
testConfiguration.pipelineOptions, testConfiguration.test)
-  }
+  loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.PYTHON_37,
+      loadTestConfigurations(mode, datasetName), 'GBK reiterate', mode)
 }
 
-CronJobBuilder.cronJob('beam_LoadTests_Python_GBK_reiterate_Dataflow_Batch', 
'H 14 * * *', this) {
-  additionalPipelineArgs = [
-    influx_db_name: InfluxDBCredentialsHelper.InfluxDBDatabaseName,
-    influx_hostname: InfluxDBCredentialsHelper.InfluxDBHostname,
-  ]
-  batchLoadTestJob(delegate, 
CommonTestProperties.TriggeringContext.POST_COMMIT)
-}
+CronJobBuilder.cronJob('beam_LoadTests_Python_GBK_reiterate_Dataflow_Batch',
+    'H 14 * * *', this) {

Review comment:
       What's the methodology for picking the time to trigger these? Is it 
documented anywhere?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [beam] tysonjh commented on a change in pull request #12612: [BEAM-10675] Add Python GBK Load Tests for streaming on Dataflow

Reply via email to