VictorPlusC commented on a change in pull request #16741:
URL: https://github.com/apache/beam/pull/16741#discussion_r809380267
##########
File path: sdks/python/apache_beam/runners/interactive/interactive_runner.py
##########
@@ -209,6 +218,70 @@ def visit_transform(self, transform_node):
return main_job_result
+ # TODO(victorhc): Move this method somewhere else if performance is impacted
+ # by generating a cluster during runtime.
+ def _create_dataproc_cluster_if_applicable(self, user_pipeline):
+ """ Creates a Dataproc cluster if the provided user_pipeline is running
+ FlinkRunner and no flink_master_url was provided as an option. A cluster
+ is not created when a flink_master_url is detected.
+
+ Example pipeline options to enable automatic Dataproc cluster creation:
+ options = PipelineOptions([
+ '--runner=FlinkRunner',
+ '--project=my-project',
+ '--region=my-region',
+ '--environment_type=DOCKER'
+ ])
+
+ Example pipeline options to skip automatic Dataproc cluster creation:
+ options = PipelineOptions([
+ '--runner=FlinkRunner',
+ '--flink_master=example.internal:41979',
+ '--environment_type=DOCKER'
+ ])
+ """
+ from apache_beam.runners.portability.flink_runner import FlinkRunner
+ from apache_beam.options.pipeline_options import FlinkRunnerOptions
+ flink_master = user_pipeline.options.view_as(
+ FlinkRunnerOptions).flink_master
+ clusters = ie.current_env().clusters
+ # Only consider this logic when both below 2 conditions apply.
+ if isinstance(self._underlying_runner,
+ FlinkRunner) and clusters.dataproc_cluster_managers.get(
+ str(id(user_pipeline)), None) is None:
+ if flink_master == '[auto]':
+ # The above condition is True when the user has not provided a
+ # flink_master.
+ if ie.current_env()._is_in_ipython:
+ warnings.filterwarnings(
+ 'ignore',
+ 'options is deprecated since First stable release. References to
'
+ '<pipeline>.options will not be supported',
+ category=DeprecationWarning)
+ project_id =
(user_pipeline.options.view_as(GoogleCloudOptions).project)
+ region = (user_pipeline.options.view_as(GoogleCloudOptions).region)
+ cluster_name = ie.current_env().clusters.default_cluster_name
+ cluster_metadata = MasterURLIdentifier(
+ project_id=project_id, region=region, cluster_name=cluster_name)
+ elif flink_master in clusters.master_urls:
+ cluster_metadata = clusters.master_urls.inverse.get(flink_master, None)
+ # else noop, no need to log anything because we allow a master_url
+ # (not managed by us) provided by the user.
+ if cluster_metadata:
+ # create the cluster_manager and populate dicts in the clusters
+ # instance if the pipeline is not already mapped to an existing
+ # cluster_manager.
+ cluster_manager = DataprocClusterManager(cluster_metadata)
+ cluster_manager.create_flink_cluster()
+ clusters.master_urls[cluster_manager.master_url] = cluster_metadata
+ clusters.dataproc_cluster_managers[str(
+ id(user_pipeline))] = cluster_manager
+ clusters.master_urls_to_pipelines[cluster_manager.master_url].append(
+ str(id(user_pipeline)))
+ _LOGGER.info(
Review comment:
I have removed the logging now.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]