chamikaramj commented on a change in pull request #14189:
URL: https://github.com/apache/beam/pull/14189#discussion_r600960548
##########
File path: sdks/python/apache_beam/runners/dataflow/internal/apiclient.py
##########
@@ -282,45 +283,26 @@ def __init__(
# Dataflow workers.
environments_to_use = self._get_environments_from_tranforms()
if _use_unified_worker(options):
- # Adding a SDK container image for the pipeline SDKs
- container_image = dataflow.SdkHarnessContainerImage()
- pipeline_sdk_container_image = get_container_image_from_options(options)
- container_image.containerImage = pipeline_sdk_container_image
- container_image.useSingleCorePerContainer = True # True for Python SDK.
- pool.sdkHarnessContainerImages.append(container_image)
-
- already_added_containers = [pipeline_sdk_container_image]
+ python_sdk_container_image = get_container_image_from_options(options)
# Adding container images for other SDKs that may be needed for
# cross-language pipelines.
- for environment in environments_to_use:
+ for id, environment in environments_to_use:
if environment.urn != common_urns.environments.DOCKER.urn:
raise Exception(
'Dataflow can only execute pipeline steps in Docker
environments.'
' Received %r.' % environment)
environment_payload = proto_utils.parse_Bytes(
environment.payload, beam_runner_api_pb2.DockerPayload)
container_image_url = environment_payload.container_image
- if container_image_url in already_added_containers:
- # Do not add the pipeline environment again.
-
- # Currently, Dataflow uses Docker container images to uniquely
- # identify execution environments. Hence Dataflow executes all
- # transforms that specify the the same Docker container image in a
- # single container instance. Dependencies of all environments that
- # specify a given container image will be staged in the container
- # instance for that particular container image.
- # TODO(BEAM-9455): loosen this restriction to support multiple
- # environments with the same container image when Dataflow supports
- # environment specific artifact provisioning.
- continue
- already_added_containers.append(container_image_url)
container_image = dataflow.SdkHarnessContainerImage()
container_image.containerImage = container_image_url
# Currently we only set following to True for Python SDK.
# TODO: set this correctly for remote environments that might be
Python.
- container_image.useSingleCorePerContainer = False
+ container_image.useSingleCorePerContainer = (
+ container_image_url == python_sdk_container_image)
Review comment:
Thanks.
##########
File path: sdks/python/apache_beam/runners/dataflow/internal/apiclient.py
##########
@@ -740,6 +720,28 @@ def _apply_sdk_environment_overrides(
new_payload.container_image = new_container_image
environment.payload = new_payload.SerializeToString()
+ # De-dup environments that use Java SDK by Docker container image since
+ # currently running multiple Java SDK Harnesses with Dataflow could result
+ # in dependency conflicts.
+ # TODDO(BEAM-9455): remove following restriction when Dataflow supports
+ # environment specific dependency provisioning.
+ container_url_to_env_map = dict()
+ container_url_to_env_id_map = dict()
+ for transform in proto_pipeline.components.transforms.values():
+ environment_id = transform.environment_id
+ if not environment_id:
+ continue
+ environment = proto_pipeline.components.environments[environment_id]
+ docker_payload = proto_utils.parse_Bytes(
+ environment.payload, beam_runner_api_pb2.DockerPayload)
+ image = docker_payload.container_image
+ if is_beam_java_container_name(image):
Review comment:
Thanks, we had an offline discussion and seems like this PR does not add
any additional restrictions related to resource hints implementation given that
Dataflow cannot support multiple Java Environments (with multiple Java SDK
Harnesses) till we fix Dataflow Dependencies issue anyways
(https://issues.apache.org/jira/browse/BEAM-9455).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]