[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=117109=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-117109 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 28/Jun/18 22:58 Start Date: 28/Jun/18 22:58 Worklog Time Spent: 10m Work Description: robertwb closed pull request #5817: [BEAM-3883] Stage files in the portability runner. URL: https://github.com/apache/beam/pull/5817 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/sdks/python/apache_beam/runners/dataflow/internal/apiclient.py b/sdks/python/apache_beam/runners/dataflow/internal/apiclient.py index 784166cafda..ccce9a99ef7 100644 --- a/sdks/python/apache_beam/runners/dataflow/internal/apiclient.py +++ b/sdks/python/apache_beam/runners/dataflow/internal/apiclient.py @@ -442,10 +442,11 @@ def _stage_resources(self, options): raise RuntimeError('The --temp_location option must be specified.') resource_stager = _LegacyDataflowStager(self) -return resource_stager.stage_job_resources( +_, resources = resource_stager.stage_job_resources( options, temp_dir=tempfile.mkdtemp(), staging_location=google_cloud_options.staging_location) +return resources def stage_file(self, gcs_or_local_path, file_name, stream, mime_type='application/octet-stream'): diff --git a/sdks/python/apache_beam/runners/portability/portable_runner.py b/sdks/python/apache_beam/runners/portability/portable_runner.py index 0da33fa7f4a..26376c93337 100644 --- a/sdks/python/apache_beam/runners/portability/portable_runner.py +++ b/sdks/python/apache_beam/runners/portability/portable_runner.py @@ -24,12 +24,11 @@ from apache_beam import metrics from apache_beam.options.pipeline_options import PortableOptions from apache_beam.portability import common_urns -from apache_beam.portability.api import beam_artifact_api_pb2 -from apache_beam.portability.api import beam_artifact_api_pb2_grpc from apache_beam.portability.api import beam_job_api_pb2 from apache_beam.portability.api import beam_job_api_pb2_grpc from apache_beam.runners import pipeline_context from apache_beam.runners import runner +from apache_beam.runners.portability import portable_stager __all__ = ['PortableRunner'] @@ -92,16 +91,12 @@ def run_pipeline(self, pipeline): beam_job_api_pb2.PrepareJobRequest( job_name='job', pipeline=proto_pipeline)) if prepare_response.artifact_staging_endpoint.url: - # Must commit something to get a retrieval token, - # committing empty manifest for now. - # TODO(BEAM-3883): Actually stage required files. - artifact_service = beam_artifact_api_pb2_grpc.ArtifactStagingServiceStub( - grpc.insecure_channel(prepare_response.artifact_staging_endpoint.url)) - commit_manifest = artifact_service.CommitManifest( - beam_artifact_api_pb2.CommitManifestRequest( - manifest=beam_artifact_api_pb2.Manifest(), - staging_session_token=prepare_response.staging_session_token)) - retrieval_token = commit_manifest.retrieval_token + stager = portable_stager.PortableStager( + grpc.insecure_channel(prepare_response.artifact_staging_endpoint.url), + prepare_response.staging_session_token) + retrieval_token, _ = stager.stage_job_resources( + pipeline._options, + staging_location='') else: retrieval_token = None run_response = job_service.Run( diff --git a/sdks/python/apache_beam/runners/portability/portable_stager.py b/sdks/python/apache_beam/runners/portability/portable_stager.py index f556811425b..3761373fb42 100644 --- a/sdks/python/apache_beam/runners/portability/portable_stager.py +++ b/sdks/python/apache_beam/runners/portability/portable_stager.py @@ -20,6 +20,8 @@ from __future__ import division from __future__ import print_function +import base64 +import hashlib import os from apache_beam.portability.api import beam_artifact_api_pb2 @@ -69,7 +71,8 @@ def stage_artifact(self, local_path_to_artifact, artifact_name): def artifact_request_generator(): artifact_metadata = beam_artifact_api_pb2.ArtifactMetadata( - name=artifact_name) + name=artifact_name, + md5=_get_file_hash(local_path_to_artifact)) metadata = beam_artifact_api_pb2.PutArtifactMetadata( staging_session_token=self._staging_session_token, metadata=artifact_metadata) @@ -90,7 +93,18 @@ def artifact_request_generator(): def commit_manifest(self): manifest = beam_artifact_api_pb2.Manifest(artifact=self._artifacts)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=117088=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-117088 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 28/Jun/18 22:29 Start Date: 28/Jun/18 22:29 Worklog Time Spent: 10m Work Description: jkff commented on a change in pull request #5817: [BEAM-3883] Stage files in the portability runner. URL: https://github.com/apache/beam/pull/5817#discussion_r199006382 ## File path: sdks/python/apache_beam/runners/portability/portable_runner.py ## @@ -107,16 +108,12 @@ def run_pipeline(self, pipeline): beam_job_api_pb2.PrepareJobRequest( job_name='job', pipeline=proto_pipeline)) if prepare_response.artifact_staging_endpoint.url: - # Must commit something to get a retrieval token, - # committing empty manifest for now. - # TODO(BEAM-3883): Actually stage required files. - artifact_service = beam_artifact_api_pb2_grpc.ArtifactStagingServiceStub( Review comment: The imports _grpc and _pb2 above are no longer used. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 117088) Time Spent: 19.5h (was: 19h 20m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Fix For: 2.6.0 > > Time Spent: 19.5h > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=116973=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-116973 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 28/Jun/18 19:28 Start Date: 28/Jun/18 19:28 Worklog Time Spent: 10m Work Description: robertwb opened a new pull request #5817: [BEAM-3883] Stage files in the portability runner. URL: https://github.com/apache/beam/pull/5817 Follow this checklist to help us incorporate your contribution quickly and easily: - [ ] Format the pull request title like `[BEAM-XXX] Fixes bug in ApproximateQuantiles`, where you replace `BEAM-XXX` with the appropriate JIRA issue, if applicable. This will automatically link the pull request to the issue. - [ ] If this contribution is large, please file an Apache [Individual Contributor License Agreement](https://www.apache.org/licenses/icla.pdf). It will help us expedite review of your Pull Request if you tag someone (e.g. `@username`) to look at it. Post-Commit Tests Status (on master branch) Lang | SDK | Apex | Dataflow | Flink | Gearpump | Spark --- | --- | --- | --- | --- | --- | --- Go | [![Build Status](https://builds.apache.org/job/beam_PostCommit_Go_GradleBuild/lastCompletedBuild/badge/icon)](https://builds.apache.org/job/beam_PostCommit_Go_GradleBuild/lastCompletedBuild/) | --- | --- | --- | --- | --- Java | [![Build Status](https://builds.apache.org/job/beam_PostCommit_Java_GradleBuild/lastCompletedBuild/badge/icon)](https://builds.apache.org/job/beam_PostCommit_Java_GradleBuild/lastCompletedBuild/) | [![Build Status](https://builds.apache.org/job/beam_PostCommit_Java_ValidatesRunner_Apex_Gradle/lastCompletedBuild/badge/icon)](https://builds.apache.org/job/beam_PostCommit_Java_ValidatesRunner_Apex_Gradle/lastCompletedBuild/) | [![Build Status](https://builds.apache.org/job/beam_PostCommit_Java_ValidatesRunner_Dataflow_Gradle/lastCompletedBuild/badge/icon)](https://builds.apache.org/job/beam_PostCommit_Java_ValidatesRunner_Dataflow_Gradle/lastCompletedBuild/) | [![Build Status](https://builds.apache.org/job/beam_PostCommit_Java_ValidatesRunner_Flink_Gradle/lastCompletedBuild/badge/icon)](https://builds.apache.org/job/beam_PostCommit_Java_ValidatesRunner_Flink_Gradle/lastCompletedBuild/) | [![Build Status](https://builds.apache.org/job/beam_PostCommit_Java_ValidatesRunner_Gearpump_Gradle/lastCompletedBuild/badge/icon)](https://builds.apache.org/job/beam_PostCommit_Java_ValidatesRunner_Gearpump_Gradle/lastCompletedBuild/) | [![Build Status](https://builds.apache.org/job/beam_PostCommit_Java_ValidatesRunner_Spark_Gradle/lastCompletedBuild/badge/icon)](https://builds.apache.org/job/beam_PostCommit_Java_ValidatesRunner_Spark_Gradle/lastCompletedBuild/) Python | [![Build Status](https://builds.apache.org/job/beam_PostCommit_Python_Verify/lastCompletedBuild/badge/icon)](https://builds.apache.org/job/beam_PostCommit_Python_Verify/lastCompletedBuild/) | --- | [![Build Status](https://builds.apache.org/job/beam_PostCommit_Py_VR_Dataflow/lastCompletedBuild/badge/icon)](https://builds.apache.org/job/beam_PostCommit_Py_VR_Dataflow/lastCompletedBuild/) [![Build Status](https://builds.apache.org/job/beam_PostCommit_Py_ValCont/lastCompletedBuild/badge/icon)](https://builds.apache.org/job/beam_PostCommit_Py_ValCont/lastCompletedBuild/) | --- | --- | --- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 116973) Time Spent: 19h 20m (was: 19h 10m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Fix For: 2.6.0 > > Time Spent: 19h 20m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104863=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104863 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 22/May/18 23:41 Start Date: 22/May/18 23:41 Worklog Time Spent: 10m Work Description: jkff closed pull request #5273: [BEAM-3883] Adding Client to push artifacts to artifact staging service URL: https://github.com/apache/beam/pull/5273 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/sdks/python/apache_beam/runners/portability/portable_stager.py b/sdks/python/apache_beam/runners/portability/portable_stager.py new file mode 100644 index 000..7113a251f24 --- /dev/null +++ b/sdks/python/apache_beam/runners/portability/portable_stager.py @@ -0,0 +1,86 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""A :class:`FileHandler` to work with :class:`ArtifactStagingServiceStub`. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +from apache_beam.portability.api import beam_artifact_api_pb2 +from apache_beam.portability.api import beam_artifact_api_pb2_grpc +from apache_beam.runners.portability.stager import Stager + + +class PortableStager(Stager): + """An implementation of :class:`Stager` to stage files on + ArtifactStagingService. + + The class keeps track of pushed files and commit manifest once all files are + uploaded. + + Note: This class is not thread safe and user of this class should ensure + thread safety. + """ + + def __init__(self, artifact_service_channel): +"""Creates a new Stager to stage file to ArtifactStagingService. + +Args: + artifact_service_channel: Channel used to interact with +ArtifactStagingService.User owns the channel and should close it when +finished. +""" +super(PortableStager, self).__init__() +self._artifact_staging_stub = beam_artifact_api_pb2_grpc.\ +ArtifactStagingServiceStub(channel=artifact_service_channel) +self._artifacts = [] + + def stage_artifact(self, local_path_to_artifact, artifact_name): +"""Stage a file to ArtifactStagingService. + +Args: + local_path_to_artifact: Path of file to be uploaded. + artifact_name: File name on the artifact server. +""" +if not os.path.isfile(local_path_to_artifact): + raise ValueError( + 'Cannot stage {0} to artifact server. Only local files can be staged.' + .format(local_path_to_artifact)) + +def artifact_request_generator(): + metadata = beam_artifact_api_pb2.ArtifactMetadata(name=artifact_name) + request = beam_artifact_api_pb2.PutArtifactRequest(metadata=metadata) + yield request + with open(local_path_to_artifact, 'rb') as f: +while True: + chunk = f.read(1 << 21) # 2MB + if not chunk: +break + request = beam_artifact_api_pb2.PutArtifactRequest( + data=beam_artifact_api_pb2.ArtifactChunk(data=chunk)) + yield request + self._artifacts.append(metadata) + +self._artifact_staging_stub.PutArtifact(artifact_request_generator()) + + def commit_manifest(self): +manifest = beam_artifact_api_pb2.Manifest(artifact=self._artifacts) +self._artifacts = [] +self._artifact_staging_stub.CommitManifest( +beam_artifact_api_pb2.CommitManifestRequest(manifest=manifest)) diff --git a/sdks/python/apache_beam/runners/portability/portable_stager_test.py b/sdks/python/apache_beam/runners/portability/portable_stager_test.py new file mode 100644 index 000..181007de5f0 --- /dev/null +++ b/sdks/python/apache_beam/runners/portability/portable_stager_test.py @@ -0,0 +1,162 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104784=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104784 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 22/May/18 21:05 Start Date: 22/May/18 21:05 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5273: [BEAM-3883] Adding Client to push artifacts to artifact staging service URL: https://github.com/apache/beam/pull/5273#discussion_r190052740 ## File path: sdks/python/apache_beam/runners/portability/portable_stager_test.py ## @@ -48,7 +48,10 @@ def tearDown(self): if self._remote_dir: shutil.rmtree(self._remote_dir) - def stage_files(self, files): + def _stage_files(self, files): +""" Review comment: Made the change This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 104784) Time Spent: 18h 50m (was: 18h 40m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 18h 50m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104779=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104779 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 22/May/18 20:46 Start Date: 22/May/18 20:46 Worklog Time Spent: 10m Work Description: tvalentyn commented on a change in pull request #5273: [BEAM-3883] Adding Client to push artifacts to artifact staging service URL: https://github.com/apache/beam/pull/5273#discussion_r190047124 ## File path: sdks/python/apache_beam/runners/portability/portable_stager_test.py ## @@ -48,7 +48,10 @@ def tearDown(self): if self._remote_dir: shutil.rmtree(self._remote_dir) - def stage_files(self, files): + def _stage_files(self, files): +""" Review comment: The reason I asked for docstring is that it would be useful to describe the structure of `files` since it is not obvious without reading the method. How about: ``` Utility method to stage files. Args: files: a list of tuples of the form [(local_name, remote_name),...] describing the name of the artifacts in local temp folder and desired name in staging location. ``` This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 104779) Time Spent: 18h 40m (was: 18.5h) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 18h 40m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104761=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104761 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 22/May/18 19:40 Start Date: 22/May/18 19:40 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5273: [BEAM-3883] Adding Client to push artifacts to artifact staging service URL: https://github.com/apache/beam/pull/5273#discussion_r190024906 ## File path: sdks/python/apache_beam/runners/portability/portable_stager_test.py ## @@ -0,0 +1,148 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Test cases for :module:`artifact_service_client`.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import filecmp +import logging +import os +import random +import shutil +import string +import tempfile +import unittest +from concurrent import futures + +import grpc + +from apache_beam.portability.api import beam_artifact_api_pb2 +from apache_beam.portability.api import beam_artifact_api_pb2_grpc +from apache_beam.runners.portability import portable_stager + + +class PortableStagerTest(unittest.TestCase): + + def setUp(self): +self._temp_dir = tempfile.mkdtemp() +self._remote_dir = tempfile.mkdtemp() + + def tearDown(self): +if self._temp_dir: + shutil.rmtree(self._temp_dir) +if self._remote_dir: + shutil.rmtree(self._remote_dir) + + def stage_files(self, files): Review comment: Added. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 104761) Time Spent: 18h 20m (was: 18h 10m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 18h 20m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104763=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104763 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 22/May/18 19:40 Start Date: 22/May/18 19:40 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5273: [BEAM-3883] Adding Client to push artifacts to artifact staging service URL: https://github.com/apache/beam/pull/5273#discussion_r190023720 ## File path: sdks/python/apache_beam/runners/portability/portable_stager.py ## @@ -0,0 +1,86 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""A :class:`FileHandler` to work with :class:`ArtifactStagingServiceStub`. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +from apache_beam.portability.api import beam_artifact_api_pb2 +from apache_beam.portability.api import beam_artifact_api_pb2_grpc +from apache_beam.runners.portability.stager import Stager + + +class PortableStager(Stager): + """An implementation of :class:`Stager` to stage files on + ArtifactStagingService. + + The class keeps track of pushed files and commit manifest once all files are + uploaded. Review comment: GRPC does not retry on connection errors. I am not planning to add retry as of now as user can simply resubmit the pipeline if it fails. We can revisit it if it becomes an issue. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 104763) Time Spent: 18.5h (was: 18h 20m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 18.5h > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104764=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104764 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 22/May/18 19:40 Start Date: 22/May/18 19:40 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5273: [BEAM-3883] Adding Client to push artifacts to artifact staging service URL: https://github.com/apache/beam/pull/5273#discussion_r190026010 ## File path: sdks/python/apache_beam/runners/portability/portable_stager_test.py ## @@ -0,0 +1,148 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Test cases for :module:`artifact_service_client`.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import filecmp +import logging +import os +import random +import shutil +import string +import tempfile +import unittest +from concurrent import futures + +import grpc + +from apache_beam.portability.api import beam_artifact_api_pb2 +from apache_beam.portability.api import beam_artifact_api_pb2_grpc +from apache_beam.runners.portability import portable_stager + + +class PortableStagerTest(unittest.TestCase): + + def setUp(self): +self._temp_dir = tempfile.mkdtemp() +self._remote_dir = tempfile.mkdtemp() + + def tearDown(self): +if self._temp_dir: + shutil.rmtree(self._temp_dir) +if self._remote_dir: + shutil.rmtree(self._remote_dir) + + def stage_files(self, files): +server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) +beam_artifact_api_pb2_grpc.add_ArtifactStagingServiceServicer_to_server( +TestLocalFileSystemArtifactStagingServiceServicer(self._remote_dir), +server) +test_port = server.add_insecure_port('[::]:0') +server.start() +stager = portable_stager.PortableStager( +grpc.insecure_channel('localhost:%s' % test_port)) +for from_file, to_file in files: + stager.stage_artifact( + local_path_to_artifact=os.path.join(self._temp_dir, from_file), + artifact_name=to_file) + +return stager._artifacts + + def test_stage_single_file(self): +from_file = 'test_local.txt' +to_file = 'test_remote.txt' + +with open(os.path.join(self._temp_dir, from_file), 'wb') as f: + f.write(b'abc') + +copied_files = self.stage_files([('test_local.txt', 'test_remote.txt')]) +self.assertTrue( +filecmp.cmp( +os.path.join(self._temp_dir, from_file), +os.path.join(self._remote_dir, to_file))) +self.assertEqual([to_file], [manifest.name for manifest in copied_files]) Review comment: Renamed to staged_file_metadata. as mentioned above, commit_manifest is exercised automatically. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 104764) Time Spent: 18.5h (was: 18h 20m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 18.5h > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: >
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104760=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104760 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 22/May/18 19:40 Start Date: 22/May/18 19:40 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5273: [BEAM-3883] Adding Client to push artifacts to artifact staging service URL: https://github.com/apache/beam/pull/5273#discussion_r190027339 ## File path: sdks/python/apache_beam/runners/portability/portable_stager_test.py ## @@ -0,0 +1,148 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Test cases for :module:`artifact_service_client`.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import filecmp +import logging +import os +import random +import shutil +import string +import tempfile +import unittest +from concurrent import futures + +import grpc + +from apache_beam.portability.api import beam_artifact_api_pb2 +from apache_beam.portability.api import beam_artifact_api_pb2_grpc +from apache_beam.runners.portability import portable_stager + + +class PortableStagerTest(unittest.TestCase): + + def setUp(self): +self._temp_dir = tempfile.mkdtemp() +self._remote_dir = tempfile.mkdtemp() + + def tearDown(self): +if self._temp_dir: + shutil.rmtree(self._temp_dir) +if self._remote_dir: + shutil.rmtree(self._remote_dir) + + def stage_files(self, files): +server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) +beam_artifact_api_pb2_grpc.add_ArtifactStagingServiceServicer_to_server( +TestLocalFileSystemArtifactStagingServiceServicer(self._remote_dir), +server) +test_port = server.add_insecure_port('[::]:0') +server.start() +stager = portable_stager.PortableStager( +grpc.insecure_channel('localhost:%s' % test_port)) +for from_file, to_file in files: + stager.stage_artifact( + local_path_to_artifact=os.path.join(self._temp_dir, from_file), + artifact_name=to_file) + +return stager._artifacts + + def test_stage_single_file(self): +from_file = 'test_local.txt' +to_file = 'test_remote.txt' + +with open(os.path.join(self._temp_dir, from_file), 'wb') as f: + f.write(b'abc') + +copied_files = self.stage_files([('test_local.txt', 'test_remote.txt')]) +self.assertTrue( +filecmp.cmp( +os.path.join(self._temp_dir, from_file), +os.path.join(self._remote_dir, to_file))) +self.assertEqual([to_file], [manifest.name for manifest in copied_files]) + + def test_stage_multiple_files(self): + +files = [ +('test_local_100.txt', 'test_remote_100.txt', 100, 's'), # +('test_local_100.binary', 'test_remote_100.binary', 100, 'b'), # +('test_local_1k.txt', 'test_remote_1k.txt', 1 << 10, 's'), # +('test_local_1k.binary', 'test_remote_1k.binary', 1 << 10, 'b'), # +('test_local_1m.txt', 'test_remote_1m.txt', 1 << 20, 's'), +('test_local_1m.binary', 'test_remote_1m.binary', 1 << 20, 'b'), +('test_local_10m.txt', 'test_remote_10m.txt', 10 * (1 << 20), 's'), +('test_local_10m.binary', 'test_remote_10m.binary', 10 * (1 << 20), 'b') +] + +for (from_file, _, size, type) in files: + chars = list(string.printable) + random.shuffle(chars) + chars = list(int(size / len(chars)) * chars + chars[0:size % len(chars)]) + if type == 's': +with open( +os.path.join(self._temp_dir, from_file), 'w', +buffering=2 << 22) as f: + f.write(''.join(chars)) + if type == 'b': +with open( +os.path.join(self._temp_dir, from_file), 'wb', +buffering=2 << 22) as f: + f.write(''.join(chars)) + +copied_files = self.stage_files( +[(from_file, to_file) for (from_file, to_file, _, _) in files]) + +for from_file, to_file, _, _ in files: + ff = os.path.join(self._temp_dir, from_file) + rf =
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104762=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104762 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 22/May/18 19:40 Start Date: 22/May/18 19:40 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5273: [BEAM-3883] Adding Client to push artifacts to artifact staging service URL: https://github.com/apache/beam/pull/5273#discussion_r190026690 ## File path: sdks/python/apache_beam/runners/portability/portable_stager_test.py ## @@ -0,0 +1,148 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Test cases for :module:`artifact_service_client`.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import filecmp +import logging +import os +import random +import shutil +import string +import tempfile +import unittest +from concurrent import futures + +import grpc + +from apache_beam.portability.api import beam_artifact_api_pb2 +from apache_beam.portability.api import beam_artifact_api_pb2_grpc +from apache_beam.runners.portability import portable_stager + + +class PortableStagerTest(unittest.TestCase): + + def setUp(self): +self._temp_dir = tempfile.mkdtemp() +self._remote_dir = tempfile.mkdtemp() + + def tearDown(self): +if self._temp_dir: + shutil.rmtree(self._temp_dir) +if self._remote_dir: + shutil.rmtree(self._remote_dir) + + def stage_files(self, files): +server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) +beam_artifact_api_pb2_grpc.add_ArtifactStagingServiceServicer_to_server( +TestLocalFileSystemArtifactStagingServiceServicer(self._remote_dir), +server) +test_port = server.add_insecure_port('[::]:0') +server.start() +stager = portable_stager.PortableStager( +grpc.insecure_channel('localhost:%s' % test_port)) +for from_file, to_file in files: + stager.stage_artifact( + local_path_to_artifact=os.path.join(self._temp_dir, from_file), + artifact_name=to_file) + +return stager._artifacts + + def test_stage_single_file(self): +from_file = 'test_local.txt' +to_file = 'test_remote.txt' + +with open(os.path.join(self._temp_dir, from_file), 'wb') as f: + f.write(b'abc') + +copied_files = self.stage_files([('test_local.txt', 'test_remote.txt')]) +self.assertTrue( +filecmp.cmp( +os.path.join(self._temp_dir, from_file), +os.path.join(self._remote_dir, to_file))) +self.assertEqual([to_file], [manifest.name for manifest in copied_files]) + + def test_stage_multiple_files(self): + +files = [ +('test_local_100.txt', 'test_remote_100.txt', 100, 's'), # +('test_local_100.binary', 'test_remote_100.binary', 100, 'b'), # +('test_local_1k.txt', 'test_remote_1k.txt', 1 << 10, 's'), # +('test_local_1k.binary', 'test_remote_1k.binary', 1 << 10, 'b'), # +('test_local_1m.txt', 'test_remote_1m.txt', 1 << 20, 's'), +('test_local_1m.binary', 'test_remote_1m.binary', 1 << 20, 'b'), +('test_local_10m.txt', 'test_remote_10m.txt', 10 * (1 << 20), 's'), +('test_local_10m.binary', 'test_remote_10m.binary', 10 * (1 << 20), 'b') +] + +for (from_file, _, size, type) in files: + chars = list(string.printable) + random.shuffle(chars) + chars = list(int(size / len(chars)) * chars + chars[0:size % len(chars)]) + if type == 's': +with open( +os.path.join(self._temp_dir, from_file), 'w', +buffering=2 << 22) as f: + f.write(''.join(chars)) + if type == 'b': +with open( +os.path.join(self._temp_dir, from_file), 'wb', +buffering=2 << 22) as f: + f.write(''.join(chars)) + +copied_files = self.stage_files( +[(from_file, to_file) for (from_file, to_file, _, _) in files]) + +for from_file, to_file, _, _ in files: + ff = os.path.join(self._temp_dir, from_file) Review comment:
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104765=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104765 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 22/May/18 19:40 Start Date: 22/May/18 19:40 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5273: [BEAM-3883] Adding Client to push artifacts to artifact staging service URL: https://github.com/apache/beam/pull/5273#discussion_r190023922 ## File path: sdks/python/apache_beam/runners/portability/portable_stager.py ## @@ -0,0 +1,86 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""A :class:`FileHandler` to work with :class:`ArtifactStagingServiceStub`. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +from apache_beam.portability.api import beam_artifact_api_pb2 +from apache_beam.portability.api import beam_artifact_api_pb2_grpc +from apache_beam.runners.portability.stager import Stager + + +class PortableStager(Stager): + """An implementation of :class:`Stager` to stage files on + ArtifactStagingService. + + The class keeps track of pushed files and commit manifest once all files are + uploaded. + + Note: This class is not thread safe and user of this class should ensure + thread safety. + """ + + def __init__(self, artifact_service_channel): +"""Creates a new Stager to stage file to ArtifactStagingService. + +Args: + artifact_service_channel: Channel used to interact with +ArtifactStagingService.User owns the channel and should close it when +finished. +""" +super(PortableStager, self).__init__() +self._artifact_staging_stub = beam_artifact_api_pb2_grpc.\ +ArtifactStagingServiceStub(channel=artifact_service_channel) +self._artifacts = [] + + def stage_artifact(self, local_path_to_artifact, artifact_name): +"""Stage a file to ArtifactStagingService. + +Args: + local_path_to_artifact: Path of file to be uploaded. + artifact_name: File name on the artifact server. +""" +if not os.path.isfile(local_path_to_artifact): + raise ValueError('Can only stage file to artifact server. from_path: {0} ' Review comment: Done This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 104765) Time Spent: 18.5h (was: 18h 20m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 18.5h > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104759=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104759 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 22/May/18 19:40 Start Date: 22/May/18 19:40 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5273: [BEAM-3883] Adding Client to push artifacts to artifact staging service URL: https://github.com/apache/beam/pull/5273#discussion_r190022847 ## File path: sdks/python/apache_beam/runners/portability/portable_stager.py ## @@ -0,0 +1,86 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""A :class:`FileHandler` to work with :class:`ArtifactStagingServiceStub`. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +from apache_beam.portability.api import beam_artifact_api_pb2 +from apache_beam.portability.api import beam_artifact_api_pb2_grpc +from apache_beam.runners.portability.stager import Stager + + +class PortableStager(Stager): + """An implementation of :class:`Stager` to stage files on + ArtifactStagingService. + + The class keeps track of pushed files and commit manifest once all files are Review comment: Stager calls commit to commit the changes automatically https://github.com/apache/beam/blob/master/sdks/python/apache_beam/runners/portability/stager.py#L255 This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 104759) Time Spent: 18h (was: 17h 50m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 18h > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104708=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104708 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 22/May/18 17:21 Start Date: 22/May/18 17:21 Worklog Time Spent: 10m Work Description: tvalentyn commented on a change in pull request #5273: [BEAM-3883] Adding Client to push artifacts to artifact staging service URL: https://github.com/apache/beam/pull/5273#discussion_r189982515 ## File path: sdks/python/apache_beam/runners/portability/portable_stager_test.py ## @@ -0,0 +1,148 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Test cases for :module:`artifact_service_client`.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import filecmp +import logging +import os +import random +import shutil +import string +import tempfile +import unittest +from concurrent import futures + +import grpc + +from apache_beam.portability.api import beam_artifact_api_pb2 +from apache_beam.portability.api import beam_artifact_api_pb2_grpc +from apache_beam.runners.portability import portable_stager + + +class PortableStagerTest(unittest.TestCase): + + def setUp(self): +self._temp_dir = tempfile.mkdtemp() +self._remote_dir = tempfile.mkdtemp() + + def tearDown(self): +if self._temp_dir: + shutil.rmtree(self._temp_dir) +if self._remote_dir: + shutil.rmtree(self._remote_dir) + + def stage_files(self, files): +server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) +beam_artifact_api_pb2_grpc.add_ArtifactStagingServiceServicer_to_server( +TestLocalFileSystemArtifactStagingServiceServicer(self._remote_dir), +server) +test_port = server.add_insecure_port('[::]:0') +server.start() +stager = portable_stager.PortableStager( +grpc.insecure_channel('localhost:%s' % test_port)) +for from_file, to_file in files: + stager.stage_artifact( + local_path_to_artifact=os.path.join(self._temp_dir, from_file), + artifact_name=to_file) + +return stager._artifacts + + def test_stage_single_file(self): +from_file = 'test_local.txt' +to_file = 'test_remote.txt' + +with open(os.path.join(self._temp_dir, from_file), 'wb') as f: + f.write(b'abc') + +copied_files = self.stage_files([('test_local.txt', 'test_remote.txt')]) +self.assertTrue( +filecmp.cmp( +os.path.join(self._temp_dir, from_file), +os.path.join(self._remote_dir, to_file))) +self.assertEqual([to_file], [manifest.name for manifest in copied_files]) + + def test_stage_multiple_files(self): + +files = [ +('test_local_100.txt', 'test_remote_100.txt', 100, 's'), # +('test_local_100.binary', 'test_remote_100.binary', 100, 'b'), # +('test_local_1k.txt', 'test_remote_1k.txt', 1 << 10, 's'), # +('test_local_1k.binary', 'test_remote_1k.binary', 1 << 10, 'b'), # +('test_local_1m.txt', 'test_remote_1m.txt', 1 << 20, 's'), +('test_local_1m.binary', 'test_remote_1m.binary', 1 << 20, 'b'), +('test_local_10m.txt', 'test_remote_10m.txt', 10 * (1 << 20), 's'), +('test_local_10m.binary', 'test_remote_10m.binary', 10 * (1 << 20), 'b') +] + +for (from_file, _, size, type) in files: + chars = list(string.printable) + random.shuffle(chars) + chars = list(int(size / len(chars)) * chars + chars[0:size % len(chars)]) + if type == 's': +with open( +os.path.join(self._temp_dir, from_file), 'w', +buffering=2 << 22) as f: + f.write(''.join(chars)) + if type == 'b': +with open( +os.path.join(self._temp_dir, from_file), 'wb', +buffering=2 << 22) as f: + f.write(''.join(chars)) + +copied_files = self.stage_files( +[(from_file, to_file) for (from_file, to_file, _, _) in files]) + +for from_file, to_file, _, _ in files: + ff = os.path.join(self._temp_dir, from_file) + rf =
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104705=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104705 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 22/May/18 17:21 Start Date: 22/May/18 17:21 Worklog Time Spent: 10m Work Description: tvalentyn commented on a change in pull request #5273: [BEAM-3883] Adding Client to push artifacts to artifact staging service URL: https://github.com/apache/beam/pull/5273#discussion_r189981700 ## File path: sdks/python/apache_beam/runners/portability/portable_stager_test.py ## @@ -0,0 +1,148 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Test cases for :module:`artifact_service_client`.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import filecmp +import logging +import os +import random +import shutil +import string +import tempfile +import unittest +from concurrent import futures + +import grpc + +from apache_beam.portability.api import beam_artifact_api_pb2 +from apache_beam.portability.api import beam_artifact_api_pb2_grpc +from apache_beam.runners.portability import portable_stager + + +class PortableStagerTest(unittest.TestCase): + + def setUp(self): +self._temp_dir = tempfile.mkdtemp() +self._remote_dir = tempfile.mkdtemp() + + def tearDown(self): +if self._temp_dir: + shutil.rmtree(self._temp_dir) +if self._remote_dir: + shutil.rmtree(self._remote_dir) + + def stage_files(self, files): +server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) +beam_artifact_api_pb2_grpc.add_ArtifactStagingServiceServicer_to_server( +TestLocalFileSystemArtifactStagingServiceServicer(self._remote_dir), +server) +test_port = server.add_insecure_port('[::]:0') +server.start() +stager = portable_stager.PortableStager( +grpc.insecure_channel('localhost:%s' % test_port)) +for from_file, to_file in files: + stager.stage_artifact( + local_path_to_artifact=os.path.join(self._temp_dir, from_file), + artifact_name=to_file) + +return stager._artifacts + + def test_stage_single_file(self): +from_file = 'test_local.txt' +to_file = 'test_remote.txt' + +with open(os.path.join(self._temp_dir, from_file), 'wb') as f: + f.write(b'abc') + +copied_files = self.stage_files([('test_local.txt', 'test_remote.txt')]) +self.assertTrue( +filecmp.cmp( +os.path.join(self._temp_dir, from_file), +os.path.join(self._remote_dir, to_file))) +self.assertEqual([to_file], [manifest.name for manifest in copied_files]) + + def test_stage_multiple_files(self): + +files = [ +('test_local_100.txt', 'test_remote_100.txt', 100, 's'), # +('test_local_100.binary', 'test_remote_100.binary', 100, 'b'), # +('test_local_1k.txt', 'test_remote_1k.txt', 1 << 10, 's'), # +('test_local_1k.binary', 'test_remote_1k.binary', 1 << 10, 'b'), # +('test_local_1m.txt', 'test_remote_1m.txt', 1 << 20, 's'), +('test_local_1m.binary', 'test_remote_1m.binary', 1 << 20, 'b'), +('test_local_10m.txt', 'test_remote_10m.txt', 10 * (1 << 20), 's'), +('test_local_10m.binary', 'test_remote_10m.binary', 10 * (1 << 20), 'b') +] + +for (from_file, _, size, type) in files: + chars = list(string.printable) + random.shuffle(chars) + chars = list(int(size / len(chars)) * chars + chars[0:size % len(chars)]) + if type == 's': +with open( +os.path.join(self._temp_dir, from_file), 'w', +buffering=2 << 22) as f: + f.write(''.join(chars)) + if type == 'b': +with open( +os.path.join(self._temp_dir, from_file), 'wb', +buffering=2 << 22) as f: + f.write(''.join(chars)) + +copied_files = self.stage_files( +[(from_file, to_file) for (from_file, to_file, _, _) in files]) + +for from_file, to_file, _, _ in files: + ff = os.path.join(self._temp_dir, from_file) Review comment:
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104707=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104707 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 22/May/18 17:21 Start Date: 22/May/18 17:21 Worklog Time Spent: 10m Work Description: tvalentyn commented on a change in pull request #5273: [BEAM-3883] Adding Client to push artifacts to artifact staging service URL: https://github.com/apache/beam/pull/5273#discussion_r189984251 ## File path: sdks/python/apache_beam/runners/portability/portable_stager.py ## @@ -0,0 +1,86 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""A :class:`FileHandler` to work with :class:`ArtifactStagingServiceStub`. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +from apache_beam.portability.api import beam_artifact_api_pb2 +from apache_beam.portability.api import beam_artifact_api_pb2_grpc +from apache_beam.runners.portability.stager import Stager + + +class PortableStager(Stager): + """An implementation of :class:`Stager` to stage files on + ArtifactStagingService. + + The class keeps track of pushed files and commit manifest once all files are + uploaded. Review comment: I am curious, do we plan to add retry logic here (perhaps in later PRs) if some part of staging falls through? Or does GPRC handle retries automatically? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 104707) Time Spent: 17h 40m (was: 17.5h) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 17h 40m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104702=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104702 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 22/May/18 17:21 Start Date: 22/May/18 17:21 Worklog Time Spent: 10m Work Description: tvalentyn commented on a change in pull request #5273: [BEAM-3883] Adding Client to push artifacts to artifact staging service URL: https://github.com/apache/beam/pull/5273#discussion_r189984592 ## File path: sdks/python/apache_beam/runners/portability/portable_stager_test.py ## @@ -0,0 +1,148 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Test cases for :module:`artifact_service_client`.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import filecmp +import logging +import os +import random +import shutil +import string +import tempfile +import unittest +from concurrent import futures + +import grpc + +from apache_beam.portability.api import beam_artifact_api_pb2 +from apache_beam.portability.api import beam_artifact_api_pb2_grpc +from apache_beam.runners.portability import portable_stager + + +class PortableStagerTest(unittest.TestCase): + + def setUp(self): +self._temp_dir = tempfile.mkdtemp() +self._remote_dir = tempfile.mkdtemp() + + def tearDown(self): +if self._temp_dir: + shutil.rmtree(self._temp_dir) +if self._remote_dir: + shutil.rmtree(self._remote_dir) + + def stage_files(self, files): +server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) +beam_artifact_api_pb2_grpc.add_ArtifactStagingServiceServicer_to_server( +TestLocalFileSystemArtifactStagingServiceServicer(self._remote_dir), +server) +test_port = server.add_insecure_port('[::]:0') +server.start() +stager = portable_stager.PortableStager( +grpc.insecure_channel('localhost:%s' % test_port)) +for from_file, to_file in files: + stager.stage_artifact( + local_path_to_artifact=os.path.join(self._temp_dir, from_file), + artifact_name=to_file) + +return stager._artifacts + + def test_stage_single_file(self): +from_file = 'test_local.txt' +to_file = 'test_remote.txt' + +with open(os.path.join(self._temp_dir, from_file), 'wb') as f: + f.write(b'abc') + +copied_files = self.stage_files([('test_local.txt', 'test_remote.txt')]) +self.assertTrue( +filecmp.cmp( +os.path.join(self._temp_dir, from_file), +os.path.join(self._remote_dir, to_file))) +self.assertEqual([to_file], [manifest.name for manifest in copied_files]) Review comment: Manifest is a **list** of staged files, so it's not the best name for the loop variable here. If we want to check that manifest is created and staged, I would actually exercise commit_manifest behavior. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 104702) Time Spent: 17h (was: 16h 50m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 17h > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime.
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104706=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104706 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 22/May/18 17:21 Start Date: 22/May/18 17:21 Worklog Time Spent: 10m Work Description: tvalentyn commented on a change in pull request #5273: [BEAM-3883] Adding Client to push artifacts to artifact staging service URL: https://github.com/apache/beam/pull/5273#discussion_r189951783 ## File path: sdks/python/apache_beam/runners/portability/portable_stager_test.py ## @@ -0,0 +1,148 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Test cases for :module:`artifact_service_client`.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import filecmp +import logging +import os +import random +import shutil +import string +import tempfile +import unittest +from concurrent import futures + +import grpc + +from apache_beam.portability.api import beam_artifact_api_pb2 +from apache_beam.portability.api import beam_artifact_api_pb2_grpc +from apache_beam.runners.portability import portable_stager + + +class PortableStagerTest(unittest.TestCase): + + def setUp(self): +self._temp_dir = tempfile.mkdtemp() +self._remote_dir = tempfile.mkdtemp() + + def tearDown(self): +if self._temp_dir: + shutil.rmtree(self._temp_dir) +if self._remote_dir: + shutil.rmtree(self._remote_dir) + + def stage_files(self, files): Review comment: Please add a docstring for this method and clarify that `files` is a list of tuples, This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 104706) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 17.5h > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104703=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104703 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 22/May/18 17:21 Start Date: 22/May/18 17:21 Worklog Time Spent: 10m Work Description: tvalentyn commented on a change in pull request #5273: [BEAM-3883] Adding Client to push artifacts to artifact staging service URL: https://github.com/apache/beam/pull/5273#discussion_r189931243 ## File path: sdks/python/apache_beam/runners/portability/portable_stager.py ## @@ -0,0 +1,86 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""A :class:`FileHandler` to work with :class:`ArtifactStagingServiceStub`. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +from apache_beam.portability.api import beam_artifact_api_pb2 +from apache_beam.portability.api import beam_artifact_api_pb2_grpc +from apache_beam.runners.portability.stager import Stager + + +class PortableStager(Stager): + """An implementation of :class:`Stager` to stage files on + ArtifactStagingService. + + The class keeps track of pushed files and commit manifest once all files are + uploaded. + + Note: This class is not thread safe and user of this class should ensure + thread safety. + """ + + def __init__(self, artifact_service_channel): +"""Creates a new Stager to stage file to ArtifactStagingService. + +Args: + artifact_service_channel: Channel used to interact with +ArtifactStagingService.User owns the channel and should close it when +finished. +""" +super(PortableStager, self).__init__() +self._artifact_staging_stub = beam_artifact_api_pb2_grpc.\ +ArtifactStagingServiceStub(channel=artifact_service_channel) +self._artifacts = [] + + def stage_artifact(self, local_path_to_artifact, artifact_name): +"""Stage a file to ArtifactStagingService. + +Args: + local_path_to_artifact: Path of file to be uploaded. + artifact_name: File name on the artifact server. +""" +if not os.path.isfile(local_path_to_artifact): + raise ValueError('Can only stage file to artifact server. from_path: {0} ' Review comment: Let's rewrite the message so that the root cause is emphasized. How about: `Cannot stage {0} to artifact server. Only local files can be staged. ` This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 104703) Time Spent: 17h 10m (was: 17h) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 17h 10m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104704=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104704 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 22/May/18 17:21 Start Date: 22/May/18 17:21 Worklog Time Spent: 10m Work Description: tvalentyn commented on a change in pull request #5273: [BEAM-3883] Adding Client to push artifacts to artifact staging service URL: https://github.com/apache/beam/pull/5273#discussion_r189929366 ## File path: sdks/python/apache_beam/runners/portability/portable_stager.py ## @@ -0,0 +1,86 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""A :class:`FileHandler` to work with :class:`ArtifactStagingServiceStub`. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +from apache_beam.portability.api import beam_artifact_api_pb2 +from apache_beam.portability.api import beam_artifact_api_pb2_grpc +from apache_beam.runners.portability.stager import Stager + + +class PortableStager(Stager): + """An implementation of :class:`Stager` to stage files on + ArtifactStagingService. + + The class keeps track of pushed files and commit manifest once all files are Review comment: Committing manifest is a responsibility of the superclass. Perhaps we could say `The class keeps track of staged files and can commit a manifest of all files that were staged.` This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 104704) Time Spent: 17h 20m (was: 17h 10m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 17h 20m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104331=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104331 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 21/May/18 23:05 Start Date: 21/May/18 23:05 Worklog Time Spent: 10m Work Description: jkff closed pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/sdks/python/apache_beam/runners/dataflow/internal/apiclient.py b/sdks/python/apache_beam/runners/dataflow/internal/apiclient.py index 54eba06abb8..72c54a40cfe 100644 --- a/sdks/python/apache_beam/runners/dataflow/internal/apiclient.py +++ b/sdks/python/apache_beam/runners/dataflow/internal/apiclient.py @@ -25,14 +25,17 @@ import logging import os import re +import tempfile import time from datetime import datetime from StringIO import StringIO +import pkg_resources from apitools.base.py import encoding from apitools.base.py import exceptions import six +from apache_beam import version as beam_version from apache_beam.internal.gcp.auth import get_service_credentials from apache_beam.internal.gcp.json_value import to_json_value from apache_beam.io.filesystems import FileSystems @@ -41,11 +44,10 @@ from apache_beam.options.pipeline_options import GoogleCloudOptions from apache_beam.options.pipeline_options import StandardOptions from apache_beam.options.pipeline_options import WorkerOptions -from apache_beam.runners.dataflow.internal import dependency from apache_beam.runners.dataflow.internal import names from apache_beam.runners.dataflow.internal.clients import dataflow -from apache_beam.runners.dataflow.internal.dependency import get_sdk_name_and_version from apache_beam.runners.dataflow.internal.names import PropertyNames +from apache_beam.runners.portability.stager import Stager from apache_beam.transforms import cy_combiners from apache_beam.transforms import DataflowDistributionCounter from apache_beam.transforms.display import DisplayData @@ -169,7 +171,7 @@ def __init__(self, packages, options, environment_version, pipeline_url): # TODO: Use enumerated type instead of strings for job types. if job_type.startswith('FNAPI_'): runner_harness_override = ( - dependency.get_runner_harness_container_image()) + get_runner_harness_container_image()) self.debug_options.experiments = self.debug_options.experiments or [] if runner_harness_override: self.debug_options.experiments.append( @@ -234,7 +236,7 @@ def __init__(self, packages, options, environment_version, pipeline_url): self.worker_options.worker_harness_container_image) else: pool.workerHarnessContainerImage = ( - dependency.get_default_container_image_for_current_sdk(job_type)) + get_default_container_image_for_current_sdk(job_type)) if self.worker_options.use_public_ips is not None: if self.worker_options.use_public_ips: pool.ipConfiguration = ( @@ -432,6 +434,19 @@ def _gcs_file_copy(self, from_path, to_path): with open(from_path, 'rb') as f: self.stage_file(to_folder, to_name, f) + def _stage_resources(self, options): +google_cloud_options = options.view_as(GoogleCloudOptions) +if google_cloud_options.staging_location is None: + raise RuntimeError('The --staging_location option must be specified.') +if google_cloud_options.temp_location is None: + raise RuntimeError('The --temp_location option must be specified.') + +resource_stager = _LegacyDataflowStager(self) +return resource_stager.stage_job_resources( +options, +temp_dir=tempfile.mkdtemp(), +staging_location=google_cloud_options.staging_location) + def stage_file(self, gcs_or_local_path, file_name, stream, mime_type='application/octet-stream'): """Stages a file at a GCS or local path with stream-supplied contents.""" @@ -496,8 +511,7 @@ def create_job_description(self, job): StringIO(job.proto_pipeline.SerializeToString())) # Stage other resources for the SDK harness -resources = dependency.stage_job_resources( -job.options, file_copy=self._gcs_file_copy) +resources = self._stage_resources(job.options) job.proto.environment = Environment( pipeline_url=FileSystems.join(job.google_cloud_options.staging_location, @@ -731,6 +745,31 @@ def translate_scalar_counter_float(accumulator, metric_update_proto):
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104292=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104292 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 21/May/18 21:53 Start Date: 21/May/18 21:53 Worklog Time Spent: 10m Work Description: tvalentyn commented on issue #5273: [BEAM-3883] Adding Client to push artifacts to artifact staging service URL: https://github.com/apache/beam/pull/5273#issuecomment-390794532 @angoenka Please let me know once this this ready for review. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 104292) Time Spent: 16.5h (was: 16h 20m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 16.5h > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104285=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104285 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 21/May/18 21:47 Start Date: 21/May/18 21:47 Worklog Time Spent: 10m Work Description: tvalentyn commented on issue #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#issuecomment-390793029 LGTM cc: @aaltay This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 104285) Time Spent: 16h 10m (was: 16h) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 16h 10m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104287=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104287 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 21/May/18 21:47 Start Date: 21/May/18 21:47 Worklog Time Spent: 10m Work Description: tvalentyn commented on issue #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#issuecomment-390793029 LGTM. Thanks, @angoenka. cc: @aaltay This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 104287) Time Spent: 16h 20m (was: 16h 10m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 16h 20m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104277=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104277 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 21/May/18 21:15 Start Date: 21/May/18 21:15 Worklog Time Spent: 10m Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#issuecomment-390785167 Run Python Dataflow ValidatesRunner This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 104277) Time Spent: 16h (was: 15h 50m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 16h > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104208=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104208 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 21/May/18 19:48 Start Date: 21/May/18 19:48 Worklog Time Spent: 10m Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#issuecomment-390762120 Run Python Dataflow ValidatesRunner This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 104208) Time Spent: 15h 50m (was: 15h 40m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 15h 50m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=103528=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-103528 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 18/May/18 18:40 Start Date: 18/May/18 18:40 Worklog Time Spent: 10m Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#issuecomment-390296544 Run Python Dataflow ValidatesRunner This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 103528) Time Spent: 15h 40m (was: 15.5h) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 15h 40m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=103171=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-103171 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 17/May/18 23:35 Start Date: 17/May/18 23:35 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r189130230 ## File path: sdks/python/apache_beam/runners/portability/stager.py ## @@ -0,0 +1,551 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Support for installing custom code and required dependencies. + +Workflows, with the exception of very simple ones, are organized in multiple +modules and packages. Typically, these modules and packages have +dependencies on other standard libraries. Beam relies on the Python +setuptools package to handle these scenarios. For further details please read: +https://pythonhosted.org/an_example_pypi_project/setuptools.html + +When a runner tries to run a pipeline it will check for a --requirements_file +and a --setup_file option. + +If --setup_file is present then it is assumed that the folder containing the +file specified by the option has the typical layout required by setuptools and +it will run 'python setup.py sdist' to produce a source distribution. The +resulting tarball (a .tar or .tar.gz file) will be staged at the staging +location specified as job option. When a worker starts it will check for the +presence of this file and will run 'easy_install tarball' to install the +package in the worker. + +If --requirements_file is present then the file specified by the option will be +staged in the staging location. When a worker starts it will check for the +presence of this file and will run 'pip install -r requirements.txt'. A +requirements file can be easily generated by running 'pip freeze -r +requirements.txt'. The reason a runner does not run this automatically is +because quite often only a small fraction of the dependencies present in a +requirements.txt file are actually needed for remote execution and therefore a +one-time manual trimming is desirable. + +TODO(silviuc): Should we allow several setup packages? +TODO(silviuc): We should allow customizing the exact command for setup build. +""" +import glob +import logging +import os +import shutil +import subprocess +import sys +import tempfile + +import pkg_resources + +from apache_beam.internal import pickler +from apache_beam.io.filesystems import FileSystems +from apache_beam.options.pipeline_options import SetupOptions +# TODO(angoenka): Remove reference to dataflow internal names +from apache_beam.runners.dataflow.internal import names +from apache_beam.utils import processes + +# All constants are for internal use only; no backwards-compatibility +# guarantees. + +# Standard file names used for staging files. +WORKFLOW_TARBALL_FILE = 'workflow.tar.gz' +REQUIREMENTS_FILE = 'requirements.txt' +EXTRA_PACKAGES_FILE = 'extra_packages.txt' + +# Package names for distributions +BEAM_PACKAGE_NAME = 'apache-beam' + + +class Stager(object): + """Stager identifies and copies the appropriate artifacts to the staging + location.""" + + @staticmethod + def _download_file(from_url, to_path): +"""Downloads a file over http/https from a url or copy it from a remote +path to local path.""" +if from_url.startswith('http://') or from_url.startswith('https://'): + # TODO(silviuc): We should cache downloads so we do not do it for every + # job. + try: +# We check if the file is actually there because wget returns a file +# even for a 404 response (file will contain the contents of the 404 +# response). +# TODO(angoenka): Extract and use the filename when downloading file. +response, content = __import__('httplib2').Http().request(from_url) +if int(response['status']) >= 400: + raise RuntimeError( + 'Artifact not found at %s (response: %s)' %
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=103172=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-103172 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 17/May/18 23:35 Start Date: 17/May/18 23:35 Worklog Time Spent: 10m Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#issuecomment-390046389 Run Python Dataflow ValidatesRunner This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 103172) Time Spent: 15.5h (was: 15h 20m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 15.5h > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=103163=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-103163 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 17/May/18 23:07 Start Date: 17/May/18 23:07 Worklog Time Spent: 10m Work Description: pabloem commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r189126287 ## File path: sdks/python/apache_beam/runners/portability/stager.py ## @@ -0,0 +1,551 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Support for installing custom code and required dependencies. + +Workflows, with the exception of very simple ones, are organized in multiple +modules and packages. Typically, these modules and packages have +dependencies on other standard libraries. Beam relies on the Python +setuptools package to handle these scenarios. For further details please read: +https://pythonhosted.org/an_example_pypi_project/setuptools.html + +When a runner tries to run a pipeline it will check for a --requirements_file +and a --setup_file option. + +If --setup_file is present then it is assumed that the folder containing the +file specified by the option has the typical layout required by setuptools and +it will run 'python setup.py sdist' to produce a source distribution. The +resulting tarball (a .tar or .tar.gz file) will be staged at the staging +location specified as job option. When a worker starts it will check for the +presence of this file and will run 'easy_install tarball' to install the +package in the worker. + +If --requirements_file is present then the file specified by the option will be +staged in the staging location. When a worker starts it will check for the +presence of this file and will run 'pip install -r requirements.txt'. A +requirements file can be easily generated by running 'pip freeze -r +requirements.txt'. The reason a runner does not run this automatically is +because quite often only a small fraction of the dependencies present in a +requirements.txt file are actually needed for remote execution and therefore a +one-time manual trimming is desirable. + +TODO(silviuc): Should we allow several setup packages? +TODO(silviuc): We should allow customizing the exact command for setup build. +""" +import glob +import logging +import os +import shutil +import subprocess +import sys +import tempfile + +import pkg_resources + +from apache_beam.internal import pickler +from apache_beam.io.filesystems import FileSystems +from apache_beam.options.pipeline_options import SetupOptions +# TODO(angoenka): Remove reference to dataflow internal names +from apache_beam.runners.dataflow.internal import names +from apache_beam.utils import processes + +# All constants are for internal use only; no backwards-compatibility +# guarantees. + +# Standard file names used for staging files. +WORKFLOW_TARBALL_FILE = 'workflow.tar.gz' +REQUIREMENTS_FILE = 'requirements.txt' +EXTRA_PACKAGES_FILE = 'extra_packages.txt' + +# Package names for distributions +BEAM_PACKAGE_NAME = 'apache-beam' + + +class Stager(object): + """Stager identifies and copies the appropriate artifacts to the staging + location.""" + + @staticmethod + def _download_file(from_url, to_path): +"""Downloads a file over http/https from a url or copy it from a remote +path to local path.""" +if from_url.startswith('http://') or from_url.startswith('https://'): + # TODO(silviuc): We should cache downloads so we do not do it for every + # job. + try: +# We check if the file is actually there because wget returns a file +# even for a 404 response (file will contain the contents of the 404 +# response). +# TODO(angoenka): Extract and use the filename when downloading file. +response, content = __import__('httplib2').Http().request(from_url) +if int(response['status']) >= 400: + raise RuntimeError( + 'Artifact not found at %s (response: %s)' %
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=103157=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-103157 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 17/May/18 22:57 Start Date: 17/May/18 22:57 Worklog Time Spent: 10m Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#issuecomment-390039131 Run Python Dataflow ValidatesRunner This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 103157) Time Spent: 15h (was: 14h 50m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 15h > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=103088=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-103088 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 17/May/18 20:12 Start Date: 17/May/18 20:12 Worklog Time Spent: 10m Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#issuecomment-389995031 Run Python Dataflow ValidatesRunner This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 103088) Time Spent: 14h 50m (was: 14h 40m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 14h 50m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102749=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102749 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 17/May/18 01:38 Start Date: 17/May/18 01:38 Worklog Time Spent: 10m Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#issuecomment-389717316 Run Python Dataflow ValidatesRunner This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 102749) Time Spent: 14h 40m (was: 14.5h) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 14h 40m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102748=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102748 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 17/May/18 01:38 Start Date: 17/May/18 01:38 Worklog Time Spent: 10m Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#issuecomment-389717242 Updated the PR based on our discussion. PTAL This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 102748) Time Spent: 14.5h (was: 14h 20m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 14.5h > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102741=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102741 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 17/May/18 00:37 Start Date: 17/May/18 00:37 Worklog Time Spent: 10m Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#issuecomment-389708022 @valentyn I am converting all the staticmethods to module methods as mocking static methods is even more difficult. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 102741) Time Spent: 14h 20m (was: 14h 10m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 14h 20m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102735=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102735 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 17/May/18 00:10 Start Date: 17/May/18 00:10 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188807218 ## File path: sdks/python/apache_beam/runners/portability/stager_test.py ## @@ -420,134 +375,167 @@ def test_sdk_location_local_directory_not_present(self): sdk_location = 'nosuchdir' with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() - options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location - dependency.stage_job_resources(options) + self.stager.stage_job_resources(options, staging_location=staging_dir) self.assertEqual( 'The file "%s" cannot be found. Its ' 'location was specified by the --sdk_location command-line option.' % -sdk_location, -cm.exception.args[0]) +sdk_location, cm.exception.args[0]) - def test_sdk_location_gcs_source_file(self): + def test_sdk_location_remote_source_file(self): staging_dir = self.make_temp_dir() sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz' options = PipelineOptions() -options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location -with mock.patch('apache_beam.runners.dataflow.internal.' -'dependency._dependency_file_copy'): - self.assertEqual( - [names.DATAFLOW_SDK_TARBALL_FILE], - dependency.stage_job_resources(options)) - - def test_sdk_location_gcs_wheel_file(self): +with mock.patch('.'.join([ +self.__module__, TestStager.__name__, TestStager.stage_artifact.__name__ +])): + with mock.patch('.'.join([ + self.__module__, TestStager.__name__, + TestStager._download_file.__name__ + ])): +self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE], + self.stager.stage_job_resources( + options, staging_location=staging_dir)) + + def test_sdk_location_remote_wheel_file(self): staging_dir = self.make_temp_dir() sdk_filename = 'apache_beam-1.0.0-cp27-cp27mu-manylinux1_x86_64.whl' -sdk_location = 'gs://my-gcs-bucket/' + sdk_filename +sdk_location = '/tmp/remote/my-bucket/' + sdk_filename options = PipelineOptions() -options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location -with mock.patch('apache_beam.runners.dataflow.internal.' -'dependency._dependency_file_copy'): - self.assertEqual( - [sdk_filename], - dependency.stage_job_resources(options)) +# We can not rely on actual remote file systems paths hence making +# '/tmp/remote/' a new remote path. +def is_remote_path(dummy_self, path): + return path.startswith('/tmp/remote/') + +with mock.patch('.'.join([ +self.__module__, TestStager.__name__, TestStager.stage_artifact.__name__ +])): + with mock.patch('.'.join([ + self.__module__, TestStager.__name__, + TestStager._download_file.__name__ + ])): +with mock.patch( +'.'.join([ +self.__module__, TestStager.__name__, +TestStager._is_remote_path.__name__ +]), is_remote_path): + self.assertEqual([sdk_filename], + self.stager.stage_job_resources( + options, staging_location=staging_dir)) def test_sdk_location_http(self): staging_dir = self.make_temp_dir() sdk_location = 'http://storage.googleapis.com/my-gcs-bucket/tarball.tar.gz' options = PipelineOptions() -options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location -def file_download(_, to_folder): - tarball_path = os.path.join(to_folder, 'sdk-tarball') - with open(tarball_path, 'w') as f: +def file_download(dummy_self, _, to_path): Review comment: Marking _download_file static has made it impossible to extend it and we will not be able to provide its implementation in the subclass without monkey patching it. I will remove the _download_file from TestStager as its not getting
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102734=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102734 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 17/May/18 00:09 Start Date: 17/May/18 00:09 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188807218 ## File path: sdks/python/apache_beam/runners/portability/stager_test.py ## @@ -420,134 +375,167 @@ def test_sdk_location_local_directory_not_present(self): sdk_location = 'nosuchdir' with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() - options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location - dependency.stage_job_resources(options) + self.stager.stage_job_resources(options, staging_location=staging_dir) self.assertEqual( 'The file "%s" cannot be found. Its ' 'location was specified by the --sdk_location command-line option.' % -sdk_location, -cm.exception.args[0]) +sdk_location, cm.exception.args[0]) - def test_sdk_location_gcs_source_file(self): + def test_sdk_location_remote_source_file(self): staging_dir = self.make_temp_dir() sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz' options = PipelineOptions() -options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location -with mock.patch('apache_beam.runners.dataflow.internal.' -'dependency._dependency_file_copy'): - self.assertEqual( - [names.DATAFLOW_SDK_TARBALL_FILE], - dependency.stage_job_resources(options)) - - def test_sdk_location_gcs_wheel_file(self): +with mock.patch('.'.join([ +self.__module__, TestStager.__name__, TestStager.stage_artifact.__name__ +])): + with mock.patch('.'.join([ + self.__module__, TestStager.__name__, + TestStager._download_file.__name__ + ])): +self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE], + self.stager.stage_job_resources( + options, staging_location=staging_dir)) + + def test_sdk_location_remote_wheel_file(self): staging_dir = self.make_temp_dir() sdk_filename = 'apache_beam-1.0.0-cp27-cp27mu-manylinux1_x86_64.whl' -sdk_location = 'gs://my-gcs-bucket/' + sdk_filename +sdk_location = '/tmp/remote/my-bucket/' + sdk_filename options = PipelineOptions() -options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location -with mock.patch('apache_beam.runners.dataflow.internal.' -'dependency._dependency_file_copy'): - self.assertEqual( - [sdk_filename], - dependency.stage_job_resources(options)) +# We can not rely on actual remote file systems paths hence making +# '/tmp/remote/' a new remote path. +def is_remote_path(dummy_self, path): + return path.startswith('/tmp/remote/') + +with mock.patch('.'.join([ +self.__module__, TestStager.__name__, TestStager.stage_artifact.__name__ +])): + with mock.patch('.'.join([ + self.__module__, TestStager.__name__, + TestStager._download_file.__name__ + ])): +with mock.patch( +'.'.join([ +self.__module__, TestStager.__name__, +TestStager._is_remote_path.__name__ +]), is_remote_path): + self.assertEqual([sdk_filename], + self.stager.stage_job_resources( + options, staging_location=staging_dir)) def test_sdk_location_http(self): staging_dir = self.make_temp_dir() sdk_location = 'http://storage.googleapis.com/my-gcs-bucket/tarball.tar.gz' options = PipelineOptions() -options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location -def file_download(_, to_folder): - tarball_path = os.path.join(to_folder, 'sdk-tarball') - with open(tarball_path, 'w') as f: +def file_download(dummy_self, _, to_path): Review comment: Marking _download_file static has made it impossible to extend it and we will not be able to provide its implementation in the subclass without monkey patching it. This
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102730=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102730 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 16/May/18 23:58 Start Date: 16/May/18 23:58 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188805559 ## File path: sdks/python/apache_beam/runners/portability/stager_test.py ## @@ -420,134 +375,167 @@ def test_sdk_location_local_directory_not_present(self): sdk_location = 'nosuchdir' with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() - options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location - dependency.stage_job_resources(options) + self.stager.stage_job_resources(options, staging_location=staging_dir) self.assertEqual( 'The file "%s" cannot be found. Its ' 'location was specified by the --sdk_location command-line option.' % -sdk_location, -cm.exception.args[0]) +sdk_location, cm.exception.args[0]) - def test_sdk_location_gcs_source_file(self): + def test_sdk_location_remote_source_file(self): staging_dir = self.make_temp_dir() sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz' options = PipelineOptions() -options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location -with mock.patch('apache_beam.runners.dataflow.internal.' -'dependency._dependency_file_copy'): - self.assertEqual( - [names.DATAFLOW_SDK_TARBALL_FILE], - dependency.stage_job_resources(options)) - - def test_sdk_location_gcs_wheel_file(self): +with mock.patch('.'.join([ +self.__module__, TestStager.__name__, TestStager.stage_artifact.__name__ +])): + with mock.patch('.'.join([ + self.__module__, TestStager.__name__, + TestStager._download_file.__name__ + ])): +self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE], + self.stager.stage_job_resources( + options, staging_location=staging_dir)) + + def test_sdk_location_remote_wheel_file(self): staging_dir = self.make_temp_dir() sdk_filename = 'apache_beam-1.0.0-cp27-cp27mu-manylinux1_x86_64.whl' -sdk_location = 'gs://my-gcs-bucket/' + sdk_filename +sdk_location = '/tmp/remote/my-bucket/' + sdk_filename options = PipelineOptions() -options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location -with mock.patch('apache_beam.runners.dataflow.internal.' -'dependency._dependency_file_copy'): - self.assertEqual( - [sdk_filename], - dependency.stage_job_resources(options)) +# We can not rely on actual remote file systems paths hence making +# '/tmp/remote/' a new remote path. +def is_remote_path(dummy_self, path): Review comment: yes, as I don't see other ways to spoof remote path. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 102730) Time Spent: 13h 50m (was: 13h 40m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 13h 50m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102728=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102728 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 16/May/18 23:57 Start Date: 16/May/18 23:57 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188805479 ## File path: sdks/python/apache_beam/runners/portability/stager_test.py ## @@ -420,134 +375,167 @@ def test_sdk_location_local_directory_not_present(self): sdk_location = 'nosuchdir' with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() - options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location - dependency.stage_job_resources(options) + self.stager.stage_job_resources(options, staging_location=staging_dir) self.assertEqual( 'The file "%s" cannot be found. Its ' 'location was specified by the --sdk_location command-line option.' % -sdk_location, -cm.exception.args[0]) +sdk_location, cm.exception.args[0]) - def test_sdk_location_gcs_source_file(self): + def test_sdk_location_remote_source_file(self): staging_dir = self.make_temp_dir() sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz' options = PipelineOptions() -options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location -with mock.patch('apache_beam.runners.dataflow.internal.' -'dependency._dependency_file_copy'): - self.assertEqual( - [names.DATAFLOW_SDK_TARBALL_FILE], - dependency.stage_job_resources(options)) - - def test_sdk_location_gcs_wheel_file(self): +with mock.patch('.'.join([ +self.__module__, TestStager.__name__, TestStager.stage_artifact.__name__ +])): + with mock.patch('.'.join([ + self.__module__, TestStager.__name__, + TestStager._download_file.__name__ + ])): +self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE], + self.stager.stage_job_resources( + options, staging_location=staging_dir)) + + def test_sdk_location_remote_wheel_file(self): staging_dir = self.make_temp_dir() sdk_filename = 'apache_beam-1.0.0-cp27-cp27mu-manylinux1_x86_64.whl' -sdk_location = 'gs://my-gcs-bucket/' + sdk_filename +sdk_location = '/tmp/remote/my-bucket/' + sdk_filename options = PipelineOptions() -options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location -with mock.patch('apache_beam.runners.dataflow.internal.' -'dependency._dependency_file_copy'): - self.assertEqual( - [sdk_filename], - dependency.stage_job_resources(options)) +# We can not rely on actual remote file systems paths hence making +# '/tmp/remote/' a new remote path. +def is_remote_path(dummy_self, path): + return path.startswith('/tmp/remote/') + +with mock.patch('.'.join([ +self.__module__, TestStager.__name__, TestStager.stage_artifact.__name__ +])): + with mock.patch('.'.join([ + self.__module__, TestStager.__name__, + TestStager._download_file.__name__ + ])): +with mock.patch( +'.'.join([ +self.__module__, TestStager.__name__, +TestStager._is_remote_path.__name__ +]), is_remote_path): + self.assertEqual([sdk_filename], + self.stager.stage_job_resources( + options, staging_location=staging_dir)) def test_sdk_location_http(self): staging_dir = self.make_temp_dir() sdk_location = 'http://storage.googleapis.com/my-gcs-bucket/tarball.tar.gz' options = PipelineOptions() -options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location -def file_download(_, to_folder): - tarball_path = os.path.join(to_folder, 'sdk-tarball') - with open(tarball_path, 'w') as f: +def file_download(dummy_self, _, to_path): + with open(to_path, 'w') as f: f.write('Package content.') - return tarball_path + return to_path -with mock.patch('apache_beam.runners.dataflow.internal.' -'dependency._dependency_file_download',
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102724=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102724 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 16/May/18 23:54 Start Date: 16/May/18 23:54 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188805008 ## File path: sdks/python/apache_beam/runners/portability/stager_test.py ## @@ -420,134 +375,167 @@ def test_sdk_location_local_directory_not_present(self): sdk_location = 'nosuchdir' with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() - options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location - dependency.stage_job_resources(options) + self.stager.stage_job_resources(options, staging_location=staging_dir) self.assertEqual( 'The file "%s" cannot be found. Its ' 'location was specified by the --sdk_location command-line option.' % -sdk_location, -cm.exception.args[0]) +sdk_location, cm.exception.args[0]) - def test_sdk_location_gcs_source_file(self): + def test_sdk_location_remote_source_file(self): staging_dir = self.make_temp_dir() sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz' options = PipelineOptions() -options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location -with mock.patch('apache_beam.runners.dataflow.internal.' -'dependency._dependency_file_copy'): - self.assertEqual( - [names.DATAFLOW_SDK_TARBALL_FILE], - dependency.stage_job_resources(options)) - - def test_sdk_location_gcs_wheel_file(self): +with mock.patch('.'.join([ +self.__module__, TestStager.__name__, TestStager.stage_artifact.__name__ +])): + with mock.patch('.'.join([ + self.__module__, TestStager.__name__, + TestStager._download_file.__name__ + ])): +self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE], + self.stager.stage_job_resources( + options, staging_location=staging_dir)) + + def test_sdk_location_remote_wheel_file(self): staging_dir = self.make_temp_dir() sdk_filename = 'apache_beam-1.0.0-cp27-cp27mu-manylinux1_x86_64.whl' -sdk_location = 'gs://my-gcs-bucket/' + sdk_filename +sdk_location = '/tmp/remote/my-bucket/' + sdk_filename options = PipelineOptions() -options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location -with mock.patch('apache_beam.runners.dataflow.internal.' -'dependency._dependency_file_copy'): - self.assertEqual( - [sdk_filename], - dependency.stage_job_resources(options)) +# We can not rely on actual remote file systems paths hence making +# '/tmp/remote/' a new remote path. +def is_remote_path(dummy_self, path): + return path.startswith('/tmp/remote/') + +with mock.patch('.'.join([ +self.__module__, TestStager.__name__, TestStager.stage_artifact.__name__ +])): + with mock.patch('.'.join([ + self.__module__, TestStager.__name__, + TestStager._download_file.__name__ + ])): +with mock.patch( +'.'.join([ +self.__module__, TestStager.__name__, +TestStager._is_remote_path.__name__ +]), is_remote_path): + self.assertEqual([sdk_filename], + self.stager.stage_job_resources( + options, staging_location=staging_dir)) def test_sdk_location_http(self): staging_dir = self.make_temp_dir() sdk_location = 'http://storage.googleapis.com/my-gcs-bucket/tarball.tar.gz' options = PipelineOptions() -options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location -def file_download(_, to_folder): - tarball_path = os.path.join(to_folder, 'sdk-tarball') - with open(tarball_path, 'w') as f: +def file_download(dummy_self, _, to_path): + with open(to_path, 'w') as f: f.write('Package content.') - return tarball_path + return to_path -with mock.patch('apache_beam.runners.dataflow.internal.' -'dependency._dependency_file_download',
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102723=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102723 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 16/May/18 23:53 Start Date: 16/May/18 23:53 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188804835 ## File path: sdks/python/apache_beam/runners/portability/stager.py ## @@ -130,19 +99,43 @@ def _download_file(self, from_url, to_path): except Exception: logging.info('Failed to download Artifact from %s', from_url) raise +elif from_url.startswith('gs://') or to_path.startswith('gs://'): + from apache_beam.io.gcp import gcsio + if from_url.startswith('gs://') and to_path.startswith('gs://'): +# Both files are GCS files so copy. +gcsio.GcsIO().copy(from_url, to_path) + elif to_path.startswith('gs://'): Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 102723) Time Spent: 13h 20m (was: 13h 10m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 13h 20m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102722=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102722 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 16/May/18 23:52 Start Date: 16/May/18 23:52 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188804700 ## File path: sdks/python/apache_beam/runners/portability/stager.py ## @@ -78,39 +78,8 @@ class Stager(object): """Stager identifies and copies the appropriate artifacts to the staging location.""" - def _copy_file(self, from_path, to_path): -"""Copies a local file to a GCS file or vice versa.""" -logging.info('file copy from %s to %s.', from_path, to_path) -if from_path.startswith('gs://') or to_path.startswith('gs://'): - from apache_beam.io.gcp import gcsio - if from_path.startswith('gs://') and to_path.startswith('gs://'): -# Both files are GCS files so copy. -gcsio.GcsIO().copy(from_path, to_path) - elif to_path.startswith('gs://'): -# Only target is a GCS file, read local file and upload. -with open(from_path, 'rb') as f: - with gcsio.GcsIO().open(to_path, mode='wb') as g: -pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE) -for chunk in iter(pfun, ''): - g.write(chunk) - else: -# Source is a GCS file but target is local file. -with gcsio.GcsIO().open(from_path, mode='rb') as g: - with open(to_path, 'wb') as f: -pfun = functools.partial(g.read, gcsio.DEFAULT_READ_BUFFER_SIZE) -for chunk in iter(pfun, ''): - f.write(chunk) -else: - # Branch used only for unit tests and integration tests. - # In such environments GCS support is not available. - if not os.path.isdir(os.path.dirname(to_path)): -logging.info( -'Created folder (since we have not done yet, and any errors ' -'will follow): %s ', os.path.dirname(to_path)) -os.mkdir(os.path.dirname(to_path)) - shutil.copyfile(from_path, to_path) - - def _download_file(self, from_url, to_path): + @staticmethod + def _download_file(from_url, to_path): Review comment: As it also support http:// and gs:// I would prefer calling it from_url. Note: from can not be used. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 102722) Time Spent: 13h 10m (was: 13h) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 13h 10m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102721=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102721 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 16/May/18 23:50 Start Date: 16/May/18 23:50 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188804467 ## File path: sdks/python/apache_beam/runners/dataflow/internal/apiclient.py ## @@ -746,17 +745,32 @@ def translate_scalar_counter_float(accumulator, metric_update_proto): metric_update_proto.floatingPoint = accumulator.value -class _ParameterizedStager(Stager): - def __init__(self, stage_artifact): -super(_ParameterizedStager, self).__init__() -self.stage_artifact_method = stage_artifact +class _LegacyDataflowStager(Stager): + # TODO(silviuc): Staged files should have a job specific prefix. Review comment: Removed it This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 102721) Time Spent: 13h (was: 12h 50m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 13h > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102720=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102720 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 16/May/18 23:50 Start Date: 16/May/18 23:50 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188804353 ## File path: sdks/python/apache_beam/runners/dataflow/internal/apiclient.py ## @@ -746,17 +745,32 @@ def translate_scalar_counter_float(accumulator, metric_update_proto): metric_update_proto.floatingPoint = accumulator.value -class _ParameterizedStager(Stager): - def __init__(self, stage_artifact): -super(_ParameterizedStager, self).__init__() -self.stage_artifact_method = stage_artifact +class _LegacyDataflowStager(Stager): + # TODO(silviuc): Staged files should have a job specific prefix. + # To prevent several jobs in the same project stomping on each other due to a + # shared staging location. + def __init__(self, dataflow_application_client): +super(_LegacyDataflowStager, self).__init__() +self.stage_artifact_method = dataflow_application_client._gcs_file_copy Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 102720) Time Spent: 12h 50m (was: 12h 40m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 12h 50m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102716=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102716 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 16/May/18 23:46 Start Date: 16/May/18 23:46 Worklog Time Spent: 10m Work Description: tvalentyn commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188798067 ## File path: sdks/python/apache_beam/runners/portability/stager_test.py ## @@ -420,134 +375,167 @@ def test_sdk_location_local_directory_not_present(self): sdk_location = 'nosuchdir' with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() - options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location - dependency.stage_job_resources(options) + self.stager.stage_job_resources(options, staging_location=staging_dir) self.assertEqual( 'The file "%s" cannot be found. Its ' 'location was specified by the --sdk_location command-line option.' % -sdk_location, -cm.exception.args[0]) +sdk_location, cm.exception.args[0]) - def test_sdk_location_gcs_source_file(self): + def test_sdk_location_remote_source_file(self): staging_dir = self.make_temp_dir() sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz' options = PipelineOptions() -options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location -with mock.patch('apache_beam.runners.dataflow.internal.' -'dependency._dependency_file_copy'): - self.assertEqual( - [names.DATAFLOW_SDK_TARBALL_FILE], - dependency.stage_job_resources(options)) - - def test_sdk_location_gcs_wheel_file(self): +with mock.patch('.'.join([ +self.__module__, TestStager.__name__, TestStager.stage_artifact.__name__ +])): + with mock.patch('.'.join([ + self.__module__, TestStager.__name__, + TestStager._download_file.__name__ + ])): +self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE], + self.stager.stage_job_resources( + options, staging_location=staging_dir)) + + def test_sdk_location_remote_wheel_file(self): staging_dir = self.make_temp_dir() sdk_filename = 'apache_beam-1.0.0-cp27-cp27mu-manylinux1_x86_64.whl' -sdk_location = 'gs://my-gcs-bucket/' + sdk_filename +sdk_location = '/tmp/remote/my-bucket/' + sdk_filename options = PipelineOptions() -options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location -with mock.patch('apache_beam.runners.dataflow.internal.' -'dependency._dependency_file_copy'): - self.assertEqual( - [sdk_filename], - dependency.stage_job_resources(options)) +# We can not rely on actual remote file systems paths hence making +# '/tmp/remote/' a new remote path. +def is_remote_path(dummy_self, path): + return path.startswith('/tmp/remote/') + +with mock.patch('.'.join([ +self.__module__, TestStager.__name__, TestStager.stage_artifact.__name__ +])): + with mock.patch('.'.join([ + self.__module__, TestStager.__name__, + TestStager._download_file.__name__ + ])): +with mock.patch( +'.'.join([ +self.__module__, TestStager.__name__, +TestStager._is_remote_path.__name__ +]), is_remote_path): + self.assertEqual([sdk_filename], + self.stager.stage_job_resources( + options, staging_location=staging_dir)) def test_sdk_location_http(self): staging_dir = self.make_temp_dir() sdk_location = 'http://storage.googleapis.com/my-gcs-bucket/tarball.tar.gz' options = PipelineOptions() -options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location -def file_download(_, to_folder): - tarball_path = os.path.join(to_folder, 'sdk-tarball') - with open(tarball_path, 'w') as f: +def file_download(dummy_self, _, to_path): + with open(to_path, 'w') as f: f.write('Package content.') - return tarball_path + return to_path -with mock.patch('apache_beam.runners.dataflow.internal.' -'dependency._dependency_file_download',
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102719=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102719 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 16/May/18 23:46 Start Date: 16/May/18 23:46 Worklog Time Spent: 10m Work Description: tvalentyn commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188799057 ## File path: sdks/python/apache_beam/runners/portability/stager_test.py ## @@ -420,134 +375,167 @@ def test_sdk_location_local_directory_not_present(self): sdk_location = 'nosuchdir' with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() - options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location - dependency.stage_job_resources(options) + self.stager.stage_job_resources(options, staging_location=staging_dir) self.assertEqual( 'The file "%s" cannot be found. Its ' 'location was specified by the --sdk_location command-line option.' % -sdk_location, -cm.exception.args[0]) +sdk_location, cm.exception.args[0]) - def test_sdk_location_gcs_source_file(self): + def test_sdk_location_remote_source_file(self): staging_dir = self.make_temp_dir() sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz' options = PipelineOptions() -options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location -with mock.patch('apache_beam.runners.dataflow.internal.' -'dependency._dependency_file_copy'): - self.assertEqual( - [names.DATAFLOW_SDK_TARBALL_FILE], - dependency.stage_job_resources(options)) - - def test_sdk_location_gcs_wheel_file(self): +with mock.patch('.'.join([ +self.__module__, TestStager.__name__, TestStager.stage_artifact.__name__ +])): + with mock.patch('.'.join([ + self.__module__, TestStager.__name__, + TestStager._download_file.__name__ + ])): +self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE], + self.stager.stage_job_resources( + options, staging_location=staging_dir)) + + def test_sdk_location_remote_wheel_file(self): staging_dir = self.make_temp_dir() sdk_filename = 'apache_beam-1.0.0-cp27-cp27mu-manylinux1_x86_64.whl' -sdk_location = 'gs://my-gcs-bucket/' + sdk_filename +sdk_location = '/tmp/remote/my-bucket/' + sdk_filename options = PipelineOptions() -options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location -with mock.patch('apache_beam.runners.dataflow.internal.' -'dependency._dependency_file_copy'): - self.assertEqual( - [sdk_filename], - dependency.stage_job_resources(options)) +# We can not rely on actual remote file systems paths hence making +# '/tmp/remote/' a new remote path. +def is_remote_path(dummy_self, path): + return path.startswith('/tmp/remote/') + +with mock.patch('.'.join([ +self.__module__, TestStager.__name__, TestStager.stage_artifact.__name__ +])): + with mock.patch('.'.join([ + self.__module__, TestStager.__name__, + TestStager._download_file.__name__ + ])): +with mock.patch( +'.'.join([ +self.__module__, TestStager.__name__, +TestStager._is_remote_path.__name__ +]), is_remote_path): + self.assertEqual([sdk_filename], + self.stager.stage_job_resources( + options, staging_location=staging_dir)) def test_sdk_location_http(self): staging_dir = self.make_temp_dir() sdk_location = 'http://storage.googleapis.com/my-gcs-bucket/tarball.tar.gz' options = PipelineOptions() -options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location -def file_download(_, to_folder): - tarball_path = os.path.join(to_folder, 'sdk-tarball') - with open(tarball_path, 'w') as f: +def file_download(dummy_self, _, to_path): + with open(to_path, 'w') as f: f.write('Package content.') - return tarball_path + return to_path -with mock.patch('apache_beam.runners.dataflow.internal.' -'dependency._dependency_file_download',
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102718=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102718 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 16/May/18 23:46 Start Date: 16/May/18 23:46 Worklog Time Spent: 10m Work Description: tvalentyn commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188802535 ## File path: sdks/python/apache_beam/runners/portability/stager_test.py ## @@ -420,134 +375,167 @@ def test_sdk_location_local_directory_not_present(self): sdk_location = 'nosuchdir' with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() - options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location - dependency.stage_job_resources(options) + self.stager.stage_job_resources(options, staging_location=staging_dir) self.assertEqual( 'The file "%s" cannot be found. Its ' 'location was specified by the --sdk_location command-line option.' % -sdk_location, -cm.exception.args[0]) +sdk_location, cm.exception.args[0]) - def test_sdk_location_gcs_source_file(self): + def test_sdk_location_remote_source_file(self): staging_dir = self.make_temp_dir() sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz' options = PipelineOptions() -options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location -with mock.patch('apache_beam.runners.dataflow.internal.' -'dependency._dependency_file_copy'): - self.assertEqual( - [names.DATAFLOW_SDK_TARBALL_FILE], - dependency.stage_job_resources(options)) - - def test_sdk_location_gcs_wheel_file(self): +with mock.patch('.'.join([ +self.__module__, TestStager.__name__, TestStager.stage_artifact.__name__ +])): + with mock.patch('.'.join([ + self.__module__, TestStager.__name__, + TestStager._download_file.__name__ + ])): +self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE], + self.stager.stage_job_resources( + options, staging_location=staging_dir)) + + def test_sdk_location_remote_wheel_file(self): staging_dir = self.make_temp_dir() sdk_filename = 'apache_beam-1.0.0-cp27-cp27mu-manylinux1_x86_64.whl' -sdk_location = 'gs://my-gcs-bucket/' + sdk_filename +sdk_location = '/tmp/remote/my-bucket/' + sdk_filename options = PipelineOptions() -options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location -with mock.patch('apache_beam.runners.dataflow.internal.' -'dependency._dependency_file_copy'): - self.assertEqual( - [sdk_filename], - dependency.stage_job_resources(options)) +# We can not rely on actual remote file systems paths hence making +# '/tmp/remote/' a new remote path. +def is_remote_path(dummy_self, path): + return path.startswith('/tmp/remote/') + +with mock.patch('.'.join([ +self.__module__, TestStager.__name__, TestStager.stage_artifact.__name__ +])): + with mock.patch('.'.join([ + self.__module__, TestStager.__name__, + TestStager._download_file.__name__ + ])): +with mock.patch( +'.'.join([ +self.__module__, TestStager.__name__, +TestStager._is_remote_path.__name__ +]), is_remote_path): + self.assertEqual([sdk_filename], + self.stager.stage_job_resources( + options, staging_location=staging_dir)) def test_sdk_location_http(self): staging_dir = self.make_temp_dir() sdk_location = 'http://storage.googleapis.com/my-gcs-bucket/tarball.tar.gz' options = PipelineOptions() -options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location -def file_download(_, to_folder): - tarball_path = os.path.join(to_folder, 'sdk-tarball') - with open(tarball_path, 'w') as f: +def file_download(dummy_self, _, to_path): Review comment: Would it be simpler to implement desirable default behavior in TestStager()? Also why do we have dummy_self in the picture here? This is an automated message from the
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102714=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102714 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 16/May/18 23:46 Start Date: 16/May/18 23:46 Worklog Time Spent: 10m Work Description: tvalentyn commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188795976 ## File path: sdks/python/apache_beam/runners/dataflow/internal/apiclient.py ## @@ -746,17 +745,32 @@ def translate_scalar_counter_float(accumulator, metric_update_proto): metric_update_proto.floatingPoint = accumulator.value -class _ParameterizedStager(Stager): - def __init__(self, stage_artifact): -super(_ParameterizedStager, self).__init__() -self.stage_artifact_method = stage_artifact +class _LegacyDataflowStager(Stager): + # TODO(silviuc): Staged files should have a job specific prefix. Review comment: Can you please check if this TODO is still relevant and remove it if not? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 102714) Time Spent: 12h 10m (was: 12h) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 12h 10m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102713=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102713 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 16/May/18 23:46 Start Date: 16/May/18 23:46 Worklog Time Spent: 10m Work Description: tvalentyn commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188801882 ## File path: sdks/python/apache_beam/runners/portability/stager_test.py ## @@ -420,134 +375,167 @@ def test_sdk_location_local_directory_not_present(self): sdk_location = 'nosuchdir' with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() - options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location - dependency.stage_job_resources(options) + self.stager.stage_job_resources(options, staging_location=staging_dir) self.assertEqual( 'The file "%s" cannot be found. Its ' 'location was specified by the --sdk_location command-line option.' % -sdk_location, -cm.exception.args[0]) +sdk_location, cm.exception.args[0]) - def test_sdk_location_gcs_source_file(self): + def test_sdk_location_remote_source_file(self): staging_dir = self.make_temp_dir() sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz' options = PipelineOptions() -options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location -with mock.patch('apache_beam.runners.dataflow.internal.' -'dependency._dependency_file_copy'): - self.assertEqual( - [names.DATAFLOW_SDK_TARBALL_FILE], - dependency.stage_job_resources(options)) - - def test_sdk_location_gcs_wheel_file(self): +with mock.patch('.'.join([ +self.__module__, TestStager.__name__, TestStager.stage_artifact.__name__ +])): + with mock.patch('.'.join([ + self.__module__, TestStager.__name__, + TestStager._download_file.__name__ + ])): +self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE], + self.stager.stage_job_resources( + options, staging_location=staging_dir)) + + def test_sdk_location_remote_wheel_file(self): staging_dir = self.make_temp_dir() sdk_filename = 'apache_beam-1.0.0-cp27-cp27mu-manylinux1_x86_64.whl' -sdk_location = 'gs://my-gcs-bucket/' + sdk_filename +sdk_location = '/tmp/remote/my-bucket/' + sdk_filename options = PipelineOptions() -options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location -with mock.patch('apache_beam.runners.dataflow.internal.' -'dependency._dependency_file_copy'): - self.assertEqual( - [sdk_filename], - dependency.stage_job_resources(options)) +# We can not rely on actual remote file systems paths hence making +# '/tmp/remote/' a new remote path. +def is_remote_path(dummy_self, path): Review comment: Are you sure we need to do this? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 102713) Time Spent: 12h 10m (was: 12h) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 12h 10m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102717=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102717 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 16/May/18 23:46 Start Date: 16/May/18 23:46 Worklog Time Spent: 10m Work Description: tvalentyn commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188796273 ## File path: sdks/python/apache_beam/runners/portability/stager.py ## @@ -78,39 +78,8 @@ class Stager(object): """Stager identifies and copies the appropriate artifacts to the staging location.""" - def _copy_file(self, from_path, to_path): -"""Copies a local file to a GCS file or vice versa.""" -logging.info('file copy from %s to %s.', from_path, to_path) -if from_path.startswith('gs://') or to_path.startswith('gs://'): - from apache_beam.io.gcp import gcsio - if from_path.startswith('gs://') and to_path.startswith('gs://'): -# Both files are GCS files so copy. -gcsio.GcsIO().copy(from_path, to_path) - elif to_path.startswith('gs://'): -# Only target is a GCS file, read local file and upload. -with open(from_path, 'rb') as f: - with gcsio.GcsIO().open(to_path, mode='wb') as g: -pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE) -for chunk in iter(pfun, ''): - g.write(chunk) - else: -# Source is a GCS file but target is local file. -with gcsio.GcsIO().open(from_path, mode='rb') as g: - with open(to_path, 'wb') as f: -pfun = functools.partial(g.read, gcsio.DEFAULT_READ_BUFFER_SIZE) -for chunk in iter(pfun, ''): - f.write(chunk) -else: - # Branch used only for unit tests and integration tests. - # In such environments GCS support is not available. - if not os.path.isdir(os.path.dirname(to_path)): -logging.info( -'Created folder (since we have not done yet, and any errors ' -'will follow): %s ', os.path.dirname(to_path)) -os.mkdir(os.path.dirname(to_path)) - shutil.copyfile(from_path, to_path) - - def _download_file(self, from_url, to_path): + @staticmethod + def _download_file(from_url, to_path): Review comment: Consider calling first argument `from_path` or `from`. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 102717) Time Spent: 12.5h (was: 12h 20m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 12.5h > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102715=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102715 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 16/May/18 23:46 Start Date: 16/May/18 23:46 Worklog Time Spent: 10m Work Description: tvalentyn commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188796621 ## File path: sdks/python/apache_beam/runners/portability/stager.py ## @@ -130,19 +99,43 @@ def _download_file(self, from_url, to_path): except Exception: logging.info('Failed to download Artifact from %s', from_url) raise +elif from_url.startswith('gs://') or to_path.startswith('gs://'): + from apache_beam.io.gcp import gcsio + if from_url.startswith('gs://') and to_path.startswith('gs://'): +# Both files are GCS files so copy. +gcsio.GcsIO().copy(from_url, to_path) + elif to_path.startswith('gs://'): Review comment: Echoing my comment from previous interation: we only use this method for downloads into local folder. So the part that uploads to GCS would be dead code, let's remove it. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 102715) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 12h 10m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102712=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102712 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 16/May/18 23:46 Start Date: 16/May/18 23:46 Worklog Time Spent: 10m Work Description: tvalentyn commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188794959 ## File path: sdks/python/apache_beam/runners/dataflow/internal/apiclient.py ## @@ -746,17 +745,32 @@ def translate_scalar_counter_float(accumulator, metric_update_proto): metric_update_proto.floatingPoint = accumulator.value -class _ParameterizedStager(Stager): - def __init__(self, stage_artifact): -super(_ParameterizedStager, self).__init__() -self.stage_artifact_method = stage_artifact +class _LegacyDataflowStager(Stager): + # TODO(silviuc): Staged files should have a job specific prefix. + # To prevent several jobs in the same project stomping on each other due to a + # shared staging location. + def __init__(self, dataflow_application_client): +super(_LegacyDataflowStager, self).__init__() +self.stage_artifact_method = dataflow_application_client._gcs_file_copy Review comment: The composition would be more transparent if we store the reference to the dataflow client in the constructor, and call `self.dataflow_application_client._gcs_file_copy(...)` in stage_artifact. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 102712) Time Spent: 12h (was: 11h 50m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 12h > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102680=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102680 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 16/May/18 21:50 Start Date: 16/May/18 21:50 Worklog Time Spent: 10m Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#issuecomment-389677258 Run Python Dataflow ValidatesRunner This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 102680) Time Spent: 11h 50m (was: 11h 40m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 11h 50m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102669=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102669 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 16/May/18 21:28 Start Date: 16/May/18 21:28 Worklog Time Spent: 10m Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#issuecomment-389671701 Applied the review commends based on our discussion. PTAL This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 102669) Time Spent: 11h 40m (was: 11.5h) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 11h 40m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102651=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102651 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 16/May/18 21:18 Start Date: 16/May/18 21:18 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188746923 ## File path: sdks/python/apache_beam/runners/portability/stager.py ## @@ -0,0 +1,573 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Support for installing custom code and required dependencies. + +Workflows, with the exception of very simple ones, are organized in multiple +modules and packages. Typically, these modules and packages have +dependencies on other standard libraries. Beam relies on the Python +setuptools package to handle these scenarios. For further details please read: +https://pythonhosted.org/an_example_pypi_project/setuptools.html + +When a runner tries to run a pipeline it will check for a --requirements_file +and a --setup_file option. + +If --setup_file is present then it is assumed that the folder containing the +file specified by the option has the typical layout required by setuptools and +it will run 'python setup.py sdist' to produce a source distribution. The +resulting tarball (a .tar or .tar.gz file) will be staged at the staging +location specified as job option. When a worker starts it will check for the +presence of this file and will run 'easy_install tarball' to install the +package in the worker. + +If --requirements_file is present then the file specified by the option will be +staged in the staging location. When a worker starts it will check for the +presence of this file and will run 'pip install -r requirements.txt'. A +requirements file can be easily generated by running 'pip freeze -r +requirements.txt'. The reason a runner does not run this automatically is +because quite often only a small fraction of the dependencies present in a +requirements.txt file are actually needed for remote execution and therefore a +one-time manual trimming is desirable. + +TODO(silviuc): Should we allow several setup packages? +TODO(silviuc): We should allow customizing the exact command for setup build. +""" +import functools +import glob +import logging +import os +import shutil +import subprocess +import sys +import tempfile + +import pkg_resources + +from apache_beam.internal import pickler +from apache_beam.io.filesystems import FileSystems +from apache_beam.options.pipeline_options import SetupOptions +# TODO(angoenka): Remove reference to dataflow internal names +from apache_beam.runners.dataflow.internal import names +from apache_beam.utils import processes + +# All constants are for internal use only; no backwards-compatibility +# guarantees. + +# Standard file names used for staging files. +WORKFLOW_TARBALL_FILE = 'workflow.tar.gz' +REQUIREMENTS_FILE = 'requirements.txt' +EXTRA_PACKAGES_FILE = 'extra_packages.txt' + +# Package names for distributions +BEAM_PACKAGE_NAME = 'apache-beam' + + +class Stager(object): + """Stager identifies and copies the appropriate artifacts to the staging + location.""" + + def _copy_file(self, from_path, to_path): +"""Copies a local file to a GCS file or vice versa.""" +logging.info('file copy from %s to %s.', from_path, to_path) +if from_path.startswith('gs://') or to_path.startswith('gs://'): + from apache_beam.io.gcp import gcsio + if from_path.startswith('gs://') and to_path.startswith('gs://'): +# Both files are GCS files so copy. +gcsio.GcsIO().copy(from_path, to_path) + elif to_path.startswith('gs://'): +# Only target is a GCS file, read local file and upload. +with open(from_path, 'rb') as f: + with gcsio.GcsIO().open(to_path, mode='wb') as g: +pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE) +for chunk in iter(pfun, ''): +
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102653=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102653 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 16/May/18 21:18 Start Date: 16/May/18 21:18 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188729612 ## File path: sdks/python/apache_beam/runners/dataflow/internal/apiclient.py ## @@ -731,6 +746,18 @@ def translate_scalar_counter_float(accumulator, metric_update_proto): metric_update_proto.floatingPoint = accumulator.value +class _ParameterizedStager(Stager): Review comment: Done This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 102653) Time Spent: 10.5h (was: 10h 20m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 10.5h > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102657=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102657 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 16/May/18 21:18 Start Date: 16/May/18 21:18 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188750502 ## File path: sdks/python/apache_beam/runners/portability/stager.py ## @@ -0,0 +1,573 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Support for installing custom code and required dependencies. + +Workflows, with the exception of very simple ones, are organized in multiple +modules and packages. Typically, these modules and packages have +dependencies on other standard libraries. Beam relies on the Python +setuptools package to handle these scenarios. For further details please read: +https://pythonhosted.org/an_example_pypi_project/setuptools.html + +When a runner tries to run a pipeline it will check for a --requirements_file +and a --setup_file option. + +If --setup_file is present then it is assumed that the folder containing the +file specified by the option has the typical layout required by setuptools and +it will run 'python setup.py sdist' to produce a source distribution. The +resulting tarball (a .tar or .tar.gz file) will be staged at the staging +location specified as job option. When a worker starts it will check for the +presence of this file and will run 'easy_install tarball' to install the +package in the worker. + +If --requirements_file is present then the file specified by the option will be +staged in the staging location. When a worker starts it will check for the +presence of this file and will run 'pip install -r requirements.txt'. A +requirements file can be easily generated by running 'pip freeze -r +requirements.txt'. The reason a runner does not run this automatically is +because quite often only a small fraction of the dependencies present in a +requirements.txt file are actually needed for remote execution and therefore a +one-time manual trimming is desirable. + +TODO(silviuc): Should we allow several setup packages? +TODO(silviuc): We should allow customizing the exact command for setup build. +""" +import functools +import glob +import logging +import os +import shutil +import subprocess +import sys +import tempfile + +import pkg_resources + +from apache_beam.internal import pickler +from apache_beam.io.filesystems import FileSystems +from apache_beam.options.pipeline_options import SetupOptions +# TODO(angoenka): Remove reference to dataflow internal names +from apache_beam.runners.dataflow.internal import names +from apache_beam.utils import processes + +# All constants are for internal use only; no backwards-compatibility +# guarantees. + +# Standard file names used for staging files. +WORKFLOW_TARBALL_FILE = 'workflow.tar.gz' +REQUIREMENTS_FILE = 'requirements.txt' +EXTRA_PACKAGES_FILE = 'extra_packages.txt' + +# Package names for distributions +BEAM_PACKAGE_NAME = 'apache-beam' + + +class Stager(object): + """Stager identifies and copies the appropriate artifacts to the staging + location.""" + + def _copy_file(self, from_path, to_path): +"""Copies a local file to a GCS file or vice versa.""" +logging.info('file copy from %s to %s.', from_path, to_path) +if from_path.startswith('gs://') or to_path.startswith('gs://'): + from apache_beam.io.gcp import gcsio + if from_path.startswith('gs://') and to_path.startswith('gs://'): +# Both files are GCS files so copy. +gcsio.GcsIO().copy(from_path, to_path) + elif to_path.startswith('gs://'): +# Only target is a GCS file, read local file and upload. +with open(from_path, 'rb') as f: + with gcsio.GcsIO().open(to_path, mode='wb') as g: +pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE) +for chunk in iter(pfun, ''): +
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102658=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102658 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 16/May/18 21:18 Start Date: 16/May/18 21:18 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188746725 ## File path: sdks/python/apache_beam/runners/portability/stager.py ## @@ -0,0 +1,573 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Support for installing custom code and required dependencies. + +Workflows, with the exception of very simple ones, are organized in multiple +modules and packages. Typically, these modules and packages have +dependencies on other standard libraries. Beam relies on the Python +setuptools package to handle these scenarios. For further details please read: +https://pythonhosted.org/an_example_pypi_project/setuptools.html + +When a runner tries to run a pipeline it will check for a --requirements_file +and a --setup_file option. + +If --setup_file is present then it is assumed that the folder containing the +file specified by the option has the typical layout required by setuptools and +it will run 'python setup.py sdist' to produce a source distribution. The +resulting tarball (a .tar or .tar.gz file) will be staged at the staging +location specified as job option. When a worker starts it will check for the +presence of this file and will run 'easy_install tarball' to install the +package in the worker. + +If --requirements_file is present then the file specified by the option will be +staged in the staging location. When a worker starts it will check for the +presence of this file and will run 'pip install -r requirements.txt'. A +requirements file can be easily generated by running 'pip freeze -r +requirements.txt'. The reason a runner does not run this automatically is +because quite often only a small fraction of the dependencies present in a +requirements.txt file are actually needed for remote execution and therefore a +one-time manual trimming is desirable. + +TODO(silviuc): Should we allow several setup packages? +TODO(silviuc): We should allow customizing the exact command for setup build. +""" +import functools +import glob +import logging +import os +import shutil +import subprocess +import sys +import tempfile + +import pkg_resources + +from apache_beam.internal import pickler +from apache_beam.io.filesystems import FileSystems +from apache_beam.options.pipeline_options import SetupOptions +# TODO(angoenka): Remove reference to dataflow internal names +from apache_beam.runners.dataflow.internal import names +from apache_beam.utils import processes + +# All constants are for internal use only; no backwards-compatibility +# guarantees. + +# Standard file names used for staging files. +WORKFLOW_TARBALL_FILE = 'workflow.tar.gz' +REQUIREMENTS_FILE = 'requirements.txt' +EXTRA_PACKAGES_FILE = 'extra_packages.txt' + +# Package names for distributions +BEAM_PACKAGE_NAME = 'apache-beam' + + +class Stager(object): + """Stager identifies and copies the appropriate artifacts to the staging + location.""" + + def _copy_file(self, from_path, to_path): +"""Copies a local file to a GCS file or vice versa.""" +logging.info('file copy from %s to %s.', from_path, to_path) +if from_path.startswith('gs://') or to_path.startswith('gs://'): + from apache_beam.io.gcp import gcsio + if from_path.startswith('gs://') and to_path.startswith('gs://'): +# Both files are GCS files so copy. +gcsio.GcsIO().copy(from_path, to_path) + elif to_path.startswith('gs://'): +# Only target is a GCS file, read local file and upload. +with open(from_path, 'rb') as f: + with gcsio.GcsIO().open(to_path, mode='wb') as g: +pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE) +for chunk in iter(pfun, ''): +
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102656=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102656 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 16/May/18 21:18 Start Date: 16/May/18 21:18 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188743575 ## File path: sdks/python/apache_beam/runners/dataflow/internal/dependency.py ## @@ -607,65 +151,9 @@ def get_sdk_name_and_version(): def get_sdk_package_name(): Review comment: Making this a module function of apiclient This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 102656) Time Spent: 11h (was: 10h 50m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 11h > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102655=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102655 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 16/May/18 21:18 Start Date: 16/May/18 21:18 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188746649 ## File path: sdks/python/apache_beam/runners/portability/stager.py ## @@ -0,0 +1,573 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Support for installing custom code and required dependencies. + +Workflows, with the exception of very simple ones, are organized in multiple +modules and packages. Typically, these modules and packages have +dependencies on other standard libraries. Beam relies on the Python +setuptools package to handle these scenarios. For further details please read: +https://pythonhosted.org/an_example_pypi_project/setuptools.html + +When a runner tries to run a pipeline it will check for a --requirements_file +and a --setup_file option. + +If --setup_file is present then it is assumed that the folder containing the +file specified by the option has the typical layout required by setuptools and +it will run 'python setup.py sdist' to produce a source distribution. The +resulting tarball (a .tar or .tar.gz file) will be staged at the staging +location specified as job option. When a worker starts it will check for the +presence of this file and will run 'easy_install tarball' to install the +package in the worker. + +If --requirements_file is present then the file specified by the option will be +staged in the staging location. When a worker starts it will check for the +presence of this file and will run 'pip install -r requirements.txt'. A +requirements file can be easily generated by running 'pip freeze -r +requirements.txt'. The reason a runner does not run this automatically is +because quite often only a small fraction of the dependencies present in a +requirements.txt file are actually needed for remote execution and therefore a +one-time manual trimming is desirable. + +TODO(silviuc): Should we allow several setup packages? +TODO(silviuc): We should allow customizing the exact command for setup build. +""" +import functools +import glob +import logging +import os +import shutil +import subprocess +import sys +import tempfile + +import pkg_resources + +from apache_beam.internal import pickler +from apache_beam.io.filesystems import FileSystems +from apache_beam.options.pipeline_options import SetupOptions +# TODO(angoenka): Remove reference to dataflow internal names +from apache_beam.runners.dataflow.internal import names +from apache_beam.utils import processes + +# All constants are for internal use only; no backwards-compatibility +# guarantees. + +# Standard file names used for staging files. +WORKFLOW_TARBALL_FILE = 'workflow.tar.gz' +REQUIREMENTS_FILE = 'requirements.txt' +EXTRA_PACKAGES_FILE = 'extra_packages.txt' + +# Package names for distributions +BEAM_PACKAGE_NAME = 'apache-beam' + + +class Stager(object): + """Stager identifies and copies the appropriate artifacts to the staging + location.""" + + def _copy_file(self, from_path, to_path): +"""Copies a local file to a GCS file or vice versa.""" +logging.info('file copy from %s to %s.', from_path, to_path) +if from_path.startswith('gs://') or to_path.startswith('gs://'): + from apache_beam.io.gcp import gcsio + if from_path.startswith('gs://') and to_path.startswith('gs://'): +# Both files are GCS files so copy. +gcsio.GcsIO().copy(from_path, to_path) + elif to_path.startswith('gs://'): +# Only target is a GCS file, read local file and upload. +with open(from_path, 'rb') as f: + with gcsio.GcsIO().open(to_path, mode='wb') as g: +pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE) +for chunk in iter(pfun, ''): +
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102659=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102659 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 16/May/18 21:18 Start Date: 16/May/18 21:18 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188775399 ## File path: sdks/python/apache_beam/runners/portability/stager.py ## @@ -0,0 +1,573 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Support for installing custom code and required dependencies. + +Workflows, with the exception of very simple ones, are organized in multiple +modules and packages. Typically, these modules and packages have +dependencies on other standard libraries. Beam relies on the Python +setuptools package to handle these scenarios. For further details please read: +https://pythonhosted.org/an_example_pypi_project/setuptools.html + +When a runner tries to run a pipeline it will check for a --requirements_file +and a --setup_file option. + +If --setup_file is present then it is assumed that the folder containing the +file specified by the option has the typical layout required by setuptools and +it will run 'python setup.py sdist' to produce a source distribution. The +resulting tarball (a .tar or .tar.gz file) will be staged at the staging +location specified as job option. When a worker starts it will check for the +presence of this file and will run 'easy_install tarball' to install the +package in the worker. + +If --requirements_file is present then the file specified by the option will be +staged in the staging location. When a worker starts it will check for the +presence of this file and will run 'pip install -r requirements.txt'. A +requirements file can be easily generated by running 'pip freeze -r +requirements.txt'. The reason a runner does not run this automatically is +because quite often only a small fraction of the dependencies present in a +requirements.txt file are actually needed for remote execution and therefore a +one-time manual trimming is desirable. + +TODO(silviuc): Should we allow several setup packages? +TODO(silviuc): We should allow customizing the exact command for setup build. +""" +import functools +import glob +import logging +import os +import shutil +import subprocess +import sys +import tempfile + +import pkg_resources + +from apache_beam.internal import pickler +from apache_beam.io.filesystems import FileSystems +from apache_beam.options.pipeline_options import SetupOptions +# TODO(angoenka): Remove reference to dataflow internal names +from apache_beam.runners.dataflow.internal import names +from apache_beam.utils import processes + +# All constants are for internal use only; no backwards-compatibility +# guarantees. + +# Standard file names used for staging files. +WORKFLOW_TARBALL_FILE = 'workflow.tar.gz' +REQUIREMENTS_FILE = 'requirements.txt' +EXTRA_PACKAGES_FILE = 'extra_packages.txt' + +# Package names for distributions +BEAM_PACKAGE_NAME = 'apache-beam' + + +class Stager(object): + """Stager identifies and copies the appropriate artifacts to the staging + location.""" + + def _copy_file(self, from_path, to_path): +"""Copies a local file to a GCS file or vice versa.""" Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 102659) Time Spent: 11.5h (was: 11h 20m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 >
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102652=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102652 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 16/May/18 21:18 Start Date: 16/May/18 21:18 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188746406 ## File path: sdks/python/apache_beam/runners/portability/stager.py ## @@ -0,0 +1,573 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Support for installing custom code and required dependencies. + +Workflows, with the exception of very simple ones, are organized in multiple +modules and packages. Typically, these modules and packages have +dependencies on other standard libraries. Beam relies on the Python +setuptools package to handle these scenarios. For further details please read: +https://pythonhosted.org/an_example_pypi_project/setuptools.html + +When a runner tries to run a pipeline it will check for a --requirements_file +and a --setup_file option. + +If --setup_file is present then it is assumed that the folder containing the +file specified by the option has the typical layout required by setuptools and +it will run 'python setup.py sdist' to produce a source distribution. The +resulting tarball (a .tar or .tar.gz file) will be staged at the staging +location specified as job option. When a worker starts it will check for the +presence of this file and will run 'easy_install tarball' to install the +package in the worker. + +If --requirements_file is present then the file specified by the option will be +staged in the staging location. When a worker starts it will check for the +presence of this file and will run 'pip install -r requirements.txt'. A +requirements file can be easily generated by running 'pip freeze -r +requirements.txt'. The reason a runner does not run this automatically is +because quite often only a small fraction of the dependencies present in a +requirements.txt file are actually needed for remote execution and therefore a +one-time manual trimming is desirable. + +TODO(silviuc): Should we allow several setup packages? +TODO(silviuc): We should allow customizing the exact command for setup build. +""" +import functools +import glob +import logging +import os +import shutil +import subprocess +import sys +import tempfile + +import pkg_resources + +from apache_beam.internal import pickler +from apache_beam.io.filesystems import FileSystems +from apache_beam.options.pipeline_options import SetupOptions +# TODO(angoenka): Remove reference to dataflow internal names +from apache_beam.runners.dataflow.internal import names +from apache_beam.utils import processes + +# All constants are for internal use only; no backwards-compatibility +# guarantees. + +# Standard file names used for staging files. +WORKFLOW_TARBALL_FILE = 'workflow.tar.gz' +REQUIREMENTS_FILE = 'requirements.txt' +EXTRA_PACKAGES_FILE = 'extra_packages.txt' + +# Package names for distributions +BEAM_PACKAGE_NAME = 'apache-beam' + + +class Stager(object): + """Stager identifies and copies the appropriate artifacts to the staging + location.""" + + def _copy_file(self, from_path, to_path): +"""Copies a local file to a GCS file or vice versa.""" +logging.info('file copy from %s to %s.', from_path, to_path) +if from_path.startswith('gs://') or to_path.startswith('gs://'): + from apache_beam.io.gcp import gcsio + if from_path.startswith('gs://') and to_path.startswith('gs://'): +# Both files are GCS files so copy. +gcsio.GcsIO().copy(from_path, to_path) + elif to_path.startswith('gs://'): +# Only target is a GCS file, read local file and upload. +with open(from_path, 'rb') as f: + with gcsio.GcsIO().open(to_path, mode='wb') as g: +pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE) +for chunk in iter(pfun, ''): +
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102654=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102654 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 16/May/18 21:18 Start Date: 16/May/18 21:18 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188746609 ## File path: sdks/python/apache_beam/runners/portability/stager.py ## @@ -0,0 +1,573 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Support for installing custom code and required dependencies. + +Workflows, with the exception of very simple ones, are organized in multiple +modules and packages. Typically, these modules and packages have +dependencies on other standard libraries. Beam relies on the Python +setuptools package to handle these scenarios. For further details please read: +https://pythonhosted.org/an_example_pypi_project/setuptools.html + +When a runner tries to run a pipeline it will check for a --requirements_file +and a --setup_file option. + +If --setup_file is present then it is assumed that the folder containing the +file specified by the option has the typical layout required by setuptools and +it will run 'python setup.py sdist' to produce a source distribution. The +resulting tarball (a .tar or .tar.gz file) will be staged at the staging +location specified as job option. When a worker starts it will check for the +presence of this file and will run 'easy_install tarball' to install the +package in the worker. + +If --requirements_file is present then the file specified by the option will be +staged in the staging location. When a worker starts it will check for the +presence of this file and will run 'pip install -r requirements.txt'. A +requirements file can be easily generated by running 'pip freeze -r +requirements.txt'. The reason a runner does not run this automatically is +because quite often only a small fraction of the dependencies present in a +requirements.txt file are actually needed for remote execution and therefore a +one-time manual trimming is desirable. + +TODO(silviuc): Should we allow several setup packages? +TODO(silviuc): We should allow customizing the exact command for setup build. +""" +import functools +import glob +import logging +import os +import shutil +import subprocess +import sys +import tempfile + +import pkg_resources + +from apache_beam.internal import pickler +from apache_beam.io.filesystems import FileSystems +from apache_beam.options.pipeline_options import SetupOptions +# TODO(angoenka): Remove reference to dataflow internal names +from apache_beam.runners.dataflow.internal import names +from apache_beam.utils import processes + +# All constants are for internal use only; no backwards-compatibility +# guarantees. + +# Standard file names used for staging files. +WORKFLOW_TARBALL_FILE = 'workflow.tar.gz' +REQUIREMENTS_FILE = 'requirements.txt' +EXTRA_PACKAGES_FILE = 'extra_packages.txt' + +# Package names for distributions +BEAM_PACKAGE_NAME = 'apache-beam' + + +class Stager(object): + """Stager identifies and copies the appropriate artifacts to the staging + location.""" + + def _copy_file(self, from_path, to_path): +"""Copies a local file to a GCS file or vice versa.""" +logging.info('file copy from %s to %s.', from_path, to_path) +if from_path.startswith('gs://') or to_path.startswith('gs://'): + from apache_beam.io.gcp import gcsio + if from_path.startswith('gs://') and to_path.startswith('gs://'): +# Both files are GCS files so copy. +gcsio.GcsIO().copy(from_path, to_path) + elif to_path.startswith('gs://'): +# Only target is a GCS file, read local file and upload. +with open(from_path, 'rb') as f: + with gcsio.GcsIO().open(to_path, mode='wb') as g: +pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE) +for chunk in iter(pfun, ''): +
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102592=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102592 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 16/May/18 18:34 Start Date: 16/May/18 18:34 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188729509 ## File path: sdks/python/apache_beam/runners/dataflow/internal/apiclient.py ## @@ -731,6 +746,18 @@ def translate_scalar_counter_float(accumulator, metric_update_proto): metric_update_proto.floatingPoint = accumulator.value +class _ParameterizedStager(Stager): Review comment: Done This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 102592) Time Spent: 10h (was: 9h 50m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 10h > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102593=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102593 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 16/May/18 18:34 Start Date: 16/May/18 18:34 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188729509 ## File path: sdks/python/apache_beam/runners/dataflow/internal/apiclient.py ## @@ -731,6 +746,18 @@ def translate_scalar_counter_float(accumulator, metric_update_proto): metric_update_proto.floatingPoint = accumulator.value +class _ParameterizedStager(Stager): Review comment: Done This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 102593) Time Spent: 10h 10m (was: 10h) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 10h 10m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102342=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102342 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 15/May/18 23:07 Start Date: 15/May/18 23:07 Worklog Time Spent: 10m Work Description: tvalentyn commented on issue #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#issuecomment-389342407 In my opinion, will be easier to follow and maintain the helper functions if we explicitly remove the access to the state of the objects (`self`) when it is not needed. I would argue, we should at least make them `@staticmethods`. From that point on, the difference between a helper function in the module or a private static method that does not access the class, is rather syntactic. I think it's a little more overhead to have them as static methods, but I don't mind that if you feel strongly about associating the helpers with the `Stager` class. You could also declare helper functions as inner functions of class methods that need them, if they are only used in that particular method. I would consider that for short helper functions. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 102342) Time Spent: 9h 50m (was: 9h 40m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 9h 50m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102299=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102299 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 15/May/18 22:01 Start Date: 15/May/18 22:01 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188450008 ## File path: sdks/python/apache_beam/runners/portability/stager.py ## @@ -0,0 +1,573 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Support for installing custom code and required dependencies. + +Workflows, with the exception of very simple ones, are organized in multiple +modules and packages. Typically, these modules and packages have +dependencies on other standard libraries. Beam relies on the Python +setuptools package to handle these scenarios. For further details please read: +https://pythonhosted.org/an_example_pypi_project/setuptools.html + +When a runner tries to run a pipeline it will check for a --requirements_file +and a --setup_file option. + +If --setup_file is present then it is assumed that the folder containing the +file specified by the option has the typical layout required by setuptools and +it will run 'python setup.py sdist' to produce a source distribution. The +resulting tarball (a .tar or .tar.gz file) will be staged at the staging +location specified as job option. When a worker starts it will check for the +presence of this file and will run 'easy_install tarball' to install the +package in the worker. + +If --requirements_file is present then the file specified by the option will be +staged in the staging location. When a worker starts it will check for the +presence of this file and will run 'pip install -r requirements.txt'. A +requirements file can be easily generated by running 'pip freeze -r +requirements.txt'. The reason a runner does not run this automatically is +because quite often only a small fraction of the dependencies present in a +requirements.txt file are actually needed for remote execution and therefore a +one-time manual trimming is desirable. + +TODO(silviuc): Should we allow several setup packages? +TODO(silviuc): We should allow customizing the exact command for setup build. +""" +import functools +import glob +import logging +import os +import shutil +import subprocess +import sys +import tempfile + +import pkg_resources + +from apache_beam.internal import pickler +from apache_beam.io.filesystems import FileSystems +from apache_beam.options.pipeline_options import SetupOptions +# TODO(angoenka): Remove reference to dataflow internal names +from apache_beam.runners.dataflow.internal import names +from apache_beam.utils import processes + +# All constants are for internal use only; no backwards-compatibility +# guarantees. + +# Standard file names used for staging files. +WORKFLOW_TARBALL_FILE = 'workflow.tar.gz' +REQUIREMENTS_FILE = 'requirements.txt' +EXTRA_PACKAGES_FILE = 'extra_packages.txt' + +# Package names for distributions +BEAM_PACKAGE_NAME = 'apache-beam' + + +class Stager(object): + """Stager identifies and copies the appropriate artifacts to the staging + location.""" + + def _copy_file(self, from_path, to_path): +"""Copies a local file to a GCS file or vice versa.""" +logging.info('file copy from %s to %s.', from_path, to_path) +if from_path.startswith('gs://') or to_path.startswith('gs://'): + from apache_beam.io.gcp import gcsio + if from_path.startswith('gs://') and to_path.startswith('gs://'): +# Both files are GCS files so copy. +gcsio.GcsIO().copy(from_path, to_path) + elif to_path.startswith('gs://'): +# Only target is a GCS file, read local file and upload. +with open(from_path, 'rb') as f: + with gcsio.GcsIO().open(to_path, mode='wb') as g: +pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE) +for chunk in iter(pfun, ''): +
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102301=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102301 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 15/May/18 22:01 Start Date: 15/May/18 22:01 Worklog Time Spent: 10m Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#issuecomment-389327626 I would like to keep all the functionality of Stager within the class instead of fragmenting it between Stager class and module. As none of these functions are reusable and are not intended to be reused in anyway, splitting things in module and class will only make code harder to follow and maintain. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 102301) Time Spent: 9h 40m (was: 9.5h) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 9h 40m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=101959=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101959 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 15/May/18 01:11 Start Date: 15/May/18 01:11 Worklog Time Spent: 10m Work Description: tvalentyn commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188141417 ## File path: sdks/python/apache_beam/runners/portability/stager.py ## @@ -0,0 +1,573 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Support for installing custom code and required dependencies. + +Workflows, with the exception of very simple ones, are organized in multiple +modules and packages. Typically, these modules and packages have +dependencies on other standard libraries. Beam relies on the Python +setuptools package to handle these scenarios. For further details please read: +https://pythonhosted.org/an_example_pypi_project/setuptools.html + +When a runner tries to run a pipeline it will check for a --requirements_file +and a --setup_file option. + +If --setup_file is present then it is assumed that the folder containing the +file specified by the option has the typical layout required by setuptools and +it will run 'python setup.py sdist' to produce a source distribution. The +resulting tarball (a .tar or .tar.gz file) will be staged at the staging +location specified as job option. When a worker starts it will check for the +presence of this file and will run 'easy_install tarball' to install the +package in the worker. + +If --requirements_file is present then the file specified by the option will be +staged in the staging location. When a worker starts it will check for the +presence of this file and will run 'pip install -r requirements.txt'. A +requirements file can be easily generated by running 'pip freeze -r +requirements.txt'. The reason a runner does not run this automatically is +because quite often only a small fraction of the dependencies present in a +requirements.txt file are actually needed for remote execution and therefore a +one-time manual trimming is desirable. + +TODO(silviuc): Should we allow several setup packages? +TODO(silviuc): We should allow customizing the exact command for setup build. +""" +import functools +import glob +import logging +import os +import shutil +import subprocess +import sys +import tempfile + +import pkg_resources + +from apache_beam.internal import pickler +from apache_beam.io.filesystems import FileSystems +from apache_beam.options.pipeline_options import SetupOptions +# TODO(angoenka): Remove reference to dataflow internal names +from apache_beam.runners.dataflow.internal import names +from apache_beam.utils import processes + +# All constants are for internal use only; no backwards-compatibility +# guarantees. + +# Standard file names used for staging files. +WORKFLOW_TARBALL_FILE = 'workflow.tar.gz' +REQUIREMENTS_FILE = 'requirements.txt' +EXTRA_PACKAGES_FILE = 'extra_packages.txt' + +# Package names for distributions +BEAM_PACKAGE_NAME = 'apache-beam' + + +class Stager(object): + """Stager identifies and copies the appropriate artifacts to the staging + location.""" + + def _copy_file(self, from_path, to_path): +"""Copies a local file to a GCS file or vice versa.""" Review comment: This method will only be used to download files from GCS to local folder, the part that handles uploads to GCS would be a dead code, let's remove it. Uploads will be handled by stage_artifact. I think it would be easier to reason about the code if we have `_download_file` method and GCS / HTTP helper methods as follows: ``` def _download_file(from_path, to_path): if from_path starts.with('gcs://'): _download_from_gcs(from_path, to_path) ... else if from_path starts.with('http'): _download_from_http(from_path, to_path) ... else: # used in tests ... ``` Also since
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=101967=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101967 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 15/May/18 01:11 Start Date: 15/May/18 01:11 Worklog Time Spent: 10m Work Description: tvalentyn commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188141414 ## File path: sdks/python/apache_beam/runners/portability/stager.py ## @@ -0,0 +1,573 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Support for installing custom code and required dependencies. + +Workflows, with the exception of very simple ones, are organized in multiple +modules and packages. Typically, these modules and packages have +dependencies on other standard libraries. Beam relies on the Python +setuptools package to handle these scenarios. For further details please read: +https://pythonhosted.org/an_example_pypi_project/setuptools.html + +When a runner tries to run a pipeline it will check for a --requirements_file +and a --setup_file option. + +If --setup_file is present then it is assumed that the folder containing the +file specified by the option has the typical layout required by setuptools and +it will run 'python setup.py sdist' to produce a source distribution. The +resulting tarball (a .tar or .tar.gz file) will be staged at the staging +location specified as job option. When a worker starts it will check for the +presence of this file and will run 'easy_install tarball' to install the +package in the worker. + +If --requirements_file is present then the file specified by the option will be +staged in the staging location. When a worker starts it will check for the +presence of this file and will run 'pip install -r requirements.txt'. A +requirements file can be easily generated by running 'pip freeze -r +requirements.txt'. The reason a runner does not run this automatically is +because quite often only a small fraction of the dependencies present in a +requirements.txt file are actually needed for remote execution and therefore a +one-time manual trimming is desirable. + +TODO(silviuc): Should we allow several setup packages? +TODO(silviuc): We should allow customizing the exact command for setup build. +""" +import functools +import glob +import logging +import os +import shutil +import subprocess +import sys +import tempfile + +import pkg_resources + +from apache_beam.internal import pickler +from apache_beam.io.filesystems import FileSystems +from apache_beam.options.pipeline_options import SetupOptions +# TODO(angoenka): Remove reference to dataflow internal names +from apache_beam.runners.dataflow.internal import names +from apache_beam.utils import processes + +# All constants are for internal use only; no backwards-compatibility +# guarantees. + +# Standard file names used for staging files. +WORKFLOW_TARBALL_FILE = 'workflow.tar.gz' +REQUIREMENTS_FILE = 'requirements.txt' +EXTRA_PACKAGES_FILE = 'extra_packages.txt' + +# Package names for distributions +BEAM_PACKAGE_NAME = 'apache-beam' + + +class Stager(object): + """Stager identifies and copies the appropriate artifacts to the staging + location.""" + + def _copy_file(self, from_path, to_path): +"""Copies a local file to a GCS file or vice versa.""" +logging.info('file copy from %s to %s.', from_path, to_path) +if from_path.startswith('gs://') or to_path.startswith('gs://'): + from apache_beam.io.gcp import gcsio + if from_path.startswith('gs://') and to_path.startswith('gs://'): +# Both files are GCS files so copy. +gcsio.GcsIO().copy(from_path, to_path) + elif to_path.startswith('gs://'): +# Only target is a GCS file, read local file and upload. +with open(from_path, 'rb') as f: + with gcsio.GcsIO().open(to_path, mode='wb') as g: +pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE) +for chunk in iter(pfun, ''): +
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=101960=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101960 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 15/May/18 01:11 Start Date: 15/May/18 01:11 Worklog Time Spent: 10m Work Description: tvalentyn commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188141423 ## File path: sdks/python/apache_beam/runners/portability/stager.py ## @@ -0,0 +1,573 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Support for installing custom code and required dependencies. + +Workflows, with the exception of very simple ones, are organized in multiple +modules and packages. Typically, these modules and packages have +dependencies on other standard libraries. Beam relies on the Python +setuptools package to handle these scenarios. For further details please read: +https://pythonhosted.org/an_example_pypi_project/setuptools.html + +When a runner tries to run a pipeline it will check for a --requirements_file +and a --setup_file option. + +If --setup_file is present then it is assumed that the folder containing the +file specified by the option has the typical layout required by setuptools and +it will run 'python setup.py sdist' to produce a source distribution. The +resulting tarball (a .tar or .tar.gz file) will be staged at the staging +location specified as job option. When a worker starts it will check for the +presence of this file and will run 'easy_install tarball' to install the +package in the worker. + +If --requirements_file is present then the file specified by the option will be +staged in the staging location. When a worker starts it will check for the +presence of this file and will run 'pip install -r requirements.txt'. A +requirements file can be easily generated by running 'pip freeze -r +requirements.txt'. The reason a runner does not run this automatically is +because quite often only a small fraction of the dependencies present in a +requirements.txt file are actually needed for remote execution and therefore a +one-time manual trimming is desirable. + +TODO(silviuc): Should we allow several setup packages? +TODO(silviuc): We should allow customizing the exact command for setup build. +""" +import functools +import glob +import logging +import os +import shutil +import subprocess +import sys +import tempfile + +import pkg_resources + +from apache_beam.internal import pickler +from apache_beam.io.filesystems import FileSystems +from apache_beam.options.pipeline_options import SetupOptions +# TODO(angoenka): Remove reference to dataflow internal names +from apache_beam.runners.dataflow.internal import names +from apache_beam.utils import processes + +# All constants are for internal use only; no backwards-compatibility +# guarantees. + +# Standard file names used for staging files. +WORKFLOW_TARBALL_FILE = 'workflow.tar.gz' +REQUIREMENTS_FILE = 'requirements.txt' +EXTRA_PACKAGES_FILE = 'extra_packages.txt' + +# Package names for distributions +BEAM_PACKAGE_NAME = 'apache-beam' + + +class Stager(object): + """Stager identifies and copies the appropriate artifacts to the staging + location.""" + + def _copy_file(self, from_path, to_path): +"""Copies a local file to a GCS file or vice versa.""" +logging.info('file copy from %s to %s.', from_path, to_path) +if from_path.startswith('gs://') or to_path.startswith('gs://'): + from apache_beam.io.gcp import gcsio + if from_path.startswith('gs://') and to_path.startswith('gs://'): +# Both files are GCS files so copy. +gcsio.GcsIO().copy(from_path, to_path) + elif to_path.startswith('gs://'): +# Only target is a GCS file, read local file and upload. +with open(from_path, 'rb') as f: + with gcsio.GcsIO().open(to_path, mode='wb') as g: +pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE) +for chunk in iter(pfun, ''): +
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=101966=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101966 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 15/May/18 01:11 Start Date: 15/May/18 01:11 Worklog Time Spent: 10m Work Description: tvalentyn commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188141429 ## File path: sdks/python/apache_beam/runners/dataflow/internal/dependency.py ## @@ -607,65 +151,9 @@ def get_sdk_name_and_version(): def get_sdk_package_name(): Review comment: Let's implement this in LegacyDataflowStager (see another comment). This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 101966) Time Spent: 9h 10m (was: 9h) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 9h 10m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=101963=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101963 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 15/May/18 01:11 Start Date: 15/May/18 01:11 Worklog Time Spent: 10m Work Description: tvalentyn commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188141413 ## File path: sdks/python/apache_beam/runners/portability/stager.py ## @@ -0,0 +1,573 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Support for installing custom code and required dependencies. + +Workflows, with the exception of very simple ones, are organized in multiple +modules and packages. Typically, these modules and packages have +dependencies on other standard libraries. Beam relies on the Python +setuptools package to handle these scenarios. For further details please read: +https://pythonhosted.org/an_example_pypi_project/setuptools.html + +When a runner tries to run a pipeline it will check for a --requirements_file +and a --setup_file option. + +If --setup_file is present then it is assumed that the folder containing the +file specified by the option has the typical layout required by setuptools and +it will run 'python setup.py sdist' to produce a source distribution. The +resulting tarball (a .tar or .tar.gz file) will be staged at the staging +location specified as job option. When a worker starts it will check for the +presence of this file and will run 'easy_install tarball' to install the +package in the worker. + +If --requirements_file is present then the file specified by the option will be +staged in the staging location. When a worker starts it will check for the +presence of this file and will run 'pip install -r requirements.txt'. A +requirements file can be easily generated by running 'pip freeze -r +requirements.txt'. The reason a runner does not run this automatically is +because quite often only a small fraction of the dependencies present in a +requirements.txt file are actually needed for remote execution and therefore a +one-time manual trimming is desirable. + +TODO(silviuc): Should we allow several setup packages? +TODO(silviuc): We should allow customizing the exact command for setup build. +""" +import functools +import glob +import logging +import os +import shutil +import subprocess +import sys +import tempfile + +import pkg_resources + +from apache_beam.internal import pickler +from apache_beam.io.filesystems import FileSystems +from apache_beam.options.pipeline_options import SetupOptions +# TODO(angoenka): Remove reference to dataflow internal names +from apache_beam.runners.dataflow.internal import names +from apache_beam.utils import processes + +# All constants are for internal use only; no backwards-compatibility +# guarantees. + +# Standard file names used for staging files. +WORKFLOW_TARBALL_FILE = 'workflow.tar.gz' +REQUIREMENTS_FILE = 'requirements.txt' +EXTRA_PACKAGES_FILE = 'extra_packages.txt' + +# Package names for distributions +BEAM_PACKAGE_NAME = 'apache-beam' + + +class Stager(object): + """Stager identifies and copies the appropriate artifacts to the staging + location.""" + + def _copy_file(self, from_path, to_path): +"""Copies a local file to a GCS file or vice versa.""" +logging.info('file copy from %s to %s.', from_path, to_path) +if from_path.startswith('gs://') or to_path.startswith('gs://'): + from apache_beam.io.gcp import gcsio + if from_path.startswith('gs://') and to_path.startswith('gs://'): +# Both files are GCS files so copy. +gcsio.GcsIO().copy(from_path, to_path) + elif to_path.startswith('gs://'): +# Only target is a GCS file, read local file and upload. +with open(from_path, 'rb') as f: + with gcsio.GcsIO().open(to_path, mode='wb') as g: +pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE) +for chunk in iter(pfun, ''): +
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=101961=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101961 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 15/May/18 01:11 Start Date: 15/May/18 01:11 Worklog Time Spent: 10m Work Description: tvalentyn commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188141416 ## File path: sdks/python/apache_beam/runners/portability/stager.py ## @@ -0,0 +1,573 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Support for installing custom code and required dependencies. + +Workflows, with the exception of very simple ones, are organized in multiple +modules and packages. Typically, these modules and packages have +dependencies on other standard libraries. Beam relies on the Python +setuptools package to handle these scenarios. For further details please read: +https://pythonhosted.org/an_example_pypi_project/setuptools.html + +When a runner tries to run a pipeline it will check for a --requirements_file +and a --setup_file option. + +If --setup_file is present then it is assumed that the folder containing the +file specified by the option has the typical layout required by setuptools and +it will run 'python setup.py sdist' to produce a source distribution. The +resulting tarball (a .tar or .tar.gz file) will be staged at the staging +location specified as job option. When a worker starts it will check for the +presence of this file and will run 'easy_install tarball' to install the +package in the worker. + +If --requirements_file is present then the file specified by the option will be +staged in the staging location. When a worker starts it will check for the +presence of this file and will run 'pip install -r requirements.txt'. A +requirements file can be easily generated by running 'pip freeze -r +requirements.txt'. The reason a runner does not run this automatically is +because quite often only a small fraction of the dependencies present in a +requirements.txt file are actually needed for remote execution and therefore a +one-time manual trimming is desirable. + +TODO(silviuc): Should we allow several setup packages? +TODO(silviuc): We should allow customizing the exact command for setup build. +""" +import functools +import glob +import logging +import os +import shutil +import subprocess +import sys +import tempfile + +import pkg_resources + +from apache_beam.internal import pickler +from apache_beam.io.filesystems import FileSystems +from apache_beam.options.pipeline_options import SetupOptions +# TODO(angoenka): Remove reference to dataflow internal names +from apache_beam.runners.dataflow.internal import names +from apache_beam.utils import processes + +# All constants are for internal use only; no backwards-compatibility +# guarantees. + +# Standard file names used for staging files. +WORKFLOW_TARBALL_FILE = 'workflow.tar.gz' +REQUIREMENTS_FILE = 'requirements.txt' +EXTRA_PACKAGES_FILE = 'extra_packages.txt' + +# Package names for distributions +BEAM_PACKAGE_NAME = 'apache-beam' + + +class Stager(object): + """Stager identifies and copies the appropriate artifacts to the staging + location.""" + + def _copy_file(self, from_path, to_path): +"""Copies a local file to a GCS file or vice versa.""" +logging.info('file copy from %s to %s.', from_path, to_path) +if from_path.startswith('gs://') or to_path.startswith('gs://'): + from apache_beam.io.gcp import gcsio + if from_path.startswith('gs://') and to_path.startswith('gs://'): +# Both files are GCS files so copy. +gcsio.GcsIO().copy(from_path, to_path) + elif to_path.startswith('gs://'): +# Only target is a GCS file, read local file and upload. +with open(from_path, 'rb') as f: + with gcsio.GcsIO().open(to_path, mode='wb') as g: +pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE) +for chunk in iter(pfun, ''): +
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=101964=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101964 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 15/May/18 01:11 Start Date: 15/May/18 01:11 Worklog Time Spent: 10m Work Description: tvalentyn commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188141422 ## File path: sdks/python/apache_beam/runners/portability/stager.py ## @@ -0,0 +1,573 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Support for installing custom code and required dependencies. + +Workflows, with the exception of very simple ones, are organized in multiple +modules and packages. Typically, these modules and packages have +dependencies on other standard libraries. Beam relies on the Python +setuptools package to handle these scenarios. For further details please read: +https://pythonhosted.org/an_example_pypi_project/setuptools.html + +When a runner tries to run a pipeline it will check for a --requirements_file +and a --setup_file option. + +If --setup_file is present then it is assumed that the folder containing the +file specified by the option has the typical layout required by setuptools and +it will run 'python setup.py sdist' to produce a source distribution. The +resulting tarball (a .tar or .tar.gz file) will be staged at the staging +location specified as job option. When a worker starts it will check for the +presence of this file and will run 'easy_install tarball' to install the +package in the worker. + +If --requirements_file is present then the file specified by the option will be +staged in the staging location. When a worker starts it will check for the +presence of this file and will run 'pip install -r requirements.txt'. A +requirements file can be easily generated by running 'pip freeze -r +requirements.txt'. The reason a runner does not run this automatically is +because quite often only a small fraction of the dependencies present in a +requirements.txt file are actually needed for remote execution and therefore a +one-time manual trimming is desirable. + +TODO(silviuc): Should we allow several setup packages? +TODO(silviuc): We should allow customizing the exact command for setup build. +""" +import functools +import glob +import logging +import os +import shutil +import subprocess +import sys +import tempfile + +import pkg_resources + +from apache_beam.internal import pickler +from apache_beam.io.filesystems import FileSystems +from apache_beam.options.pipeline_options import SetupOptions +# TODO(angoenka): Remove reference to dataflow internal names +from apache_beam.runners.dataflow.internal import names +from apache_beam.utils import processes + +# All constants are for internal use only; no backwards-compatibility +# guarantees. + +# Standard file names used for staging files. +WORKFLOW_TARBALL_FILE = 'workflow.tar.gz' +REQUIREMENTS_FILE = 'requirements.txt' +EXTRA_PACKAGES_FILE = 'extra_packages.txt' + +# Package names for distributions +BEAM_PACKAGE_NAME = 'apache-beam' + + +class Stager(object): + """Stager identifies and copies the appropriate artifacts to the staging + location.""" + + def _copy_file(self, from_path, to_path): +"""Copies a local file to a GCS file or vice versa.""" +logging.info('file copy from %s to %s.', from_path, to_path) +if from_path.startswith('gs://') or to_path.startswith('gs://'): + from apache_beam.io.gcp import gcsio + if from_path.startswith('gs://') and to_path.startswith('gs://'): +# Both files are GCS files so copy. +gcsio.GcsIO().copy(from_path, to_path) + elif to_path.startswith('gs://'): +# Only target is a GCS file, read local file and upload. +with open(from_path, 'rb') as f: + with gcsio.GcsIO().open(to_path, mode='wb') as g: +pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE) +for chunk in iter(pfun, ''): +
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=101962=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101962 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 15/May/18 01:11 Start Date: 15/May/18 01:11 Worklog Time Spent: 10m Work Description: tvalentyn commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188141415 ## File path: sdks/python/apache_beam/runners/portability/stager.py ## @@ -0,0 +1,573 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Support for installing custom code and required dependencies. + +Workflows, with the exception of very simple ones, are organized in multiple +modules and packages. Typically, these modules and packages have +dependencies on other standard libraries. Beam relies on the Python +setuptools package to handle these scenarios. For further details please read: +https://pythonhosted.org/an_example_pypi_project/setuptools.html + +When a runner tries to run a pipeline it will check for a --requirements_file +and a --setup_file option. + +If --setup_file is present then it is assumed that the folder containing the +file specified by the option has the typical layout required by setuptools and +it will run 'python setup.py sdist' to produce a source distribution. The +resulting tarball (a .tar or .tar.gz file) will be staged at the staging +location specified as job option. When a worker starts it will check for the +presence of this file and will run 'easy_install tarball' to install the +package in the worker. + +If --requirements_file is present then the file specified by the option will be +staged in the staging location. When a worker starts it will check for the +presence of this file and will run 'pip install -r requirements.txt'. A +requirements file can be easily generated by running 'pip freeze -r +requirements.txt'. The reason a runner does not run this automatically is +because quite often only a small fraction of the dependencies present in a +requirements.txt file are actually needed for remote execution and therefore a +one-time manual trimming is desirable. + +TODO(silviuc): Should we allow several setup packages? +TODO(silviuc): We should allow customizing the exact command for setup build. +""" +import functools +import glob +import logging +import os +import shutil +import subprocess +import sys +import tempfile + +import pkg_resources + +from apache_beam.internal import pickler +from apache_beam.io.filesystems import FileSystems +from apache_beam.options.pipeline_options import SetupOptions +# TODO(angoenka): Remove reference to dataflow internal names +from apache_beam.runners.dataflow.internal import names +from apache_beam.utils import processes + +# All constants are for internal use only; no backwards-compatibility +# guarantees. + +# Standard file names used for staging files. +WORKFLOW_TARBALL_FILE = 'workflow.tar.gz' +REQUIREMENTS_FILE = 'requirements.txt' +EXTRA_PACKAGES_FILE = 'extra_packages.txt' + +# Package names for distributions +BEAM_PACKAGE_NAME = 'apache-beam' + + +class Stager(object): + """Stager identifies and copies the appropriate artifacts to the staging + location.""" + + def _copy_file(self, from_path, to_path): +"""Copies a local file to a GCS file or vice versa.""" +logging.info('file copy from %s to %s.', from_path, to_path) +if from_path.startswith('gs://') or to_path.startswith('gs://'): + from apache_beam.io.gcp import gcsio + if from_path.startswith('gs://') and to_path.startswith('gs://'): +# Both files are GCS files so copy. +gcsio.GcsIO().copy(from_path, to_path) + elif to_path.startswith('gs://'): +# Only target is a GCS file, read local file and upload. +with open(from_path, 'rb') as f: + with gcsio.GcsIO().open(to_path, mode='wb') as g: +pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE) +for chunk in iter(pfun, ''): +
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=101965=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101965 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 15/May/18 01:11 Start Date: 15/May/18 01:11 Worklog Time Spent: 10m Work Description: tvalentyn commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188141420 ## File path: sdks/python/apache_beam/runners/portability/stager.py ## @@ -0,0 +1,573 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Support for installing custom code and required dependencies. + +Workflows, with the exception of very simple ones, are organized in multiple +modules and packages. Typically, these modules and packages have +dependencies on other standard libraries. Beam relies on the Python +setuptools package to handle these scenarios. For further details please read: +https://pythonhosted.org/an_example_pypi_project/setuptools.html + +When a runner tries to run a pipeline it will check for a --requirements_file +and a --setup_file option. + +If --setup_file is present then it is assumed that the folder containing the +file specified by the option has the typical layout required by setuptools and +it will run 'python setup.py sdist' to produce a source distribution. The +resulting tarball (a .tar or .tar.gz file) will be staged at the staging +location specified as job option. When a worker starts it will check for the +presence of this file and will run 'easy_install tarball' to install the +package in the worker. + +If --requirements_file is present then the file specified by the option will be +staged in the staging location. When a worker starts it will check for the +presence of this file and will run 'pip install -r requirements.txt'. A +requirements file can be easily generated by running 'pip freeze -r +requirements.txt'. The reason a runner does not run this automatically is +because quite often only a small fraction of the dependencies present in a +requirements.txt file are actually needed for remote execution and therefore a +one-time manual trimming is desirable. + +TODO(silviuc): Should we allow several setup packages? +TODO(silviuc): We should allow customizing the exact command for setup build. +""" +import functools +import glob +import logging +import os +import shutil +import subprocess +import sys +import tempfile + +import pkg_resources + +from apache_beam.internal import pickler +from apache_beam.io.filesystems import FileSystems +from apache_beam.options.pipeline_options import SetupOptions +# TODO(angoenka): Remove reference to dataflow internal names +from apache_beam.runners.dataflow.internal import names +from apache_beam.utils import processes + +# All constants are for internal use only; no backwards-compatibility +# guarantees. + +# Standard file names used for staging files. +WORKFLOW_TARBALL_FILE = 'workflow.tar.gz' +REQUIREMENTS_FILE = 'requirements.txt' +EXTRA_PACKAGES_FILE = 'extra_packages.txt' + +# Package names for distributions +BEAM_PACKAGE_NAME = 'apache-beam' + + +class Stager(object): + """Stager identifies and copies the appropriate artifacts to the staging + location.""" + + def _copy_file(self, from_path, to_path): +"""Copies a local file to a GCS file or vice versa.""" +logging.info('file copy from %s to %s.', from_path, to_path) +if from_path.startswith('gs://') or to_path.startswith('gs://'): + from apache_beam.io.gcp import gcsio + if from_path.startswith('gs://') and to_path.startswith('gs://'): +# Both files are GCS files so copy. +gcsio.GcsIO().copy(from_path, to_path) + elif to_path.startswith('gs://'): +# Only target is a GCS file, read local file and upload. +with open(from_path, 'rb') as f: + with gcsio.GcsIO().open(to_path, mode='wb') as g: +pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE) +for chunk in iter(pfun, ''): +
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=101958=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101958 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 15/May/18 01:11 Start Date: 15/May/18 01:11 Worklog Time Spent: 10m Work Description: tvalentyn commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188141421 ## File path: sdks/python/apache_beam/runners/portability/stager.py ## @@ -0,0 +1,573 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Support for installing custom code and required dependencies. + +Workflows, with the exception of very simple ones, are organized in multiple +modules and packages. Typically, these modules and packages have +dependencies on other standard libraries. Beam relies on the Python +setuptools package to handle these scenarios. For further details please read: +https://pythonhosted.org/an_example_pypi_project/setuptools.html + +When a runner tries to run a pipeline it will check for a --requirements_file +and a --setup_file option. + +If --setup_file is present then it is assumed that the folder containing the +file specified by the option has the typical layout required by setuptools and +it will run 'python setup.py sdist' to produce a source distribution. The +resulting tarball (a .tar or .tar.gz file) will be staged at the staging +location specified as job option. When a worker starts it will check for the +presence of this file and will run 'easy_install tarball' to install the +package in the worker. + +If --requirements_file is present then the file specified by the option will be +staged in the staging location. When a worker starts it will check for the +presence of this file and will run 'pip install -r requirements.txt'. A +requirements file can be easily generated by running 'pip freeze -r +requirements.txt'. The reason a runner does not run this automatically is +because quite often only a small fraction of the dependencies present in a +requirements.txt file are actually needed for remote execution and therefore a +one-time manual trimming is desirable. + +TODO(silviuc): Should we allow several setup packages? +TODO(silviuc): We should allow customizing the exact command for setup build. +""" +import functools +import glob +import logging +import os +import shutil +import subprocess +import sys +import tempfile + +import pkg_resources + +from apache_beam.internal import pickler +from apache_beam.io.filesystems import FileSystems +from apache_beam.options.pipeline_options import SetupOptions +# TODO(angoenka): Remove reference to dataflow internal names +from apache_beam.runners.dataflow.internal import names +from apache_beam.utils import processes + +# All constants are for internal use only; no backwards-compatibility +# guarantees. + +# Standard file names used for staging files. +WORKFLOW_TARBALL_FILE = 'workflow.tar.gz' +REQUIREMENTS_FILE = 'requirements.txt' +EXTRA_PACKAGES_FILE = 'extra_packages.txt' + +# Package names for distributions +BEAM_PACKAGE_NAME = 'apache-beam' + + +class Stager(object): + """Stager identifies and copies the appropriate artifacts to the staging + location.""" + + def _copy_file(self, from_path, to_path): +"""Copies a local file to a GCS file or vice versa.""" +logging.info('file copy from %s to %s.', from_path, to_path) +if from_path.startswith('gs://') or to_path.startswith('gs://'): + from apache_beam.io.gcp import gcsio + if from_path.startswith('gs://') and to_path.startswith('gs://'): +# Both files are GCS files so copy. +gcsio.GcsIO().copy(from_path, to_path) + elif to_path.startswith('gs://'): +# Only target is a GCS file, read local file and upload. +with open(from_path, 'rb') as f: + with gcsio.GcsIO().open(to_path, mode='wb') as g: +pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE) +for chunk in iter(pfun, ''): +
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=101957=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101957 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 15/May/18 01:11 Start Date: 15/May/18 01:11 Worklog Time Spent: 10m Work Description: tvalentyn commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r188141411 ## File path: sdks/python/apache_beam/runners/dataflow/internal/apiclient.py ## @@ -731,6 +746,18 @@ def translate_scalar_counter_float(accumulator, metric_update_proto): metric_update_proto.floatingPoint = accumulator.value +class _ParameterizedStager(Stager): Review comment: Let's call this `_LegacyDataflowStager`, and take an instance of `DataflowApplicationClient` in the constructor, then let's call `dataflow_application_client._gcs_file_copy()` in stage_artifact. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 101957) Time Spent: 7h 50m (was: 7h 40m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 7h 50m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=101843=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101843 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 14/May/18 19:19 Start Date: 14/May/18 19:19 Worklog Time Spent: 10m Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#issuecomment-388931882 Run Python Dataflow ValidatesRunner This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 101843) Time Spent: 7h 40m (was: 7.5h) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 7h 40m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=101288=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101288 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 11/May/18 20:05 Start Date: 11/May/18 20:05 Worklog Time Spent: 10m Work Description: angoenka commented on issue #5273: [BEAM-3883] Adding Client to push artifacts to artifact staging service URL: https://github.com/apache/beam/pull/5273#issuecomment-388471936 Replied on the comment in PR #5251 This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 101288) Time Spent: 7.5h (was: 7h 20m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 7.5h > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=101285=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101285 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 11/May/18 19:58 Start Date: 11/May/18 19:58 Worklog Time Spent: 10m Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#issuecomment-388470371 From the perspective of fixing the entanglement in the code, I do not want to handle it in this PR as it is unrelated to streamlining dependency.py. The current entanglement is in the apiclient.py which does file staging and reuse some of the same state. I think this should be handled separately as its not in the scope of this PR. Regarding the naming of GCSStager, its because its not intended to be used outside apiclient and is only meant to be used for dataflow runner. I would be happy to rename it as long as the name signifies that its not meant for use elsewhere. I can also make it a private class if it makes sense. "ParameterizedStager" seems to be a good name and just to make it private, I will make it "_ParameterizedStager". This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 101285) Time Spent: 7h 20m (was: 7h 10m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 7h 20m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100939=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100939 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 11/May/18 01:15 Start Date: 11/May/18 01:15 Worklog Time Spent: 10m Work Description: angoenka commented on issue #5273: [BEAM-3883] Adding Client to push artifacts to artifact staging service URL: https://github.com/apache/beam/pull/5273#issuecomment-388230845 I have updated the PR based on the discussion https://github.com/apache/beam/pull/5251#issuecomment-388163831 PTAL This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 100939) Time Spent: 7h 10m (was: 7h) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 7h 10m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100931=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100931 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 11/May/18 00:47 Start Date: 11/May/18 00:47 Worklog Time Spent: 10m Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#issuecomment-388226972 @tvalentyn I have merged filehandler in stager based on your suggestion. PTAL This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 100931) Time Spent: 7h (was: 6h 50m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 7h > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100807=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100807 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 10/May/18 20:19 Start Date: 10/May/18 20:19 Worklog Time Spent: 10m Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#issuecomment-388173088 I agree with the general structure you mentioned. And just to rephrase it I would call them 1. Collect artifacts 2. Commit artifacts The key difference in the current code is that it commits individual artifacts when it finds them while collecting. This certainly makes the code messy. However there are some challenges as how we download artifacts which can depend upon the setup. To reuse collection code we will need to pass a download_file method which can download remote file to local file system from a variety of sources. And then we will have a upload_file which will be used in commit artifacts. Which brings us very close to the current implementation and make me less inclined towards this change. FileHandler has just 2 methods download_file and upload_file which clearly outlined the the expectations from a runner. This makes stager completely reusable with different FileHandlers. Please let me know if it makes sense. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 100807) Time Spent: 6h 50m (was: 6h 40m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 6h 50m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100789=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100789 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 10/May/18 19:44 Start Date: 10/May/18 19:44 Worklog Time Spent: 10m Work Description: tvalentyn commented on issue #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#issuecomment-388163831 I think we can simplify the logic here. During resource staging for Python SDK we need to stage pipeline artifacts (SDK, workflow main session, requirements.txt, cached version of the packages in requirements.txt, maybe workflow tarball). Staging happens in two steps: 1. If the artifact is not local, download the artifact from artifact source location to a local temp folder. 2. Stage the artifact from local folder into the staging location. Depending on the execution environment, we can have different staging locations: - All portable runners, including ULR should stage artifacts to an Artifact Server over GRPC. - Dataflow Runner for non-portable pipelines need to stage artifacts to a GCS bucket. Step 2 needs to be different for different stager implementations, however the first step does not. I don't see a reason to implement prestaging the dependencies separately for each stager. Being able to stage SDK from GCS, HTTP, or other location is a capability of SDK, regardless of the runner, so I think it should be common. We can support other locations as well when the need arises. This said, I suggest the following sketch for the abstractions: ``` class Stager(object): def stage_job_resources(options, temp_dir=None, ...): ''' Materializes all resources to be stages in a local folder, stages artifacts one by one using _stage_artifact(), and calls _commit_manifest() at the end. SDK: will be staged from PyPI if sdk_location=default, otherwise from sdk_location. sdk_location can be a path to local directory, GCS path or HTTP URL. Extra packages are staged from local directory. Packages from requirements.txt are downloaded from PyPI into temporary folder, then staged to a staging location. ... ''' # move existing functionality from dependency.stage_job_resources() here. def stage_artifact(local_path_to_artifact, artefact_name): """ Stages the artifact to self._staging_location, if successful returns True and adds artifact_name to the manifest of artifacts that have been staged. """ raise NotImplementedError def commit_manifest(): """Commits manifest through Artifact API.""" raise NotImplementedError class GcsStager(Stager): # Stager for legacy Dataflow pipelines def stage_artifact(local_path_to_artifact, artefact_name): # check that self.staging_location is a GCS bucket # copy the artifact to gs://self.staging_location/artefact_name+some_suffix def commit_manifest(): pass # No need to do anything here for legacy pipelines. class ArtifactServerStager(Stager): def stage_artifact(local_path_to_artifact, artefact_name): # Implementation that talks to Artifact Server via Fn Artifact API. def commit_manifest(local_path_to_artifact, artefact_name): # Implementation that talks to Artifact Server via Fn Artifact API. ``` What do you think? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 100789) Time Spent: 6h 40m (was: 6.5h) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 6h 40m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100779=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100779 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 10/May/18 19:15 Start Date: 10/May/18 19:15 Worklog Time Spent: 10m Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#issuecomment-388156058 Run Python Dataflow ValidatesContainer This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 100779) Time Spent: 6.5h (was: 6h 20m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 6.5h > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100382=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100382 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 10/May/18 02:38 Start Date: 10/May/18 02:38 Worklog Time Spent: 10m Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#issuecomment-387934312 Run Python Dataflow ValidatesRunner This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 100382) Time Spent: 6h 20m (was: 6h 10m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 6h 20m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100381=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100381 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 10/May/18 02:37 Start Date: 10/May/18 02:37 Worklog Time Spent: 10m Work Description: angoenka commented on issue #5273: [BEAM-3883] Adding Client to push artifacts to artifact staging service URL: https://github.com/apache/beam/pull/5273#issuecomment-387934066 Run Python Dataflow ValidatesRunner This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 100381) Time Spent: 6h 10m (was: 6h) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 6h 10m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100380=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100380 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 10/May/18 02:35 Start Date: 10/May/18 02:35 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5273: [BEAM-3883] Adding Client to push artifacts to artifact staging service URL: https://github.com/apache/beam/pull/5273#discussion_r187226786 ## File path: sdks/python/apache_beam/runners/portability/artifact_service_client.py ## @@ -0,0 +1,104 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""A :class:`FileHandler` to work with :class:`ArtifactStagingServiceStub`. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +from apache_beam.portability.api import beam_artifact_api_pb2 +from apache_beam.portability.api import beam_artifact_api_pb2_grpc +from apache_beam.runners.portability.stager import FileHandler + + +class ArtifactStagingFileHandler(FileHandler): + """:class:`FileHandler` to push files to ArtifactStagingService. + + The class keeps track of pushed files and user is expected to call + :fun:`commit_manifest` once all files are uploaded. + Once :fun:`commit_manifest` is called, no further operations can be performed + on the class. + + Note: This class is not thread safe and user of this class should ensure + thread safety. + """ + + def __init__(self, artifact_service_channel): +"""Creates a new FileHandler to upload file to ArtifactStagingService. + +Args: + artifact_service_channel: Channel used to interact with +ArtifactStagingService.User owns the channel and should close it when +finished. +""" +super(ArtifactStagingFileHandler, self).__init__() +self._artifact_staging_stub = beam_artifact_api_pb2_grpc.\ +ArtifactStagingServiceStub(channel=artifact_service_channel) +self._artifacts = [] +self.closed = False + + def file_copy(self, from_path, to_path): Review comment: Sure. Made this change in the parent PR This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 100380) Time Spent: 6h (was: 5h 50m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 6h > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100377=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100377 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 10/May/18 02:35 Start Date: 10/May/18 02:35 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5273: [BEAM-3883] Adding Client to push artifacts to artifact staging service URL: https://github.com/apache/beam/pull/5273#discussion_r187199182 ## File path: sdks/python/apache_beam/runners/portability/artifact_service_client.py ## @@ -0,0 +1,104 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""A :class:`FileHandler` to work with :class:`ArtifactStagingServiceStub`. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +from apache_beam.portability.api import beam_artifact_api_pb2 +from apache_beam.portability.api import beam_artifact_api_pb2_grpc +from apache_beam.runners.portability.stager import FileHandler + + +class ArtifactStagingFileHandler(FileHandler): + """:class:`FileHandler` to push files to ArtifactStagingService. + + The class keeps track of pushed files and user is expected to call + :fun:`commit_manifest` once all files are uploaded. + Once :fun:`commit_manifest` is called, no further operations can be performed + on the class. + + Note: This class is not thread safe and user of this class should ensure + thread safety. + """ + + def __init__(self, artifact_service_channel): +"""Creates a new FileHandler to upload file to ArtifactStagingService. + +Args: + artifact_service_channel: Channel used to interact with +ArtifactStagingService.User owns the channel and should close it when +finished. +""" +super(ArtifactStagingFileHandler, self).__init__() +self._artifact_staging_stub = beam_artifact_api_pb2_grpc.\ +ArtifactStagingServiceStub(channel=artifact_service_channel) +self._artifacts = [] +self.closed = False + + def file_copy(self, from_path, to_path): +"""Uploads a file to ArtifactStagingService. + +Note: Downloading/copying file from remote server is not supported. +Args: + from_path: Path of file to be uploaded. + to_path: File name on the artifact server. +""" +self._check_closed() +if not os.path.isfile(from_path): + raise ValueError( + 'Can only copy local file to artifact server. from_path: {0} ' + 'to_path: {1}'.format(from_path, to_path)) + +def artifact_request_generator(): + metadata = beam_artifact_api_pb2.ArtifactMetadata(name=to_path) + request = beam_artifact_api_pb2.PutArtifactRequest(metadata=metadata) + yield request + with open(from_path, 'rb') as f: +while True: + chunk = f.read(2 << 12) # 4kb + if not chunk: +break + request = beam_artifact_api_pb2.PutArtifactRequest( + data=beam_artifact_api_pb2.ArtifactChunk(data=chunk)) + yield request + self._artifacts.append(metadata) + +response = self._artifact_staging_stub.PutArtifact( +artifact_request_generator()) +print(response) + + def file_download(self, from_url, to_path): +self._check_closed() +return super(ArtifactStagingFileHandler, self).file_download( +from_url, to_path) + + def commit_manifest(self): Review comment: Makes sense. Will not commit in case of exception or error. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 100377) Time Spent: 5h 40m (was: 5.5h) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 >
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100378=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100378 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 10/May/18 02:35 Start Date: 10/May/18 02:35 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5273: [BEAM-3883] Adding Client to push artifacts to artifact staging service URL: https://github.com/apache/beam/pull/5273#discussion_r187190033 ## File path: sdks/python/apache_beam/runners/portability/artifact_service_client.py ## @@ -0,0 +1,104 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""A :class:`FileHandler` to work with :class:`ArtifactStagingServiceStub`. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +from apache_beam.portability.api import beam_artifact_api_pb2 +from apache_beam.portability.api import beam_artifact_api_pb2_grpc +from apache_beam.runners.portability.stager import FileHandler + + +class ArtifactStagingFileHandler(FileHandler): + """:class:`FileHandler` to push files to ArtifactStagingService. + + The class keeps track of pushed files and user is expected to call + :fun:`commit_manifest` once all files are uploaded. + Once :fun:`commit_manifest` is called, no further operations can be performed + on the class. + + Note: This class is not thread safe and user of this class should ensure + thread safety. + """ + + def __init__(self, artifact_service_channel): +"""Creates a new FileHandler to upload file to ArtifactStagingService. + +Args: + artifact_service_channel: Channel used to interact with +ArtifactStagingService.User owns the channel and should close it when +finished. +""" +super(ArtifactStagingFileHandler, self).__init__() +self._artifact_staging_stub = beam_artifact_api_pb2_grpc.\ +ArtifactStagingServiceStub(channel=artifact_service_channel) +self._artifacts = [] +self.closed = False + + def file_copy(self, from_path, to_path): +"""Uploads a file to ArtifactStagingService. + +Note: Downloading/copying file from remote server is not supported. +Args: + from_path: Path of file to be uploaded. + to_path: File name on the artifact server. +""" +self._check_closed() +if not os.path.isfile(from_path): + raise ValueError( + 'Can only copy local file to artifact server. from_path: {0} ' + 'to_path: {1}'.format(from_path, to_path)) + +def artifact_request_generator(): + metadata = beam_artifact_api_pb2.ArtifactMetadata(name=to_path) + request = beam_artifact_api_pb2.PutArtifactRequest(metadata=metadata) + yield request + with open(from_path, 'rb') as f: +while True: + chunk = f.read(2 << 12) # 4kb + if not chunk: +break + request = beam_artifact_api_pb2.PutArtifactRequest( + data=beam_artifact_api_pb2.ArtifactChunk(data=chunk)) + yield request + self._artifacts.append(metadata) + +response = self._artifact_staging_stub.PutArtifact( +artifact_request_generator()) +print(response) Review comment: Removed, This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 100378) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 5h 50m >
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100376=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100376 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 10/May/18 02:35 Start Date: 10/May/18 02:35 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5273: [BEAM-3883] Adding Client to push artifacts to artifact staging service URL: https://github.com/apache/beam/pull/5273#discussion_r187189738 ## File path: sdks/python/apache_beam/runners/portability/artifact_service_client.py ## @@ -0,0 +1,104 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""A :class:`FileHandler` to work with :class:`ArtifactStagingServiceStub`. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +from apache_beam.portability.api import beam_artifact_api_pb2 +from apache_beam.portability.api import beam_artifact_api_pb2_grpc +from apache_beam.runners.portability.stager import FileHandler + + +class ArtifactStagingFileHandler(FileHandler): + """:class:`FileHandler` to push files to ArtifactStagingService. + + The class keeps track of pushed files and user is expected to call + :fun:`commit_manifest` once all files are uploaded. + Once :fun:`commit_manifest` is called, no further operations can be performed + on the class. + + Note: This class is not thread safe and user of this class should ensure + thread safety. + """ + + def __init__(self, artifact_service_channel): +"""Creates a new FileHandler to upload file to ArtifactStagingService. + +Args: + artifact_service_channel: Channel used to interact with +ArtifactStagingService.User owns the channel and should close it when +finished. +""" +super(ArtifactStagingFileHandler, self).__init__() +self._artifact_staging_stub = beam_artifact_api_pb2_grpc.\ +ArtifactStagingServiceStub(channel=artifact_service_channel) +self._artifacts = [] +self.closed = False + + def file_copy(self, from_path, to_path): +"""Uploads a file to ArtifactStagingService. + +Note: Downloading/copying file from remote server is not supported. +Args: + from_path: Path of file to be uploaded. + to_path: File name on the artifact server. +""" +self._check_closed() +if not os.path.isfile(from_path): + raise ValueError( + 'Can only copy local file to artifact server. from_path: {0} ' + 'to_path: {1}'.format(from_path, to_path)) + +def artifact_request_generator(): + metadata = beam_artifact_api_pb2.ArtifactMetadata(name=to_path) + request = beam_artifact_api_pb2.PutArtifactRequest(metadata=metadata) + yield request + with open(from_path, 'rb') as f: +while True: + chunk = f.read(2 << 12) # 4kb Review comment: Sure, we can go till 2MB. GRPC has a payload size limit of 4MB so 2MB should be ok. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 100376) Time Spent: 5.5h (was: 5h 20m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 5.5h > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100379=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100379 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 10/May/18 02:35 Start Date: 10/May/18 02:35 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5273: [BEAM-3883] Adding Client to push artifacts to artifact staging service URL: https://github.com/apache/beam/pull/5273#discussion_r187191092 ## File path: sdks/python/apache_beam/runners/portability/artifact_service_client.py ## @@ -0,0 +1,104 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""A :class:`FileHandler` to work with :class:`ArtifactStagingServiceStub`. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +from apache_beam.portability.api import beam_artifact_api_pb2 +from apache_beam.portability.api import beam_artifact_api_pb2_grpc +from apache_beam.runners.portability.stager import FileHandler + + +class ArtifactStagingFileHandler(FileHandler): + """:class:`FileHandler` to push files to ArtifactStagingService. + + The class keeps track of pushed files and user is expected to call + :fun:`commit_manifest` once all files are uploaded. + Once :fun:`commit_manifest` is called, no further operations can be performed + on the class. + + Note: This class is not thread safe and user of this class should ensure + thread safety. + """ + + def __init__(self, artifact_service_channel): +"""Creates a new FileHandler to upload file to ArtifactStagingService. + +Args: + artifact_service_channel: Channel used to interact with +ArtifactStagingService.User owns the channel and should close it when +finished. +""" +super(ArtifactStagingFileHandler, self).__init__() +self._artifact_staging_stub = beam_artifact_api_pb2_grpc.\ +ArtifactStagingServiceStub(channel=artifact_service_channel) +self._artifacts = [] +self.closed = False + + def file_copy(self, from_path, to_path): +"""Uploads a file to ArtifactStagingService. + +Note: Downloading/copying file from remote server is not supported. +Args: + from_path: Path of file to be uploaded. + to_path: File name on the artifact server. +""" +self._check_closed() +if not os.path.isfile(from_path): + raise ValueError( + 'Can only copy local file to artifact server. from_path: {0} ' + 'to_path: {1}'.format(from_path, to_path)) + +def artifact_request_generator(): + metadata = beam_artifact_api_pb2.ArtifactMetadata(name=to_path) + request = beam_artifact_api_pb2.PutArtifactRequest(metadata=metadata) + yield request + with open(from_path, 'rb') as f: +while True: + chunk = f.read(2 << 12) # 4kb + if not chunk: +break + request = beam_artifact_api_pb2.PutArtifactRequest( + data=beam_artifact_api_pb2.ArtifactChunk(data=chunk)) + yield request + self._artifacts.append(metadata) + +response = self._artifact_staging_stub.PutArtifact( +artifact_request_generator()) +print(response) + + def file_download(self, from_url, to_path): Review comment: - download_file? Sure. Made this change in the parent PR - Is from_url in the same format as to_path in upload_file? If so they should use consistent terminology. Not really. in case of upload_file, to path is just the name of the path to the file in staging location while in download_file from_url can be any url from where we can potentially download some file like http:// gs:// etc - why do we need this file at all? I thought the artifact stager only uploads files. Is this a test helper method? Stager prep the file to be uploaded and uploads them. A case where download is required is when when a package tarball is provided as http url or on GCS. This is an automated message from the Apache Git
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100308=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100308 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 09/May/18 22:44 Start Date: 09/May/18 22:44 Worklog Time Spent: 10m Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#issuecomment-387897316 Run Python Dataflow ValidatesRunner This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 100308) Time Spent: 5h 20m (was: 5h 10m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 5h 20m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100282=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100282 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 09/May/18 21:55 Start Date: 09/May/18 21:55 Worklog Time Spent: 10m Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#issuecomment-387887629 Run Python Dataflow ValidatesRunner This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 100282) Time Spent: 5h 10m (was: 5h) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 5h 10m > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100281=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100281 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 09/May/18 21:53 Start Date: 09/May/18 21:53 Worklog Time Spent: 10m Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#issuecomment-387887066 Run Python Dataflow ValidatesRunner This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 100281) Time Spent: 5h (was: 4h 50m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 5h > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100274=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100274 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 09/May/18 21:49 Start Date: 09/May/18 21:49 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r187133776 ## File path: sdks/python/apache_beam/runners/portability/stager.py ## @@ -0,0 +1,556 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Support for installing custom code and required dependencies. + +Workflows, with the exception of very simple ones, are organized in multiple +modules and packages. Typically, these modules and packages have +dependencies on other standard libraries. Beam relies on the Python +setuptools package to handle these scenarios. For further details please read: +https://pythonhosted.org/an_example_pypi_project/setuptools.html + +When a runner tries to run a pipeline it will check for a --requirements_file +and a --setup_file option. + +If --setup_file is present then it is assumed that the folder containing the +file specified by the option has the typical layout required by setuptools and +it will run 'python setup.py sdist' to produce a source distribution. The +resulting tarball (a .tar or .tar.gz file) will be staged at the staging +location specified as job option. When a worker starts it will check for the +presence of this file and will run 'easy_install tarball' to install the +package in the worker. + +If --requirements_file is present then the file specified by the option will be +staged in the staging location. When a worker starts it will check for the +presence of this file and will run 'pip install -r requirements.txt'. A +requirements file can be easily generated by running 'pip freeze -r +requirements.txt'. The reason a runner does not run this automatically is +because quite often only a small fraction of the dependencies present in a +requirements.txt file are actually needed for remote execution and therefore a +one-time manual trimming is desirable. + +TODO(silviuc): Staged files should have a job specific prefix. +To prevent several jobs in the same project stomping on each other due to a Review comment: Makes sense. Also removing the todo as different jobs in a project should have different staging directory. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 100274) Time Spent: 4.5h (was: 4h 20m) > Python SDK stages artifacts when talking to job server > -- > > Key: BEAM-3883 > URL: https://issues.apache.org/jira/browse/BEAM-3883 > Project: Beam > Issue Type: Sub-task > Components: sdk-py-core >Reporter: Ben Sidhom >Assignee: Ankur Goenka >Priority: Major > Time Spent: 4.5h > Remaining Estimate: 0h > > The Python SDK does not currently stage its user-defined functions or > dependencies when talking to the job API. Artifacts that need to be staged > include the user code itself, any SDK components not included in the > container image, and the list of Python packages that must be installed at > runtime. > > Artifacts that are currently expected can be found in the harness boot code: > [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100273=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100273 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 09/May/18 21:49 Start Date: 09/May/18 21:49 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r187161722 ## File path: sdks/python/apache_beam/runners/portability/stager.py ## @@ -0,0 +1,556 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Support for installing custom code and required dependencies. + +Workflows, with the exception of very simple ones, are organized in multiple +modules and packages. Typically, these modules and packages have +dependencies on other standard libraries. Beam relies on the Python +setuptools package to handle these scenarios. For further details please read: +https://pythonhosted.org/an_example_pypi_project/setuptools.html + +When a runner tries to run a pipeline it will check for a --requirements_file +and a --setup_file option. + +If --setup_file is present then it is assumed that the folder containing the +file specified by the option has the typical layout required by setuptools and +it will run 'python setup.py sdist' to produce a source distribution. The +resulting tarball (a .tar or .tar.gz file) will be staged at the staging +location specified as job option. When a worker starts it will check for the +presence of this file and will run 'easy_install tarball' to install the +package in the worker. + +If --requirements_file is present then the file specified by the option will be +staged in the staging location. When a worker starts it will check for the +presence of this file and will run 'pip install -r requirements.txt'. A +requirements file can be easily generated by running 'pip freeze -r +requirements.txt'. The reason a runner does not run this automatically is +because quite often only a small fraction of the dependencies present in a +requirements.txt file are actually needed for remote execution and therefore a +one-time manual trimming is desirable. + +TODO(silviuc): Staged files should have a job specific prefix. +To prevent several jobs in the same project stomping on each other due to a +shared staging location. + +TODO(silviuc): Should we allow several setup packages? +TODO(silviuc): We should allow customizing the exact command for setup build. +""" + +import glob +import logging +import os +import shutil +import subprocess +import sys +import tempfile + +import pkg_resources + +from apache_beam.internal import pickler +from apache_beam.io.filesystems import FileSystems +from apache_beam.options.pipeline_options import SetupOptions +# TODO(angoenka): Remove reference to dataflow internal names +from apache_beam.runners.dataflow.internal import names +from apache_beam.utils import processes + +# All constants are for internal use only; no backwards-compatibility +# guarantees. + +# Standard file names used for staging files. +WORKFLOW_TARBALL_FILE = 'workflow.tar.gz' +REQUIREMENTS_FILE = 'requirements.txt' +EXTRA_PACKAGES_FILE = 'extra_packages.txt' + +# Package names for distributions +BEAM_PACKAGE_NAME = 'apache-beam' + + +class FileHandler(object): Review comment: The class is needed to group the functionality of copy/download/check_remote as these are the 3 things which are used by stager. Also file_download needs file_copy as it copies file from remote if its not an http url. FileHandler is not intended to be implemented for every runner as the sdk will not directly interact with runner instead will only submit job to a job_service and artifacts to an artifact service for both of which we have well defined contract. However by proving FileHandler as an argument, we keep the possibility open for different adding other means of interacting with files and also support the old runners. I will update the docstring to highlight the usage
[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server
[ https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100275=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100275 ] ASF GitHub Bot logged work on BEAM-3883: Author: ASF GitHub Bot Created on: 09/May/18 21:49 Start Date: 09/May/18 21:49 Worklog Time Spent: 10m Work Description: angoenka commented on a change in pull request #5251: [BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact service URL: https://github.com/apache/beam/pull/5251#discussion_r187170162 ## File path: sdks/python/apache_beam/runners/portability/stager.py ## @@ -0,0 +1,556 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Support for installing custom code and required dependencies. + +Workflows, with the exception of very simple ones, are organized in multiple +modules and packages. Typically, these modules and packages have +dependencies on other standard libraries. Beam relies on the Python +setuptools package to handle these scenarios. For further details please read: +https://pythonhosted.org/an_example_pypi_project/setuptools.html + +When a runner tries to run a pipeline it will check for a --requirements_file +and a --setup_file option. + +If --setup_file is present then it is assumed that the folder containing the +file specified by the option has the typical layout required by setuptools and +it will run 'python setup.py sdist' to produce a source distribution. The +resulting tarball (a .tar or .tar.gz file) will be staged at the staging +location specified as job option. When a worker starts it will check for the +presence of this file and will run 'easy_install tarball' to install the +package in the worker. + +If --requirements_file is present then the file specified by the option will be +staged in the staging location. When a worker starts it will check for the +presence of this file and will run 'pip install -r requirements.txt'. A +requirements file can be easily generated by running 'pip freeze -r +requirements.txt'. The reason a runner does not run this automatically is +because quite often only a small fraction of the dependencies present in a +requirements.txt file are actually needed for remote execution and therefore a +one-time manual trimming is desirable. + +TODO(silviuc): Staged files should have a job specific prefix. +To prevent several jobs in the same project stomping on each other due to a +shared staging location. + +TODO(silviuc): Should we allow several setup packages? +TODO(silviuc): We should allow customizing the exact command for setup build. +""" + +import glob +import logging +import os +import shutil +import subprocess +import sys +import tempfile + +import pkg_resources + +from apache_beam.internal import pickler +from apache_beam.io.filesystems import FileSystems +from apache_beam.options.pipeline_options import SetupOptions +# TODO(angoenka): Remove reference to dataflow internal names +from apache_beam.runners.dataflow.internal import names +from apache_beam.utils import processes + +# All constants are for internal use only; no backwards-compatibility +# guarantees. + +# Standard file names used for staging files. +WORKFLOW_TARBALL_FILE = 'workflow.tar.gz' +REQUIREMENTS_FILE = 'requirements.txt' +EXTRA_PACKAGES_FILE = 'extra_packages.txt' + +# Package names for distributions +BEAM_PACKAGE_NAME = 'apache-beam' + + +class FileHandler(object): + + def file_copy(self, from_path, to_path): +"""Copies a local file to a remote location or vice versa.""" +logging.info('File copy from %s to %s.', from_path, to_path) + +if not os.path.isdir(os.path.dirname(to_path)): + logging.info( + 'Created folder (since we have not done yet, and any errors ' + 'will follow): %s ', os.path.dirname(to_path)) + os.mkdir(os.path.dirname(to_path)) +shutil.copyfile(from_path, to_path) + + def file_download(self, from_url, to_path): +"""Downloads a file over http/https from a or copy them from a remote +location.""" +if from_url.startswith('http://') or from_url.startswith('https://'): + # TODO(silviuc): We should cache