[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-06-28 Thread ASF GitHub Bot (JIRA)


 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=117109=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-117109
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 28/Jun/18 22:58
Start Date: 28/Jun/18 22:58
Worklog Time Spent: 10m 
  Work Description: robertwb closed pull request #5817: [BEAM-3883] Stage 
files in the portability runner.
URL: https://github.com/apache/beam/pull/5817
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/sdks/python/apache_beam/runners/dataflow/internal/apiclient.py 
b/sdks/python/apache_beam/runners/dataflow/internal/apiclient.py
index 784166cafda..ccce9a99ef7 100644
--- a/sdks/python/apache_beam/runners/dataflow/internal/apiclient.py
+++ b/sdks/python/apache_beam/runners/dataflow/internal/apiclient.py
@@ -442,10 +442,11 @@ def _stage_resources(self, options):
   raise RuntimeError('The --temp_location option must be specified.')
 
 resource_stager = _LegacyDataflowStager(self)
-return resource_stager.stage_job_resources(
+_, resources = resource_stager.stage_job_resources(
 options,
 temp_dir=tempfile.mkdtemp(),
 staging_location=google_cloud_options.staging_location)
+return resources
 
   def stage_file(self, gcs_or_local_path, file_name, stream,
  mime_type='application/octet-stream'):
diff --git a/sdks/python/apache_beam/runners/portability/portable_runner.py 
b/sdks/python/apache_beam/runners/portability/portable_runner.py
index 0da33fa7f4a..26376c93337 100644
--- a/sdks/python/apache_beam/runners/portability/portable_runner.py
+++ b/sdks/python/apache_beam/runners/portability/portable_runner.py
@@ -24,12 +24,11 @@
 from apache_beam import metrics
 from apache_beam.options.pipeline_options import PortableOptions
 from apache_beam.portability import common_urns
-from apache_beam.portability.api import beam_artifact_api_pb2
-from apache_beam.portability.api import beam_artifact_api_pb2_grpc
 from apache_beam.portability.api import beam_job_api_pb2
 from apache_beam.portability.api import beam_job_api_pb2_grpc
 from apache_beam.runners import pipeline_context
 from apache_beam.runners import runner
+from apache_beam.runners.portability import portable_stager
 
 __all__ = ['PortableRunner']
 
@@ -92,16 +91,12 @@ def run_pipeline(self, pipeline):
 beam_job_api_pb2.PrepareJobRequest(
 job_name='job', pipeline=proto_pipeline))
 if prepare_response.artifact_staging_endpoint.url:
-  # Must commit something to get a retrieval token,
-  # committing empty manifest for now.
-  # TODO(BEAM-3883): Actually stage required files.
-  artifact_service = beam_artifact_api_pb2_grpc.ArtifactStagingServiceStub(
-  
grpc.insecure_channel(prepare_response.artifact_staging_endpoint.url))
-  commit_manifest = artifact_service.CommitManifest(
-  beam_artifact_api_pb2.CommitManifestRequest(
-  manifest=beam_artifact_api_pb2.Manifest(),
-  staging_session_token=prepare_response.staging_session_token))
-  retrieval_token = commit_manifest.retrieval_token
+  stager = portable_stager.PortableStager(
+  
grpc.insecure_channel(prepare_response.artifact_staging_endpoint.url),
+  prepare_response.staging_session_token)
+  retrieval_token, _ = stager.stage_job_resources(
+  pipeline._options,
+  staging_location='')
 else:
   retrieval_token = None
 run_response = job_service.Run(
diff --git a/sdks/python/apache_beam/runners/portability/portable_stager.py 
b/sdks/python/apache_beam/runners/portability/portable_stager.py
index f556811425b..3761373fb42 100644
--- a/sdks/python/apache_beam/runners/portability/portable_stager.py
+++ b/sdks/python/apache_beam/runners/portability/portable_stager.py
@@ -20,6 +20,8 @@
 from __future__ import division
 from __future__ import print_function
 
+import base64
+import hashlib
 import os
 
 from apache_beam.portability.api import beam_artifact_api_pb2
@@ -69,7 +71,8 @@ def stage_artifact(self, local_path_to_artifact, 
artifact_name):
 
 def artifact_request_generator():
   artifact_metadata = beam_artifact_api_pb2.ArtifactMetadata(
-  name=artifact_name)
+  name=artifact_name,
+  md5=_get_file_hash(local_path_to_artifact))
   metadata = beam_artifact_api_pb2.PutArtifactMetadata(
   staging_session_token=self._staging_session_token,
   metadata=artifact_metadata)
@@ -90,7 +93,18 @@ def artifact_request_generator():
   def commit_manifest(self):
 manifest = beam_artifact_api_pb2.Manifest(artifact=self._artifacts)
 

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-06-28 Thread ASF GitHub Bot (JIRA)


 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=117088=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-117088
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 28/Jun/18 22:29
Start Date: 28/Jun/18 22:29
Worklog Time Spent: 10m 
  Work Description: jkff commented on a change in pull request #5817: 
[BEAM-3883] Stage files in the portability runner.
URL: https://github.com/apache/beam/pull/5817#discussion_r199006382
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/portable_runner.py
 ##
 @@ -107,16 +108,12 @@ def run_pipeline(self, pipeline):
 beam_job_api_pb2.PrepareJobRequest(
 job_name='job', pipeline=proto_pipeline))
 if prepare_response.artifact_staging_endpoint.url:
-  # Must commit something to get a retrieval token,
-  # committing empty manifest for now.
-  # TODO(BEAM-3883): Actually stage required files.
-  artifact_service = beam_artifact_api_pb2_grpc.ArtifactStagingServiceStub(
 
 Review comment:
   The imports _grpc and _pb2 above are no longer used.


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 117088)
Time Spent: 19.5h  (was: 19h 20m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
> Fix For: 2.6.0
>
>  Time Spent: 19.5h
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-06-28 Thread ASF GitHub Bot (JIRA)


 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=116973=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-116973
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 28/Jun/18 19:28
Start Date: 28/Jun/18 19:28
Worklog Time Spent: 10m 
  Work Description: robertwb opened a new pull request #5817: [BEAM-3883] 
Stage files in the portability runner.
URL: https://github.com/apache/beam/pull/5817
 
 
   
   
   
   Follow this checklist to help us incorporate your contribution quickly and 
easily:
   
- [ ] Format the pull request title like `[BEAM-XXX] Fixes bug in 
ApproximateQuantiles`, where you replace `BEAM-XXX` with the appropriate JIRA 
issue, if applicable. This will automatically link the pull request to the 
issue.
- [ ] If this contribution is large, please file an Apache [Individual 
Contributor License Agreement](https://www.apache.org/licenses/icla.pdf).
   
   It will help us expedite review of your Pull Request if you tag someone 
(e.g. `@username`) to look at it.
   
   Post-Commit Tests Status (on master branch)
   

   
   Lang | SDK | Apex | Dataflow | Flink | Gearpump | Spark
   --- | --- | --- | --- | --- | --- | ---
   Go | [![Build 
Status](https://builds.apache.org/job/beam_PostCommit_Go_GradleBuild/lastCompletedBuild/badge/icon)](https://builds.apache.org/job/beam_PostCommit_Go_GradleBuild/lastCompletedBuild/)
 | --- | --- | --- | --- | ---
   Java | [![Build 
Status](https://builds.apache.org/job/beam_PostCommit_Java_GradleBuild/lastCompletedBuild/badge/icon)](https://builds.apache.org/job/beam_PostCommit_Java_GradleBuild/lastCompletedBuild/)
 | [![Build 
Status](https://builds.apache.org/job/beam_PostCommit_Java_ValidatesRunner_Apex_Gradle/lastCompletedBuild/badge/icon)](https://builds.apache.org/job/beam_PostCommit_Java_ValidatesRunner_Apex_Gradle/lastCompletedBuild/)
 | [![Build 
Status](https://builds.apache.org/job/beam_PostCommit_Java_ValidatesRunner_Dataflow_Gradle/lastCompletedBuild/badge/icon)](https://builds.apache.org/job/beam_PostCommit_Java_ValidatesRunner_Dataflow_Gradle/lastCompletedBuild/)
 | [![Build 
Status](https://builds.apache.org/job/beam_PostCommit_Java_ValidatesRunner_Flink_Gradle/lastCompletedBuild/badge/icon)](https://builds.apache.org/job/beam_PostCommit_Java_ValidatesRunner_Flink_Gradle/lastCompletedBuild/)
 | [![Build 
Status](https://builds.apache.org/job/beam_PostCommit_Java_ValidatesRunner_Gearpump_Gradle/lastCompletedBuild/badge/icon)](https://builds.apache.org/job/beam_PostCommit_Java_ValidatesRunner_Gearpump_Gradle/lastCompletedBuild/)
 | [![Build 
Status](https://builds.apache.org/job/beam_PostCommit_Java_ValidatesRunner_Spark_Gradle/lastCompletedBuild/badge/icon)](https://builds.apache.org/job/beam_PostCommit_Java_ValidatesRunner_Spark_Gradle/lastCompletedBuild/)
   Python | [![Build 
Status](https://builds.apache.org/job/beam_PostCommit_Python_Verify/lastCompletedBuild/badge/icon)](https://builds.apache.org/job/beam_PostCommit_Python_Verify/lastCompletedBuild/)
 | --- | [![Build 
Status](https://builds.apache.org/job/beam_PostCommit_Py_VR_Dataflow/lastCompletedBuild/badge/icon)](https://builds.apache.org/job/beam_PostCommit_Py_VR_Dataflow/lastCompletedBuild/)
  [![Build 
Status](https://builds.apache.org/job/beam_PostCommit_Py_ValCont/lastCompletedBuild/badge/icon)](https://builds.apache.org/job/beam_PostCommit_Py_ValCont/lastCompletedBuild/)
 | --- | --- | ---
   
   
   
   
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 116973)
Time Spent: 19h 20m  (was: 19h 10m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
> Fix For: 2.6.0
>
>  Time Spent: 19h 20m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be 

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-22 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104863=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104863
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 22/May/18 23:41
Start Date: 22/May/18 23:41
Worklog Time Spent: 10m 
  Work Description: jkff closed pull request #5273: [BEAM-3883] Adding 
Client to push artifacts to artifact staging service
URL: https://github.com/apache/beam/pull/5273
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/sdks/python/apache_beam/runners/portability/portable_stager.py 
b/sdks/python/apache_beam/runners/portability/portable_stager.py
new file mode 100644
index 000..7113a251f24
--- /dev/null
+++ b/sdks/python/apache_beam/runners/portability/portable_stager.py
@@ -0,0 +1,86 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""A :class:`FileHandler` to work with :class:`ArtifactStagingServiceStub`.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from apache_beam.portability.api import beam_artifact_api_pb2
+from apache_beam.portability.api import beam_artifact_api_pb2_grpc
+from apache_beam.runners.portability.stager import Stager
+
+
+class PortableStager(Stager):
+  """An implementation of :class:`Stager` to stage files on
+  ArtifactStagingService.
+
+  The class keeps track of pushed files and commit manifest once all files are
+  uploaded.
+
+  Note: This class is not thread safe and user of this class should ensure
+  thread safety.
+  """
+
+  def __init__(self, artifact_service_channel):
+"""Creates a new Stager to stage file to ArtifactStagingService.
+
+Args:
+  artifact_service_channel: Channel used to interact with
+ArtifactStagingService.User owns the channel and should close it when
+finished.
+"""
+super(PortableStager, self).__init__()
+self._artifact_staging_stub = beam_artifact_api_pb2_grpc.\
+ArtifactStagingServiceStub(channel=artifact_service_channel)
+self._artifacts = []
+
+  def stage_artifact(self, local_path_to_artifact, artifact_name):
+"""Stage a file to ArtifactStagingService.
+
+Args:
+  local_path_to_artifact: Path of file to be uploaded.
+  artifact_name: File name on the artifact server.
+"""
+if not os.path.isfile(local_path_to_artifact):
+  raise ValueError(
+  'Cannot stage {0} to artifact server. Only local files can be 
staged.'
+  .format(local_path_to_artifact))
+
+def artifact_request_generator():
+  metadata = beam_artifact_api_pb2.ArtifactMetadata(name=artifact_name)
+  request = beam_artifact_api_pb2.PutArtifactRequest(metadata=metadata)
+  yield request
+  with open(local_path_to_artifact, 'rb') as f:
+while True:
+  chunk = f.read(1 << 21)  # 2MB
+  if not chunk:
+break
+  request = beam_artifact_api_pb2.PutArtifactRequest(
+  data=beam_artifact_api_pb2.ArtifactChunk(data=chunk))
+  yield request
+  self._artifacts.append(metadata)
+
+self._artifact_staging_stub.PutArtifact(artifact_request_generator())
+
+  def commit_manifest(self):
+manifest = beam_artifact_api_pb2.Manifest(artifact=self._artifacts)
+self._artifacts = []
+self._artifact_staging_stub.CommitManifest(
+beam_artifact_api_pb2.CommitManifestRequest(manifest=manifest))
diff --git 
a/sdks/python/apache_beam/runners/portability/portable_stager_test.py 
b/sdks/python/apache_beam/runners/portability/portable_stager_test.py
new file mode 100644
index 000..181007de5f0
--- /dev/null
+++ b/sdks/python/apache_beam/runners/portability/portable_stager_test.py
@@ -0,0 +1,162 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work 

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-22 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104784=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104784
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 22/May/18 21:05
Start Date: 22/May/18 21:05
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5273: 
[BEAM-3883] Adding Client to push artifacts to artifact staging service
URL: https://github.com/apache/beam/pull/5273#discussion_r190052740
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/portable_stager_test.py
 ##
 @@ -48,7 +48,10 @@ def tearDown(self):
 if self._remote_dir:
   shutil.rmtree(self._remote_dir)
 
-  def stage_files(self, files):
+  def _stage_files(self, files):
+"""
 
 Review comment:
   Made the change


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 104784)
Time Spent: 18h 50m  (was: 18h 40m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 18h 50m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-22 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104779=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104779
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 22/May/18 20:46
Start Date: 22/May/18 20:46
Worklog Time Spent: 10m 
  Work Description: tvalentyn commented on a change in pull request #5273: 
[BEAM-3883] Adding Client to push artifacts to artifact staging service
URL: https://github.com/apache/beam/pull/5273#discussion_r190047124
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/portable_stager_test.py
 ##
 @@ -48,7 +48,10 @@ def tearDown(self):
 if self._remote_dir:
   shutil.rmtree(self._remote_dir)
 
-  def stage_files(self, files):
+  def _stage_files(self, files):
+"""
 
 Review comment:
   The reason I asked for docstring is that it would be useful to describe the 
structure of `files` since it is not obvious without reading the method. How 
about:
   
   ```
   Utility method to stage files.
   
   Args: 
   
 files: a list of tuples of the form [(local_name, remote_name),...] 
describing the name of the artifacts in local temp folder and desired name in 
staging location. 
   ```


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 104779)
Time Spent: 18h 40m  (was: 18.5h)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 18h 40m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-22 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104761=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104761
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 22/May/18 19:40
Start Date: 22/May/18 19:40
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5273: 
[BEAM-3883] Adding Client to push artifacts to artifact staging service
URL: https://github.com/apache/beam/pull/5273#discussion_r190024906
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/portable_stager_test.py
 ##
 @@ -0,0 +1,148 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Test cases for :module:`artifact_service_client`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import filecmp
+import logging
+import os
+import random
+import shutil
+import string
+import tempfile
+import unittest
+from concurrent import futures
+
+import grpc
+
+from apache_beam.portability.api import beam_artifact_api_pb2
+from apache_beam.portability.api import beam_artifact_api_pb2_grpc
+from apache_beam.runners.portability import portable_stager
+
+
+class PortableStagerTest(unittest.TestCase):
+
+  def setUp(self):
+self._temp_dir = tempfile.mkdtemp()
+self._remote_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+if self._temp_dir:
+  shutil.rmtree(self._temp_dir)
+if self._remote_dir:
+  shutil.rmtree(self._remote_dir)
+
+  def stage_files(self, files):
 
 Review comment:
   Added.


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 104761)
Time Spent: 18h 20m  (was: 18h 10m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 18h 20m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-22 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104763=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104763
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 22/May/18 19:40
Start Date: 22/May/18 19:40
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5273: 
[BEAM-3883] Adding Client to push artifacts to artifact staging service
URL: https://github.com/apache/beam/pull/5273#discussion_r190023720
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/portable_stager.py
 ##
 @@ -0,0 +1,86 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""A :class:`FileHandler` to work with :class:`ArtifactStagingServiceStub`.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from apache_beam.portability.api import beam_artifact_api_pb2
+from apache_beam.portability.api import beam_artifact_api_pb2_grpc
+from apache_beam.runners.portability.stager import Stager
+
+
+class PortableStager(Stager):
+  """An implementation of :class:`Stager` to stage files on
+  ArtifactStagingService.
+
+  The class keeps track of pushed files and commit manifest once all files are
+  uploaded.
 
 Review comment:
   GRPC does not retry on connection errors. 
   I am not planning to add retry as of now as user can simply resubmit the 
pipeline if it fails.
   We can revisit it if it becomes an issue.


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 104763)
Time Spent: 18.5h  (was: 18h 20m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 18.5h
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-22 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104764=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104764
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 22/May/18 19:40
Start Date: 22/May/18 19:40
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5273: 
[BEAM-3883] Adding Client to push artifacts to artifact staging service
URL: https://github.com/apache/beam/pull/5273#discussion_r190026010
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/portable_stager_test.py
 ##
 @@ -0,0 +1,148 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Test cases for :module:`artifact_service_client`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import filecmp
+import logging
+import os
+import random
+import shutil
+import string
+import tempfile
+import unittest
+from concurrent import futures
+
+import grpc
+
+from apache_beam.portability.api import beam_artifact_api_pb2
+from apache_beam.portability.api import beam_artifact_api_pb2_grpc
+from apache_beam.runners.portability import portable_stager
+
+
+class PortableStagerTest(unittest.TestCase):
+
+  def setUp(self):
+self._temp_dir = tempfile.mkdtemp()
+self._remote_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+if self._temp_dir:
+  shutil.rmtree(self._temp_dir)
+if self._remote_dir:
+  shutil.rmtree(self._remote_dir)
+
+  def stage_files(self, files):
+server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
+beam_artifact_api_pb2_grpc.add_ArtifactStagingServiceServicer_to_server(
+TestLocalFileSystemArtifactStagingServiceServicer(self._remote_dir),
+server)
+test_port = server.add_insecure_port('[::]:0')
+server.start()
+stager = portable_stager.PortableStager(
+grpc.insecure_channel('localhost:%s' % test_port))
+for from_file, to_file in files:
+  stager.stage_artifact(
+  local_path_to_artifact=os.path.join(self._temp_dir, from_file),
+  artifact_name=to_file)
+
+return stager._artifacts
+
+  def test_stage_single_file(self):
+from_file = 'test_local.txt'
+to_file = 'test_remote.txt'
+
+with open(os.path.join(self._temp_dir, from_file), 'wb') as f:
+  f.write(b'abc')
+
+copied_files = self.stage_files([('test_local.txt', 'test_remote.txt')])
+self.assertTrue(
+filecmp.cmp(
+os.path.join(self._temp_dir, from_file),
+os.path.join(self._remote_dir, to_file)))
+self.assertEqual([to_file], [manifest.name for manifest in copied_files])
 
 Review comment:
   Renamed to staged_file_metadata.
   
   as mentioned above, commit_manifest is exercised automatically.
   
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 104764)
Time Spent: 18.5h  (was: 18h 20m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 18.5h
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> 

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-22 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104760=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104760
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 22/May/18 19:40
Start Date: 22/May/18 19:40
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5273: 
[BEAM-3883] Adding Client to push artifacts to artifact staging service
URL: https://github.com/apache/beam/pull/5273#discussion_r190027339
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/portable_stager_test.py
 ##
 @@ -0,0 +1,148 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Test cases for :module:`artifact_service_client`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import filecmp
+import logging
+import os
+import random
+import shutil
+import string
+import tempfile
+import unittest
+from concurrent import futures
+
+import grpc
+
+from apache_beam.portability.api import beam_artifact_api_pb2
+from apache_beam.portability.api import beam_artifact_api_pb2_grpc
+from apache_beam.runners.portability import portable_stager
+
+
+class PortableStagerTest(unittest.TestCase):
+
+  def setUp(self):
+self._temp_dir = tempfile.mkdtemp()
+self._remote_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+if self._temp_dir:
+  shutil.rmtree(self._temp_dir)
+if self._remote_dir:
+  shutil.rmtree(self._remote_dir)
+
+  def stage_files(self, files):
+server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
+beam_artifact_api_pb2_grpc.add_ArtifactStagingServiceServicer_to_server(
+TestLocalFileSystemArtifactStagingServiceServicer(self._remote_dir),
+server)
+test_port = server.add_insecure_port('[::]:0')
+server.start()
+stager = portable_stager.PortableStager(
+grpc.insecure_channel('localhost:%s' % test_port))
+for from_file, to_file in files:
+  stager.stage_artifact(
+  local_path_to_artifact=os.path.join(self._temp_dir, from_file),
+  artifact_name=to_file)
+
+return stager._artifacts
+
+  def test_stage_single_file(self):
+from_file = 'test_local.txt'
+to_file = 'test_remote.txt'
+
+with open(os.path.join(self._temp_dir, from_file), 'wb') as f:
+  f.write(b'abc')
+
+copied_files = self.stage_files([('test_local.txt', 'test_remote.txt')])
+self.assertTrue(
+filecmp.cmp(
+os.path.join(self._temp_dir, from_file),
+os.path.join(self._remote_dir, to_file)))
+self.assertEqual([to_file], [manifest.name for manifest in copied_files])
+
+  def test_stage_multiple_files(self):
+
+files = [
+('test_local_100.txt', 'test_remote_100.txt', 100, 's'),  #
+('test_local_100.binary', 'test_remote_100.binary', 100, 'b'),  #
+('test_local_1k.txt', 'test_remote_1k.txt', 1 << 10, 's'),  #
+('test_local_1k.binary', 'test_remote_1k.binary', 1 << 10, 'b'),  #
+('test_local_1m.txt', 'test_remote_1m.txt', 1 << 20, 's'),
+('test_local_1m.binary', 'test_remote_1m.binary', 1 << 20, 'b'),
+('test_local_10m.txt', 'test_remote_10m.txt', 10 * (1 << 20), 's'),
+('test_local_10m.binary', 'test_remote_10m.binary', 10 * (1 << 20), 
'b')
+]
+
+for (from_file, _, size, type) in files:
+  chars = list(string.printable)
+  random.shuffle(chars)
+  chars = list(int(size / len(chars)) * chars + chars[0:size % len(chars)])
+  if type == 's':
+with open(
+os.path.join(self._temp_dir, from_file), 'w',
+buffering=2 << 22) as f:
+  f.write(''.join(chars))
+  if type == 'b':
+with open(
+os.path.join(self._temp_dir, from_file), 'wb',
+buffering=2 << 22) as f:
+  f.write(''.join(chars))
+
+copied_files = self.stage_files(
+[(from_file, to_file) for (from_file, to_file, _, _) in files])
+
+for from_file, to_file, _, _ in files:
+  ff = os.path.join(self._temp_dir, from_file)
+  rf = 

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-22 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104762=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104762
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 22/May/18 19:40
Start Date: 22/May/18 19:40
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5273: 
[BEAM-3883] Adding Client to push artifacts to artifact staging service
URL: https://github.com/apache/beam/pull/5273#discussion_r190026690
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/portable_stager_test.py
 ##
 @@ -0,0 +1,148 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Test cases for :module:`artifact_service_client`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import filecmp
+import logging
+import os
+import random
+import shutil
+import string
+import tempfile
+import unittest
+from concurrent import futures
+
+import grpc
+
+from apache_beam.portability.api import beam_artifact_api_pb2
+from apache_beam.portability.api import beam_artifact_api_pb2_grpc
+from apache_beam.runners.portability import portable_stager
+
+
+class PortableStagerTest(unittest.TestCase):
+
+  def setUp(self):
+self._temp_dir = tempfile.mkdtemp()
+self._remote_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+if self._temp_dir:
+  shutil.rmtree(self._temp_dir)
+if self._remote_dir:
+  shutil.rmtree(self._remote_dir)
+
+  def stage_files(self, files):
+server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
+beam_artifact_api_pb2_grpc.add_ArtifactStagingServiceServicer_to_server(
+TestLocalFileSystemArtifactStagingServiceServicer(self._remote_dir),
+server)
+test_port = server.add_insecure_port('[::]:0')
+server.start()
+stager = portable_stager.PortableStager(
+grpc.insecure_channel('localhost:%s' % test_port))
+for from_file, to_file in files:
+  stager.stage_artifact(
+  local_path_to_artifact=os.path.join(self._temp_dir, from_file),
+  artifact_name=to_file)
+
+return stager._artifacts
+
+  def test_stage_single_file(self):
+from_file = 'test_local.txt'
+to_file = 'test_remote.txt'
+
+with open(os.path.join(self._temp_dir, from_file), 'wb') as f:
+  f.write(b'abc')
+
+copied_files = self.stage_files([('test_local.txt', 'test_remote.txt')])
+self.assertTrue(
+filecmp.cmp(
+os.path.join(self._temp_dir, from_file),
+os.path.join(self._remote_dir, to_file)))
+self.assertEqual([to_file], [manifest.name for manifest in copied_files])
+
+  def test_stage_multiple_files(self):
+
+files = [
+('test_local_100.txt', 'test_remote_100.txt', 100, 's'),  #
+('test_local_100.binary', 'test_remote_100.binary', 100, 'b'),  #
+('test_local_1k.txt', 'test_remote_1k.txt', 1 << 10, 's'),  #
+('test_local_1k.binary', 'test_remote_1k.binary', 1 << 10, 'b'),  #
+('test_local_1m.txt', 'test_remote_1m.txt', 1 << 20, 's'),
+('test_local_1m.binary', 'test_remote_1m.binary', 1 << 20, 'b'),
+('test_local_10m.txt', 'test_remote_10m.txt', 10 * (1 << 20), 's'),
+('test_local_10m.binary', 'test_remote_10m.binary', 10 * (1 << 20), 
'b')
+]
+
+for (from_file, _, size, type) in files:
+  chars = list(string.printable)
+  random.shuffle(chars)
+  chars = list(int(size / len(chars)) * chars + chars[0:size % len(chars)])
+  if type == 's':
+with open(
+os.path.join(self._temp_dir, from_file), 'w',
+buffering=2 << 22) as f:
+  f.write(''.join(chars))
+  if type == 'b':
+with open(
+os.path.join(self._temp_dir, from_file), 'wb',
+buffering=2 << 22) as f:
+  f.write(''.join(chars))
+
+copied_files = self.stage_files(
+[(from_file, to_file) for (from_file, to_file, _, _) in files])
+
+for from_file, to_file, _, _ in files:
+  ff = os.path.join(self._temp_dir, from_file)
 
 Review comment:
   

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-22 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104765=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104765
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 22/May/18 19:40
Start Date: 22/May/18 19:40
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5273: 
[BEAM-3883] Adding Client to push artifacts to artifact staging service
URL: https://github.com/apache/beam/pull/5273#discussion_r190023922
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/portable_stager.py
 ##
 @@ -0,0 +1,86 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""A :class:`FileHandler` to work with :class:`ArtifactStagingServiceStub`.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from apache_beam.portability.api import beam_artifact_api_pb2
+from apache_beam.portability.api import beam_artifact_api_pb2_grpc
+from apache_beam.runners.portability.stager import Stager
+
+
+class PortableStager(Stager):
+  """An implementation of :class:`Stager` to stage files on
+  ArtifactStagingService.
+
+  The class keeps track of pushed files and commit manifest once all files are
+  uploaded.
+
+  Note: This class is not thread safe and user of this class should ensure
+  thread safety.
+  """
+
+  def __init__(self, artifact_service_channel):
+"""Creates a new Stager to stage file to ArtifactStagingService.
+
+Args:
+  artifact_service_channel: Channel used to interact with
+ArtifactStagingService.User owns the channel and should close it when
+finished.
+"""
+super(PortableStager, self).__init__()
+self._artifact_staging_stub = beam_artifact_api_pb2_grpc.\
+ArtifactStagingServiceStub(channel=artifact_service_channel)
+self._artifacts = []
+
+  def stage_artifact(self, local_path_to_artifact, artifact_name):
+"""Stage a file to ArtifactStagingService.
+
+Args:
+  local_path_to_artifact: Path of file to be uploaded.
+  artifact_name: File name on the artifact server.
+"""
+if not os.path.isfile(local_path_to_artifact):
+  raise ValueError('Can only stage file to artifact server. from_path: {0} 
'
 
 Review comment:
   Done


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 104765)
Time Spent: 18.5h  (was: 18h 20m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 18.5h
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-22 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104759=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104759
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 22/May/18 19:40
Start Date: 22/May/18 19:40
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5273: 
[BEAM-3883] Adding Client to push artifacts to artifact staging service
URL: https://github.com/apache/beam/pull/5273#discussion_r190022847
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/portable_stager.py
 ##
 @@ -0,0 +1,86 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""A :class:`FileHandler` to work with :class:`ArtifactStagingServiceStub`.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from apache_beam.portability.api import beam_artifact_api_pb2
+from apache_beam.portability.api import beam_artifact_api_pb2_grpc
+from apache_beam.runners.portability.stager import Stager
+
+
+class PortableStager(Stager):
+  """An implementation of :class:`Stager` to stage files on
+  ArtifactStagingService.
+
+  The class keeps track of pushed files and commit manifest once all files are
 
 Review comment:
   Stager calls commit to commit the changes automatically 
https://github.com/apache/beam/blob/master/sdks/python/apache_beam/runners/portability/stager.py#L255


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 104759)
Time Spent: 18h  (was: 17h 50m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 18h
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-22 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104708=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104708
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 22/May/18 17:21
Start Date: 22/May/18 17:21
Worklog Time Spent: 10m 
  Work Description: tvalentyn commented on a change in pull request #5273: 
[BEAM-3883] Adding Client to push artifacts to artifact staging service
URL: https://github.com/apache/beam/pull/5273#discussion_r189982515
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/portable_stager_test.py
 ##
 @@ -0,0 +1,148 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Test cases for :module:`artifact_service_client`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import filecmp
+import logging
+import os
+import random
+import shutil
+import string
+import tempfile
+import unittest
+from concurrent import futures
+
+import grpc
+
+from apache_beam.portability.api import beam_artifact_api_pb2
+from apache_beam.portability.api import beam_artifact_api_pb2_grpc
+from apache_beam.runners.portability import portable_stager
+
+
+class PortableStagerTest(unittest.TestCase):
+
+  def setUp(self):
+self._temp_dir = tempfile.mkdtemp()
+self._remote_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+if self._temp_dir:
+  shutil.rmtree(self._temp_dir)
+if self._remote_dir:
+  shutil.rmtree(self._remote_dir)
+
+  def stage_files(self, files):
+server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
+beam_artifact_api_pb2_grpc.add_ArtifactStagingServiceServicer_to_server(
+TestLocalFileSystemArtifactStagingServiceServicer(self._remote_dir),
+server)
+test_port = server.add_insecure_port('[::]:0')
+server.start()
+stager = portable_stager.PortableStager(
+grpc.insecure_channel('localhost:%s' % test_port))
+for from_file, to_file in files:
+  stager.stage_artifact(
+  local_path_to_artifact=os.path.join(self._temp_dir, from_file),
+  artifact_name=to_file)
+
+return stager._artifacts
+
+  def test_stage_single_file(self):
+from_file = 'test_local.txt'
+to_file = 'test_remote.txt'
+
+with open(os.path.join(self._temp_dir, from_file), 'wb') as f:
+  f.write(b'abc')
+
+copied_files = self.stage_files([('test_local.txt', 'test_remote.txt')])
+self.assertTrue(
+filecmp.cmp(
+os.path.join(self._temp_dir, from_file),
+os.path.join(self._remote_dir, to_file)))
+self.assertEqual([to_file], [manifest.name for manifest in copied_files])
+
+  def test_stage_multiple_files(self):
+
+files = [
+('test_local_100.txt', 'test_remote_100.txt', 100, 's'),  #
+('test_local_100.binary', 'test_remote_100.binary', 100, 'b'),  #
+('test_local_1k.txt', 'test_remote_1k.txt', 1 << 10, 's'),  #
+('test_local_1k.binary', 'test_remote_1k.binary', 1 << 10, 'b'),  #
+('test_local_1m.txt', 'test_remote_1m.txt', 1 << 20, 's'),
+('test_local_1m.binary', 'test_remote_1m.binary', 1 << 20, 'b'),
+('test_local_10m.txt', 'test_remote_10m.txt', 10 * (1 << 20), 's'),
+('test_local_10m.binary', 'test_remote_10m.binary', 10 * (1 << 20), 
'b')
+]
+
+for (from_file, _, size, type) in files:
+  chars = list(string.printable)
+  random.shuffle(chars)
+  chars = list(int(size / len(chars)) * chars + chars[0:size % len(chars)])
+  if type == 's':
+with open(
+os.path.join(self._temp_dir, from_file), 'w',
+buffering=2 << 22) as f:
+  f.write(''.join(chars))
+  if type == 'b':
+with open(
+os.path.join(self._temp_dir, from_file), 'wb',
+buffering=2 << 22) as f:
+  f.write(''.join(chars))
+
+copied_files = self.stage_files(
+[(from_file, to_file) for (from_file, to_file, _, _) in files])
+
+for from_file, to_file, _, _ in files:
+  ff = os.path.join(self._temp_dir, from_file)
+  rf = 

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-22 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104705=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104705
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 22/May/18 17:21
Start Date: 22/May/18 17:21
Worklog Time Spent: 10m 
  Work Description: tvalentyn commented on a change in pull request #5273: 
[BEAM-3883] Adding Client to push artifacts to artifact staging service
URL: https://github.com/apache/beam/pull/5273#discussion_r189981700
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/portable_stager_test.py
 ##
 @@ -0,0 +1,148 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Test cases for :module:`artifact_service_client`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import filecmp
+import logging
+import os
+import random
+import shutil
+import string
+import tempfile
+import unittest
+from concurrent import futures
+
+import grpc
+
+from apache_beam.portability.api import beam_artifact_api_pb2
+from apache_beam.portability.api import beam_artifact_api_pb2_grpc
+from apache_beam.runners.portability import portable_stager
+
+
+class PortableStagerTest(unittest.TestCase):
+
+  def setUp(self):
+self._temp_dir = tempfile.mkdtemp()
+self._remote_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+if self._temp_dir:
+  shutil.rmtree(self._temp_dir)
+if self._remote_dir:
+  shutil.rmtree(self._remote_dir)
+
+  def stage_files(self, files):
+server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
+beam_artifact_api_pb2_grpc.add_ArtifactStagingServiceServicer_to_server(
+TestLocalFileSystemArtifactStagingServiceServicer(self._remote_dir),
+server)
+test_port = server.add_insecure_port('[::]:0')
+server.start()
+stager = portable_stager.PortableStager(
+grpc.insecure_channel('localhost:%s' % test_port))
+for from_file, to_file in files:
+  stager.stage_artifact(
+  local_path_to_artifact=os.path.join(self._temp_dir, from_file),
+  artifact_name=to_file)
+
+return stager._artifacts
+
+  def test_stage_single_file(self):
+from_file = 'test_local.txt'
+to_file = 'test_remote.txt'
+
+with open(os.path.join(self._temp_dir, from_file), 'wb') as f:
+  f.write(b'abc')
+
+copied_files = self.stage_files([('test_local.txt', 'test_remote.txt')])
+self.assertTrue(
+filecmp.cmp(
+os.path.join(self._temp_dir, from_file),
+os.path.join(self._remote_dir, to_file)))
+self.assertEqual([to_file], [manifest.name for manifest in copied_files])
+
+  def test_stage_multiple_files(self):
+
+files = [
+('test_local_100.txt', 'test_remote_100.txt', 100, 's'),  #
+('test_local_100.binary', 'test_remote_100.binary', 100, 'b'),  #
+('test_local_1k.txt', 'test_remote_1k.txt', 1 << 10, 's'),  #
+('test_local_1k.binary', 'test_remote_1k.binary', 1 << 10, 'b'),  #
+('test_local_1m.txt', 'test_remote_1m.txt', 1 << 20, 's'),
+('test_local_1m.binary', 'test_remote_1m.binary', 1 << 20, 'b'),
+('test_local_10m.txt', 'test_remote_10m.txt', 10 * (1 << 20), 's'),
+('test_local_10m.binary', 'test_remote_10m.binary', 10 * (1 << 20), 
'b')
+]
+
+for (from_file, _, size, type) in files:
+  chars = list(string.printable)
+  random.shuffle(chars)
+  chars = list(int(size / len(chars)) * chars + chars[0:size % len(chars)])
+  if type == 's':
+with open(
+os.path.join(self._temp_dir, from_file), 'w',
+buffering=2 << 22) as f:
+  f.write(''.join(chars))
+  if type == 'b':
+with open(
+os.path.join(self._temp_dir, from_file), 'wb',
+buffering=2 << 22) as f:
+  f.write(''.join(chars))
+
+copied_files = self.stage_files(
+[(from_file, to_file) for (from_file, to_file, _, _) in files])
+
+for from_file, to_file, _, _ in files:
+  ff = os.path.join(self._temp_dir, from_file)
 
 Review comment:
  

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-22 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104707=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104707
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 22/May/18 17:21
Start Date: 22/May/18 17:21
Worklog Time Spent: 10m 
  Work Description: tvalentyn commented on a change in pull request #5273: 
[BEAM-3883] Adding Client to push artifacts to artifact staging service
URL: https://github.com/apache/beam/pull/5273#discussion_r189984251
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/portable_stager.py
 ##
 @@ -0,0 +1,86 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""A :class:`FileHandler` to work with :class:`ArtifactStagingServiceStub`.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from apache_beam.portability.api import beam_artifact_api_pb2
+from apache_beam.portability.api import beam_artifact_api_pb2_grpc
+from apache_beam.runners.portability.stager import Stager
+
+
+class PortableStager(Stager):
+  """An implementation of :class:`Stager` to stage files on
+  ArtifactStagingService.
+
+  The class keeps track of pushed files and commit manifest once all files are
+  uploaded.
 
 Review comment:
   I am curious, do we plan to add retry logic here (perhaps in later PRs) if 
some part of staging falls through? Or does GPRC handle retries automatically?


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 104707)
Time Spent: 17h 40m  (was: 17.5h)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 17h 40m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-22 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104702=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104702
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 22/May/18 17:21
Start Date: 22/May/18 17:21
Worklog Time Spent: 10m 
  Work Description: tvalentyn commented on a change in pull request #5273: 
[BEAM-3883] Adding Client to push artifacts to artifact staging service
URL: https://github.com/apache/beam/pull/5273#discussion_r189984592
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/portable_stager_test.py
 ##
 @@ -0,0 +1,148 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Test cases for :module:`artifact_service_client`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import filecmp
+import logging
+import os
+import random
+import shutil
+import string
+import tempfile
+import unittest
+from concurrent import futures
+
+import grpc
+
+from apache_beam.portability.api import beam_artifact_api_pb2
+from apache_beam.portability.api import beam_artifact_api_pb2_grpc
+from apache_beam.runners.portability import portable_stager
+
+
+class PortableStagerTest(unittest.TestCase):
+
+  def setUp(self):
+self._temp_dir = tempfile.mkdtemp()
+self._remote_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+if self._temp_dir:
+  shutil.rmtree(self._temp_dir)
+if self._remote_dir:
+  shutil.rmtree(self._remote_dir)
+
+  def stage_files(self, files):
+server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
+beam_artifact_api_pb2_grpc.add_ArtifactStagingServiceServicer_to_server(
+TestLocalFileSystemArtifactStagingServiceServicer(self._remote_dir),
+server)
+test_port = server.add_insecure_port('[::]:0')
+server.start()
+stager = portable_stager.PortableStager(
+grpc.insecure_channel('localhost:%s' % test_port))
+for from_file, to_file in files:
+  stager.stage_artifact(
+  local_path_to_artifact=os.path.join(self._temp_dir, from_file),
+  artifact_name=to_file)
+
+return stager._artifacts
+
+  def test_stage_single_file(self):
+from_file = 'test_local.txt'
+to_file = 'test_remote.txt'
+
+with open(os.path.join(self._temp_dir, from_file), 'wb') as f:
+  f.write(b'abc')
+
+copied_files = self.stage_files([('test_local.txt', 'test_remote.txt')])
+self.assertTrue(
+filecmp.cmp(
+os.path.join(self._temp_dir, from_file),
+os.path.join(self._remote_dir, to_file)))
+self.assertEqual([to_file], [manifest.name for manifest in copied_files])
 
 Review comment:
   Manifest is a **list** of staged files, so it's not the best name for the 
loop variable here. If we want to check that manifest is created and staged, I 
would actually exercise commit_manifest behavior. 


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 104702)
Time Spent: 17h  (was: 16h 50m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 17h
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-22 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104706=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104706
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 22/May/18 17:21
Start Date: 22/May/18 17:21
Worklog Time Spent: 10m 
  Work Description: tvalentyn commented on a change in pull request #5273: 
[BEAM-3883] Adding Client to push artifacts to artifact staging service
URL: https://github.com/apache/beam/pull/5273#discussion_r189951783
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/portable_stager_test.py
 ##
 @@ -0,0 +1,148 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Test cases for :module:`artifact_service_client`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import filecmp
+import logging
+import os
+import random
+import shutil
+import string
+import tempfile
+import unittest
+from concurrent import futures
+
+import grpc
+
+from apache_beam.portability.api import beam_artifact_api_pb2
+from apache_beam.portability.api import beam_artifact_api_pb2_grpc
+from apache_beam.runners.portability import portable_stager
+
+
+class PortableStagerTest(unittest.TestCase):
+
+  def setUp(self):
+self._temp_dir = tempfile.mkdtemp()
+self._remote_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+if self._temp_dir:
+  shutil.rmtree(self._temp_dir)
+if self._remote_dir:
+  shutil.rmtree(self._remote_dir)
+
+  def stage_files(self, files):
 
 Review comment:
   Please add a docstring for this method and clarify that `files` is a list of 
tuples,


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 104706)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 17.5h
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-22 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104703=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104703
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 22/May/18 17:21
Start Date: 22/May/18 17:21
Worklog Time Spent: 10m 
  Work Description: tvalentyn commented on a change in pull request #5273: 
[BEAM-3883] Adding Client to push artifacts to artifact staging service
URL: https://github.com/apache/beam/pull/5273#discussion_r189931243
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/portable_stager.py
 ##
 @@ -0,0 +1,86 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""A :class:`FileHandler` to work with :class:`ArtifactStagingServiceStub`.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from apache_beam.portability.api import beam_artifact_api_pb2
+from apache_beam.portability.api import beam_artifact_api_pb2_grpc
+from apache_beam.runners.portability.stager import Stager
+
+
+class PortableStager(Stager):
+  """An implementation of :class:`Stager` to stage files on
+  ArtifactStagingService.
+
+  The class keeps track of pushed files and commit manifest once all files are
+  uploaded.
+
+  Note: This class is not thread safe and user of this class should ensure
+  thread safety.
+  """
+
+  def __init__(self, artifact_service_channel):
+"""Creates a new Stager to stage file to ArtifactStagingService.
+
+Args:
+  artifact_service_channel: Channel used to interact with
+ArtifactStagingService.User owns the channel and should close it when
+finished.
+"""
+super(PortableStager, self).__init__()
+self._artifact_staging_stub = beam_artifact_api_pb2_grpc.\
+ArtifactStagingServiceStub(channel=artifact_service_channel)
+self._artifacts = []
+
+  def stage_artifact(self, local_path_to_artifact, artifact_name):
+"""Stage a file to ArtifactStagingService.
+
+Args:
+  local_path_to_artifact: Path of file to be uploaded.
+  artifact_name: File name on the artifact server.
+"""
+if not os.path.isfile(local_path_to_artifact):
+  raise ValueError('Can only stage file to artifact server. from_path: {0} 
'
 
 Review comment:
   Let's rewrite the message so that the root cause is emphasized. How about: 
`Cannot stage {0} to artifact server. Only local files can be staged. `


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 104703)
Time Spent: 17h 10m  (was: 17h)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 17h 10m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-22 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104704=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104704
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 22/May/18 17:21
Start Date: 22/May/18 17:21
Worklog Time Spent: 10m 
  Work Description: tvalentyn commented on a change in pull request #5273: 
[BEAM-3883] Adding Client to push artifacts to artifact staging service
URL: https://github.com/apache/beam/pull/5273#discussion_r189929366
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/portable_stager.py
 ##
 @@ -0,0 +1,86 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""A :class:`FileHandler` to work with :class:`ArtifactStagingServiceStub`.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from apache_beam.portability.api import beam_artifact_api_pb2
+from apache_beam.portability.api import beam_artifact_api_pb2_grpc
+from apache_beam.runners.portability.stager import Stager
+
+
+class PortableStager(Stager):
+  """An implementation of :class:`Stager` to stage files on
+  ArtifactStagingService.
+
+  The class keeps track of pushed files and commit manifest once all files are
 
 Review comment:
   Committing manifest is a responsibility of the superclass. Perhaps we could 
say 
   `The class keeps track of staged files and can commit a manifest of all 
files that were staged.`


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 104704)
Time Spent: 17h 20m  (was: 17h 10m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 17h 20m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-21 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104331=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104331
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 21/May/18 23:05
Start Date: 21/May/18 23:05
Worklog Time Spent: 10m 
  Work Description: jkff closed pull request #5251: [BEAM-3883] Refactor 
and clean dependency.py to make it reusable with artifact service
URL: https://github.com/apache/beam/pull/5251
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/sdks/python/apache_beam/runners/dataflow/internal/apiclient.py 
b/sdks/python/apache_beam/runners/dataflow/internal/apiclient.py
index 54eba06abb8..72c54a40cfe 100644
--- a/sdks/python/apache_beam/runners/dataflow/internal/apiclient.py
+++ b/sdks/python/apache_beam/runners/dataflow/internal/apiclient.py
@@ -25,14 +25,17 @@
 import logging
 import os
 import re
+import tempfile
 import time
 from datetime import datetime
 from StringIO import StringIO
 
+import pkg_resources
 from apitools.base.py import encoding
 from apitools.base.py import exceptions
 import six
 
+from apache_beam import version as beam_version
 from apache_beam.internal.gcp.auth import get_service_credentials
 from apache_beam.internal.gcp.json_value import to_json_value
 from apache_beam.io.filesystems import FileSystems
@@ -41,11 +44,10 @@
 from apache_beam.options.pipeline_options import GoogleCloudOptions
 from apache_beam.options.pipeline_options import StandardOptions
 from apache_beam.options.pipeline_options import WorkerOptions
-from apache_beam.runners.dataflow.internal import dependency
 from apache_beam.runners.dataflow.internal import names
 from apache_beam.runners.dataflow.internal.clients import dataflow
-from apache_beam.runners.dataflow.internal.dependency import 
get_sdk_name_and_version
 from apache_beam.runners.dataflow.internal.names import PropertyNames
+from apache_beam.runners.portability.stager import Stager
 from apache_beam.transforms import cy_combiners
 from apache_beam.transforms import DataflowDistributionCounter
 from apache_beam.transforms.display import DisplayData
@@ -169,7 +171,7 @@ def __init__(self, packages, options, environment_version, 
pipeline_url):
 # TODO: Use enumerated type instead of strings for job types.
 if job_type.startswith('FNAPI_'):
   runner_harness_override = (
-  dependency.get_runner_harness_container_image())
+  get_runner_harness_container_image())
   self.debug_options.experiments = self.debug_options.experiments or []
   if runner_harness_override:
 self.debug_options.experiments.append(
@@ -234,7 +236,7 @@ def __init__(self, packages, options, environment_version, 
pipeline_url):
   self.worker_options.worker_harness_container_image)
 else:
   pool.workerHarnessContainerImage = (
-  dependency.get_default_container_image_for_current_sdk(job_type))
+  get_default_container_image_for_current_sdk(job_type))
 if self.worker_options.use_public_ips is not None:
   if self.worker_options.use_public_ips:
 pool.ipConfiguration = (
@@ -432,6 +434,19 @@ def _gcs_file_copy(self, from_path, to_path):
 with open(from_path, 'rb') as f:
   self.stage_file(to_folder, to_name, f)
 
+  def _stage_resources(self, options):
+google_cloud_options = options.view_as(GoogleCloudOptions)
+if google_cloud_options.staging_location is None:
+  raise RuntimeError('The --staging_location option must be specified.')
+if google_cloud_options.temp_location is None:
+  raise RuntimeError('The --temp_location option must be specified.')
+
+resource_stager = _LegacyDataflowStager(self)
+return resource_stager.stage_job_resources(
+options,
+temp_dir=tempfile.mkdtemp(),
+staging_location=google_cloud_options.staging_location)
+
   def stage_file(self, gcs_or_local_path, file_name, stream,
  mime_type='application/octet-stream'):
 """Stages a file at a GCS or local path with stream-supplied contents."""
@@ -496,8 +511,7 @@ def create_job_description(self, job):
 StringIO(job.proto_pipeline.SerializeToString()))
 
 # Stage other resources for the SDK harness
-resources = dependency.stage_job_resources(
-job.options, file_copy=self._gcs_file_copy)
+resources = self._stage_resources(job.options)
 
 job.proto.environment = Environment(
 
pipeline_url=FileSystems.join(job.google_cloud_options.staging_location,
@@ -731,6 +745,31 @@ def translate_scalar_counter_float(accumulator, 
metric_update_proto):
 

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-21 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104292=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104292
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 21/May/18 21:53
Start Date: 21/May/18 21:53
Worklog Time Spent: 10m 
  Work Description: tvalentyn commented on issue #5273: [BEAM-3883] Adding 
Client to push artifacts to artifact staging service
URL: https://github.com/apache/beam/pull/5273#issuecomment-390794532
 
 
   @angoenka Please let me know once this this ready for review.


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 104292)
Time Spent: 16.5h  (was: 16h 20m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 16.5h
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-21 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104285=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104285
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 21/May/18 21:47
Start Date: 21/May/18 21:47
Worklog Time Spent: 10m 
  Work Description: tvalentyn commented on issue #5251: [BEAM-3883] 
Refactor and clean dependency.py to make it reusable with artifact service
URL: https://github.com/apache/beam/pull/5251#issuecomment-390793029
 
 
   LGTM
   cc: @aaltay 


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 104285)
Time Spent: 16h 10m  (was: 16h)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 16h 10m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-21 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104287=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104287
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 21/May/18 21:47
Start Date: 21/May/18 21:47
Worklog Time Spent: 10m 
  Work Description: tvalentyn commented on issue #5251: [BEAM-3883] 
Refactor and clean dependency.py to make it reusable with artifact service
URL: https://github.com/apache/beam/pull/5251#issuecomment-390793029
 
 
   LGTM. Thanks, @angoenka.
   cc: @aaltay 


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 104287)
Time Spent: 16h 20m  (was: 16h 10m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 16h 20m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-21 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104277=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104277
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 21/May/18 21:15
Start Date: 21/May/18 21:15
Worklog Time Spent: 10m 
  Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor 
and clean dependency.py to make it reusable with artifact service
URL: https://github.com/apache/beam/pull/5251#issuecomment-390785167
 
 
   Run Python Dataflow ValidatesRunner


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 104277)
Time Spent: 16h  (was: 15h 50m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 16h
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-21 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=104208=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-104208
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 21/May/18 19:48
Start Date: 21/May/18 19:48
Worklog Time Spent: 10m 
  Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor 
and clean dependency.py to make it reusable with artifact service
URL: https://github.com/apache/beam/pull/5251#issuecomment-390762120
 
 
   Run Python Dataflow ValidatesRunner


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 104208)
Time Spent: 15h 50m  (was: 15h 40m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 15h 50m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-18 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=103528=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-103528
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 18/May/18 18:40
Start Date: 18/May/18 18:40
Worklog Time Spent: 10m 
  Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor 
and clean dependency.py to make it reusable with artifact service
URL: https://github.com/apache/beam/pull/5251#issuecomment-390296544
 
 
   Run Python Dataflow ValidatesRunner


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 103528)
Time Spent: 15h 40m  (was: 15.5h)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 15h 40m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-17 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=103171=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-103171
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 17/May/18 23:35
Start Date: 17/May/18 23:35
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r189130230
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager.py
 ##
 @@ -0,0 +1,551 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Support for installing custom code and required dependencies.
+
+Workflows, with the exception of very simple ones, are organized in multiple
+modules and packages. Typically, these modules and packages have
+dependencies on other standard libraries. Beam relies on the Python
+setuptools package to handle these scenarios. For further details please read:
+https://pythonhosted.org/an_example_pypi_project/setuptools.html
+
+When a runner tries to run a pipeline it will check for a --requirements_file
+and a --setup_file option.
+
+If --setup_file is present then it is assumed that the folder containing the
+file specified by the option has the typical layout required by setuptools and
+it will run 'python setup.py sdist' to produce a source distribution. The
+resulting tarball (a .tar or .tar.gz file) will be staged at the staging
+location specified as job option. When a worker starts it will check for the
+presence of this file and will run 'easy_install tarball' to install the
+package in the worker.
+
+If --requirements_file is present then the file specified by the option will be
+staged in the staging location.  When a worker starts it will check for the
+presence of this file and will run 'pip install -r requirements.txt'. A
+requirements file can be easily generated by running 'pip freeze -r
+requirements.txt'. The reason a runner does not run this automatically is
+because quite often only a small fraction of the dependencies present in a
+requirements.txt file are actually needed for remote execution and therefore a
+one-time manual trimming is desirable.
+
+TODO(silviuc): Should we allow several setup packages?
+TODO(silviuc): We should allow customizing the exact command for setup build.
+"""
+import glob
+import logging
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import pkg_resources
+
+from apache_beam.internal import pickler
+from apache_beam.io.filesystems import FileSystems
+from apache_beam.options.pipeline_options import SetupOptions
+# TODO(angoenka): Remove reference to dataflow internal names
+from apache_beam.runners.dataflow.internal import names
+from apache_beam.utils import processes
+
+# All constants are for internal use only; no backwards-compatibility
+# guarantees.
+
+# Standard file names used for staging files.
+WORKFLOW_TARBALL_FILE = 'workflow.tar.gz'
+REQUIREMENTS_FILE = 'requirements.txt'
+EXTRA_PACKAGES_FILE = 'extra_packages.txt'
+
+# Package names for distributions
+BEAM_PACKAGE_NAME = 'apache-beam'
+
+
+class Stager(object):
+  """Stager identifies and copies the appropriate artifacts to the staging
+  location."""
+
+  @staticmethod
+  def _download_file(from_url, to_path):
+"""Downloads a file over http/https from a url or copy it from a remote
+path to local path."""
+if from_url.startswith('http://') or from_url.startswith('https://'):
+  # TODO(silviuc): We should cache downloads so we do not do it for every
+  # job.
+  try:
+# We check if the file is actually there because wget returns a file
+# even for a 404 response (file will contain the contents of the 404
+# response).
+# TODO(angoenka): Extract and use the filename when downloading file.
+response, content = __import__('httplib2').Http().request(from_url)
+if int(response['status']) >= 400:
+  raise RuntimeError(
+  'Artifact not found at %s (response: %s)' % 

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-17 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=103172=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-103172
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 17/May/18 23:35
Start Date: 17/May/18 23:35
Worklog Time Spent: 10m 
  Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor 
and clean dependency.py to make it reusable with artifact service
URL: https://github.com/apache/beam/pull/5251#issuecomment-390046389
 
 
   Run Python Dataflow ValidatesRunner


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 103172)
Time Spent: 15.5h  (was: 15h 20m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 15.5h
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-17 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=103163=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-103163
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 17/May/18 23:07
Start Date: 17/May/18 23:07
Worklog Time Spent: 10m 
  Work Description: pabloem commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r189126287
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager.py
 ##
 @@ -0,0 +1,551 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Support for installing custom code and required dependencies.
+
+Workflows, with the exception of very simple ones, are organized in multiple
+modules and packages. Typically, these modules and packages have
+dependencies on other standard libraries. Beam relies on the Python
+setuptools package to handle these scenarios. For further details please read:
+https://pythonhosted.org/an_example_pypi_project/setuptools.html
+
+When a runner tries to run a pipeline it will check for a --requirements_file
+and a --setup_file option.
+
+If --setup_file is present then it is assumed that the folder containing the
+file specified by the option has the typical layout required by setuptools and
+it will run 'python setup.py sdist' to produce a source distribution. The
+resulting tarball (a .tar or .tar.gz file) will be staged at the staging
+location specified as job option. When a worker starts it will check for the
+presence of this file and will run 'easy_install tarball' to install the
+package in the worker.
+
+If --requirements_file is present then the file specified by the option will be
+staged in the staging location.  When a worker starts it will check for the
+presence of this file and will run 'pip install -r requirements.txt'. A
+requirements file can be easily generated by running 'pip freeze -r
+requirements.txt'. The reason a runner does not run this automatically is
+because quite often only a small fraction of the dependencies present in a
+requirements.txt file are actually needed for remote execution and therefore a
+one-time manual trimming is desirable.
+
+TODO(silviuc): Should we allow several setup packages?
+TODO(silviuc): We should allow customizing the exact command for setup build.
+"""
+import glob
+import logging
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import pkg_resources
+
+from apache_beam.internal import pickler
+from apache_beam.io.filesystems import FileSystems
+from apache_beam.options.pipeline_options import SetupOptions
+# TODO(angoenka): Remove reference to dataflow internal names
+from apache_beam.runners.dataflow.internal import names
+from apache_beam.utils import processes
+
+# All constants are for internal use only; no backwards-compatibility
+# guarantees.
+
+# Standard file names used for staging files.
+WORKFLOW_TARBALL_FILE = 'workflow.tar.gz'
+REQUIREMENTS_FILE = 'requirements.txt'
+EXTRA_PACKAGES_FILE = 'extra_packages.txt'
+
+# Package names for distributions
+BEAM_PACKAGE_NAME = 'apache-beam'
+
+
+class Stager(object):
+  """Stager identifies and copies the appropriate artifacts to the staging
+  location."""
+
+  @staticmethod
+  def _download_file(from_url, to_path):
+"""Downloads a file over http/https from a url or copy it from a remote
+path to local path."""
+if from_url.startswith('http://') or from_url.startswith('https://'):
+  # TODO(silviuc): We should cache downloads so we do not do it for every
+  # job.
+  try:
+# We check if the file is actually there because wget returns a file
+# even for a 404 response (file will contain the contents of the 404
+# response).
+# TODO(angoenka): Extract and use the filename when downloading file.
+response, content = __import__('httplib2').Http().request(from_url)
+if int(response['status']) >= 400:
+  raise RuntimeError(
+  'Artifact not found at %s (response: %s)' % 

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-17 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=103157=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-103157
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 17/May/18 22:57
Start Date: 17/May/18 22:57
Worklog Time Spent: 10m 
  Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor 
and clean dependency.py to make it reusable with artifact service
URL: https://github.com/apache/beam/pull/5251#issuecomment-390039131
 
 
   Run Python Dataflow ValidatesRunner


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 103157)
Time Spent: 15h  (was: 14h 50m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 15h
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-17 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=103088=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-103088
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 17/May/18 20:12
Start Date: 17/May/18 20:12
Worklog Time Spent: 10m 
  Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor 
and clean dependency.py to make it reusable with artifact service
URL: https://github.com/apache/beam/pull/5251#issuecomment-389995031
 
 
   Run Python Dataflow ValidatesRunner


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 103088)
Time Spent: 14h 50m  (was: 14h 40m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 14h 50m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102749=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102749
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 17/May/18 01:38
Start Date: 17/May/18 01:38
Worklog Time Spent: 10m 
  Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor 
and clean dependency.py to make it reusable with artifact service
URL: https://github.com/apache/beam/pull/5251#issuecomment-389717316
 
 
   Run Python Dataflow ValidatesRunner


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 102749)
Time Spent: 14h 40m  (was: 14.5h)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 14h 40m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102748=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102748
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 17/May/18 01:38
Start Date: 17/May/18 01:38
Worklog Time Spent: 10m 
  Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor 
and clean dependency.py to make it reusable with artifact service
URL: https://github.com/apache/beam/pull/5251#issuecomment-389717242
 
 
   Updated the PR based on our discussion.
   PTAL


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 102748)
Time Spent: 14.5h  (was: 14h 20m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 14.5h
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102741=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102741
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 17/May/18 00:37
Start Date: 17/May/18 00:37
Worklog Time Spent: 10m 
  Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor 
and clean dependency.py to make it reusable with artifact service
URL: https://github.com/apache/beam/pull/5251#issuecomment-389708022
 
 
   @valentyn I am converting all the staticmethods to module methods as mocking 
static methods is even more difficult.
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 102741)
Time Spent: 14h 20m  (was: 14h 10m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 14h 20m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102735=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102735
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 17/May/18 00:10
Start Date: 17/May/18 00:10
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188807218
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager_test.py
 ##
 @@ -420,134 +375,167 @@ def 
test_sdk_location_local_directory_not_present(self):
 sdk_location = 'nosuchdir'
 with self.assertRaises(RuntimeError) as cm:
   options = PipelineOptions()
-  options.view_as(GoogleCloudOptions).staging_location = staging_dir
   self.update_options(options)
   options.view_as(SetupOptions).sdk_location = sdk_location
 
-  dependency.stage_job_resources(options)
+  self.stager.stage_job_resources(options, staging_location=staging_dir)
 self.assertEqual(
 'The file "%s" cannot be found. Its '
 'location was specified by the --sdk_location command-line option.' %
-sdk_location,
-cm.exception.args[0])
+sdk_location, cm.exception.args[0])
 
-  def test_sdk_location_gcs_source_file(self):
+  def test_sdk_location_remote_source_file(self):
 staging_dir = self.make_temp_dir()
 sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz'
 
 options = PipelineOptions()
-options.view_as(GoogleCloudOptions).staging_location = staging_dir
 self.update_options(options)
 options.view_as(SetupOptions).sdk_location = sdk_location
 
-with mock.patch('apache_beam.runners.dataflow.internal.'
-'dependency._dependency_file_copy'):
-  self.assertEqual(
-  [names.DATAFLOW_SDK_TARBALL_FILE],
-  dependency.stage_job_resources(options))
-
-  def test_sdk_location_gcs_wheel_file(self):
+with mock.patch('.'.join([
+self.__module__, TestStager.__name__, 
TestStager.stage_artifact.__name__
+])):
+  with mock.patch('.'.join([
+  self.__module__, TestStager.__name__,
+  TestStager._download_file.__name__
+  ])):
+self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE],
+ self.stager.stage_job_resources(
+ options, staging_location=staging_dir))
+
+  def test_sdk_location_remote_wheel_file(self):
 staging_dir = self.make_temp_dir()
 sdk_filename = 'apache_beam-1.0.0-cp27-cp27mu-manylinux1_x86_64.whl'
-sdk_location = 'gs://my-gcs-bucket/' + sdk_filename
+sdk_location = '/tmp/remote/my-bucket/' + sdk_filename
 
 options = PipelineOptions()
-options.view_as(GoogleCloudOptions).staging_location = staging_dir
 self.update_options(options)
 options.view_as(SetupOptions).sdk_location = sdk_location
 
-with mock.patch('apache_beam.runners.dataflow.internal.'
-'dependency._dependency_file_copy'):
-  self.assertEqual(
-  [sdk_filename],
-  dependency.stage_job_resources(options))
+# We can not rely on actual remote file systems paths hence making
+# '/tmp/remote/' a new remote path.
+def is_remote_path(dummy_self, path):
+  return path.startswith('/tmp/remote/')
+
+with mock.patch('.'.join([
+self.__module__, TestStager.__name__, 
TestStager.stage_artifact.__name__
+])):
+  with mock.patch('.'.join([
+  self.__module__, TestStager.__name__,
+  TestStager._download_file.__name__
+  ])):
+with mock.patch(
+'.'.join([
+self.__module__, TestStager.__name__,
+TestStager._is_remote_path.__name__
+]), is_remote_path):
+  self.assertEqual([sdk_filename],
+   self.stager.stage_job_resources(
+   options, staging_location=staging_dir))
 
   def test_sdk_location_http(self):
 staging_dir = self.make_temp_dir()
 sdk_location = 'http://storage.googleapis.com/my-gcs-bucket/tarball.tar.gz'
 
 options = PipelineOptions()
-options.view_as(GoogleCloudOptions).staging_location = staging_dir
 self.update_options(options)
 options.view_as(SetupOptions).sdk_location = sdk_location
 
-def file_download(_, to_folder):
-  tarball_path = os.path.join(to_folder, 'sdk-tarball')
-  with open(tarball_path, 'w') as f:
+def file_download(dummy_self, _, to_path):
 
 Review comment:
   Marking _download_file static has made it impossible to extend it and we 
will not be able to provide its implementation in the subclass without monkey 
patching it. 
   I will remove the _download_file from TestStager as its not getting 

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102734=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102734
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 17/May/18 00:09
Start Date: 17/May/18 00:09
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188807218
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager_test.py
 ##
 @@ -420,134 +375,167 @@ def 
test_sdk_location_local_directory_not_present(self):
 sdk_location = 'nosuchdir'
 with self.assertRaises(RuntimeError) as cm:
   options = PipelineOptions()
-  options.view_as(GoogleCloudOptions).staging_location = staging_dir
   self.update_options(options)
   options.view_as(SetupOptions).sdk_location = sdk_location
 
-  dependency.stage_job_resources(options)
+  self.stager.stage_job_resources(options, staging_location=staging_dir)
 self.assertEqual(
 'The file "%s" cannot be found. Its '
 'location was specified by the --sdk_location command-line option.' %
-sdk_location,
-cm.exception.args[0])
+sdk_location, cm.exception.args[0])
 
-  def test_sdk_location_gcs_source_file(self):
+  def test_sdk_location_remote_source_file(self):
 staging_dir = self.make_temp_dir()
 sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz'
 
 options = PipelineOptions()
-options.view_as(GoogleCloudOptions).staging_location = staging_dir
 self.update_options(options)
 options.view_as(SetupOptions).sdk_location = sdk_location
 
-with mock.patch('apache_beam.runners.dataflow.internal.'
-'dependency._dependency_file_copy'):
-  self.assertEqual(
-  [names.DATAFLOW_SDK_TARBALL_FILE],
-  dependency.stage_job_resources(options))
-
-  def test_sdk_location_gcs_wheel_file(self):
+with mock.patch('.'.join([
+self.__module__, TestStager.__name__, 
TestStager.stage_artifact.__name__
+])):
+  with mock.patch('.'.join([
+  self.__module__, TestStager.__name__,
+  TestStager._download_file.__name__
+  ])):
+self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE],
+ self.stager.stage_job_resources(
+ options, staging_location=staging_dir))
+
+  def test_sdk_location_remote_wheel_file(self):
 staging_dir = self.make_temp_dir()
 sdk_filename = 'apache_beam-1.0.0-cp27-cp27mu-manylinux1_x86_64.whl'
-sdk_location = 'gs://my-gcs-bucket/' + sdk_filename
+sdk_location = '/tmp/remote/my-bucket/' + sdk_filename
 
 options = PipelineOptions()
-options.view_as(GoogleCloudOptions).staging_location = staging_dir
 self.update_options(options)
 options.view_as(SetupOptions).sdk_location = sdk_location
 
-with mock.patch('apache_beam.runners.dataflow.internal.'
-'dependency._dependency_file_copy'):
-  self.assertEqual(
-  [sdk_filename],
-  dependency.stage_job_resources(options))
+# We can not rely on actual remote file systems paths hence making
+# '/tmp/remote/' a new remote path.
+def is_remote_path(dummy_self, path):
+  return path.startswith('/tmp/remote/')
+
+with mock.patch('.'.join([
+self.__module__, TestStager.__name__, 
TestStager.stage_artifact.__name__
+])):
+  with mock.patch('.'.join([
+  self.__module__, TestStager.__name__,
+  TestStager._download_file.__name__
+  ])):
+with mock.patch(
+'.'.join([
+self.__module__, TestStager.__name__,
+TestStager._is_remote_path.__name__
+]), is_remote_path):
+  self.assertEqual([sdk_filename],
+   self.stager.stage_job_resources(
+   options, staging_location=staging_dir))
 
   def test_sdk_location_http(self):
 staging_dir = self.make_temp_dir()
 sdk_location = 'http://storage.googleapis.com/my-gcs-bucket/tarball.tar.gz'
 
 options = PipelineOptions()
-options.view_as(GoogleCloudOptions).staging_location = staging_dir
 self.update_options(options)
 options.view_as(SetupOptions).sdk_location = sdk_location
 
-def file_download(_, to_folder):
-  tarball_path = os.path.join(to_folder, 'sdk-tarball')
-  with open(tarball_path, 'w') as f:
+def file_download(dummy_self, _, to_path):
 
 Review comment:
   Marking _download_file static has made it impossible to extend it and we 
will not be able to provide its implementation in the subclass without monkey 
patching it.


This 

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102730=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102730
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 16/May/18 23:58
Start Date: 16/May/18 23:58
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188805559
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager_test.py
 ##
 @@ -420,134 +375,167 @@ def 
test_sdk_location_local_directory_not_present(self):
 sdk_location = 'nosuchdir'
 with self.assertRaises(RuntimeError) as cm:
   options = PipelineOptions()
-  options.view_as(GoogleCloudOptions).staging_location = staging_dir
   self.update_options(options)
   options.view_as(SetupOptions).sdk_location = sdk_location
 
-  dependency.stage_job_resources(options)
+  self.stager.stage_job_resources(options, staging_location=staging_dir)
 self.assertEqual(
 'The file "%s" cannot be found. Its '
 'location was specified by the --sdk_location command-line option.' %
-sdk_location,
-cm.exception.args[0])
+sdk_location, cm.exception.args[0])
 
-  def test_sdk_location_gcs_source_file(self):
+  def test_sdk_location_remote_source_file(self):
 staging_dir = self.make_temp_dir()
 sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz'
 
 options = PipelineOptions()
-options.view_as(GoogleCloudOptions).staging_location = staging_dir
 self.update_options(options)
 options.view_as(SetupOptions).sdk_location = sdk_location
 
-with mock.patch('apache_beam.runners.dataflow.internal.'
-'dependency._dependency_file_copy'):
-  self.assertEqual(
-  [names.DATAFLOW_SDK_TARBALL_FILE],
-  dependency.stage_job_resources(options))
-
-  def test_sdk_location_gcs_wheel_file(self):
+with mock.patch('.'.join([
+self.__module__, TestStager.__name__, 
TestStager.stage_artifact.__name__
+])):
+  with mock.patch('.'.join([
+  self.__module__, TestStager.__name__,
+  TestStager._download_file.__name__
+  ])):
+self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE],
+ self.stager.stage_job_resources(
+ options, staging_location=staging_dir))
+
+  def test_sdk_location_remote_wheel_file(self):
 staging_dir = self.make_temp_dir()
 sdk_filename = 'apache_beam-1.0.0-cp27-cp27mu-manylinux1_x86_64.whl'
-sdk_location = 'gs://my-gcs-bucket/' + sdk_filename
+sdk_location = '/tmp/remote/my-bucket/' + sdk_filename
 
 options = PipelineOptions()
-options.view_as(GoogleCloudOptions).staging_location = staging_dir
 self.update_options(options)
 options.view_as(SetupOptions).sdk_location = sdk_location
 
-with mock.patch('apache_beam.runners.dataflow.internal.'
-'dependency._dependency_file_copy'):
-  self.assertEqual(
-  [sdk_filename],
-  dependency.stage_job_resources(options))
+# We can not rely on actual remote file systems paths hence making
+# '/tmp/remote/' a new remote path.
+def is_remote_path(dummy_self, path):
 
 Review comment:
   yes, as I don't see other ways to spoof remote path.


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 102730)
Time Spent: 13h 50m  (was: 13h 40m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 13h 50m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]




[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102728=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102728
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 16/May/18 23:57
Start Date: 16/May/18 23:57
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188805479
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager_test.py
 ##
 @@ -420,134 +375,167 @@ def 
test_sdk_location_local_directory_not_present(self):
 sdk_location = 'nosuchdir'
 with self.assertRaises(RuntimeError) as cm:
   options = PipelineOptions()
-  options.view_as(GoogleCloudOptions).staging_location = staging_dir
   self.update_options(options)
   options.view_as(SetupOptions).sdk_location = sdk_location
 
-  dependency.stage_job_resources(options)
+  self.stager.stage_job_resources(options, staging_location=staging_dir)
 self.assertEqual(
 'The file "%s" cannot be found. Its '
 'location was specified by the --sdk_location command-line option.' %
-sdk_location,
-cm.exception.args[0])
+sdk_location, cm.exception.args[0])
 
-  def test_sdk_location_gcs_source_file(self):
+  def test_sdk_location_remote_source_file(self):
 staging_dir = self.make_temp_dir()
 sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz'
 
 options = PipelineOptions()
-options.view_as(GoogleCloudOptions).staging_location = staging_dir
 self.update_options(options)
 options.view_as(SetupOptions).sdk_location = sdk_location
 
-with mock.patch('apache_beam.runners.dataflow.internal.'
-'dependency._dependency_file_copy'):
-  self.assertEqual(
-  [names.DATAFLOW_SDK_TARBALL_FILE],
-  dependency.stage_job_resources(options))
-
-  def test_sdk_location_gcs_wheel_file(self):
+with mock.patch('.'.join([
+self.__module__, TestStager.__name__, 
TestStager.stage_artifact.__name__
+])):
+  with mock.patch('.'.join([
+  self.__module__, TestStager.__name__,
+  TestStager._download_file.__name__
+  ])):
+self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE],
+ self.stager.stage_job_resources(
+ options, staging_location=staging_dir))
+
+  def test_sdk_location_remote_wheel_file(self):
 staging_dir = self.make_temp_dir()
 sdk_filename = 'apache_beam-1.0.0-cp27-cp27mu-manylinux1_x86_64.whl'
-sdk_location = 'gs://my-gcs-bucket/' + sdk_filename
+sdk_location = '/tmp/remote/my-bucket/' + sdk_filename
 
 options = PipelineOptions()
-options.view_as(GoogleCloudOptions).staging_location = staging_dir
 self.update_options(options)
 options.view_as(SetupOptions).sdk_location = sdk_location
 
-with mock.patch('apache_beam.runners.dataflow.internal.'
-'dependency._dependency_file_copy'):
-  self.assertEqual(
-  [sdk_filename],
-  dependency.stage_job_resources(options))
+# We can not rely on actual remote file systems paths hence making
+# '/tmp/remote/' a new remote path.
+def is_remote_path(dummy_self, path):
+  return path.startswith('/tmp/remote/')
+
+with mock.patch('.'.join([
+self.__module__, TestStager.__name__, 
TestStager.stage_artifact.__name__
+])):
+  with mock.patch('.'.join([
+  self.__module__, TestStager.__name__,
+  TestStager._download_file.__name__
+  ])):
+with mock.patch(
+'.'.join([
+self.__module__, TestStager.__name__,
+TestStager._is_remote_path.__name__
+]), is_remote_path):
+  self.assertEqual([sdk_filename],
+   self.stager.stage_job_resources(
+   options, staging_location=staging_dir))
 
   def test_sdk_location_http(self):
 staging_dir = self.make_temp_dir()
 sdk_location = 'http://storage.googleapis.com/my-gcs-bucket/tarball.tar.gz'
 
 options = PipelineOptions()
-options.view_as(GoogleCloudOptions).staging_location = staging_dir
 self.update_options(options)
 options.view_as(SetupOptions).sdk_location = sdk_location
 
-def file_download(_, to_folder):
-  tarball_path = os.path.join(to_folder, 'sdk-tarball')
-  with open(tarball_path, 'w') as f:
+def file_download(dummy_self, _, to_path):
+  with open(to_path, 'w') as f:
 f.write('Package content.')
-  return tarball_path
+  return to_path
 
-with mock.patch('apache_beam.runners.dataflow.internal.'
-'dependency._dependency_file_download', 

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102724=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102724
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 16/May/18 23:54
Start Date: 16/May/18 23:54
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188805008
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager_test.py
 ##
 @@ -420,134 +375,167 @@ def 
test_sdk_location_local_directory_not_present(self):
 sdk_location = 'nosuchdir'
 with self.assertRaises(RuntimeError) as cm:
   options = PipelineOptions()
-  options.view_as(GoogleCloudOptions).staging_location = staging_dir
   self.update_options(options)
   options.view_as(SetupOptions).sdk_location = sdk_location
 
-  dependency.stage_job_resources(options)
+  self.stager.stage_job_resources(options, staging_location=staging_dir)
 self.assertEqual(
 'The file "%s" cannot be found. Its '
 'location was specified by the --sdk_location command-line option.' %
-sdk_location,
-cm.exception.args[0])
+sdk_location, cm.exception.args[0])
 
-  def test_sdk_location_gcs_source_file(self):
+  def test_sdk_location_remote_source_file(self):
 staging_dir = self.make_temp_dir()
 sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz'
 
 options = PipelineOptions()
-options.view_as(GoogleCloudOptions).staging_location = staging_dir
 self.update_options(options)
 options.view_as(SetupOptions).sdk_location = sdk_location
 
-with mock.patch('apache_beam.runners.dataflow.internal.'
-'dependency._dependency_file_copy'):
-  self.assertEqual(
-  [names.DATAFLOW_SDK_TARBALL_FILE],
-  dependency.stage_job_resources(options))
-
-  def test_sdk_location_gcs_wheel_file(self):
+with mock.patch('.'.join([
+self.__module__, TestStager.__name__, 
TestStager.stage_artifact.__name__
+])):
+  with mock.patch('.'.join([
+  self.__module__, TestStager.__name__,
+  TestStager._download_file.__name__
+  ])):
+self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE],
+ self.stager.stage_job_resources(
+ options, staging_location=staging_dir))
+
+  def test_sdk_location_remote_wheel_file(self):
 staging_dir = self.make_temp_dir()
 sdk_filename = 'apache_beam-1.0.0-cp27-cp27mu-manylinux1_x86_64.whl'
-sdk_location = 'gs://my-gcs-bucket/' + sdk_filename
+sdk_location = '/tmp/remote/my-bucket/' + sdk_filename
 
 options = PipelineOptions()
-options.view_as(GoogleCloudOptions).staging_location = staging_dir
 self.update_options(options)
 options.view_as(SetupOptions).sdk_location = sdk_location
 
-with mock.patch('apache_beam.runners.dataflow.internal.'
-'dependency._dependency_file_copy'):
-  self.assertEqual(
-  [sdk_filename],
-  dependency.stage_job_resources(options))
+# We can not rely on actual remote file systems paths hence making
+# '/tmp/remote/' a new remote path.
+def is_remote_path(dummy_self, path):
+  return path.startswith('/tmp/remote/')
+
+with mock.patch('.'.join([
+self.__module__, TestStager.__name__, 
TestStager.stage_artifact.__name__
+])):
+  with mock.patch('.'.join([
+  self.__module__, TestStager.__name__,
+  TestStager._download_file.__name__
+  ])):
+with mock.patch(
+'.'.join([
+self.__module__, TestStager.__name__,
+TestStager._is_remote_path.__name__
+]), is_remote_path):
+  self.assertEqual([sdk_filename],
+   self.stager.stage_job_resources(
+   options, staging_location=staging_dir))
 
   def test_sdk_location_http(self):
 staging_dir = self.make_temp_dir()
 sdk_location = 'http://storage.googleapis.com/my-gcs-bucket/tarball.tar.gz'
 
 options = PipelineOptions()
-options.view_as(GoogleCloudOptions).staging_location = staging_dir
 self.update_options(options)
 options.view_as(SetupOptions).sdk_location = sdk_location
 
-def file_download(_, to_folder):
-  tarball_path = os.path.join(to_folder, 'sdk-tarball')
-  with open(tarball_path, 'w') as f:
+def file_download(dummy_self, _, to_path):
+  with open(to_path, 'w') as f:
 f.write('Package content.')
-  return tarball_path
+  return to_path
 
-with mock.patch('apache_beam.runners.dataflow.internal.'
-'dependency._dependency_file_download', 

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102723=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102723
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 16/May/18 23:53
Start Date: 16/May/18 23:53
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188804835
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager.py
 ##
 @@ -130,19 +99,43 @@ def _download_file(self, from_url, to_path):
   except Exception:
 logging.info('Failed to download Artifact from %s', from_url)
 raise
+elif from_url.startswith('gs://') or to_path.startswith('gs://'):
+  from apache_beam.io.gcp import gcsio
+  if from_url.startswith('gs://') and to_path.startswith('gs://'):
+# Both files are GCS files so copy.
+gcsio.GcsIO().copy(from_url, to_path)
+  elif to_path.startswith('gs://'):
 
 Review comment:
   done


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 102723)
Time Spent: 13h 20m  (was: 13h 10m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 13h 20m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102722=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102722
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 16/May/18 23:52
Start Date: 16/May/18 23:52
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188804700
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager.py
 ##
 @@ -78,39 +78,8 @@ class Stager(object):
   """Stager identifies and copies the appropriate artifacts to the staging
   location."""
 
-  def _copy_file(self, from_path, to_path):
-"""Copies a local file to a GCS file or vice versa."""
-logging.info('file copy from %s to %s.', from_path, to_path)
-if from_path.startswith('gs://') or to_path.startswith('gs://'):
-  from apache_beam.io.gcp import gcsio
-  if from_path.startswith('gs://') and to_path.startswith('gs://'):
-# Both files are GCS files so copy.
-gcsio.GcsIO().copy(from_path, to_path)
-  elif to_path.startswith('gs://'):
-# Only target is a GCS file, read local file and upload.
-with open(from_path, 'rb') as f:
-  with gcsio.GcsIO().open(to_path, mode='wb') as g:
-pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE)
-for chunk in iter(pfun, ''):
-  g.write(chunk)
-  else:
-# Source is a GCS file but target is local file.
-with gcsio.GcsIO().open(from_path, mode='rb') as g:
-  with open(to_path, 'wb') as f:
-pfun = functools.partial(g.read, gcsio.DEFAULT_READ_BUFFER_SIZE)
-for chunk in iter(pfun, ''):
-  f.write(chunk)
-else:
-  # Branch used only for unit tests and integration tests.
-  # In such environments GCS support is not available.
-  if not os.path.isdir(os.path.dirname(to_path)):
-logging.info(
-'Created folder (since we have not done yet, and any errors '
-'will follow): %s ', os.path.dirname(to_path))
-os.mkdir(os.path.dirname(to_path))
-  shutil.copyfile(from_path, to_path)
-
-  def _download_file(self, from_url, to_path):
+  @staticmethod
+  def _download_file(from_url, to_path):
 
 Review comment:
   As it also support http:// and gs:// I would prefer calling it from_url.
   Note: from can not be used.


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 102722)
Time Spent: 13h 10m  (was: 13h)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 13h 10m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102721=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102721
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 16/May/18 23:50
Start Date: 16/May/18 23:50
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188804467
 
 

 ##
 File path: sdks/python/apache_beam/runners/dataflow/internal/apiclient.py
 ##
 @@ -746,17 +745,32 @@ def translate_scalar_counter_float(accumulator, 
metric_update_proto):
 metric_update_proto.floatingPoint = accumulator.value
 
 
-class _ParameterizedStager(Stager):
-  def __init__(self, stage_artifact):
-super(_ParameterizedStager, self).__init__()
-self.stage_artifact_method = stage_artifact
+class _LegacyDataflowStager(Stager):
+  # TODO(silviuc): Staged files should have a job specific prefix.
 
 Review comment:
   Removed it


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 102721)
Time Spent: 13h  (was: 12h 50m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 13h
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102720=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102720
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 16/May/18 23:50
Start Date: 16/May/18 23:50
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188804353
 
 

 ##
 File path: sdks/python/apache_beam/runners/dataflow/internal/apiclient.py
 ##
 @@ -746,17 +745,32 @@ def translate_scalar_counter_float(accumulator, 
metric_update_proto):
 metric_update_proto.floatingPoint = accumulator.value
 
 
-class _ParameterizedStager(Stager):
-  def __init__(self, stage_artifact):
-super(_ParameterizedStager, self).__init__()
-self.stage_artifact_method = stage_artifact
+class _LegacyDataflowStager(Stager):
+  # TODO(silviuc): Staged files should have a job specific prefix.
+  # To prevent several jobs in the same project stomping on each other due to a
+  # shared staging location.
+  def __init__(self, dataflow_application_client):
+super(_LegacyDataflowStager, self).__init__()
+self.stage_artifact_method = dataflow_application_client._gcs_file_copy
 
 Review comment:
   done


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 102720)
Time Spent: 12h 50m  (was: 12h 40m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 12h 50m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102716=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102716
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 16/May/18 23:46
Start Date: 16/May/18 23:46
Worklog Time Spent: 10m 
  Work Description: tvalentyn commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188798067
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager_test.py
 ##
 @@ -420,134 +375,167 @@ def 
test_sdk_location_local_directory_not_present(self):
 sdk_location = 'nosuchdir'
 with self.assertRaises(RuntimeError) as cm:
   options = PipelineOptions()
-  options.view_as(GoogleCloudOptions).staging_location = staging_dir
   self.update_options(options)
   options.view_as(SetupOptions).sdk_location = sdk_location
 
-  dependency.stage_job_resources(options)
+  self.stager.stage_job_resources(options, staging_location=staging_dir)
 self.assertEqual(
 'The file "%s" cannot be found. Its '
 'location was specified by the --sdk_location command-line option.' %
-sdk_location,
-cm.exception.args[0])
+sdk_location, cm.exception.args[0])
 
-  def test_sdk_location_gcs_source_file(self):
+  def test_sdk_location_remote_source_file(self):
 staging_dir = self.make_temp_dir()
 sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz'
 
 options = PipelineOptions()
-options.view_as(GoogleCloudOptions).staging_location = staging_dir
 self.update_options(options)
 options.view_as(SetupOptions).sdk_location = sdk_location
 
-with mock.patch('apache_beam.runners.dataflow.internal.'
-'dependency._dependency_file_copy'):
-  self.assertEqual(
-  [names.DATAFLOW_SDK_TARBALL_FILE],
-  dependency.stage_job_resources(options))
-
-  def test_sdk_location_gcs_wheel_file(self):
+with mock.patch('.'.join([
+self.__module__, TestStager.__name__, 
TestStager.stage_artifact.__name__
+])):
+  with mock.patch('.'.join([
+  self.__module__, TestStager.__name__,
+  TestStager._download_file.__name__
+  ])):
+self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE],
+ self.stager.stage_job_resources(
+ options, staging_location=staging_dir))
+
+  def test_sdk_location_remote_wheel_file(self):
 staging_dir = self.make_temp_dir()
 sdk_filename = 'apache_beam-1.0.0-cp27-cp27mu-manylinux1_x86_64.whl'
-sdk_location = 'gs://my-gcs-bucket/' + sdk_filename
+sdk_location = '/tmp/remote/my-bucket/' + sdk_filename
 
 options = PipelineOptions()
-options.view_as(GoogleCloudOptions).staging_location = staging_dir
 self.update_options(options)
 options.view_as(SetupOptions).sdk_location = sdk_location
 
-with mock.patch('apache_beam.runners.dataflow.internal.'
-'dependency._dependency_file_copy'):
-  self.assertEqual(
-  [sdk_filename],
-  dependency.stage_job_resources(options))
+# We can not rely on actual remote file systems paths hence making
+# '/tmp/remote/' a new remote path.
+def is_remote_path(dummy_self, path):
+  return path.startswith('/tmp/remote/')
+
+with mock.patch('.'.join([
+self.__module__, TestStager.__name__, 
TestStager.stage_artifact.__name__
+])):
+  with mock.patch('.'.join([
+  self.__module__, TestStager.__name__,
+  TestStager._download_file.__name__
+  ])):
+with mock.patch(
+'.'.join([
+self.__module__, TestStager.__name__,
+TestStager._is_remote_path.__name__
+]), is_remote_path):
+  self.assertEqual([sdk_filename],
+   self.stager.stage_job_resources(
+   options, staging_location=staging_dir))
 
   def test_sdk_location_http(self):
 staging_dir = self.make_temp_dir()
 sdk_location = 'http://storage.googleapis.com/my-gcs-bucket/tarball.tar.gz'
 
 options = PipelineOptions()
-options.view_as(GoogleCloudOptions).staging_location = staging_dir
 self.update_options(options)
 options.view_as(SetupOptions).sdk_location = sdk_location
 
-def file_download(_, to_folder):
-  tarball_path = os.path.join(to_folder, 'sdk-tarball')
-  with open(tarball_path, 'w') as f:
+def file_download(dummy_self, _, to_path):
+  with open(to_path, 'w') as f:
 f.write('Package content.')
-  return tarball_path
+  return to_path
 
-with mock.patch('apache_beam.runners.dataflow.internal.'
-'dependency._dependency_file_download', 

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102719=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102719
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 16/May/18 23:46
Start Date: 16/May/18 23:46
Worklog Time Spent: 10m 
  Work Description: tvalentyn commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188799057
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager_test.py
 ##
 @@ -420,134 +375,167 @@ def 
test_sdk_location_local_directory_not_present(self):
 sdk_location = 'nosuchdir'
 with self.assertRaises(RuntimeError) as cm:
   options = PipelineOptions()
-  options.view_as(GoogleCloudOptions).staging_location = staging_dir
   self.update_options(options)
   options.view_as(SetupOptions).sdk_location = sdk_location
 
-  dependency.stage_job_resources(options)
+  self.stager.stage_job_resources(options, staging_location=staging_dir)
 self.assertEqual(
 'The file "%s" cannot be found. Its '
 'location was specified by the --sdk_location command-line option.' %
-sdk_location,
-cm.exception.args[0])
+sdk_location, cm.exception.args[0])
 
-  def test_sdk_location_gcs_source_file(self):
+  def test_sdk_location_remote_source_file(self):
 staging_dir = self.make_temp_dir()
 sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz'
 
 options = PipelineOptions()
-options.view_as(GoogleCloudOptions).staging_location = staging_dir
 self.update_options(options)
 options.view_as(SetupOptions).sdk_location = sdk_location
 
-with mock.patch('apache_beam.runners.dataflow.internal.'
-'dependency._dependency_file_copy'):
-  self.assertEqual(
-  [names.DATAFLOW_SDK_TARBALL_FILE],
-  dependency.stage_job_resources(options))
-
-  def test_sdk_location_gcs_wheel_file(self):
+with mock.patch('.'.join([
+self.__module__, TestStager.__name__, 
TestStager.stage_artifact.__name__
+])):
+  with mock.patch('.'.join([
+  self.__module__, TestStager.__name__,
+  TestStager._download_file.__name__
+  ])):
+self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE],
+ self.stager.stage_job_resources(
+ options, staging_location=staging_dir))
+
+  def test_sdk_location_remote_wheel_file(self):
 staging_dir = self.make_temp_dir()
 sdk_filename = 'apache_beam-1.0.0-cp27-cp27mu-manylinux1_x86_64.whl'
-sdk_location = 'gs://my-gcs-bucket/' + sdk_filename
+sdk_location = '/tmp/remote/my-bucket/' + sdk_filename
 
 options = PipelineOptions()
-options.view_as(GoogleCloudOptions).staging_location = staging_dir
 self.update_options(options)
 options.view_as(SetupOptions).sdk_location = sdk_location
 
-with mock.patch('apache_beam.runners.dataflow.internal.'
-'dependency._dependency_file_copy'):
-  self.assertEqual(
-  [sdk_filename],
-  dependency.stage_job_resources(options))
+# We can not rely on actual remote file systems paths hence making
+# '/tmp/remote/' a new remote path.
+def is_remote_path(dummy_self, path):
+  return path.startswith('/tmp/remote/')
+
+with mock.patch('.'.join([
+self.__module__, TestStager.__name__, 
TestStager.stage_artifact.__name__
+])):
+  with mock.patch('.'.join([
+  self.__module__, TestStager.__name__,
+  TestStager._download_file.__name__
+  ])):
+with mock.patch(
+'.'.join([
+self.__module__, TestStager.__name__,
+TestStager._is_remote_path.__name__
+]), is_remote_path):
+  self.assertEqual([sdk_filename],
+   self.stager.stage_job_resources(
+   options, staging_location=staging_dir))
 
   def test_sdk_location_http(self):
 staging_dir = self.make_temp_dir()
 sdk_location = 'http://storage.googleapis.com/my-gcs-bucket/tarball.tar.gz'
 
 options = PipelineOptions()
-options.view_as(GoogleCloudOptions).staging_location = staging_dir
 self.update_options(options)
 options.view_as(SetupOptions).sdk_location = sdk_location
 
-def file_download(_, to_folder):
-  tarball_path = os.path.join(to_folder, 'sdk-tarball')
-  with open(tarball_path, 'w') as f:
+def file_download(dummy_self, _, to_path):
+  with open(to_path, 'w') as f:
 f.write('Package content.')
-  return tarball_path
+  return to_path
 
-with mock.patch('apache_beam.runners.dataflow.internal.'
-'dependency._dependency_file_download', 

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102718=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102718
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 16/May/18 23:46
Start Date: 16/May/18 23:46
Worklog Time Spent: 10m 
  Work Description: tvalentyn commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188802535
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager_test.py
 ##
 @@ -420,134 +375,167 @@ def 
test_sdk_location_local_directory_not_present(self):
 sdk_location = 'nosuchdir'
 with self.assertRaises(RuntimeError) as cm:
   options = PipelineOptions()
-  options.view_as(GoogleCloudOptions).staging_location = staging_dir
   self.update_options(options)
   options.view_as(SetupOptions).sdk_location = sdk_location
 
-  dependency.stage_job_resources(options)
+  self.stager.stage_job_resources(options, staging_location=staging_dir)
 self.assertEqual(
 'The file "%s" cannot be found. Its '
 'location was specified by the --sdk_location command-line option.' %
-sdk_location,
-cm.exception.args[0])
+sdk_location, cm.exception.args[0])
 
-  def test_sdk_location_gcs_source_file(self):
+  def test_sdk_location_remote_source_file(self):
 staging_dir = self.make_temp_dir()
 sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz'
 
 options = PipelineOptions()
-options.view_as(GoogleCloudOptions).staging_location = staging_dir
 self.update_options(options)
 options.view_as(SetupOptions).sdk_location = sdk_location
 
-with mock.patch('apache_beam.runners.dataflow.internal.'
-'dependency._dependency_file_copy'):
-  self.assertEqual(
-  [names.DATAFLOW_SDK_TARBALL_FILE],
-  dependency.stage_job_resources(options))
-
-  def test_sdk_location_gcs_wheel_file(self):
+with mock.patch('.'.join([
+self.__module__, TestStager.__name__, 
TestStager.stage_artifact.__name__
+])):
+  with mock.patch('.'.join([
+  self.__module__, TestStager.__name__,
+  TestStager._download_file.__name__
+  ])):
+self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE],
+ self.stager.stage_job_resources(
+ options, staging_location=staging_dir))
+
+  def test_sdk_location_remote_wheel_file(self):
 staging_dir = self.make_temp_dir()
 sdk_filename = 'apache_beam-1.0.0-cp27-cp27mu-manylinux1_x86_64.whl'
-sdk_location = 'gs://my-gcs-bucket/' + sdk_filename
+sdk_location = '/tmp/remote/my-bucket/' + sdk_filename
 
 options = PipelineOptions()
-options.view_as(GoogleCloudOptions).staging_location = staging_dir
 self.update_options(options)
 options.view_as(SetupOptions).sdk_location = sdk_location
 
-with mock.patch('apache_beam.runners.dataflow.internal.'
-'dependency._dependency_file_copy'):
-  self.assertEqual(
-  [sdk_filename],
-  dependency.stage_job_resources(options))
+# We can not rely on actual remote file systems paths hence making
+# '/tmp/remote/' a new remote path.
+def is_remote_path(dummy_self, path):
+  return path.startswith('/tmp/remote/')
+
+with mock.patch('.'.join([
+self.__module__, TestStager.__name__, 
TestStager.stage_artifact.__name__
+])):
+  with mock.patch('.'.join([
+  self.__module__, TestStager.__name__,
+  TestStager._download_file.__name__
+  ])):
+with mock.patch(
+'.'.join([
+self.__module__, TestStager.__name__,
+TestStager._is_remote_path.__name__
+]), is_remote_path):
+  self.assertEqual([sdk_filename],
+   self.stager.stage_job_resources(
+   options, staging_location=staging_dir))
 
   def test_sdk_location_http(self):
 staging_dir = self.make_temp_dir()
 sdk_location = 'http://storage.googleapis.com/my-gcs-bucket/tarball.tar.gz'
 
 options = PipelineOptions()
-options.view_as(GoogleCloudOptions).staging_location = staging_dir
 self.update_options(options)
 options.view_as(SetupOptions).sdk_location = sdk_location
 
-def file_download(_, to_folder):
-  tarball_path = os.path.join(to_folder, 'sdk-tarball')
-  with open(tarball_path, 'w') as f:
+def file_download(dummy_self, _, to_path):
 
 Review comment:
   Would it be simpler to implement desirable default behavior in TestStager()? 
Also why do we have dummy_self in the picture here?


This is an automated message from the 

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102714=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102714
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 16/May/18 23:46
Start Date: 16/May/18 23:46
Worklog Time Spent: 10m 
  Work Description: tvalentyn commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188795976
 
 

 ##
 File path: sdks/python/apache_beam/runners/dataflow/internal/apiclient.py
 ##
 @@ -746,17 +745,32 @@ def translate_scalar_counter_float(accumulator, 
metric_update_proto):
 metric_update_proto.floatingPoint = accumulator.value
 
 
-class _ParameterizedStager(Stager):
-  def __init__(self, stage_artifact):
-super(_ParameterizedStager, self).__init__()
-self.stage_artifact_method = stage_artifact
+class _LegacyDataflowStager(Stager):
+  # TODO(silviuc): Staged files should have a job specific prefix.
 
 Review comment:
   Can you please check if this TODO is still relevant and remove it if not?


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 102714)
Time Spent: 12h 10m  (was: 12h)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 12h 10m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102713=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102713
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 16/May/18 23:46
Start Date: 16/May/18 23:46
Worklog Time Spent: 10m 
  Work Description: tvalentyn commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188801882
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager_test.py
 ##
 @@ -420,134 +375,167 @@ def 
test_sdk_location_local_directory_not_present(self):
 sdk_location = 'nosuchdir'
 with self.assertRaises(RuntimeError) as cm:
   options = PipelineOptions()
-  options.view_as(GoogleCloudOptions).staging_location = staging_dir
   self.update_options(options)
   options.view_as(SetupOptions).sdk_location = sdk_location
 
-  dependency.stage_job_resources(options)
+  self.stager.stage_job_resources(options, staging_location=staging_dir)
 self.assertEqual(
 'The file "%s" cannot be found. Its '
 'location was specified by the --sdk_location command-line option.' %
-sdk_location,
-cm.exception.args[0])
+sdk_location, cm.exception.args[0])
 
-  def test_sdk_location_gcs_source_file(self):
+  def test_sdk_location_remote_source_file(self):
 staging_dir = self.make_temp_dir()
 sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz'
 
 options = PipelineOptions()
-options.view_as(GoogleCloudOptions).staging_location = staging_dir
 self.update_options(options)
 options.view_as(SetupOptions).sdk_location = sdk_location
 
-with mock.patch('apache_beam.runners.dataflow.internal.'
-'dependency._dependency_file_copy'):
-  self.assertEqual(
-  [names.DATAFLOW_SDK_TARBALL_FILE],
-  dependency.stage_job_resources(options))
-
-  def test_sdk_location_gcs_wheel_file(self):
+with mock.patch('.'.join([
+self.__module__, TestStager.__name__, 
TestStager.stage_artifact.__name__
+])):
+  with mock.patch('.'.join([
+  self.__module__, TestStager.__name__,
+  TestStager._download_file.__name__
+  ])):
+self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE],
+ self.stager.stage_job_resources(
+ options, staging_location=staging_dir))
+
+  def test_sdk_location_remote_wheel_file(self):
 staging_dir = self.make_temp_dir()
 sdk_filename = 'apache_beam-1.0.0-cp27-cp27mu-manylinux1_x86_64.whl'
-sdk_location = 'gs://my-gcs-bucket/' + sdk_filename
+sdk_location = '/tmp/remote/my-bucket/' + sdk_filename
 
 options = PipelineOptions()
-options.view_as(GoogleCloudOptions).staging_location = staging_dir
 self.update_options(options)
 options.view_as(SetupOptions).sdk_location = sdk_location
 
-with mock.patch('apache_beam.runners.dataflow.internal.'
-'dependency._dependency_file_copy'):
-  self.assertEqual(
-  [sdk_filename],
-  dependency.stage_job_resources(options))
+# We can not rely on actual remote file systems paths hence making
+# '/tmp/remote/' a new remote path.
+def is_remote_path(dummy_self, path):
 
 Review comment:
   Are you sure we need to do this?


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 102713)
Time Spent: 12h 10m  (was: 12h)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 12h 10m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent 

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102717=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102717
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 16/May/18 23:46
Start Date: 16/May/18 23:46
Worklog Time Spent: 10m 
  Work Description: tvalentyn commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188796273
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager.py
 ##
 @@ -78,39 +78,8 @@ class Stager(object):
   """Stager identifies and copies the appropriate artifacts to the staging
   location."""
 
-  def _copy_file(self, from_path, to_path):
-"""Copies a local file to a GCS file or vice versa."""
-logging.info('file copy from %s to %s.', from_path, to_path)
-if from_path.startswith('gs://') or to_path.startswith('gs://'):
-  from apache_beam.io.gcp import gcsio
-  if from_path.startswith('gs://') and to_path.startswith('gs://'):
-# Both files are GCS files so copy.
-gcsio.GcsIO().copy(from_path, to_path)
-  elif to_path.startswith('gs://'):
-# Only target is a GCS file, read local file and upload.
-with open(from_path, 'rb') as f:
-  with gcsio.GcsIO().open(to_path, mode='wb') as g:
-pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE)
-for chunk in iter(pfun, ''):
-  g.write(chunk)
-  else:
-# Source is a GCS file but target is local file.
-with gcsio.GcsIO().open(from_path, mode='rb') as g:
-  with open(to_path, 'wb') as f:
-pfun = functools.partial(g.read, gcsio.DEFAULT_READ_BUFFER_SIZE)
-for chunk in iter(pfun, ''):
-  f.write(chunk)
-else:
-  # Branch used only for unit tests and integration tests.
-  # In such environments GCS support is not available.
-  if not os.path.isdir(os.path.dirname(to_path)):
-logging.info(
-'Created folder (since we have not done yet, and any errors '
-'will follow): %s ', os.path.dirname(to_path))
-os.mkdir(os.path.dirname(to_path))
-  shutil.copyfile(from_path, to_path)
-
-  def _download_file(self, from_url, to_path):
+  @staticmethod
+  def _download_file(from_url, to_path):
 
 Review comment:
   Consider calling first argument `from_path` or `from`.


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 102717)
Time Spent: 12.5h  (was: 12h 20m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 12.5h
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102715=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102715
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 16/May/18 23:46
Start Date: 16/May/18 23:46
Worklog Time Spent: 10m 
  Work Description: tvalentyn commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188796621
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager.py
 ##
 @@ -130,19 +99,43 @@ def _download_file(self, from_url, to_path):
   except Exception:
 logging.info('Failed to download Artifact from %s', from_url)
 raise
+elif from_url.startswith('gs://') or to_path.startswith('gs://'):
+  from apache_beam.io.gcp import gcsio
+  if from_url.startswith('gs://') and to_path.startswith('gs://'):
+# Both files are GCS files so copy.
+gcsio.GcsIO().copy(from_url, to_path)
+  elif to_path.startswith('gs://'):
 
 Review comment:
   Echoing my comment from previous interation: we only use this method for 
downloads into local folder. So the part that uploads to GCS would be dead 
code, let's remove it.


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 102715)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 12h 10m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102712=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102712
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 16/May/18 23:46
Start Date: 16/May/18 23:46
Worklog Time Spent: 10m 
  Work Description: tvalentyn commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188794959
 
 

 ##
 File path: sdks/python/apache_beam/runners/dataflow/internal/apiclient.py
 ##
 @@ -746,17 +745,32 @@ def translate_scalar_counter_float(accumulator, 
metric_update_proto):
 metric_update_proto.floatingPoint = accumulator.value
 
 
-class _ParameterizedStager(Stager):
-  def __init__(self, stage_artifact):
-super(_ParameterizedStager, self).__init__()
-self.stage_artifact_method = stage_artifact
+class _LegacyDataflowStager(Stager):
+  # TODO(silviuc): Staged files should have a job specific prefix.
+  # To prevent several jobs in the same project stomping on each other due to a
+  # shared staging location.
+  def __init__(self, dataflow_application_client):
+super(_LegacyDataflowStager, self).__init__()
+self.stage_artifact_method = dataflow_application_client._gcs_file_copy
 
 Review comment:
   The composition would be more transparent if we store the reference to the 
dataflow client in the constructor, and call 
`self.dataflow_application_client._gcs_file_copy(...)` in stage_artifact.


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 102712)
Time Spent: 12h  (was: 11h 50m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 12h
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102680=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102680
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 16/May/18 21:50
Start Date: 16/May/18 21:50
Worklog Time Spent: 10m 
  Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor 
and clean dependency.py to make it reusable with artifact service
URL: https://github.com/apache/beam/pull/5251#issuecomment-389677258
 
 
   Run Python Dataflow ValidatesRunner


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 102680)
Time Spent: 11h 50m  (was: 11h 40m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 11h 50m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102669=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102669
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 16/May/18 21:28
Start Date: 16/May/18 21:28
Worklog Time Spent: 10m 
  Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor 
and clean dependency.py to make it reusable with artifact service
URL: https://github.com/apache/beam/pull/5251#issuecomment-389671701
 
 
   Applied the review commends based on our discussion. PTAL


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 102669)
Time Spent: 11h 40m  (was: 11.5h)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 11h 40m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102651=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102651
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 16/May/18 21:18
Start Date: 16/May/18 21:18
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188746923
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager.py
 ##
 @@ -0,0 +1,573 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Support for installing custom code and required dependencies.
+
+Workflows, with the exception of very simple ones, are organized in multiple
+modules and packages. Typically, these modules and packages have
+dependencies on other standard libraries. Beam relies on the Python
+setuptools package to handle these scenarios. For further details please read:
+https://pythonhosted.org/an_example_pypi_project/setuptools.html
+
+When a runner tries to run a pipeline it will check for a --requirements_file
+and a --setup_file option.
+
+If --setup_file is present then it is assumed that the folder containing the
+file specified by the option has the typical layout required by setuptools and
+it will run 'python setup.py sdist' to produce a source distribution. The
+resulting tarball (a .tar or .tar.gz file) will be staged at the staging
+location specified as job option. When a worker starts it will check for the
+presence of this file and will run 'easy_install tarball' to install the
+package in the worker.
+
+If --requirements_file is present then the file specified by the option will be
+staged in the staging location.  When a worker starts it will check for the
+presence of this file and will run 'pip install -r requirements.txt'. A
+requirements file can be easily generated by running 'pip freeze -r
+requirements.txt'. The reason a runner does not run this automatically is
+because quite often only a small fraction of the dependencies present in a
+requirements.txt file are actually needed for remote execution and therefore a
+one-time manual trimming is desirable.
+
+TODO(silviuc): Should we allow several setup packages?
+TODO(silviuc): We should allow customizing the exact command for setup build.
+"""
+import functools
+import glob
+import logging
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import pkg_resources
+
+from apache_beam.internal import pickler
+from apache_beam.io.filesystems import FileSystems
+from apache_beam.options.pipeline_options import SetupOptions
+# TODO(angoenka): Remove reference to dataflow internal names
+from apache_beam.runners.dataflow.internal import names
+from apache_beam.utils import processes
+
+# All constants are for internal use only; no backwards-compatibility
+# guarantees.
+
+# Standard file names used for staging files.
+WORKFLOW_TARBALL_FILE = 'workflow.tar.gz'
+REQUIREMENTS_FILE = 'requirements.txt'
+EXTRA_PACKAGES_FILE = 'extra_packages.txt'
+
+# Package names for distributions
+BEAM_PACKAGE_NAME = 'apache-beam'
+
+
+class Stager(object):
+  """Stager identifies and copies the appropriate artifacts to the staging
+  location."""
+
+  def _copy_file(self, from_path, to_path):
+"""Copies a local file to a GCS file or vice versa."""
+logging.info('file copy from %s to %s.', from_path, to_path)
+if from_path.startswith('gs://') or to_path.startswith('gs://'):
+  from apache_beam.io.gcp import gcsio
+  if from_path.startswith('gs://') and to_path.startswith('gs://'):
+# Both files are GCS files so copy.
+gcsio.GcsIO().copy(from_path, to_path)
+  elif to_path.startswith('gs://'):
+# Only target is a GCS file, read local file and upload.
+with open(from_path, 'rb') as f:
+  with gcsio.GcsIO().open(to_path, mode='wb') as g:
+pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE)
+for chunk in iter(pfun, ''):
+  

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102653=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102653
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 16/May/18 21:18
Start Date: 16/May/18 21:18
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188729612
 
 

 ##
 File path: sdks/python/apache_beam/runners/dataflow/internal/apiclient.py
 ##
 @@ -731,6 +746,18 @@ def translate_scalar_counter_float(accumulator, 
metric_update_proto):
 metric_update_proto.floatingPoint = accumulator.value
 
 
+class _ParameterizedStager(Stager):
 
 Review comment:
   Done


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 102653)
Time Spent: 10.5h  (was: 10h 20m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 10.5h
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102657=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102657
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 16/May/18 21:18
Start Date: 16/May/18 21:18
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188750502
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager.py
 ##
 @@ -0,0 +1,573 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Support for installing custom code and required dependencies.
+
+Workflows, with the exception of very simple ones, are organized in multiple
+modules and packages. Typically, these modules and packages have
+dependencies on other standard libraries. Beam relies on the Python
+setuptools package to handle these scenarios. For further details please read:
+https://pythonhosted.org/an_example_pypi_project/setuptools.html
+
+When a runner tries to run a pipeline it will check for a --requirements_file
+and a --setup_file option.
+
+If --setup_file is present then it is assumed that the folder containing the
+file specified by the option has the typical layout required by setuptools and
+it will run 'python setup.py sdist' to produce a source distribution. The
+resulting tarball (a .tar or .tar.gz file) will be staged at the staging
+location specified as job option. When a worker starts it will check for the
+presence of this file and will run 'easy_install tarball' to install the
+package in the worker.
+
+If --requirements_file is present then the file specified by the option will be
+staged in the staging location.  When a worker starts it will check for the
+presence of this file and will run 'pip install -r requirements.txt'. A
+requirements file can be easily generated by running 'pip freeze -r
+requirements.txt'. The reason a runner does not run this automatically is
+because quite often only a small fraction of the dependencies present in a
+requirements.txt file are actually needed for remote execution and therefore a
+one-time manual trimming is desirable.
+
+TODO(silviuc): Should we allow several setup packages?
+TODO(silviuc): We should allow customizing the exact command for setup build.
+"""
+import functools
+import glob
+import logging
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import pkg_resources
+
+from apache_beam.internal import pickler
+from apache_beam.io.filesystems import FileSystems
+from apache_beam.options.pipeline_options import SetupOptions
+# TODO(angoenka): Remove reference to dataflow internal names
+from apache_beam.runners.dataflow.internal import names
+from apache_beam.utils import processes
+
+# All constants are for internal use only; no backwards-compatibility
+# guarantees.
+
+# Standard file names used for staging files.
+WORKFLOW_TARBALL_FILE = 'workflow.tar.gz'
+REQUIREMENTS_FILE = 'requirements.txt'
+EXTRA_PACKAGES_FILE = 'extra_packages.txt'
+
+# Package names for distributions
+BEAM_PACKAGE_NAME = 'apache-beam'
+
+
+class Stager(object):
+  """Stager identifies and copies the appropriate artifacts to the staging
+  location."""
+
+  def _copy_file(self, from_path, to_path):
+"""Copies a local file to a GCS file or vice versa."""
+logging.info('file copy from %s to %s.', from_path, to_path)
+if from_path.startswith('gs://') or to_path.startswith('gs://'):
+  from apache_beam.io.gcp import gcsio
+  if from_path.startswith('gs://') and to_path.startswith('gs://'):
+# Both files are GCS files so copy.
+gcsio.GcsIO().copy(from_path, to_path)
+  elif to_path.startswith('gs://'):
+# Only target is a GCS file, read local file and upload.
+with open(from_path, 'rb') as f:
+  with gcsio.GcsIO().open(to_path, mode='wb') as g:
+pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE)
+for chunk in iter(pfun, ''):
+  

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102658=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102658
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 16/May/18 21:18
Start Date: 16/May/18 21:18
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188746725
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager.py
 ##
 @@ -0,0 +1,573 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Support for installing custom code and required dependencies.
+
+Workflows, with the exception of very simple ones, are organized in multiple
+modules and packages. Typically, these modules and packages have
+dependencies on other standard libraries. Beam relies on the Python
+setuptools package to handle these scenarios. For further details please read:
+https://pythonhosted.org/an_example_pypi_project/setuptools.html
+
+When a runner tries to run a pipeline it will check for a --requirements_file
+and a --setup_file option.
+
+If --setup_file is present then it is assumed that the folder containing the
+file specified by the option has the typical layout required by setuptools and
+it will run 'python setup.py sdist' to produce a source distribution. The
+resulting tarball (a .tar or .tar.gz file) will be staged at the staging
+location specified as job option. When a worker starts it will check for the
+presence of this file and will run 'easy_install tarball' to install the
+package in the worker.
+
+If --requirements_file is present then the file specified by the option will be
+staged in the staging location.  When a worker starts it will check for the
+presence of this file and will run 'pip install -r requirements.txt'. A
+requirements file can be easily generated by running 'pip freeze -r
+requirements.txt'. The reason a runner does not run this automatically is
+because quite often only a small fraction of the dependencies present in a
+requirements.txt file are actually needed for remote execution and therefore a
+one-time manual trimming is desirable.
+
+TODO(silviuc): Should we allow several setup packages?
+TODO(silviuc): We should allow customizing the exact command for setup build.
+"""
+import functools
+import glob
+import logging
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import pkg_resources
+
+from apache_beam.internal import pickler
+from apache_beam.io.filesystems import FileSystems
+from apache_beam.options.pipeline_options import SetupOptions
+# TODO(angoenka): Remove reference to dataflow internal names
+from apache_beam.runners.dataflow.internal import names
+from apache_beam.utils import processes
+
+# All constants are for internal use only; no backwards-compatibility
+# guarantees.
+
+# Standard file names used for staging files.
+WORKFLOW_TARBALL_FILE = 'workflow.tar.gz'
+REQUIREMENTS_FILE = 'requirements.txt'
+EXTRA_PACKAGES_FILE = 'extra_packages.txt'
+
+# Package names for distributions
+BEAM_PACKAGE_NAME = 'apache-beam'
+
+
+class Stager(object):
+  """Stager identifies and copies the appropriate artifacts to the staging
+  location."""
+
+  def _copy_file(self, from_path, to_path):
+"""Copies a local file to a GCS file or vice versa."""
+logging.info('file copy from %s to %s.', from_path, to_path)
+if from_path.startswith('gs://') or to_path.startswith('gs://'):
+  from apache_beam.io.gcp import gcsio
+  if from_path.startswith('gs://') and to_path.startswith('gs://'):
+# Both files are GCS files so copy.
+gcsio.GcsIO().copy(from_path, to_path)
+  elif to_path.startswith('gs://'):
+# Only target is a GCS file, read local file and upload.
+with open(from_path, 'rb') as f:
+  with gcsio.GcsIO().open(to_path, mode='wb') as g:
+pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE)
+for chunk in iter(pfun, ''):
+  

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102656=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102656
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 16/May/18 21:18
Start Date: 16/May/18 21:18
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188743575
 
 

 ##
 File path: sdks/python/apache_beam/runners/dataflow/internal/dependency.py
 ##
 @@ -607,65 +151,9 @@ def get_sdk_name_and_version():
 def get_sdk_package_name():
 
 Review comment:
   Making this a module function of apiclient


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 102656)
Time Spent: 11h  (was: 10h 50m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 11h
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102655=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102655
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 16/May/18 21:18
Start Date: 16/May/18 21:18
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188746649
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager.py
 ##
 @@ -0,0 +1,573 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Support for installing custom code and required dependencies.
+
+Workflows, with the exception of very simple ones, are organized in multiple
+modules and packages. Typically, these modules and packages have
+dependencies on other standard libraries. Beam relies on the Python
+setuptools package to handle these scenarios. For further details please read:
+https://pythonhosted.org/an_example_pypi_project/setuptools.html
+
+When a runner tries to run a pipeline it will check for a --requirements_file
+and a --setup_file option.
+
+If --setup_file is present then it is assumed that the folder containing the
+file specified by the option has the typical layout required by setuptools and
+it will run 'python setup.py sdist' to produce a source distribution. The
+resulting tarball (a .tar or .tar.gz file) will be staged at the staging
+location specified as job option. When a worker starts it will check for the
+presence of this file and will run 'easy_install tarball' to install the
+package in the worker.
+
+If --requirements_file is present then the file specified by the option will be
+staged in the staging location.  When a worker starts it will check for the
+presence of this file and will run 'pip install -r requirements.txt'. A
+requirements file can be easily generated by running 'pip freeze -r
+requirements.txt'. The reason a runner does not run this automatically is
+because quite often only a small fraction of the dependencies present in a
+requirements.txt file are actually needed for remote execution and therefore a
+one-time manual trimming is desirable.
+
+TODO(silviuc): Should we allow several setup packages?
+TODO(silviuc): We should allow customizing the exact command for setup build.
+"""
+import functools
+import glob
+import logging
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import pkg_resources
+
+from apache_beam.internal import pickler
+from apache_beam.io.filesystems import FileSystems
+from apache_beam.options.pipeline_options import SetupOptions
+# TODO(angoenka): Remove reference to dataflow internal names
+from apache_beam.runners.dataflow.internal import names
+from apache_beam.utils import processes
+
+# All constants are for internal use only; no backwards-compatibility
+# guarantees.
+
+# Standard file names used for staging files.
+WORKFLOW_TARBALL_FILE = 'workflow.tar.gz'
+REQUIREMENTS_FILE = 'requirements.txt'
+EXTRA_PACKAGES_FILE = 'extra_packages.txt'
+
+# Package names for distributions
+BEAM_PACKAGE_NAME = 'apache-beam'
+
+
+class Stager(object):
+  """Stager identifies and copies the appropriate artifacts to the staging
+  location."""
+
+  def _copy_file(self, from_path, to_path):
+"""Copies a local file to a GCS file or vice versa."""
+logging.info('file copy from %s to %s.', from_path, to_path)
+if from_path.startswith('gs://') or to_path.startswith('gs://'):
+  from apache_beam.io.gcp import gcsio
+  if from_path.startswith('gs://') and to_path.startswith('gs://'):
+# Both files are GCS files so copy.
+gcsio.GcsIO().copy(from_path, to_path)
+  elif to_path.startswith('gs://'):
+# Only target is a GCS file, read local file and upload.
+with open(from_path, 'rb') as f:
+  with gcsio.GcsIO().open(to_path, mode='wb') as g:
+pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE)
+for chunk in iter(pfun, ''):
+  

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102659=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102659
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 16/May/18 21:18
Start Date: 16/May/18 21:18
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188775399
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager.py
 ##
 @@ -0,0 +1,573 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Support for installing custom code and required dependencies.
+
+Workflows, with the exception of very simple ones, are organized in multiple
+modules and packages. Typically, these modules and packages have
+dependencies on other standard libraries. Beam relies on the Python
+setuptools package to handle these scenarios. For further details please read:
+https://pythonhosted.org/an_example_pypi_project/setuptools.html
+
+When a runner tries to run a pipeline it will check for a --requirements_file
+and a --setup_file option.
+
+If --setup_file is present then it is assumed that the folder containing the
+file specified by the option has the typical layout required by setuptools and
+it will run 'python setup.py sdist' to produce a source distribution. The
+resulting tarball (a .tar or .tar.gz file) will be staged at the staging
+location specified as job option. When a worker starts it will check for the
+presence of this file and will run 'easy_install tarball' to install the
+package in the worker.
+
+If --requirements_file is present then the file specified by the option will be
+staged in the staging location.  When a worker starts it will check for the
+presence of this file and will run 'pip install -r requirements.txt'. A
+requirements file can be easily generated by running 'pip freeze -r
+requirements.txt'. The reason a runner does not run this automatically is
+because quite often only a small fraction of the dependencies present in a
+requirements.txt file are actually needed for remote execution and therefore a
+one-time manual trimming is desirable.
+
+TODO(silviuc): Should we allow several setup packages?
+TODO(silviuc): We should allow customizing the exact command for setup build.
+"""
+import functools
+import glob
+import logging
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import pkg_resources
+
+from apache_beam.internal import pickler
+from apache_beam.io.filesystems import FileSystems
+from apache_beam.options.pipeline_options import SetupOptions
+# TODO(angoenka): Remove reference to dataflow internal names
+from apache_beam.runners.dataflow.internal import names
+from apache_beam.utils import processes
+
+# All constants are for internal use only; no backwards-compatibility
+# guarantees.
+
+# Standard file names used for staging files.
+WORKFLOW_TARBALL_FILE = 'workflow.tar.gz'
+REQUIREMENTS_FILE = 'requirements.txt'
+EXTRA_PACKAGES_FILE = 'extra_packages.txt'
+
+# Package names for distributions
+BEAM_PACKAGE_NAME = 'apache-beam'
+
+
+class Stager(object):
+  """Stager identifies and copies the appropriate artifacts to the staging
+  location."""
+
+  def _copy_file(self, from_path, to_path):
+"""Copies a local file to a GCS file or vice versa."""
 
 Review comment:
   done


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 102659)
Time Spent: 11.5h  (was: 11h 20m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> 

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102652=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102652
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 16/May/18 21:18
Start Date: 16/May/18 21:18
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188746406
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager.py
 ##
 @@ -0,0 +1,573 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Support for installing custom code and required dependencies.
+
+Workflows, with the exception of very simple ones, are organized in multiple
+modules and packages. Typically, these modules and packages have
+dependencies on other standard libraries. Beam relies on the Python
+setuptools package to handle these scenarios. For further details please read:
+https://pythonhosted.org/an_example_pypi_project/setuptools.html
+
+When a runner tries to run a pipeline it will check for a --requirements_file
+and a --setup_file option.
+
+If --setup_file is present then it is assumed that the folder containing the
+file specified by the option has the typical layout required by setuptools and
+it will run 'python setup.py sdist' to produce a source distribution. The
+resulting tarball (a .tar or .tar.gz file) will be staged at the staging
+location specified as job option. When a worker starts it will check for the
+presence of this file and will run 'easy_install tarball' to install the
+package in the worker.
+
+If --requirements_file is present then the file specified by the option will be
+staged in the staging location.  When a worker starts it will check for the
+presence of this file and will run 'pip install -r requirements.txt'. A
+requirements file can be easily generated by running 'pip freeze -r
+requirements.txt'. The reason a runner does not run this automatically is
+because quite often only a small fraction of the dependencies present in a
+requirements.txt file are actually needed for remote execution and therefore a
+one-time manual trimming is desirable.
+
+TODO(silviuc): Should we allow several setup packages?
+TODO(silviuc): We should allow customizing the exact command for setup build.
+"""
+import functools
+import glob
+import logging
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import pkg_resources
+
+from apache_beam.internal import pickler
+from apache_beam.io.filesystems import FileSystems
+from apache_beam.options.pipeline_options import SetupOptions
+# TODO(angoenka): Remove reference to dataflow internal names
+from apache_beam.runners.dataflow.internal import names
+from apache_beam.utils import processes
+
+# All constants are for internal use only; no backwards-compatibility
+# guarantees.
+
+# Standard file names used for staging files.
+WORKFLOW_TARBALL_FILE = 'workflow.tar.gz'
+REQUIREMENTS_FILE = 'requirements.txt'
+EXTRA_PACKAGES_FILE = 'extra_packages.txt'
+
+# Package names for distributions
+BEAM_PACKAGE_NAME = 'apache-beam'
+
+
+class Stager(object):
+  """Stager identifies and copies the appropriate artifacts to the staging
+  location."""
+
+  def _copy_file(self, from_path, to_path):
+"""Copies a local file to a GCS file or vice versa."""
+logging.info('file copy from %s to %s.', from_path, to_path)
+if from_path.startswith('gs://') or to_path.startswith('gs://'):
+  from apache_beam.io.gcp import gcsio
+  if from_path.startswith('gs://') and to_path.startswith('gs://'):
+# Both files are GCS files so copy.
+gcsio.GcsIO().copy(from_path, to_path)
+  elif to_path.startswith('gs://'):
+# Only target is a GCS file, read local file and upload.
+with open(from_path, 'rb') as f:
+  with gcsio.GcsIO().open(to_path, mode='wb') as g:
+pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE)
+for chunk in iter(pfun, ''):
+  

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102654=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102654
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 16/May/18 21:18
Start Date: 16/May/18 21:18
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188746609
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager.py
 ##
 @@ -0,0 +1,573 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Support for installing custom code and required dependencies.
+
+Workflows, with the exception of very simple ones, are organized in multiple
+modules and packages. Typically, these modules and packages have
+dependencies on other standard libraries. Beam relies on the Python
+setuptools package to handle these scenarios. For further details please read:
+https://pythonhosted.org/an_example_pypi_project/setuptools.html
+
+When a runner tries to run a pipeline it will check for a --requirements_file
+and a --setup_file option.
+
+If --setup_file is present then it is assumed that the folder containing the
+file specified by the option has the typical layout required by setuptools and
+it will run 'python setup.py sdist' to produce a source distribution. The
+resulting tarball (a .tar or .tar.gz file) will be staged at the staging
+location specified as job option. When a worker starts it will check for the
+presence of this file and will run 'easy_install tarball' to install the
+package in the worker.
+
+If --requirements_file is present then the file specified by the option will be
+staged in the staging location.  When a worker starts it will check for the
+presence of this file and will run 'pip install -r requirements.txt'. A
+requirements file can be easily generated by running 'pip freeze -r
+requirements.txt'. The reason a runner does not run this automatically is
+because quite often only a small fraction of the dependencies present in a
+requirements.txt file are actually needed for remote execution and therefore a
+one-time manual trimming is desirable.
+
+TODO(silviuc): Should we allow several setup packages?
+TODO(silviuc): We should allow customizing the exact command for setup build.
+"""
+import functools
+import glob
+import logging
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import pkg_resources
+
+from apache_beam.internal import pickler
+from apache_beam.io.filesystems import FileSystems
+from apache_beam.options.pipeline_options import SetupOptions
+# TODO(angoenka): Remove reference to dataflow internal names
+from apache_beam.runners.dataflow.internal import names
+from apache_beam.utils import processes
+
+# All constants are for internal use only; no backwards-compatibility
+# guarantees.
+
+# Standard file names used for staging files.
+WORKFLOW_TARBALL_FILE = 'workflow.tar.gz'
+REQUIREMENTS_FILE = 'requirements.txt'
+EXTRA_PACKAGES_FILE = 'extra_packages.txt'
+
+# Package names for distributions
+BEAM_PACKAGE_NAME = 'apache-beam'
+
+
+class Stager(object):
+  """Stager identifies and copies the appropriate artifacts to the staging
+  location."""
+
+  def _copy_file(self, from_path, to_path):
+"""Copies a local file to a GCS file or vice versa."""
+logging.info('file copy from %s to %s.', from_path, to_path)
+if from_path.startswith('gs://') or to_path.startswith('gs://'):
+  from apache_beam.io.gcp import gcsio
+  if from_path.startswith('gs://') and to_path.startswith('gs://'):
+# Both files are GCS files so copy.
+gcsio.GcsIO().copy(from_path, to_path)
+  elif to_path.startswith('gs://'):
+# Only target is a GCS file, read local file and upload.
+with open(from_path, 'rb') as f:
+  with gcsio.GcsIO().open(to_path, mode='wb') as g:
+pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE)
+for chunk in iter(pfun, ''):
+  

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102592=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102592
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 16/May/18 18:34
Start Date: 16/May/18 18:34
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188729509
 
 

 ##
 File path: sdks/python/apache_beam/runners/dataflow/internal/apiclient.py
 ##
 @@ -731,6 +746,18 @@ def translate_scalar_counter_float(accumulator, 
metric_update_proto):
 metric_update_proto.floatingPoint = accumulator.value
 
 
+class _ParameterizedStager(Stager):
 
 Review comment:
   Done


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 102592)
Time Spent: 10h  (was: 9h 50m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 10h
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-16 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102593=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102593
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 16/May/18 18:34
Start Date: 16/May/18 18:34
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188729509
 
 

 ##
 File path: sdks/python/apache_beam/runners/dataflow/internal/apiclient.py
 ##
 @@ -731,6 +746,18 @@ def translate_scalar_counter_float(accumulator, 
metric_update_proto):
 metric_update_proto.floatingPoint = accumulator.value
 
 
+class _ParameterizedStager(Stager):
 
 Review comment:
   Done


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 102593)
Time Spent: 10h 10m  (was: 10h)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 10h 10m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-15 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102342=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102342
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 15/May/18 23:07
Start Date: 15/May/18 23:07
Worklog Time Spent: 10m 
  Work Description: tvalentyn commented on issue #5251: [BEAM-3883] 
Refactor and clean dependency.py to make it reusable with artifact service
URL: https://github.com/apache/beam/pull/5251#issuecomment-389342407
 
 
   In my opinion, will be easier to follow and maintain the helper functions if 
we explicitly remove the access to the state of the objects (`self`) when it is 
not needed. I would argue, we should at least make them `@staticmethods`. From 
that point on, the difference between a helper function in the module or a 
private static method that does not access the class, is rather syntactic. I 
think it's a little more overhead to have them as static methods, but I don't 
mind that if you feel strongly about associating the helpers with the `Stager` 
class.
   
   You could also declare helper functions as inner functions of class methods 
that need them, if they are only used in that particular method. I would 
consider that for short helper functions.


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 102342)
Time Spent: 9h 50m  (was: 9h 40m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 9h 50m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-15 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102299=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102299
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 15/May/18 22:01
Start Date: 15/May/18 22:01
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188450008
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager.py
 ##
 @@ -0,0 +1,573 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Support for installing custom code and required dependencies.
+
+Workflows, with the exception of very simple ones, are organized in multiple
+modules and packages. Typically, these modules and packages have
+dependencies on other standard libraries. Beam relies on the Python
+setuptools package to handle these scenarios. For further details please read:
+https://pythonhosted.org/an_example_pypi_project/setuptools.html
+
+When a runner tries to run a pipeline it will check for a --requirements_file
+and a --setup_file option.
+
+If --setup_file is present then it is assumed that the folder containing the
+file specified by the option has the typical layout required by setuptools and
+it will run 'python setup.py sdist' to produce a source distribution. The
+resulting tarball (a .tar or .tar.gz file) will be staged at the staging
+location specified as job option. When a worker starts it will check for the
+presence of this file and will run 'easy_install tarball' to install the
+package in the worker.
+
+If --requirements_file is present then the file specified by the option will be
+staged in the staging location.  When a worker starts it will check for the
+presence of this file and will run 'pip install -r requirements.txt'. A
+requirements file can be easily generated by running 'pip freeze -r
+requirements.txt'. The reason a runner does not run this automatically is
+because quite often only a small fraction of the dependencies present in a
+requirements.txt file are actually needed for remote execution and therefore a
+one-time manual trimming is desirable.
+
+TODO(silviuc): Should we allow several setup packages?
+TODO(silviuc): We should allow customizing the exact command for setup build.
+"""
+import functools
+import glob
+import logging
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import pkg_resources
+
+from apache_beam.internal import pickler
+from apache_beam.io.filesystems import FileSystems
+from apache_beam.options.pipeline_options import SetupOptions
+# TODO(angoenka): Remove reference to dataflow internal names
+from apache_beam.runners.dataflow.internal import names
+from apache_beam.utils import processes
+
+# All constants are for internal use only; no backwards-compatibility
+# guarantees.
+
+# Standard file names used for staging files.
+WORKFLOW_TARBALL_FILE = 'workflow.tar.gz'
+REQUIREMENTS_FILE = 'requirements.txt'
+EXTRA_PACKAGES_FILE = 'extra_packages.txt'
+
+# Package names for distributions
+BEAM_PACKAGE_NAME = 'apache-beam'
+
+
+class Stager(object):
+  """Stager identifies and copies the appropriate artifacts to the staging
+  location."""
+
+  def _copy_file(self, from_path, to_path):
+"""Copies a local file to a GCS file or vice versa."""
+logging.info('file copy from %s to %s.', from_path, to_path)
+if from_path.startswith('gs://') or to_path.startswith('gs://'):
+  from apache_beam.io.gcp import gcsio
+  if from_path.startswith('gs://') and to_path.startswith('gs://'):
+# Both files are GCS files so copy.
+gcsio.GcsIO().copy(from_path, to_path)
+  elif to_path.startswith('gs://'):
+# Only target is a GCS file, read local file and upload.
+with open(from_path, 'rb') as f:
+  with gcsio.GcsIO().open(to_path, mode='wb') as g:
+pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE)
+for chunk in iter(pfun, ''):
+  

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-15 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=102301=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-102301
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 15/May/18 22:01
Start Date: 15/May/18 22:01
Worklog Time Spent: 10m 
  Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor 
and clean dependency.py to make it reusable with artifact service
URL: https://github.com/apache/beam/pull/5251#issuecomment-389327626
 
 
   I would like to keep all the functionality of Stager within the class 
instead of fragmenting it between Stager class and module.
   As none of these functions are reusable and are not intended to be reused in 
anyway, splitting things in module and class will only make code harder to 
follow and maintain.


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 102301)
Time Spent: 9h 40m  (was: 9.5h)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 9h 40m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-14 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=101959=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101959
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 15/May/18 01:11
Start Date: 15/May/18 01:11
Worklog Time Spent: 10m 
  Work Description: tvalentyn commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188141417
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager.py
 ##
 @@ -0,0 +1,573 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Support for installing custom code and required dependencies.
+
+Workflows, with the exception of very simple ones, are organized in multiple
+modules and packages. Typically, these modules and packages have
+dependencies on other standard libraries. Beam relies on the Python
+setuptools package to handle these scenarios. For further details please read:
+https://pythonhosted.org/an_example_pypi_project/setuptools.html
+
+When a runner tries to run a pipeline it will check for a --requirements_file
+and a --setup_file option.
+
+If --setup_file is present then it is assumed that the folder containing the
+file specified by the option has the typical layout required by setuptools and
+it will run 'python setup.py sdist' to produce a source distribution. The
+resulting tarball (a .tar or .tar.gz file) will be staged at the staging
+location specified as job option. When a worker starts it will check for the
+presence of this file and will run 'easy_install tarball' to install the
+package in the worker.
+
+If --requirements_file is present then the file specified by the option will be
+staged in the staging location.  When a worker starts it will check for the
+presence of this file and will run 'pip install -r requirements.txt'. A
+requirements file can be easily generated by running 'pip freeze -r
+requirements.txt'. The reason a runner does not run this automatically is
+because quite often only a small fraction of the dependencies present in a
+requirements.txt file are actually needed for remote execution and therefore a
+one-time manual trimming is desirable.
+
+TODO(silviuc): Should we allow several setup packages?
+TODO(silviuc): We should allow customizing the exact command for setup build.
+"""
+import functools
+import glob
+import logging
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import pkg_resources
+
+from apache_beam.internal import pickler
+from apache_beam.io.filesystems import FileSystems
+from apache_beam.options.pipeline_options import SetupOptions
+# TODO(angoenka): Remove reference to dataflow internal names
+from apache_beam.runners.dataflow.internal import names
+from apache_beam.utils import processes
+
+# All constants are for internal use only; no backwards-compatibility
+# guarantees.
+
+# Standard file names used for staging files.
+WORKFLOW_TARBALL_FILE = 'workflow.tar.gz'
+REQUIREMENTS_FILE = 'requirements.txt'
+EXTRA_PACKAGES_FILE = 'extra_packages.txt'
+
+# Package names for distributions
+BEAM_PACKAGE_NAME = 'apache-beam'
+
+
+class Stager(object):
+  """Stager identifies and copies the appropriate artifacts to the staging
+  location."""
+
+  def _copy_file(self, from_path, to_path):
+"""Copies a local file to a GCS file or vice versa."""
 
 Review comment:
   
   This method will only be used  to download files from GCS to local folder, 
the part that handles uploads to GCS would be a dead code, let's remove it. 
Uploads will be handled by stage_artifact.
   
   I think it would be easier to reason about the code if we have 
`_download_file` method and GCS / HTTP helper methods as follows:
   ```
   def _download_file(from_path, to_path):
 if from_path starts.with('gcs://'):
   _download_from_gcs(from_path, to_path)
   ...
 else if from_path starts.with('http'):
   _download_from_http(from_path, to_path)
   ...  
 else: 
   # used in tests 
   ...
   ```
   
   Also since 

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-14 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=101967=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101967
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 15/May/18 01:11
Start Date: 15/May/18 01:11
Worklog Time Spent: 10m 
  Work Description: tvalentyn commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188141414
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager.py
 ##
 @@ -0,0 +1,573 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Support for installing custom code and required dependencies.
+
+Workflows, with the exception of very simple ones, are organized in multiple
+modules and packages. Typically, these modules and packages have
+dependencies on other standard libraries. Beam relies on the Python
+setuptools package to handle these scenarios. For further details please read:
+https://pythonhosted.org/an_example_pypi_project/setuptools.html
+
+When a runner tries to run a pipeline it will check for a --requirements_file
+and a --setup_file option.
+
+If --setup_file is present then it is assumed that the folder containing the
+file specified by the option has the typical layout required by setuptools and
+it will run 'python setup.py sdist' to produce a source distribution. The
+resulting tarball (a .tar or .tar.gz file) will be staged at the staging
+location specified as job option. When a worker starts it will check for the
+presence of this file and will run 'easy_install tarball' to install the
+package in the worker.
+
+If --requirements_file is present then the file specified by the option will be
+staged in the staging location.  When a worker starts it will check for the
+presence of this file and will run 'pip install -r requirements.txt'. A
+requirements file can be easily generated by running 'pip freeze -r
+requirements.txt'. The reason a runner does not run this automatically is
+because quite often only a small fraction of the dependencies present in a
+requirements.txt file are actually needed for remote execution and therefore a
+one-time manual trimming is desirable.
+
+TODO(silviuc): Should we allow several setup packages?
+TODO(silviuc): We should allow customizing the exact command for setup build.
+"""
+import functools
+import glob
+import logging
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import pkg_resources
+
+from apache_beam.internal import pickler
+from apache_beam.io.filesystems import FileSystems
+from apache_beam.options.pipeline_options import SetupOptions
+# TODO(angoenka): Remove reference to dataflow internal names
+from apache_beam.runners.dataflow.internal import names
+from apache_beam.utils import processes
+
+# All constants are for internal use only; no backwards-compatibility
+# guarantees.
+
+# Standard file names used for staging files.
+WORKFLOW_TARBALL_FILE = 'workflow.tar.gz'
+REQUIREMENTS_FILE = 'requirements.txt'
+EXTRA_PACKAGES_FILE = 'extra_packages.txt'
+
+# Package names for distributions
+BEAM_PACKAGE_NAME = 'apache-beam'
+
+
+class Stager(object):
+  """Stager identifies and copies the appropriate artifacts to the staging
+  location."""
+
+  def _copy_file(self, from_path, to_path):
+"""Copies a local file to a GCS file or vice versa."""
+logging.info('file copy from %s to %s.', from_path, to_path)
+if from_path.startswith('gs://') or to_path.startswith('gs://'):
+  from apache_beam.io.gcp import gcsio
+  if from_path.startswith('gs://') and to_path.startswith('gs://'):
+# Both files are GCS files so copy.
+gcsio.GcsIO().copy(from_path, to_path)
+  elif to_path.startswith('gs://'):
+# Only target is a GCS file, read local file and upload.
+with open(from_path, 'rb') as f:
+  with gcsio.GcsIO().open(to_path, mode='wb') as g:
+pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE)
+for chunk in iter(pfun, ''):
+  

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-14 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=101960=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101960
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 15/May/18 01:11
Start Date: 15/May/18 01:11
Worklog Time Spent: 10m 
  Work Description: tvalentyn commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188141423
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager.py
 ##
 @@ -0,0 +1,573 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Support for installing custom code and required dependencies.
+
+Workflows, with the exception of very simple ones, are organized in multiple
+modules and packages. Typically, these modules and packages have
+dependencies on other standard libraries. Beam relies on the Python
+setuptools package to handle these scenarios. For further details please read:
+https://pythonhosted.org/an_example_pypi_project/setuptools.html
+
+When a runner tries to run a pipeline it will check for a --requirements_file
+and a --setup_file option.
+
+If --setup_file is present then it is assumed that the folder containing the
+file specified by the option has the typical layout required by setuptools and
+it will run 'python setup.py sdist' to produce a source distribution. The
+resulting tarball (a .tar or .tar.gz file) will be staged at the staging
+location specified as job option. When a worker starts it will check for the
+presence of this file and will run 'easy_install tarball' to install the
+package in the worker.
+
+If --requirements_file is present then the file specified by the option will be
+staged in the staging location.  When a worker starts it will check for the
+presence of this file and will run 'pip install -r requirements.txt'. A
+requirements file can be easily generated by running 'pip freeze -r
+requirements.txt'. The reason a runner does not run this automatically is
+because quite often only a small fraction of the dependencies present in a
+requirements.txt file are actually needed for remote execution and therefore a
+one-time manual trimming is desirable.
+
+TODO(silviuc): Should we allow several setup packages?
+TODO(silviuc): We should allow customizing the exact command for setup build.
+"""
+import functools
+import glob
+import logging
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import pkg_resources
+
+from apache_beam.internal import pickler
+from apache_beam.io.filesystems import FileSystems
+from apache_beam.options.pipeline_options import SetupOptions
+# TODO(angoenka): Remove reference to dataflow internal names
+from apache_beam.runners.dataflow.internal import names
+from apache_beam.utils import processes
+
+# All constants are for internal use only; no backwards-compatibility
+# guarantees.
+
+# Standard file names used for staging files.
+WORKFLOW_TARBALL_FILE = 'workflow.tar.gz'
+REQUIREMENTS_FILE = 'requirements.txt'
+EXTRA_PACKAGES_FILE = 'extra_packages.txt'
+
+# Package names for distributions
+BEAM_PACKAGE_NAME = 'apache-beam'
+
+
+class Stager(object):
+  """Stager identifies and copies the appropriate artifacts to the staging
+  location."""
+
+  def _copy_file(self, from_path, to_path):
+"""Copies a local file to a GCS file or vice versa."""
+logging.info('file copy from %s to %s.', from_path, to_path)
+if from_path.startswith('gs://') or to_path.startswith('gs://'):
+  from apache_beam.io.gcp import gcsio
+  if from_path.startswith('gs://') and to_path.startswith('gs://'):
+# Both files are GCS files so copy.
+gcsio.GcsIO().copy(from_path, to_path)
+  elif to_path.startswith('gs://'):
+# Only target is a GCS file, read local file and upload.
+with open(from_path, 'rb') as f:
+  with gcsio.GcsIO().open(to_path, mode='wb') as g:
+pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE)
+for chunk in iter(pfun, ''):
+  

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-14 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=101966=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101966
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 15/May/18 01:11
Start Date: 15/May/18 01:11
Worklog Time Spent: 10m 
  Work Description: tvalentyn commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188141429
 
 

 ##
 File path: sdks/python/apache_beam/runners/dataflow/internal/dependency.py
 ##
 @@ -607,65 +151,9 @@ def get_sdk_name_and_version():
 def get_sdk_package_name():
 
 Review comment:
   
   Let's implement this in LegacyDataflowStager (see another comment).


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 101966)
Time Spent: 9h 10m  (was: 9h)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 9h 10m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-14 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=101963=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101963
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 15/May/18 01:11
Start Date: 15/May/18 01:11
Worklog Time Spent: 10m 
  Work Description: tvalentyn commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188141413
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager.py
 ##
 @@ -0,0 +1,573 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Support for installing custom code and required dependencies.
+
+Workflows, with the exception of very simple ones, are organized in multiple
+modules and packages. Typically, these modules and packages have
+dependencies on other standard libraries. Beam relies on the Python
+setuptools package to handle these scenarios. For further details please read:
+https://pythonhosted.org/an_example_pypi_project/setuptools.html
+
+When a runner tries to run a pipeline it will check for a --requirements_file
+and a --setup_file option.
+
+If --setup_file is present then it is assumed that the folder containing the
+file specified by the option has the typical layout required by setuptools and
+it will run 'python setup.py sdist' to produce a source distribution. The
+resulting tarball (a .tar or .tar.gz file) will be staged at the staging
+location specified as job option. When a worker starts it will check for the
+presence of this file and will run 'easy_install tarball' to install the
+package in the worker.
+
+If --requirements_file is present then the file specified by the option will be
+staged in the staging location.  When a worker starts it will check for the
+presence of this file and will run 'pip install -r requirements.txt'. A
+requirements file can be easily generated by running 'pip freeze -r
+requirements.txt'. The reason a runner does not run this automatically is
+because quite often only a small fraction of the dependencies present in a
+requirements.txt file are actually needed for remote execution and therefore a
+one-time manual trimming is desirable.
+
+TODO(silviuc): Should we allow several setup packages?
+TODO(silviuc): We should allow customizing the exact command for setup build.
+"""
+import functools
+import glob
+import logging
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import pkg_resources
+
+from apache_beam.internal import pickler
+from apache_beam.io.filesystems import FileSystems
+from apache_beam.options.pipeline_options import SetupOptions
+# TODO(angoenka): Remove reference to dataflow internal names
+from apache_beam.runners.dataflow.internal import names
+from apache_beam.utils import processes
+
+# All constants are for internal use only; no backwards-compatibility
+# guarantees.
+
+# Standard file names used for staging files.
+WORKFLOW_TARBALL_FILE = 'workflow.tar.gz'
+REQUIREMENTS_FILE = 'requirements.txt'
+EXTRA_PACKAGES_FILE = 'extra_packages.txt'
+
+# Package names for distributions
+BEAM_PACKAGE_NAME = 'apache-beam'
+
+
+class Stager(object):
+  """Stager identifies and copies the appropriate artifacts to the staging
+  location."""
+
+  def _copy_file(self, from_path, to_path):
+"""Copies a local file to a GCS file or vice versa."""
+logging.info('file copy from %s to %s.', from_path, to_path)
+if from_path.startswith('gs://') or to_path.startswith('gs://'):
+  from apache_beam.io.gcp import gcsio
+  if from_path.startswith('gs://') and to_path.startswith('gs://'):
+# Both files are GCS files so copy.
+gcsio.GcsIO().copy(from_path, to_path)
+  elif to_path.startswith('gs://'):
+# Only target is a GCS file, read local file and upload.
+with open(from_path, 'rb') as f:
+  with gcsio.GcsIO().open(to_path, mode='wb') as g:
+pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE)
+for chunk in iter(pfun, ''):
+  

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-14 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=101961=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101961
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 15/May/18 01:11
Start Date: 15/May/18 01:11
Worklog Time Spent: 10m 
  Work Description: tvalentyn commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188141416
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager.py
 ##
 @@ -0,0 +1,573 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Support for installing custom code and required dependencies.
+
+Workflows, with the exception of very simple ones, are organized in multiple
+modules and packages. Typically, these modules and packages have
+dependencies on other standard libraries. Beam relies on the Python
+setuptools package to handle these scenarios. For further details please read:
+https://pythonhosted.org/an_example_pypi_project/setuptools.html
+
+When a runner tries to run a pipeline it will check for a --requirements_file
+and a --setup_file option.
+
+If --setup_file is present then it is assumed that the folder containing the
+file specified by the option has the typical layout required by setuptools and
+it will run 'python setup.py sdist' to produce a source distribution. The
+resulting tarball (a .tar or .tar.gz file) will be staged at the staging
+location specified as job option. When a worker starts it will check for the
+presence of this file and will run 'easy_install tarball' to install the
+package in the worker.
+
+If --requirements_file is present then the file specified by the option will be
+staged in the staging location.  When a worker starts it will check for the
+presence of this file and will run 'pip install -r requirements.txt'. A
+requirements file can be easily generated by running 'pip freeze -r
+requirements.txt'. The reason a runner does not run this automatically is
+because quite often only a small fraction of the dependencies present in a
+requirements.txt file are actually needed for remote execution and therefore a
+one-time manual trimming is desirable.
+
+TODO(silviuc): Should we allow several setup packages?
+TODO(silviuc): We should allow customizing the exact command for setup build.
+"""
+import functools
+import glob
+import logging
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import pkg_resources
+
+from apache_beam.internal import pickler
+from apache_beam.io.filesystems import FileSystems
+from apache_beam.options.pipeline_options import SetupOptions
+# TODO(angoenka): Remove reference to dataflow internal names
+from apache_beam.runners.dataflow.internal import names
+from apache_beam.utils import processes
+
+# All constants are for internal use only; no backwards-compatibility
+# guarantees.
+
+# Standard file names used for staging files.
+WORKFLOW_TARBALL_FILE = 'workflow.tar.gz'
+REQUIREMENTS_FILE = 'requirements.txt'
+EXTRA_PACKAGES_FILE = 'extra_packages.txt'
+
+# Package names for distributions
+BEAM_PACKAGE_NAME = 'apache-beam'
+
+
+class Stager(object):
+  """Stager identifies and copies the appropriate artifacts to the staging
+  location."""
+
+  def _copy_file(self, from_path, to_path):
+"""Copies a local file to a GCS file or vice versa."""
+logging.info('file copy from %s to %s.', from_path, to_path)
+if from_path.startswith('gs://') or to_path.startswith('gs://'):
+  from apache_beam.io.gcp import gcsio
+  if from_path.startswith('gs://') and to_path.startswith('gs://'):
+# Both files are GCS files so copy.
+gcsio.GcsIO().copy(from_path, to_path)
+  elif to_path.startswith('gs://'):
+# Only target is a GCS file, read local file and upload.
+with open(from_path, 'rb') as f:
+  with gcsio.GcsIO().open(to_path, mode='wb') as g:
+pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE)
+for chunk in iter(pfun, ''):
+  

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-14 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=101964=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101964
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 15/May/18 01:11
Start Date: 15/May/18 01:11
Worklog Time Spent: 10m 
  Work Description: tvalentyn commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188141422
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager.py
 ##
 @@ -0,0 +1,573 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Support for installing custom code and required dependencies.
+
+Workflows, with the exception of very simple ones, are organized in multiple
+modules and packages. Typically, these modules and packages have
+dependencies on other standard libraries. Beam relies on the Python
+setuptools package to handle these scenarios. For further details please read:
+https://pythonhosted.org/an_example_pypi_project/setuptools.html
+
+When a runner tries to run a pipeline it will check for a --requirements_file
+and a --setup_file option.
+
+If --setup_file is present then it is assumed that the folder containing the
+file specified by the option has the typical layout required by setuptools and
+it will run 'python setup.py sdist' to produce a source distribution. The
+resulting tarball (a .tar or .tar.gz file) will be staged at the staging
+location specified as job option. When a worker starts it will check for the
+presence of this file and will run 'easy_install tarball' to install the
+package in the worker.
+
+If --requirements_file is present then the file specified by the option will be
+staged in the staging location.  When a worker starts it will check for the
+presence of this file and will run 'pip install -r requirements.txt'. A
+requirements file can be easily generated by running 'pip freeze -r
+requirements.txt'. The reason a runner does not run this automatically is
+because quite often only a small fraction of the dependencies present in a
+requirements.txt file are actually needed for remote execution and therefore a
+one-time manual trimming is desirable.
+
+TODO(silviuc): Should we allow several setup packages?
+TODO(silviuc): We should allow customizing the exact command for setup build.
+"""
+import functools
+import glob
+import logging
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import pkg_resources
+
+from apache_beam.internal import pickler
+from apache_beam.io.filesystems import FileSystems
+from apache_beam.options.pipeline_options import SetupOptions
+# TODO(angoenka): Remove reference to dataflow internal names
+from apache_beam.runners.dataflow.internal import names
+from apache_beam.utils import processes
+
+# All constants are for internal use only; no backwards-compatibility
+# guarantees.
+
+# Standard file names used for staging files.
+WORKFLOW_TARBALL_FILE = 'workflow.tar.gz'
+REQUIREMENTS_FILE = 'requirements.txt'
+EXTRA_PACKAGES_FILE = 'extra_packages.txt'
+
+# Package names for distributions
+BEAM_PACKAGE_NAME = 'apache-beam'
+
+
+class Stager(object):
+  """Stager identifies and copies the appropriate artifacts to the staging
+  location."""
+
+  def _copy_file(self, from_path, to_path):
+"""Copies a local file to a GCS file or vice versa."""
+logging.info('file copy from %s to %s.', from_path, to_path)
+if from_path.startswith('gs://') or to_path.startswith('gs://'):
+  from apache_beam.io.gcp import gcsio
+  if from_path.startswith('gs://') and to_path.startswith('gs://'):
+# Both files are GCS files so copy.
+gcsio.GcsIO().copy(from_path, to_path)
+  elif to_path.startswith('gs://'):
+# Only target is a GCS file, read local file and upload.
+with open(from_path, 'rb') as f:
+  with gcsio.GcsIO().open(to_path, mode='wb') as g:
+pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE)
+for chunk in iter(pfun, ''):
+  

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-14 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=101962=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101962
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 15/May/18 01:11
Start Date: 15/May/18 01:11
Worklog Time Spent: 10m 
  Work Description: tvalentyn commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188141415
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager.py
 ##
 @@ -0,0 +1,573 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Support for installing custom code and required dependencies.
+
+Workflows, with the exception of very simple ones, are organized in multiple
+modules and packages. Typically, these modules and packages have
+dependencies on other standard libraries. Beam relies on the Python
+setuptools package to handle these scenarios. For further details please read:
+https://pythonhosted.org/an_example_pypi_project/setuptools.html
+
+When a runner tries to run a pipeline it will check for a --requirements_file
+and a --setup_file option.
+
+If --setup_file is present then it is assumed that the folder containing the
+file specified by the option has the typical layout required by setuptools and
+it will run 'python setup.py sdist' to produce a source distribution. The
+resulting tarball (a .tar or .tar.gz file) will be staged at the staging
+location specified as job option. When a worker starts it will check for the
+presence of this file and will run 'easy_install tarball' to install the
+package in the worker.
+
+If --requirements_file is present then the file specified by the option will be
+staged in the staging location.  When a worker starts it will check for the
+presence of this file and will run 'pip install -r requirements.txt'. A
+requirements file can be easily generated by running 'pip freeze -r
+requirements.txt'. The reason a runner does not run this automatically is
+because quite often only a small fraction of the dependencies present in a
+requirements.txt file are actually needed for remote execution and therefore a
+one-time manual trimming is desirable.
+
+TODO(silviuc): Should we allow several setup packages?
+TODO(silviuc): We should allow customizing the exact command for setup build.
+"""
+import functools
+import glob
+import logging
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import pkg_resources
+
+from apache_beam.internal import pickler
+from apache_beam.io.filesystems import FileSystems
+from apache_beam.options.pipeline_options import SetupOptions
+# TODO(angoenka): Remove reference to dataflow internal names
+from apache_beam.runners.dataflow.internal import names
+from apache_beam.utils import processes
+
+# All constants are for internal use only; no backwards-compatibility
+# guarantees.
+
+# Standard file names used for staging files.
+WORKFLOW_TARBALL_FILE = 'workflow.tar.gz'
+REQUIREMENTS_FILE = 'requirements.txt'
+EXTRA_PACKAGES_FILE = 'extra_packages.txt'
+
+# Package names for distributions
+BEAM_PACKAGE_NAME = 'apache-beam'
+
+
+class Stager(object):
+  """Stager identifies and copies the appropriate artifacts to the staging
+  location."""
+
+  def _copy_file(self, from_path, to_path):
+"""Copies a local file to a GCS file or vice versa."""
+logging.info('file copy from %s to %s.', from_path, to_path)
+if from_path.startswith('gs://') or to_path.startswith('gs://'):
+  from apache_beam.io.gcp import gcsio
+  if from_path.startswith('gs://') and to_path.startswith('gs://'):
+# Both files are GCS files so copy.
+gcsio.GcsIO().copy(from_path, to_path)
+  elif to_path.startswith('gs://'):
+# Only target is a GCS file, read local file and upload.
+with open(from_path, 'rb') as f:
+  with gcsio.GcsIO().open(to_path, mode='wb') as g:
+pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE)
+for chunk in iter(pfun, ''):
+  

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-14 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=101965=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101965
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 15/May/18 01:11
Start Date: 15/May/18 01:11
Worklog Time Spent: 10m 
  Work Description: tvalentyn commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188141420
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager.py
 ##
 @@ -0,0 +1,573 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Support for installing custom code and required dependencies.
+
+Workflows, with the exception of very simple ones, are organized in multiple
+modules and packages. Typically, these modules and packages have
+dependencies on other standard libraries. Beam relies on the Python
+setuptools package to handle these scenarios. For further details please read:
+https://pythonhosted.org/an_example_pypi_project/setuptools.html
+
+When a runner tries to run a pipeline it will check for a --requirements_file
+and a --setup_file option.
+
+If --setup_file is present then it is assumed that the folder containing the
+file specified by the option has the typical layout required by setuptools and
+it will run 'python setup.py sdist' to produce a source distribution. The
+resulting tarball (a .tar or .tar.gz file) will be staged at the staging
+location specified as job option. When a worker starts it will check for the
+presence of this file and will run 'easy_install tarball' to install the
+package in the worker.
+
+If --requirements_file is present then the file specified by the option will be
+staged in the staging location.  When a worker starts it will check for the
+presence of this file and will run 'pip install -r requirements.txt'. A
+requirements file can be easily generated by running 'pip freeze -r
+requirements.txt'. The reason a runner does not run this automatically is
+because quite often only a small fraction of the dependencies present in a
+requirements.txt file are actually needed for remote execution and therefore a
+one-time manual trimming is desirable.
+
+TODO(silviuc): Should we allow several setup packages?
+TODO(silviuc): We should allow customizing the exact command for setup build.
+"""
+import functools
+import glob
+import logging
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import pkg_resources
+
+from apache_beam.internal import pickler
+from apache_beam.io.filesystems import FileSystems
+from apache_beam.options.pipeline_options import SetupOptions
+# TODO(angoenka): Remove reference to dataflow internal names
+from apache_beam.runners.dataflow.internal import names
+from apache_beam.utils import processes
+
+# All constants are for internal use only; no backwards-compatibility
+# guarantees.
+
+# Standard file names used for staging files.
+WORKFLOW_TARBALL_FILE = 'workflow.tar.gz'
+REQUIREMENTS_FILE = 'requirements.txt'
+EXTRA_PACKAGES_FILE = 'extra_packages.txt'
+
+# Package names for distributions
+BEAM_PACKAGE_NAME = 'apache-beam'
+
+
+class Stager(object):
+  """Stager identifies and copies the appropriate artifacts to the staging
+  location."""
+
+  def _copy_file(self, from_path, to_path):
+"""Copies a local file to a GCS file or vice versa."""
+logging.info('file copy from %s to %s.', from_path, to_path)
+if from_path.startswith('gs://') or to_path.startswith('gs://'):
+  from apache_beam.io.gcp import gcsio
+  if from_path.startswith('gs://') and to_path.startswith('gs://'):
+# Both files are GCS files so copy.
+gcsio.GcsIO().copy(from_path, to_path)
+  elif to_path.startswith('gs://'):
+# Only target is a GCS file, read local file and upload.
+with open(from_path, 'rb') as f:
+  with gcsio.GcsIO().open(to_path, mode='wb') as g:
+pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE)
+for chunk in iter(pfun, ''):
+  

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-14 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=101958=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101958
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 15/May/18 01:11
Start Date: 15/May/18 01:11
Worklog Time Spent: 10m 
  Work Description: tvalentyn commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188141421
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager.py
 ##
 @@ -0,0 +1,573 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Support for installing custom code and required dependencies.
+
+Workflows, with the exception of very simple ones, are organized in multiple
+modules and packages. Typically, these modules and packages have
+dependencies on other standard libraries. Beam relies on the Python
+setuptools package to handle these scenarios. For further details please read:
+https://pythonhosted.org/an_example_pypi_project/setuptools.html
+
+When a runner tries to run a pipeline it will check for a --requirements_file
+and a --setup_file option.
+
+If --setup_file is present then it is assumed that the folder containing the
+file specified by the option has the typical layout required by setuptools and
+it will run 'python setup.py sdist' to produce a source distribution. The
+resulting tarball (a .tar or .tar.gz file) will be staged at the staging
+location specified as job option. When a worker starts it will check for the
+presence of this file and will run 'easy_install tarball' to install the
+package in the worker.
+
+If --requirements_file is present then the file specified by the option will be
+staged in the staging location.  When a worker starts it will check for the
+presence of this file and will run 'pip install -r requirements.txt'. A
+requirements file can be easily generated by running 'pip freeze -r
+requirements.txt'. The reason a runner does not run this automatically is
+because quite often only a small fraction of the dependencies present in a
+requirements.txt file are actually needed for remote execution and therefore a
+one-time manual trimming is desirable.
+
+TODO(silviuc): Should we allow several setup packages?
+TODO(silviuc): We should allow customizing the exact command for setup build.
+"""
+import functools
+import glob
+import logging
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import pkg_resources
+
+from apache_beam.internal import pickler
+from apache_beam.io.filesystems import FileSystems
+from apache_beam.options.pipeline_options import SetupOptions
+# TODO(angoenka): Remove reference to dataflow internal names
+from apache_beam.runners.dataflow.internal import names
+from apache_beam.utils import processes
+
+# All constants are for internal use only; no backwards-compatibility
+# guarantees.
+
+# Standard file names used for staging files.
+WORKFLOW_TARBALL_FILE = 'workflow.tar.gz'
+REQUIREMENTS_FILE = 'requirements.txt'
+EXTRA_PACKAGES_FILE = 'extra_packages.txt'
+
+# Package names for distributions
+BEAM_PACKAGE_NAME = 'apache-beam'
+
+
+class Stager(object):
+  """Stager identifies and copies the appropriate artifacts to the staging
+  location."""
+
+  def _copy_file(self, from_path, to_path):
+"""Copies a local file to a GCS file or vice versa."""
+logging.info('file copy from %s to %s.', from_path, to_path)
+if from_path.startswith('gs://') or to_path.startswith('gs://'):
+  from apache_beam.io.gcp import gcsio
+  if from_path.startswith('gs://') and to_path.startswith('gs://'):
+# Both files are GCS files so copy.
+gcsio.GcsIO().copy(from_path, to_path)
+  elif to_path.startswith('gs://'):
+# Only target is a GCS file, read local file and upload.
+with open(from_path, 'rb') as f:
+  with gcsio.GcsIO().open(to_path, mode='wb') as g:
+pfun = functools.partial(f.read, gcsio.WRITE_CHUNK_SIZE)
+for chunk in iter(pfun, ''):
+  

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-14 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=101957=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101957
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 15/May/18 01:11
Start Date: 15/May/18 01:11
Worklog Time Spent: 10m 
  Work Description: tvalentyn commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r188141411
 
 

 ##
 File path: sdks/python/apache_beam/runners/dataflow/internal/apiclient.py
 ##
 @@ -731,6 +746,18 @@ def translate_scalar_counter_float(accumulator, 
metric_update_proto):
 metric_update_proto.floatingPoint = accumulator.value
 
 
+class _ParameterizedStager(Stager):
 
 Review comment:
   
   Let's call this `_LegacyDataflowStager`, and take an instance of 
`DataflowApplicationClient` in the constructor, then let's call 
`dataflow_application_client._gcs_file_copy()` in stage_artifact.


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 101957)
Time Spent: 7h 50m  (was: 7h 40m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 7h 50m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-14 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=101843=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101843
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 14/May/18 19:19
Start Date: 14/May/18 19:19
Worklog Time Spent: 10m 
  Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor 
and clean dependency.py to make it reusable with artifact service
URL: https://github.com/apache/beam/pull/5251#issuecomment-388931882
 
 
   Run Python Dataflow ValidatesRunner


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 101843)
Time Spent: 7h 40m  (was: 7.5h)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 7h 40m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-11 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=101288=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101288
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 11/May/18 20:05
Start Date: 11/May/18 20:05
Worklog Time Spent: 10m 
  Work Description: angoenka commented on issue #5273: [BEAM-3883] Adding 
Client to push artifacts to artifact staging service
URL: https://github.com/apache/beam/pull/5273#issuecomment-388471936
 
 
   Replied on the comment in PR #5251 


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 101288)
Time Spent: 7.5h  (was: 7h 20m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 7.5h
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-11 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=101285=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101285
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 11/May/18 19:58
Start Date: 11/May/18 19:58
Worklog Time Spent: 10m 
  Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor 
and clean dependency.py to make it reusable with artifact service
URL: https://github.com/apache/beam/pull/5251#issuecomment-388470371
 
 
   From the perspective of fixing the entanglement in the code, I do not want 
to handle it in this PR as it is unrelated to streamlining dependency.py. The 
current entanglement is in the apiclient.py which does file staging and reuse 
some of the same state. I think this should be handled separately as its not in 
the scope of this PR. 
   Regarding the naming of GCSStager, its because its not intended to be used 
outside apiclient and is only meant to be used for dataflow runner. I would be 
happy to rename it as long as the name signifies that its not meant for use 
elsewhere. I can also make it a private class if it makes sense.
   "ParameterizedStager" seems to be a good name and just to make it private, I 
will make it "_ParameterizedStager".


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 101285)
Time Spent: 7h 20m  (was: 7h 10m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 7h 20m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-10 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100939=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100939
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 11/May/18 01:15
Start Date: 11/May/18 01:15
Worklog Time Spent: 10m 
  Work Description: angoenka commented on issue #5273: [BEAM-3883] Adding 
Client to push artifacts to artifact staging service
URL: https://github.com/apache/beam/pull/5273#issuecomment-388230845
 
 
   I have updated the PR based on the discussion 
https://github.com/apache/beam/pull/5251#issuecomment-388163831
   PTAL 


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 100939)
Time Spent: 7h 10m  (was: 7h)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 7h 10m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-10 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100931=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100931
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 11/May/18 00:47
Start Date: 11/May/18 00:47
Worklog Time Spent: 10m 
  Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor 
and clean dependency.py to make it reusable with artifact service
URL: https://github.com/apache/beam/pull/5251#issuecomment-388226972
 
 
   @tvalentyn I have merged filehandler in stager based on your suggestion. PTAL


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 100931)
Time Spent: 7h  (was: 6h 50m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 7h
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-10 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100807=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100807
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 10/May/18 20:19
Start Date: 10/May/18 20:19
Worklog Time Spent: 10m 
  Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor 
and clean dependency.py to make it reusable with artifact service
URL: https://github.com/apache/beam/pull/5251#issuecomment-388173088
 
 
   I agree with the general structure you mentioned. And just to rephrase it I 
would call them
   1. Collect artifacts
   2. Commit artifacts
   
   The key difference in the current code is that it commits individual 
artifacts when it finds them while collecting. This certainly makes the code 
messy.
   However there are some challenges as how we download artifacts which can 
depend upon the setup. 
   To reuse collection code we will need to pass a download_file method which 
can download remote file to local file system from a variety of sources.
   And then we will have a upload_file which will be used in commit artifacts.
   Which brings us very close to the current implementation and make me less 
inclined towards this change.
   FileHandler has just 2 methods download_file and upload_file which clearly 
outlined the the expectations from a runner. This makes stager completely 
reusable with different FileHandlers.
   
   Please let me know if it makes sense.


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 100807)
Time Spent: 6h 50m  (was: 6h 40m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 6h 50m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-10 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100789=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100789
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 10/May/18 19:44
Start Date: 10/May/18 19:44
Worklog Time Spent: 10m 
  Work Description: tvalentyn commented on issue #5251: [BEAM-3883] 
Refactor and clean dependency.py to make it reusable with artifact service
URL: https://github.com/apache/beam/pull/5251#issuecomment-388163831
 
 
   I think we can simplify the logic here. During  resource staging for Python 
SDK we need to stage  pipeline artifacts (SDK, workflow main session, 
requirements.txt, cached version of the packages in requirements.txt, maybe 
workflow tarball). 
   
   Staging happens in two steps: 
   
   1. If the artifact is not local, download the artifact from artifact source 
location to a local temp folder.
   2. Stage the artifact from local folder into the staging location.
   
   Depending on the execution environment, we can have different staging 
locations: 
   - All portable runners, including ULR should stage artifacts to an Artifact 
Server  over GRPC. 
   - Dataflow Runner for non-portable pipelines need to stage artifacts to a 
GCS bucket. 
   
   Step 2 needs to be different for different stager implementations, however 
the first step does not. I don't see a reason to implement prestaging the 
dependencies  separately for each stager. Being able to stage SDK from GCS, 
HTTP, or other location is a capability of SDK, regardless of the runner, so I 
think it should be common. We can support other locations as well when the need 
arises.
   
   This said, I suggest the following sketch for the abstractions:
   
   ```
   class Stager(object):
 def stage_job_resources(options, temp_dir=None, ...):
  ''' Materializes all resources to be stages in a local folder, stages 
artifacts 
  one by one using _stage_artifact(), and calls _commit_manifest() at the 
end. 
  SDK: will be staged from PyPI if sdk_location=default, otherwise from 
sdk_location. 
  sdk_location can be a path to local directory, GCS path or HTTP URL.
  Extra packages are staged from local directory.
  Packages from requirements.txt are downloaded from PyPI into temporary 
folder, 
  then staged to a staging location. 
  ...
  ''' 
  # move existing functionality from dependency.stage_job_resources() here.
   
   
 def stage_artifact(local_path_to_artifact, artefact_name):
   """ Stages the artifact to self._staging_location, if successful returns 
True and 
   adds artifact_name to the manifest of artifacts that have been staged.
   """
   raise NotImplementedError
   
 def commit_manifest():
   """Commits manifest through Artifact API."""
   raise NotImplementedError
   
   class GcsStager(Stager):
  # Stager for legacy Dataflow pipelines
  def stage_artifact(local_path_to_artifact, artefact_name):
 # check that self.staging_location is a GCS bucket
 # copy the artifact to 
gs://self.staging_location/artefact_name+some_suffix 
   
 def commit_manifest():
pass # No need to do anything here for legacy pipelines.
   
   class ArtifactServerStager(Stager):
 def stage_artifact(local_path_to_artifact, artefact_name):
 # Implementation that talks to Artifact Server via Fn Artifact API.
   
 def commit_manifest(local_path_to_artifact, artefact_name):
 # Implementation that talks to Artifact Server via Fn Artifact API.
   ```
   
   What do you think?
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 100789)
Time Spent: 6h 40m  (was: 6.5h)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 6h 40m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in 

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-10 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100779=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100779
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 10/May/18 19:15
Start Date: 10/May/18 19:15
Worklog Time Spent: 10m 
  Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor 
and clean dependency.py to make it reusable with artifact service
URL: https://github.com/apache/beam/pull/5251#issuecomment-388156058
 
 
   Run Python Dataflow ValidatesContainer


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 100779)
Time Spent: 6.5h  (was: 6h 20m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 6.5h
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-09 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100382=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100382
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 10/May/18 02:38
Start Date: 10/May/18 02:38
Worklog Time Spent: 10m 
  Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor 
and clean dependency.py to make it reusable with artifact service
URL: https://github.com/apache/beam/pull/5251#issuecomment-387934312
 
 
   Run Python Dataflow ValidatesRunner
   
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 100382)
Time Spent: 6h 20m  (was: 6h 10m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 6h 20m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-09 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100381=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100381
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 10/May/18 02:37
Start Date: 10/May/18 02:37
Worklog Time Spent: 10m 
  Work Description: angoenka commented on issue #5273: [BEAM-3883] Adding 
Client to push artifacts to artifact staging service
URL: https://github.com/apache/beam/pull/5273#issuecomment-387934066
 
 
   Run Python Dataflow ValidatesRunner
   
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 100381)
Time Spent: 6h 10m  (was: 6h)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 6h 10m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-09 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100380=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100380
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 10/May/18 02:35
Start Date: 10/May/18 02:35
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5273: 
[BEAM-3883] Adding Client to push artifacts to artifact staging service
URL: https://github.com/apache/beam/pull/5273#discussion_r187226786
 
 

 ##
 File path: 
sdks/python/apache_beam/runners/portability/artifact_service_client.py
 ##
 @@ -0,0 +1,104 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""A :class:`FileHandler` to work with :class:`ArtifactStagingServiceStub`.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from apache_beam.portability.api import beam_artifact_api_pb2
+from apache_beam.portability.api import beam_artifact_api_pb2_grpc
+from apache_beam.runners.portability.stager import FileHandler
+
+
+class ArtifactStagingFileHandler(FileHandler):
+  """:class:`FileHandler` to push files to ArtifactStagingService.
+
+  The class keeps track of pushed files and user is expected to call
+  :fun:`commit_manifest` once all files are uploaded.
+  Once :fun:`commit_manifest` is called, no further operations can be performed
+  on the class.
+
+  Note: This class is not thread safe and user of this class should ensure
+  thread safety.
+  """
+
+  def __init__(self, artifact_service_channel):
+"""Creates a new FileHandler to upload file to ArtifactStagingService.
+
+Args:
+  artifact_service_channel: Channel used to interact with
+ArtifactStagingService.User owns the channel and should close it when
+finished.
+"""
+super(ArtifactStagingFileHandler, self).__init__()
+self._artifact_staging_stub = beam_artifact_api_pb2_grpc.\
+ArtifactStagingServiceStub(channel=artifact_service_channel)
+self._artifacts = []
+self.closed = False
+
+  def file_copy(self, from_path, to_path):
 
 Review comment:
   Sure. Made this change in the parent PR


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 100380)
Time Spent: 6h  (was: 5h 50m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 6h
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-09 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100377=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100377
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 10/May/18 02:35
Start Date: 10/May/18 02:35
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5273: 
[BEAM-3883] Adding Client to push artifacts to artifact staging service
URL: https://github.com/apache/beam/pull/5273#discussion_r187199182
 
 

 ##
 File path: 
sdks/python/apache_beam/runners/portability/artifact_service_client.py
 ##
 @@ -0,0 +1,104 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""A :class:`FileHandler` to work with :class:`ArtifactStagingServiceStub`.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from apache_beam.portability.api import beam_artifact_api_pb2
+from apache_beam.portability.api import beam_artifact_api_pb2_grpc
+from apache_beam.runners.portability.stager import FileHandler
+
+
+class ArtifactStagingFileHandler(FileHandler):
+  """:class:`FileHandler` to push files to ArtifactStagingService.
+
+  The class keeps track of pushed files and user is expected to call
+  :fun:`commit_manifest` once all files are uploaded.
+  Once :fun:`commit_manifest` is called, no further operations can be performed
+  on the class.
+
+  Note: This class is not thread safe and user of this class should ensure
+  thread safety.
+  """
+
+  def __init__(self, artifact_service_channel):
+"""Creates a new FileHandler to upload file to ArtifactStagingService.
+
+Args:
+  artifact_service_channel: Channel used to interact with
+ArtifactStagingService.User owns the channel and should close it when
+finished.
+"""
+super(ArtifactStagingFileHandler, self).__init__()
+self._artifact_staging_stub = beam_artifact_api_pb2_grpc.\
+ArtifactStagingServiceStub(channel=artifact_service_channel)
+self._artifacts = []
+self.closed = False
+
+  def file_copy(self, from_path, to_path):
+"""Uploads a file to ArtifactStagingService.
+
+Note: Downloading/copying file from remote server is not supported.
+Args:
+  from_path: Path of file to be uploaded.
+  to_path: File name on the artifact server.
+"""
+self._check_closed()
+if not os.path.isfile(from_path):
+  raise ValueError(
+  'Can only copy local file to artifact server. from_path: {0} '
+  'to_path: {1}'.format(from_path, to_path))
+
+def artifact_request_generator():
+  metadata = beam_artifact_api_pb2.ArtifactMetadata(name=to_path)
+  request = beam_artifact_api_pb2.PutArtifactRequest(metadata=metadata)
+  yield request
+  with open(from_path, 'rb') as f:
+while True:
+  chunk = f.read(2 << 12)  # 4kb
+  if not chunk:
+break
+  request = beam_artifact_api_pb2.PutArtifactRequest(
+  data=beam_artifact_api_pb2.ArtifactChunk(data=chunk))
+  yield request
+  self._artifacts.append(metadata)
+
+response = self._artifact_staging_stub.PutArtifact(
+artifact_request_generator())
+print(response)
+
+  def file_download(self, from_url, to_path):
+self._check_closed()
+return super(ArtifactStagingFileHandler, self).file_download(
+from_url, to_path)
+
+  def commit_manifest(self):
 
 Review comment:
   Makes sense.
   Will not commit in case of exception or error.


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 100377)
Time Spent: 5h 40m  (was: 5.5h)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
>

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-09 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100378=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100378
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 10/May/18 02:35
Start Date: 10/May/18 02:35
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5273: 
[BEAM-3883] Adding Client to push artifacts to artifact staging service
URL: https://github.com/apache/beam/pull/5273#discussion_r187190033
 
 

 ##
 File path: 
sdks/python/apache_beam/runners/portability/artifact_service_client.py
 ##
 @@ -0,0 +1,104 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""A :class:`FileHandler` to work with :class:`ArtifactStagingServiceStub`.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from apache_beam.portability.api import beam_artifact_api_pb2
+from apache_beam.portability.api import beam_artifact_api_pb2_grpc
+from apache_beam.runners.portability.stager import FileHandler
+
+
+class ArtifactStagingFileHandler(FileHandler):
+  """:class:`FileHandler` to push files to ArtifactStagingService.
+
+  The class keeps track of pushed files and user is expected to call
+  :fun:`commit_manifest` once all files are uploaded.
+  Once :fun:`commit_manifest` is called, no further operations can be performed
+  on the class.
+
+  Note: This class is not thread safe and user of this class should ensure
+  thread safety.
+  """
+
+  def __init__(self, artifact_service_channel):
+"""Creates a new FileHandler to upload file to ArtifactStagingService.
+
+Args:
+  artifact_service_channel: Channel used to interact with
+ArtifactStagingService.User owns the channel and should close it when
+finished.
+"""
+super(ArtifactStagingFileHandler, self).__init__()
+self._artifact_staging_stub = beam_artifact_api_pb2_grpc.\
+ArtifactStagingServiceStub(channel=artifact_service_channel)
+self._artifacts = []
+self.closed = False
+
+  def file_copy(self, from_path, to_path):
+"""Uploads a file to ArtifactStagingService.
+
+Note: Downloading/copying file from remote server is not supported.
+Args:
+  from_path: Path of file to be uploaded.
+  to_path: File name on the artifact server.
+"""
+self._check_closed()
+if not os.path.isfile(from_path):
+  raise ValueError(
+  'Can only copy local file to artifact server. from_path: {0} '
+  'to_path: {1}'.format(from_path, to_path))
+
+def artifact_request_generator():
+  metadata = beam_artifact_api_pb2.ArtifactMetadata(name=to_path)
+  request = beam_artifact_api_pb2.PutArtifactRequest(metadata=metadata)
+  yield request
+  with open(from_path, 'rb') as f:
+while True:
+  chunk = f.read(2 << 12)  # 4kb
+  if not chunk:
+break
+  request = beam_artifact_api_pb2.PutArtifactRequest(
+  data=beam_artifact_api_pb2.ArtifactChunk(data=chunk))
+  yield request
+  self._artifacts.append(metadata)
+
+response = self._artifact_staging_stub.PutArtifact(
+artifact_request_generator())
+print(response)
 
 Review comment:
   Removed,


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 100378)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 5h 50m
> 

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-09 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100376=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100376
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 10/May/18 02:35
Start Date: 10/May/18 02:35
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5273: 
[BEAM-3883] Adding Client to push artifacts to artifact staging service
URL: https://github.com/apache/beam/pull/5273#discussion_r187189738
 
 

 ##
 File path: 
sdks/python/apache_beam/runners/portability/artifact_service_client.py
 ##
 @@ -0,0 +1,104 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""A :class:`FileHandler` to work with :class:`ArtifactStagingServiceStub`.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from apache_beam.portability.api import beam_artifact_api_pb2
+from apache_beam.portability.api import beam_artifact_api_pb2_grpc
+from apache_beam.runners.portability.stager import FileHandler
+
+
+class ArtifactStagingFileHandler(FileHandler):
+  """:class:`FileHandler` to push files to ArtifactStagingService.
+
+  The class keeps track of pushed files and user is expected to call
+  :fun:`commit_manifest` once all files are uploaded.
+  Once :fun:`commit_manifest` is called, no further operations can be performed
+  on the class.
+
+  Note: This class is not thread safe and user of this class should ensure
+  thread safety.
+  """
+
+  def __init__(self, artifact_service_channel):
+"""Creates a new FileHandler to upload file to ArtifactStagingService.
+
+Args:
+  artifact_service_channel: Channel used to interact with
+ArtifactStagingService.User owns the channel and should close it when
+finished.
+"""
+super(ArtifactStagingFileHandler, self).__init__()
+self._artifact_staging_stub = beam_artifact_api_pb2_grpc.\
+ArtifactStagingServiceStub(channel=artifact_service_channel)
+self._artifacts = []
+self.closed = False
+
+  def file_copy(self, from_path, to_path):
+"""Uploads a file to ArtifactStagingService.
+
+Note: Downloading/copying file from remote server is not supported.
+Args:
+  from_path: Path of file to be uploaded.
+  to_path: File name on the artifact server.
+"""
+self._check_closed()
+if not os.path.isfile(from_path):
+  raise ValueError(
+  'Can only copy local file to artifact server. from_path: {0} '
+  'to_path: {1}'.format(from_path, to_path))
+
+def artifact_request_generator():
+  metadata = beam_artifact_api_pb2.ArtifactMetadata(name=to_path)
+  request = beam_artifact_api_pb2.PutArtifactRequest(metadata=metadata)
+  yield request
+  with open(from_path, 'rb') as f:
+while True:
+  chunk = f.read(2 << 12)  # 4kb
 
 Review comment:
   Sure, we can go till 2MB. GRPC has a payload size limit of 4MB so 2MB should 
be ok.


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 100376)
Time Spent: 5.5h  (was: 5h 20m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 5.5h
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-09 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100379=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100379
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 10/May/18 02:35
Start Date: 10/May/18 02:35
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5273: 
[BEAM-3883] Adding Client to push artifacts to artifact staging service
URL: https://github.com/apache/beam/pull/5273#discussion_r187191092
 
 

 ##
 File path: 
sdks/python/apache_beam/runners/portability/artifact_service_client.py
 ##
 @@ -0,0 +1,104 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""A :class:`FileHandler` to work with :class:`ArtifactStagingServiceStub`.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from apache_beam.portability.api import beam_artifact_api_pb2
+from apache_beam.portability.api import beam_artifact_api_pb2_grpc
+from apache_beam.runners.portability.stager import FileHandler
+
+
+class ArtifactStagingFileHandler(FileHandler):
+  """:class:`FileHandler` to push files to ArtifactStagingService.
+
+  The class keeps track of pushed files and user is expected to call
+  :fun:`commit_manifest` once all files are uploaded.
+  Once :fun:`commit_manifest` is called, no further operations can be performed
+  on the class.
+
+  Note: This class is not thread safe and user of this class should ensure
+  thread safety.
+  """
+
+  def __init__(self, artifact_service_channel):
+"""Creates a new FileHandler to upload file to ArtifactStagingService.
+
+Args:
+  artifact_service_channel: Channel used to interact with
+ArtifactStagingService.User owns the channel and should close it when
+finished.
+"""
+super(ArtifactStagingFileHandler, self).__init__()
+self._artifact_staging_stub = beam_artifact_api_pb2_grpc.\
+ArtifactStagingServiceStub(channel=artifact_service_channel)
+self._artifacts = []
+self.closed = False
+
+  def file_copy(self, from_path, to_path):
+"""Uploads a file to ArtifactStagingService.
+
+Note: Downloading/copying file from remote server is not supported.
+Args:
+  from_path: Path of file to be uploaded.
+  to_path: File name on the artifact server.
+"""
+self._check_closed()
+if not os.path.isfile(from_path):
+  raise ValueError(
+  'Can only copy local file to artifact server. from_path: {0} '
+  'to_path: {1}'.format(from_path, to_path))
+
+def artifact_request_generator():
+  metadata = beam_artifact_api_pb2.ArtifactMetadata(name=to_path)
+  request = beam_artifact_api_pb2.PutArtifactRequest(metadata=metadata)
+  yield request
+  with open(from_path, 'rb') as f:
+while True:
+  chunk = f.read(2 << 12)  # 4kb
+  if not chunk:
+break
+  request = beam_artifact_api_pb2.PutArtifactRequest(
+  data=beam_artifact_api_pb2.ArtifactChunk(data=chunk))
+  yield request
+  self._artifacts.append(metadata)
+
+response = self._artifact_staging_stub.PutArtifact(
+artifact_request_generator())
+print(response)
+
+  def file_download(self, from_url, to_path):
 
 Review comment:
   - download_file? 
   Sure. Made this change in the parent PR
   - Is from_url in the same format as to_path in upload_file? If so they 
should use consistent terminology.
   Not really. in case of upload_file, to path is just the name of the path to 
the file in staging location while in download_file from_url can be any url 
from where we can potentially download some file like http:// gs:// etc 
   - why do we need this file at all? I thought the artifact stager only 
uploads files. Is this a test helper method?
   Stager prep the file to be uploaded and uploads them.
   A case where download is required is when when a package tarball is provided 
as http url or on GCS.


This is an automated message from the Apache Git 

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-09 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100308=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100308
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 09/May/18 22:44
Start Date: 09/May/18 22:44
Worklog Time Spent: 10m 
  Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor 
and clean dependency.py to make it reusable with artifact service
URL: https://github.com/apache/beam/pull/5251#issuecomment-387897316
 
 
   Run Python Dataflow ValidatesRunner
   
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 100308)
Time Spent: 5h 20m  (was: 5h 10m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 5h 20m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-09 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100282=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100282
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 09/May/18 21:55
Start Date: 09/May/18 21:55
Worklog Time Spent: 10m 
  Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor 
and clean dependency.py to make it reusable with artifact service
URL: https://github.com/apache/beam/pull/5251#issuecomment-387887629
 
 
   Run Python Dataflow ValidatesRunner
   
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 100282)
Time Spent: 5h 10m  (was: 5h)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 5h 10m
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-09 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100281=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100281
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 09/May/18 21:53
Start Date: 09/May/18 21:53
Worklog Time Spent: 10m 
  Work Description: angoenka commented on issue #5251: [BEAM-3883] Refactor 
and clean dependency.py to make it reusable with artifact service
URL: https://github.com/apache/beam/pull/5251#issuecomment-387887066
 
 
   Run Python Dataflow ValidatesRunner
   
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 100281)
Time Spent: 5h  (was: 4h 50m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 5h
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-09 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100274=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100274
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 09/May/18 21:49
Start Date: 09/May/18 21:49
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r187133776
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager.py
 ##
 @@ -0,0 +1,556 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Support for installing custom code and required dependencies.
+
+Workflows, with the exception of very simple ones, are organized in multiple
+modules and packages. Typically, these modules and packages have
+dependencies on other standard libraries. Beam relies on the Python
+setuptools package to handle these scenarios. For further details please read:
+https://pythonhosted.org/an_example_pypi_project/setuptools.html
+
+When a runner tries to run a pipeline it will check for a --requirements_file
+and a --setup_file option.
+
+If --setup_file is present then it is assumed that the folder containing the
+file specified by the option has the typical layout required by setuptools and
+it will run 'python setup.py sdist' to produce a source distribution. The
+resulting tarball (a .tar or .tar.gz file) will be staged at the staging
+location specified as job option. When a worker starts it will check for the
+presence of this file and will run 'easy_install tarball' to install the
+package in the worker.
+
+If --requirements_file is present then the file specified by the option will be
+staged in the staging location.  When a worker starts it will check for the
+presence of this file and will run 'pip install -r requirements.txt'. A
+requirements file can be easily generated by running 'pip freeze -r
+requirements.txt'. The reason a runner does not run this automatically is
+because quite often only a small fraction of the dependencies present in a
+requirements.txt file are actually needed for remote execution and therefore a
+one-time manual trimming is desirable.
+
+TODO(silviuc): Staged files should have a job specific prefix.
+To prevent several jobs in the same project stomping on each other due to a
 
 Review comment:
   Makes sense.
   Also removing the todo as different jobs in a project should have different 
staging directory.


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
---

Worklog Id: (was: 100274)
Time Spent: 4.5h  (was: 4h 20m)

> Python SDK stages artifacts when talking to job server
> --
>
> Key: BEAM-3883
> URL: https://issues.apache.org/jira/browse/BEAM-3883
> Project: Beam
>  Issue Type: Sub-task
>  Components: sdk-py-core
>Reporter: Ben Sidhom
>Assignee: Ankur Goenka
>Priority: Major
>  Time Spent: 4.5h
>  Remaining Estimate: 0h
>
> The Python SDK does not currently stage its user-defined functions or 
> dependencies when talking to the job API. Artifacts that need to be staged 
> include the user code itself, any SDK components not included in the 
> container image, and the list of Python packages that must be installed at 
> runtime.
>  
> Artifacts that are currently expected can be found in the harness boot code: 
> [https://github.com/apache/beam/blob/58e3b06bee7378d2d8db1c8dd534b415864f63e1/sdks/python/container/boot.go#L52.]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-09 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100273=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100273
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 09/May/18 21:49
Start Date: 09/May/18 21:49
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r187161722
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager.py
 ##
 @@ -0,0 +1,556 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Support for installing custom code and required dependencies.
+
+Workflows, with the exception of very simple ones, are organized in multiple
+modules and packages. Typically, these modules and packages have
+dependencies on other standard libraries. Beam relies on the Python
+setuptools package to handle these scenarios. For further details please read:
+https://pythonhosted.org/an_example_pypi_project/setuptools.html
+
+When a runner tries to run a pipeline it will check for a --requirements_file
+and a --setup_file option.
+
+If --setup_file is present then it is assumed that the folder containing the
+file specified by the option has the typical layout required by setuptools and
+it will run 'python setup.py sdist' to produce a source distribution. The
+resulting tarball (a .tar or .tar.gz file) will be staged at the staging
+location specified as job option. When a worker starts it will check for the
+presence of this file and will run 'easy_install tarball' to install the
+package in the worker.
+
+If --requirements_file is present then the file specified by the option will be
+staged in the staging location.  When a worker starts it will check for the
+presence of this file and will run 'pip install -r requirements.txt'. A
+requirements file can be easily generated by running 'pip freeze -r
+requirements.txt'. The reason a runner does not run this automatically is
+because quite often only a small fraction of the dependencies present in a
+requirements.txt file are actually needed for remote execution and therefore a
+one-time manual trimming is desirable.
+
+TODO(silviuc): Staged files should have a job specific prefix.
+To prevent several jobs in the same project stomping on each other due to a
+shared staging location.
+
+TODO(silviuc): Should we allow several setup packages?
+TODO(silviuc): We should allow customizing the exact command for setup build.
+"""
+
+import glob
+import logging
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import pkg_resources
+
+from apache_beam.internal import pickler
+from apache_beam.io.filesystems import FileSystems
+from apache_beam.options.pipeline_options import SetupOptions
+# TODO(angoenka): Remove reference to dataflow internal names
+from apache_beam.runners.dataflow.internal import names
+from apache_beam.utils import processes
+
+# All constants are for internal use only; no backwards-compatibility
+# guarantees.
+
+# Standard file names used for staging files.
+WORKFLOW_TARBALL_FILE = 'workflow.tar.gz'
+REQUIREMENTS_FILE = 'requirements.txt'
+EXTRA_PACKAGES_FILE = 'extra_packages.txt'
+
+# Package names for distributions
+BEAM_PACKAGE_NAME = 'apache-beam'
+
+
+class FileHandler(object):
 
 Review comment:
   The class is needed to group the functionality of copy/download/check_remote 
as these are the 3 things which are used by stager.
   Also file_download needs file_copy as it copies file from remote if its not 
an http url.
   
   FileHandler is not intended to be implemented for every runner as the sdk 
will not directly interact with runner instead will only submit job to a 
job_service and artifacts to an artifact service for both of which we have well 
defined contract. 
   However by proving FileHandler as an argument, we keep the possibility open 
for different adding other means of interacting with files and also support the 
old runners.
   
   I will update the docstring to highlight the usage 

[jira] [Work logged] (BEAM-3883) Python SDK stages artifacts when talking to job server

2018-05-09 Thread ASF GitHub Bot (JIRA)

 [ 
https://issues.apache.org/jira/browse/BEAM-3883?focusedWorklogId=100275=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-100275
 ]

ASF GitHub Bot logged work on BEAM-3883:


Author: ASF GitHub Bot
Created on: 09/May/18 21:49
Start Date: 09/May/18 21:49
Worklog Time Spent: 10m 
  Work Description: angoenka commented on a change in pull request #5251: 
[BEAM-3883] Refactor and clean dependency.py to make it reusable with artifact 
service
URL: https://github.com/apache/beam/pull/5251#discussion_r187170162
 
 

 ##
 File path: sdks/python/apache_beam/runners/portability/stager.py
 ##
 @@ -0,0 +1,556 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Support for installing custom code and required dependencies.
+
+Workflows, with the exception of very simple ones, are organized in multiple
+modules and packages. Typically, these modules and packages have
+dependencies on other standard libraries. Beam relies on the Python
+setuptools package to handle these scenarios. For further details please read:
+https://pythonhosted.org/an_example_pypi_project/setuptools.html
+
+When a runner tries to run a pipeline it will check for a --requirements_file
+and a --setup_file option.
+
+If --setup_file is present then it is assumed that the folder containing the
+file specified by the option has the typical layout required by setuptools and
+it will run 'python setup.py sdist' to produce a source distribution. The
+resulting tarball (a .tar or .tar.gz file) will be staged at the staging
+location specified as job option. When a worker starts it will check for the
+presence of this file and will run 'easy_install tarball' to install the
+package in the worker.
+
+If --requirements_file is present then the file specified by the option will be
+staged in the staging location.  When a worker starts it will check for the
+presence of this file and will run 'pip install -r requirements.txt'. A
+requirements file can be easily generated by running 'pip freeze -r
+requirements.txt'. The reason a runner does not run this automatically is
+because quite often only a small fraction of the dependencies present in a
+requirements.txt file are actually needed for remote execution and therefore a
+one-time manual trimming is desirable.
+
+TODO(silviuc): Staged files should have a job specific prefix.
+To prevent several jobs in the same project stomping on each other due to a
+shared staging location.
+
+TODO(silviuc): Should we allow several setup packages?
+TODO(silviuc): We should allow customizing the exact command for setup build.
+"""
+
+import glob
+import logging
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import pkg_resources
+
+from apache_beam.internal import pickler
+from apache_beam.io.filesystems import FileSystems
+from apache_beam.options.pipeline_options import SetupOptions
+# TODO(angoenka): Remove reference to dataflow internal names
+from apache_beam.runners.dataflow.internal import names
+from apache_beam.utils import processes
+
+# All constants are for internal use only; no backwards-compatibility
+# guarantees.
+
+# Standard file names used for staging files.
+WORKFLOW_TARBALL_FILE = 'workflow.tar.gz'
+REQUIREMENTS_FILE = 'requirements.txt'
+EXTRA_PACKAGES_FILE = 'extra_packages.txt'
+
+# Package names for distributions
+BEAM_PACKAGE_NAME = 'apache-beam'
+
+
+class FileHandler(object):
+
+  def file_copy(self, from_path, to_path):
+"""Copies a local file to a remote location or vice versa."""
+logging.info('File copy from %s to %s.', from_path, to_path)
+
+if not os.path.isdir(os.path.dirname(to_path)):
+  logging.info(
+  'Created folder (since we have not done yet, and any errors '
+  'will follow): %s ', os.path.dirname(to_path))
+  os.mkdir(os.path.dirname(to_path))
+shutil.copyfile(from_path, to_path)
+
+  def file_download(self, from_url, to_path):
+"""Downloads a file over http/https from a or copy them from a remote
+location."""
+if from_url.startswith('http://') or from_url.startswith('https://'):
+  # TODO(silviuc): We should cache 

  1   2   >