Ella, I believe you are using version 2.0.0. Use of --download flag is fixed at head. (I do not recall the exact version of the fix, could be 2.2 or 2.3). If possible please try to use a newer version of Beam.
Also, as Luke suggested, we would welcome any contributions to the documentation. On Fri, Aug 24, 2018 at 1:26 PM, Lukasz Cwik <lc...@google.com> wrote: > It seems like we only mention the need for pip 7.0.0 on the python > quickstart page https://beam.apache.org/get-started/quickstart-py/ > > Would you like to submit a change to update it? > > On Wed, Aug 22, 2018 at 9:31 AM OrielResearch Eila Arich-Landkof < > e...@orielresearch.org> wrote: > >> The issue was with the pip version. --download was deprecated. I dont >> know where this need to be mentioned / fixed. >> running >> pip install pip==9.0.3 >> >> solved the issue. >> >> Thanks, >> eila >> >> On Wed, Aug 22, 2018 at 11:20 AM OrielResearch Eila Arich-Landkof < >> e...@orielresearch.org> wrote: >> >>> I tried a simple pipeline which is runner perfectly on local runner and >>> the same issue on dataflow. see below. Is there anything at the environment >>> that need to be updated that I am not aware of? >>> >>> Many thanks for any reference. >>> Eila >>> >>> import apache_beam as beam >>> options = PipelineOptions() >>> google_cloud_options = options.view_as(GoogleCloudOptions) >>> google_cloud_options.project = 'PROJECT-ID' >>> google_cloud_options.job_name = 'try-debug' >>> google_cloud_options.staging_location = '%s/staging' % BUCKET_URL >>> #'gs://archs4/staging' >>> google_cloud_options.temp_location = '%s/tmp' % BUCKET_URL >>> #'gs://archs4/temp' >>> options.view_as(StandardOptions).runner = 'DataflowRunner' >>> >>> p1 = beam.Pipeline(options=options) >>> >>> (p1 | 'read' >> >>> beam.io.ReadFromText('gs://dataflow-samples/shakespeare/kinglear.txt') >>> | 'write' >> beam.io.WriteToText('gs://bucket/test.txt', num_shards=1) >>> ) >>> >>> p1.run().wait_until_finish() >>> >>> will fire the following error: >>> >>> CalledProcessErrorTraceback (most recent call last) >>> <ipython-input-17-b4be63f7802f> in <module>() >>> 5 ) >>> 6 >>> ----> 7 p1.run().wait_until_finish() >>> >>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/pipeline.pyc >>> in run(self, test_runner_api) >>> 174 finally: >>> 175 shutil.rmtree(tmpdir) >>> --> 176 return self.runner.run(self) >>> 177 >>> 178 def __enter__(self): >>> >>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/dataflow_runner.pyc >>> in run(self, pipeline) >>> 250 # Create the job >>> 251 result = DataflowPipelineResult( >>> --> 252 self.dataflow_client.create_job(self.job), self) >>> 253 >>> 254 self._metrics = DataflowMetrics(self.dataflow_client, result, >>> self.job) >>> >>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/utils/retry.pyc >>> in wrapper(*args, **kwargs) >>> 166 while True: >>> 167 try: >>> --> 168 return fun(*args, **kwargs) >>> 169 except Exception as exn: # pylint: disable=broad-except >>> 170 if not retry_filter(exn): >>> >>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/apiclient.pyc >>> in create_job(self, job) >>> 423 def create_job(self, job): >>> 424 """Creates job description. May stage and/or submit for remote >>> execution.""" >>> --> 425 self.create_job_description(job) >>> 426 >>> 427 # Stage and submit the job when necessary >>> >>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/apiclient.pyc >>> in create_job_description(self, job) >>> 446 """Creates a job described by the workflow proto.""" >>> 447 resources = dependency.stage_job_resources( >>> --> 448 job.options, file_copy=self._gcs_file_copy) >>> 449 job.proto.environment = Environment( >>> 450 packages=resources, options=job.options, >>> >>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/dependency.pyc >>> in stage_job_resources(options, file_copy, build_setup_args, temp_dir, >>> populate_requirements_cache) >>> 377 else: >>> 378 sdk_remote_location = setup_options.sdk_location >>> --> 379 _stage_beam_sdk_tarball(sdk_remote_location, staged_path, >>> temp_dir) >>> 380 resources.append(names.DATAFLOW_SDK_TARBALL_FILE) >>> 381 else: >>> >>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/dependency.pyc >>> in _stage_beam_sdk_tarball(sdk_remote_location, staged_path, temp_dir) >>> 462 elif sdk_remote_location == 'pypi': >>> 463 logging.info('Staging the SDK tarball from PyPI to %s', >>> staged_path) >>> --> 464 _dependency_file_copy(_download_pypi_sdk_package(temp_dir), >>> staged_path) >>> 465 else: >>> 466 raise RuntimeError( >>> >>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/dependency.pyc >>> in _download_pypi_sdk_package(temp_dir) >>> 525 '--no-binary', ':all:', '--no-deps'] >>> 526 logging.info('Executing command: %s', cmd_args) >>> --> 527 processes.check_call(cmd_args) >>> 528 zip_expected = os.path.join( >>> 529 temp_dir, '%s-%s.zip' % (package_name, version)) >>> >>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/utils/processes.pyc >>> in check_call(*args, **kwargs) >>> 42 if force_shell: >>> 43 kwargs['shell'] = True >>> ---> 44 return subprocess.check_call(*args, **kwargs) >>> 45 >>> 46 >>> >>> /usr/local/envs/py2env/lib/python2.7/subprocess.pyc in >>> check_call(*popenargs, **kwargs) >>> 188 if cmd is None: >>> 189 cmd = popenargs[0] >>> --> 190 raise CalledProcessError(retcode, cmd) >>> 191 return 0 >>> 192 >>> >>> CalledProcessError: Command '['/usr/local/envs/py2env/bin/python', '-m', >>> 'pip', 'install', '--download', '/tmp/tmpyyiizo', >>> 'google-cloud-dataflow==2.0.0', '--no-binary', ':all:', '--no-deps']' >>> returned non-zero exit status 2 >>> >>> >>> >>> On Wed, Aug 22, 2018 at 10:39 AM OrielResearch Eila Arich-Landkof < >>> e...@orielresearch.org> wrote: >>> >>>> Hello all, >>>> >>>> I am running a pipeline that used to be executed on dataflow with no >>>> issues. I am using the datalab environment. See below the error. To my >>>> understanding happening before the pipeline code is being is being >>>> executed. >>>> Any idea what went wrong? >>>> >>>> Thanks, >>>> Eila >>>> >>>> >>>> Executing the pipeline: >>>> >>>> *p.run().wait_until_finish()* >>>> >>>> The following error is being fired: >>>> >>>> INFO:root:Executing command: ['/usr/local/envs/py2env/bin/python', >>>> 'setup.py', 'sdist', '--dist-dir', '/tmp/tmp_B0gnK'] >>>> INFO:root:Starting GCS upload to >>>> gs://archs4/staging/label-archs4-annotation-15.1534948236.075799/workflow.tar.gz... >>>> INFO:oauth2client.client:Attempting refresh to obtain initial access_token >>>> INFO:root:Completed GCS upload to >>>> gs://archs4/staging/label-archs4-annotation-15.1534948236.075799/workflow.tar.gz >>>> INFO:root:Staging the SDK tarball from PyPI to >>>> gs://archs4/staging/label-archs4-annotation-15.1534948236.075799/dataflow_python_sdk.tar >>>> INFO:root:Executing command: ['/usr/local/envs/py2env/bin/python', '-m', >>>> 'pip', 'install', '--download', '/tmp/tmp_B0gnK', >>>> 'google-cloud-dataflow==2.0.0', '--no-binary', ':all:', '--no-deps'] >>>> >>>> CalledProcessErrorTraceback (most recent call >>>> last)<ipython-input-27-1e5aeb8b7d9b> in <module>()----> 1 >>>> p.run().wait_until_finish() >>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/pipeline.pyc >>>> in run(self, test_runner_api) 174 finally: 175 >>>> shutil.rmtree(tmpdir)--> 176 return self.runner.run(self) 177 >>>> 178 def __enter__(self): >>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/dataflow_runner.pyc >>>> in run(self, pipeline) 250 # Create the job 251 result = >>>> DataflowPipelineResult(--> 252 >>>> self.dataflow_client.create_job(self.job), self) 253 254 >>>> self._metrics = DataflowMetrics(self.dataflow_client, result, self.job) >>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/utils/retry.pyc >>>> in wrapper(*args, **kwargs) 166 while True: 167 >>>> try:--> 168 return fun(*args, **kwargs) 169 except >>>> Exception as exn: # pylint: disable=broad-except 170 if not >>>> retry_filter(exn): >>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/apiclient.pyc >>>> in create_job(self, job) 423 def create_job(self, job): 424 >>>> """Creates job description. May stage and/or submit for remote >>>> execution."""--> 425 self.create_job_description(job) 426 427 >>>> # Stage and submit the job when necessary >>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/apiclient.pyc >>>> in create_job_description(self, job) 446 """Creates a job >>>> described by the workflow proto.""" 447 resources = >>>> dependency.stage_job_resources(--> 448 job.options, >>>> file_copy=self._gcs_file_copy) 449 job.proto.environment = >>>> Environment( 450 packages=resources, options=job.options, >>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/dependency.pyc >>>> in stage_job_resources(options, file_copy, build_setup_args, temp_dir, >>>> populate_requirements_cache) 377 else: 378 >>>> sdk_remote_location = setup_options.sdk_location--> 379 >>>> _stage_beam_sdk_tarball(sdk_remote_location, staged_path, temp_dir) 380 >>>> resources.append(names.DATAFLOW_SDK_TARBALL_FILE) 381 else: >>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/dependency.pyc >>>> in _stage_beam_sdk_tarball(sdk_remote_location, staged_path, temp_dir) >>>> 462 elif sdk_remote_location == 'pypi': 463 logging.info('Staging >>>> the SDK tarball from PyPI to %s', staged_path)--> 464 >>>> _dependency_file_copy(_download_pypi_sdk_package(temp_dir), staged_path) >>>> 465 else: 466 raise RuntimeError( >>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/dependency.pyc >>>> in _download_pypi_sdk_package(temp_dir) 525 '--no-binary', >>>> ':all:', '--no-deps'] 526 logging.info('Executing command: %s', >>>> cmd_args)--> 527 processes.check_call(cmd_args) 528 zip_expected = >>>> os.path.join( 529 temp_dir, '%s-%s.zip' % (package_name, version)) >>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/utils/processes.pyc >>>> in check_call(*args, **kwargs) 42 if force_shell: 43 >>>> kwargs['shell'] = True---> 44 return subprocess.check_call(*args, >>>> **kwargs) 45 46 >>>> /usr/local/envs/py2env/lib/python2.7/subprocess.pyc in >>>> check_call(*popenargs, **kwargs) 188 if cmd is None: 189 >>>> cmd = popenargs[0]--> 190 raise >>>> CalledProcessError(retcode, cmd) 191 return 0 192 >>>> CalledProcessError: Command '['/usr/local/envs/py2env/bin/python', '-m', >>>> 'pip', 'install', '--download', '/tmp/tmp_B0gnK', >>>> 'google-cloud-dataflow==2.0.0', '--no-binary', ':all:', '--no-deps']' >>>> returned non-zero exit status 2 >>>> >>>> >>>> >>>> -- >>>> Eila >>>> www.orielresearch.org >>>> https://www.meetu <https://www.meetup.com/Deep-Learning-In-Production/> >>>> p.co <https://www.meetup.com/Deep-Learning-In-Production/>m/Deep- >>>> Learning-In-Production/ >>>> <https://www.meetup.com/Deep-Learning-In-Production/> >>>> >>>> >>>> >>> >>> -- >>> Eila >>> www.orielresearch.org >>> https://www.meetu <https://www.meetup.com/Deep-Learning-In-Production/> >>> p.co <https://www.meetup.com/Deep-Learning-In-Production/>m/Deep- >>> Learning-In-Production/ >>> <https://www.meetup.com/Deep-Learning-In-Production/> >>> >>> >>> >> >> -- >> Eila >> www.orielresearch.org >> https://www.meetu <https://www.meetup.com/Deep-Learning-In-Production/> >> p.co <https://www.meetup.com/Deep-Learning-In-Production/>m/Deep- >> Learning-In-Production/ >> <https://www.meetup.com/Deep-Learning-In-Production/> >> >> >>