Hi all,
I have created a Dataflow pipeline in batch mode using Apache beam Python
SDK. I am using one non-public dependency 'uplight-telemetry'. I have
specified it using parameter extra_package while creating pipeline_options
object. However, the pipeline loading is failing with an error *No module
named 'uplight_telemetry'*.
The code to create pipeline_options is as following-
def __create_pipeline_options_dataflow(job_name):
# Set up the Dataflow runner options
gcp_project_id = os.environ.get(GCP_PROJECT_ID)
current_dir = os.path.dirname(os.path.abspath(__file__))
print("current_dir=", current_dir)
setup_file_path = os.path.join(current_dir, '..', '..', 'setup.py')
print("Set-up file path=", setup_file_path)
#TODO:Move file to proper location
uplight_telemetry_tar_file_path=os.path.join(current_dir, '..',
'..','..','non-public-dependencies', 'uplight-telemetry-1.0.0.tar.gz')
# TODO:Move to environmental variables
pipeline_options = {
'project': gcp_project_id,
'region': "us-east1",
'job_name': job_name, # Provide a unique job name
'temp_location':
f'gs://{TAS_GCS_BUCKET_NAME_PREFIX}{os.getenv("UP_PLATFORM_ENV")}/temp',
'staging_location':
f'gs://{TAS_GCS_BUCKET_NAME_PREFIX}{os.getenv("UP_PLATFORM_ENV")}/staging',
'runner': 'DataflowRunner',
'save_main_session': True,
'service_account_email': os.environ.get(SERVICE_ACCOUNT),
# 'network': f'projects/{gcp_project_id}/global/networks/default',
'subnetwork': os.environ.get(SUBNETWORK_URL),
'setup_file': setup_file_path,
'extra_package': uplight_telemetry_tar_file_path
# 'template_location':
'gcr.io/dataflow-templates-base/python310-template-launcher-base'
}
print("Pipeline created for job-name", job_name)
logger.debug(f"pipeline_options created as {pipeline_options}")
return pipeline_options
Why is it not trying to install this package from extra_package?