[GitHub] [airflow] bhirsz commented on a diff in pull request #26915: Rewrite system tests for ML Engine service

GitBox Thu, 06 Oct 2022 22:46:00 -0700


bhirsz commented on code in PR #26915:
URL: https://github.com/apache/airflow/pull/26915#discussion_r989701818



##########
tests/system/providers/google/cloud/ml_engine/example_mlengine.py:
##########
@@ -37,70 +40,74 @@
     MLEngineStartBatchPredictionJobOperator,
     MLEngineStartTrainingJobOperator,
 )
+from airflow.providers.google.cloud.transfers.local_to_gcs import 
LocalFilesystemToGCSOperator
 from airflow.providers.google.cloud.utils import mlengine_operator_utils
+from airflow.utils.trigger_rule import TriggerRule
 
-PROJECT_ID = os.environ.get("GCP_PROJECT_ID", "example-project")
+DAG_ID = "example_gcp_mlengine"
+BASE_DIR = pathlib.Path(__file__).parent.resolve()
+PREDICT_FILE_NAME = 'predict.json'
+PATH_TO_PREDICT_FILE = BASE_DIR / PREDICT_FILE_NAME
 
-MODEL_NAME = os.environ.get("GCP_MLENGINE_MODEL_NAME", "model_name")
-
-SAVED_MODEL_PATH = os.environ.get("GCP_MLENGINE_SAVED_MODEL_PATH", 
"gs://INVALID BUCKET NAME/saved-model/")
-JOB_DIR = os.environ.get("GCP_MLENGINE_JOB_DIR", "gs://INVALID BUCKET 
NAME/keras-job-dir")
-PREDICTION_INPUT = os.environ.get(
-    "GCP_MLENGINE_PREDICTION_INPUT", "gs://INVALID BUCKET 
NAME/prediction_input.json"
-)
+PROJECT_ID = os.environ.get("GCP_PROJECT_ID")
+ENV_ID = os.environ.get("SYSTEM_TESTS_ENV_ID")
+MODEL_NAME = os.environ.get("GCP_MLENGINE_MODEL_NAME", 
f"example_mlengine_model_{ENV_ID}")
+BUCKET_NAME = os.environ.get("BUCKET_NAME", 
f"example_mlengine_bucket_{ENV_ID}")
+BUCKET_PATH = os.environ.get("BUCKET_PATH", f"gs://{BUCKET_NAME}")
+JOB_DIR = os.environ.get("GCP_MLENGINE_JOB_DIR", f"{BUCKET_PATH}/job-dir")
+SAVED_MODEL_PATH = os.environ.get("GCP_MLENGINE_SAVED_MODEL_PATH", 
f"{JOB_DIR}/")
+PREDICTION_INPUT = os.environ.get("GCP_MLENGINE_PREDICTION_INPUT", 
f"{BUCKET_PATH}/{PREDICT_FILE_NAME}")
 PREDICTION_OUTPUT = os.environ.get(
-    "GCP_MLENGINE_PREDICTION_OUTPUT", "gs://INVALID BUCKET 
NAME/prediction_output"
+    "GCP_MLENGINE_PREDICTION_OUTPUT", "gs://INVALID BUCKET 
NAME/prediction_output/"
+)
+TRAINER_URI = os.environ.get(
+    "GCP_MLENGINE_TRAINER_URI",
+    "gs://system-tests-resources/example_gcp_mlengine/trainer-0.1.tar.gz",
+)
+TRAINER_PY_MODULE = os.environ.get(
+    "GCP_MLENGINE_TRAINER_TRAINER_PY_MODULE",
+    "trainer.task",
 )
-TRAINER_URI = os.environ.get("GCP_MLENGINE_TRAINER_URI", "gs://INVALID BUCKET 
NAME/trainer.tar.gz")
-TRAINER_PY_MODULE = os.environ.get("GCP_MLENGINE_TRAINER_TRAINER_PY_MODULE", 
"trainer.task")
+SUMMARY_TMP = os.environ.get("GCP_MLENGINE_DATAFLOW_TMP", 
f"{BUCKET_PATH}/tmp/")
+SUMMARY_STAGING = os.environ.get("GCP_MLENGINE_DATAFLOW_STAGING", 
f"{BUCKET_PATH}/staging/")
 
-SUMMARY_TMP = os.environ.get("GCP_MLENGINE_DATAFLOW_TMP", "gs://INVALID BUCKET 
NAME/tmp/")
-SUMMARY_STAGING = os.environ.get("GCP_MLENGINE_DATAFLOW_STAGING", 
"gs://INVALID BUCKET NAME/staging/")
+
+def generate_model_predict_input_data() -> list[int]:
+    return [i for i in range(0, 201, 10)]
 
 
 with models.DAG(
-    "example_gcp_mlengine",
+    dag_id=DAG_ID,
+    schedule="@once",
     start_date=datetime(2021, 1, 1),
     catchup=False,
-    tags=['example'],
+    tags=['example', 'ml_engine'],
     params={"model_name": MODEL_NAME},
 ) as dag:
-    hyperparams: dict[str, Any] = {
-        'goal': 'MAXIMIZE',
-        'hyperparameterMetricTag': 'metric1',
-        'maxTrials': 30,
-        'maxParallelTrials': 1,
-        'enableTrialEarlyStopping': True,
-        'params': [],
-    }
-
-    hyperparams['params'].append(
-        {
-            'parameterName': 'hidden1',
-            'type': 'INTEGER',
-            'minValue': 40,
-            'maxValue': 400,
-            'scaleType': 'UNIT_LINEAR_SCALE',
-        }
+    create_bucket = GCSCreateBucketOperator(
+        task_id="create-bucket",
+        bucket_name=BUCKET_NAME,
     )
 
-    hyperparams['params'].append(
-        {'parameterName': 'numRnnCells', 'type': 'DISCRETE', 'discreteValues': 
[1, 2, 3, 4]}
+    def write_predict_file(path_to_file: str):
+        predict_data = generate_model_predict_input_data()
+        with open(path_to_file, 'w') as file:
+            for p in predict_data:
+                file.write(f'{{"input_layer": [{p}]}}\n')
+
+    write_data = PythonOperator(
+        task_id="write-predict-data-file",
+        python_callable=write_predict_file,
+        op_args=(PATH_TO_PREDICT_FILE,),
     )

Review Comment:
   The code is fine but with the introduction of ``@task`` it's recommended 
more: 
https://airflow.apache.org/docs/apache-airflow/stable/howto/operator/python.html



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [airflow] bhirsz commented on a diff in pull request #26915: Rewrite system tests for ML Engine service

Reply via email to