Lee-W commented on code in PR #32683:
URL: https://github.com/apache/airflow/pull/32683#discussion_r1267484537


##########
airflow/providers/amazon/aws/hooks/sagemaker.py:
##########
@@ -1073,14 +1074,19 @@ def start_pipeline(
         :param display_name: The name this pipeline execution will have in the 
UI. Doesn't need to be unique.
         :param pipeline_params: Optional parameters for the pipeline.
             All parameters supplied need to already be present in the pipeline 
definition.
-        :param wait_for_completion: Will only return once the pipeline is 
complete if true.
-        :param check_interval: How long to wait between checks for pipeline 
status when waiting for
-            completion.
-        :param verbose: Whether to print steps details when waiting for 
completion.
-            Defaults to true, consider turning off for pipelines that have 
thousands of steps.
 
         :return: the ARN of the pipeline execution launched.
         """
+        if wait_for_completion or check_interval is not None:
+            warnings.warn(
+                "parameter `wait_for_completion` and `check_interval` are 
deprecated, "
+                "remove them and call check_status yourself if you want to 
wait for completion",
+                AirflowProviderDeprecationWarning,
+                stacklevel=2,
+            )
+        if check_interval is None:
+            check_interval = 30

Review Comment:
   May I know why do we want to deprecate `check_interval`?



##########
airflow/providers/amazon/aws/operators/sagemaker.py:
##########
@@ -1024,22 +1032,57 @@ def __init__(
         self.pipeline_params = pipeline_params
         self.wait_for_completion = wait_for_completion
         self.check_interval = check_interval
+        self.waiter_max_attempts = waiter_max_attempts
         self.verbose = verbose
+        self.deferrable = deferrable
 
     def execute(self, context: Context) -> str:
         arn = self.hook.start_pipeline(
             pipeline_name=self.pipeline_name,
             display_name=self.display_name,
             pipeline_params=self.pipeline_params,
-            wait_for_completion=self.wait_for_completion,
-            check_interval=self.check_interval,
-            verbose=self.verbose,
         )
         self.log.info(
             "Starting a new execution for pipeline %s, running with ARN %s", 
self.pipeline_name, arn
         )
+        if self.deferrable:
+            self.defer(
+                trigger=SageMakerPipelineTrigger(
+                    waiter_type=SageMakerPipelineTrigger.Type.COMPLETE,
+                    pipeline_execution_arn=arn,
+                    waiter_delay=self.check_interval,
+                    waiter_max_attempts=self.waiter_max_attempts,
+                    aws_conn_id=self.aws_conn_id,
+                ),
+                method_name="execute_complete",
+            )
+        elif self.wait_for_completion:
+            waiter = self.hook.get_waiter("PipelineExecutionComplete")
+            wait(
+                waiter=waiter,
+                waiter_delay=self.check_interval,
+                waiter_max_attempts=self.waiter_max_attempts,
+                args={"PipelineExecutionArn": arn},
+                failure_message="Error while waiting for pipeline execution to 
complete",
+                status_message="Pipeline execution status",
+                status_args=["PipelineExecutionStatus", "FailureReason"],
+            )
+            self.hook.check_status(
+                arn,
+                "PipelineExecutionStatus",
+                lambda p: self.hook.describe_pipeline_exec(p, self.verbose),
+                self.check_interval,
+                non_terminal_states=self.hook.pipeline_non_terminal_states,
+                max_ingestion_time=self.waiter_max_attempts * 
self.check_interval,
+            )
         return arn
 
+    def execute_complete(self, context: Context, event: dict[str, Any] | None 
= None) -> str:
+        if event is None or event["status"] != "success":
+            raise AirflowException(f"Failure during pipeline execution: 
{event}")
+        else:
+            return event["value"]

Review Comment:
   ```suggestion
           return event["value"]
   ```



##########
airflow/providers/amazon/aws/hooks/sagemaker.py:
##########
@@ -1073,14 +1074,19 @@ def start_pipeline(
         :param display_name: The name this pipeline execution will have in the 
UI. Doesn't need to be unique.
         :param pipeline_params: Optional parameters for the pipeline.
             All parameters supplied need to already be present in the pipeline 
definition.
-        :param wait_for_completion: Will only return once the pipeline is 
complete if true.
-        :param check_interval: How long to wait between checks for pipeline 
status when waiting for
-            completion.
-        :param verbose: Whether to print steps details when waiting for 
completion.
-            Defaults to true, consider turning off for pipelines that have 
thousands of steps.

Review Comment:
   Should we mark them as deprecated instead of removing the docstring?



##########
airflow/providers/amazon/aws/triggers/sagemaker.py:
##########
@@ -115,3 +121,79 @@ async def run(self):
                 status_args=[self._get_response_status_key(self.job_type)],
             )
             yield TriggerEvent({"status": "success", "message": "Job 
completed."})
+
+
+class SageMakerPipelineTrigger(BaseTrigger):
+    """Trigger to wait for a sagemaker pipeline execution to finish."""
+
+    class Type(IntEnum):
+        """Type of waiter to use."""
+
+        COMPLETE = 1
+        STOPPED = 2
+
+    def __init__(
+        self,
+        waiter_type: Type,
+        pipeline_execution_arn: str,
+        waiter_delay: int,
+        waiter_max_attempts: int,
+        aws_conn_id: str,
+    ):
+        self.waiter_type = waiter_type
+        self.pipeline_execution_arn = pipeline_execution_arn
+        self.waiter_delay = waiter_delay
+        self.waiter_max_attempts = waiter_max_attempts
+        self.aws_conn_id = aws_conn_id
+
+    def serialize(self) -> tuple[str, dict[str, Any]]:
+        return (
+            "airflow.providers.amazon.aws.triggers.sagemaker.SageMakerTrigger",
+            {
+                "waiter_type": self.waiter_type.value,  # saving the int value 
here
+                "pipeline_execution_arn": self.pipeline_execution_arn,
+                "waiter_delay": self.waiter_delay,
+                "waiter_max_attempts": self.waiter_max_attempts,
+                "aws_conn_id": self.aws_conn_id,
+            },
+        )
+
+    _waiter_name = {
+        Type.COMPLETE: "PipelineExecutionComplete",
+        Type.STOPPED: "PipelineExecutionStopped",
+    }
+
+    async def run(self) -> AsyncIterator[TriggerEvent]:
+        attempts = 0
+        hook = SageMakerHook(aws_conn_id=self.aws_conn_id)
+        async with hook.async_conn as conn:
+            waiter = hook.get_waiter(self._waiter_name[self.waiter_type])
+            while attempts < self.waiter_max_attempts:
+                attempts = attempts + 1
+                try:
+                    await waiter.wait(
+                        PipelineExecutionArn=self.pipeline_execution_arn, 
WaiterConfig={"MaxAttempts": 1}
+                    )
+                    yield TriggerEvent({"status": "success", "value": 
self.pipeline_execution_arn})
+                    break  # we reach this point only if the waiter met a 
success criteria

Review Comment:
   not related to this PR.
   
   ---
   
   @pankajastro I remember previously you suggest we use `return` here. But I 
think using `break` here seems to be a better option. WDYT?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to