dstandish commented on code in PR #30618:
URL: https://github.com/apache/airflow/pull/30618#discussion_r1169174297
##########
airflow/providers/google/cloud/sensors/gcs.py:
##########
@@ -307,10 +310,37 @@ def poke(self, context: Context) -> bool:
self._matches = hook.list(self.bucket, prefix=self.prefix)
return bool(self._matches)
- def execute(self, context: Context) -> list[str]:
+ def execute(self, context: Context):
"""Overridden to allow matches to be passed"""
- super().execute(context)
- return self._matches
+ if not self.deferrable:
+ super().execute(context)
+ return self._matches
+ else:
+ self.defer(
+ timeout=timedelta(seconds=self.timeout),
+ trigger=GCSPrefixBlobTrigger(
+ bucket=self.bucket,
+ prefix=self.prefix,
+ poke_interval=self.poke_interval,
+ google_cloud_conn_id=self.google_cloud_conn_id,
+ hook_params={
+ "delegate_to": self.delegate_to,
+ "impersonation_chain": self.impersonation_chain,
+ },
+ ),
+ method_name="execute_complete",
+ )
+
+ def execute_complete(self, context: dict[str, Any], event: dict[str, str |
list[str]]) -> str | list[str]:
+ """
+ Callback for when the trigger fires - returns immediately.
+ Relies on trigger to throw an exception, otherwise it assumes
execution was
+ successful.
+ """
+ self.log.info("Sensor checks existence of objects: %s, %s",
self.bucket, self.prefix)
Review Comment:
```suggestion
self.log.info("Checking for existence of object: %s, %s",
self.bucket, self.prefix)
```
maybe tiny bit better?
##########
airflow/providers/google/cloud/triggers/gcs.py:
##########
@@ -97,3 +97,76 @@ async def _object_exists(self, hook: GCSAsyncHook,
bucket_name: str, object_name
if object_response:
return "success"
return "pending"
+
+
+class GCSPrefixBlobTrigger(GCSBlobTrigger):
+ """
+ A trigger that fires and it looks for all the objects in the given bucket
+ which matches the given prefix if not found sleep for certain interval and
checks again.
+
+ :param bucket: the bucket in the google cloud storage where the objects
are residing.
+ :param prefix: The prefix of the blob_names to match in the Google cloud
storage bucket
+ :param google_cloud_conn_id: reference to the Google Connection
+ :param poke_interval: polling period in seconds to check
+ """
+
+ def __init__(
+ self,
+ bucket: str,
+ prefix: str,
+ poke_interval: float,
+ google_cloud_conn_id: str,
+ hook_params: dict[str, Any],
+ ):
+ super().__init__(
+ bucket=bucket,
+ object_name=prefix,
+ poke_interval=poke_interval,
+ google_cloud_conn_id=google_cloud_conn_id,
+ hook_params=hook_params,
+ )
+ self.prefix = prefix
+
+ def serialize(self) -> tuple[str, dict[str, Any]]:
+ """Serializes GCSPrefixBlobTrigger arguments and classpath."""
+ return (
+ "airflow.providers.google.cloud.triggers.gcs.GCSPrefixBlobTrigger",
+ {
+ "bucket": self.bucket,
+ "prefix": self.prefix,
+ "poke_interval": self.poke_interval,
+ "google_cloud_conn_id": self.google_cloud_conn_id,
+ "hook_params": self.hook_params,
+ },
+ )
+
+ async def run(self) -> AsyncIterator["TriggerEvent"]:
+ """Simple loop until the matches are found for the given prefix on the
bucket."""
+ try:
+ hook = self._get_async_hook()
+ while True:
+ res = await self._list_blobs_with_prefix(
+ hook=hook, bucket_name=self.bucket, prefix=self.prefix
+ )
+ if len(res) > 0:
+ yield TriggerEvent(
+ {"status": "success", "message": "Successfully
completed", "matches": res}
+ )
+ await asyncio.sleep(self.poke_interval)
+ except Exception as e:
+ yield TriggerEvent({"status": "error", "message": str(e)})
+ return
+
+ async def _list_blobs_with_prefix(self, hook: GCSAsyncHook, bucket_name:
str, prefix: str) -> list[str]:
+ """
+ Returns list of blobs which matches the given prefix for the given
bucket.
Review Comment:
```suggestion
Returns names of blobs which matches the given prefix for the given
bucket.
```
##########
airflow/providers/google/cloud/sensors/gcs.py:
##########
@@ -307,10 +310,37 @@ def poke(self, context: Context) -> bool:
self._matches = hook.list(self.bucket, prefix=self.prefix)
return bool(self._matches)
- def execute(self, context: Context) -> list[str]:
+ def execute(self, context: Context):
"""Overridden to allow matches to be passed"""
- super().execute(context)
- return self._matches
+ if not self.deferrable:
+ super().execute(context)
+ return self._matches
+ else:
+ self.defer(
+ timeout=timedelta(seconds=self.timeout),
+ trigger=GCSPrefixBlobTrigger(
+ bucket=self.bucket,
+ prefix=self.prefix,
+ poke_interval=self.poke_interval,
+ google_cloud_conn_id=self.google_cloud_conn_id,
+ hook_params={
+ "delegate_to": self.delegate_to,
+ "impersonation_chain": self.impersonation_chain,
+ },
+ ),
+ method_name="execute_complete",
+ )
+
+ def execute_complete(self, context: dict[str, Any], event: dict[str, str |
list[str]]) -> str | list[str]:
+ """
+ Callback for when the trigger fires - returns immediately.
Review Comment:
```suggestion
Callback for when the trigger fires; returns immediately.
```
saw same in https://github.com/apache/airflow/pull/30579
##########
airflow/providers/google/cloud/sensors/gcs.py:
##########
@@ -307,10 +310,37 @@ def poke(self, context: Context) -> bool:
self._matches = hook.list(self.bucket, prefix=self.prefix)
return bool(self._matches)
- def execute(self, context: Context) -> list[str]:
+ def execute(self, context: Context):
"""Overridden to allow matches to be passed"""
- super().execute(context)
- return self._matches
+ if not self.deferrable:
+ super().execute(context)
+ return self._matches
+ else:
+ self.defer(
+ timeout=timedelta(seconds=self.timeout),
+ trigger=GCSPrefixBlobTrigger(
+ bucket=self.bucket,
+ prefix=self.prefix,
+ poke_interval=self.poke_interval,
+ google_cloud_conn_id=self.google_cloud_conn_id,
+ hook_params={
+ "delegate_to": self.delegate_to,
+ "impersonation_chain": self.impersonation_chain,
+ },
+ ),
+ method_name="execute_complete",
+ )
+
+ def execute_complete(self, context: dict[str, Any], event: dict[str, str |
list[str]]) -> str | list[str]:
+ """
+ Callback for when the trigger fires - returns immediately.
+ Relies on trigger to throw an exception, otherwise it assumes
execution was
Review Comment:
Is this true anymore? Looks like it checks now whether success. Saw same
in https://github.com/apache/airflow/pull/30579
##########
docs/apache-airflow-providers-google/operators/cloud/gcs.rst:
##########
@@ -199,8 +199,20 @@ Use the
:class:`~airflow.providers.google.cloud.sensors.gcs.GCSObjectsWithPrefix
:start-after: [START howto_sensor_object_with_prefix_exists_task]
:end-before: [END howto_sensor_object_with_prefix_exists_task]
+You can set the ``deferrable`` param to True if you want this sensor to run
asynchronously - leading to efficient
+utilization of resources in your Airflow deployment. However the triggerer
component needs to be up and running
Review Comment:
```suggestion
utilization of resources in your Airflow deployment. However the triggerer
component needs to be enabled
```
##########
airflow/providers/google/cloud/triggers/gcs.py:
##########
@@ -97,3 +97,76 @@ async def _object_exists(self, hook: GCSAsyncHook,
bucket_name: str, object_name
if object_response:
return "success"
return "pending"
+
+
+class GCSPrefixBlobTrigger(GCSBlobTrigger):
+ """
+ A trigger that fires and it looks for all the objects in the given bucket
+ which matches the given prefix if not found sleep for certain interval and
checks again.
Review Comment:
```suggestion
Looks for objects in bucket matching the prefix.
If none found, sleep for interval and check again. Otherwise, return
matches.
```
##########
docs/apache-airflow-providers-google/operators/cloud/gcs.rst:
##########
@@ -199,8 +199,20 @@ Use the
:class:`~airflow.providers.google.cloud.sensors.gcs.GCSObjectsWithPrefix
:start-after: [START howto_sensor_object_with_prefix_exists_task]
:end-before: [END howto_sensor_object_with_prefix_exists_task]
+You can set the ``deferrable`` param to True if you want this sensor to run
asynchronously - leading to efficient
Review Comment:
```suggestion
You can set the ``deferrable`` param to True if you want this sensor to run
asynchronously, leading to more efficient
```
##########
docs/apache-airflow-providers-google/operators/cloud/gcs.rst:
##########
@@ -199,8 +199,20 @@ Use the
:class:`~airflow.providers.google.cloud.sensors.gcs.GCSObjectsWithPrefix
:start-after: [START howto_sensor_object_with_prefix_exists_task]
:end-before: [END howto_sensor_object_with_prefix_exists_task]
+You can set the ``deferrable`` param to True if you want this sensor to run
asynchronously - leading to efficient
+utilization of resources in your Airflow deployment. However the triggerer
component needs to be up and running
Review Comment:
above two comments also apply to https://github.com/apache/airflow/pull/30579
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]