dstandish commented on code in PR #30618:
URL: https://github.com/apache/airflow/pull/30618#discussion_r1178249248
##########
airflow/providers/google/cloud/sensors/gcs.py:
##########
@@ -307,10 +314,35 @@ def poke(self, context: Context) -> bool:
self._matches = hook.list(self.bucket, prefix=self.prefix)
return bool(self._matches)
- def execute(self, context: Context) -> list[str]:
+ def execute(self, context: Context):
"""Overridden to allow matches to be passed"""
- super().execute(context)
- return self._matches
+ if not self.deferrable:
+ super().execute(context)
+ return self._matches
+ else:
+ self.defer(
+ timeout=timedelta(seconds=self.timeout),
+ trigger=GCSPrefixBlobTrigger(
+ bucket=self.bucket,
+ prefix=self.prefix,
+ poke_interval=self.poke_interval,
+ google_cloud_conn_id=self.google_cloud_conn_id,
+ hook_params={
+ "impersonation_chain": self.impersonation_chain,
+ },
+ ),
+ method_name="execute_complete",
+ )
+
+ def execute_complete(self, context: dict[str, Any], event: dict[str, str |
list[str]]) -> str | list[str]:
+ """
+ Callback for when the trigger fires; returns immediately.
+ Relies on trigger to throw a success event
+ """
+ self.log.info("Checking for existence of object: %s, %s", self.bucket,
self.prefix)
Review Comment:
```suggestion
self.log.info("Checking for existence of object: %s, %s",
self.bucket, self.prefix)
```
sorry to nit pick this but here you are not checking -- the checking has
already happened (in the trigger) and you're resuming from trigger and
checking status
##########
airflow/providers/google/cloud/triggers/gcs.py:
##########
@@ -200,3 +200,76 @@ async def _is_blob_updated_after(
if blob_updated_time > target_date:
return True, {"status": "success", "message": "success"}
return False, {"status": "pending", "message": "pending"}
+
+
+class GCSPrefixBlobTrigger(GCSBlobTrigger):
+ """
+ Looks for objects in bucket matching a prefix.
+ If none found, sleep for interval and check again. Otherwise, return
matches.
+
+ :param bucket: the bucket in the google cloud storage where the objects
are residing.
+ :param prefix: The prefix of the blob_names to match in the Google cloud
storage bucket
+ :param google_cloud_conn_id: reference to the Google Connection
+ :param poke_interval: polling period in seconds to check
+ """
+
+ def __init__(
+ self,
+ bucket: str,
+ prefix: str,
+ poke_interval: float,
+ google_cloud_conn_id: str,
+ hook_params: dict[str, Any],
+ ):
+ super().__init__(
+ bucket=bucket,
+ object_name=prefix,
+ poke_interval=poke_interval,
+ google_cloud_conn_id=google_cloud_conn_id,
+ hook_params=hook_params,
+ )
+ self.prefix = prefix
+
+ def serialize(self) -> tuple[str, dict[str, Any]]:
+ """Serializes GCSPrefixBlobTrigger arguments and classpath."""
+ return (
+ "airflow.providers.google.cloud.triggers.gcs.GCSPrefixBlobTrigger",
+ {
+ "bucket": self.bucket,
+ "prefix": self.prefix,
+ "poke_interval": self.poke_interval,
+ "google_cloud_conn_id": self.google_cloud_conn_id,
+ "hook_params": self.hook_params,
+ },
+ )
+
+ async def run(self) -> AsyncIterator[TriggerEvent]:
+ """Simple loop until the matches are found for the given prefix on the
bucket."""
+ try:
+ hook = self._get_async_hook()
+ while True:
+ res = await self._list_blobs_with_prefix(
+ hook=hook, bucket_name=self.bucket, prefix=self.prefix
+ )
+ if len(res) > 0:
+ yield TriggerEvent(
+ {"status": "success", "message": "Successfully
completed", "matches": res}
+ )
+ await asyncio.sleep(self.poke_interval)
+ except Exception as e:
+ yield TriggerEvent({"status": "error", "message": str(e)})
+ return
+
+ async def _list_blobs_with_prefix(self, hook: GCSAsyncHook, bucket_name:
str, prefix: str) -> list[str]:
+ """
+ Returns names of blobs which match the given prefix for a given bucket.
+
+ :param bucket_name: The Google Cloud Storage bucket where the object
is.
Review Comment:
docstring missing param `hook`
##########
airflow/providers/google/cloud/triggers/gcs.py:
##########
@@ -200,3 +200,76 @@ async def _is_blob_updated_after(
if blob_updated_time > target_date:
return True, {"status": "success", "message": "success"}
return False, {"status": "pending", "message": "pending"}
+
+
+class GCSPrefixBlobTrigger(GCSBlobTrigger):
+ """
+ Looks for objects in bucket matching a prefix.
+ If none found, sleep for interval and check again. Otherwise, return
matches.
+
+ :param bucket: the bucket in the google cloud storage where the objects
are residing.
+ :param prefix: The prefix of the blob_names to match in the Google cloud
storage bucket
+ :param google_cloud_conn_id: reference to the Google Connection
+ :param poke_interval: polling period in seconds to check
+ """
+
+ def __init__(
+ self,
+ bucket: str,
+ prefix: str,
+ poke_interval: float,
Review Comment:
missing from docstring
##########
airflow/providers/google/cloud/triggers/gcs.py:
##########
@@ -200,3 +200,76 @@ async def _is_blob_updated_after(
if blob_updated_time > target_date:
return True, {"status": "success", "message": "success"}
return False, {"status": "pending", "message": "pending"}
+
+
+class GCSPrefixBlobTrigger(GCSBlobTrigger):
+ """
+ Looks for objects in bucket matching a prefix.
+ If none found, sleep for interval and check again. Otherwise, return
matches.
+
+ :param bucket: the bucket in the google cloud storage where the objects
are residing.
+ :param prefix: The prefix of the blob_names to match in the Google cloud
storage bucket
+ :param google_cloud_conn_id: reference to the Google Connection
+ :param poke_interval: polling period in seconds to check
+ """
+
+ def __init__(
+ self,
+ bucket: str,
+ prefix: str,
+ poke_interval: float,
+ google_cloud_conn_id: str,
+ hook_params: dict[str, Any],
+ ):
+ super().__init__(
+ bucket=bucket,
+ object_name=prefix,
+ poke_interval=poke_interval,
+ google_cloud_conn_id=google_cloud_conn_id,
+ hook_params=hook_params,
+ )
+ self.prefix = prefix
+
+ def serialize(self) -> tuple[str, dict[str, Any]]:
+ """Serializes GCSPrefixBlobTrigger arguments and classpath."""
+ return (
+ "airflow.providers.google.cloud.triggers.gcs.GCSPrefixBlobTrigger",
+ {
+ "bucket": self.bucket,
+ "prefix": self.prefix,
+ "poke_interval": self.poke_interval,
+ "google_cloud_conn_id": self.google_cloud_conn_id,
+ "hook_params": self.hook_params,
+ },
+ )
+
+ async def run(self) -> AsyncIterator[TriggerEvent]:
+ """Simple loop until the matches are found for the given prefix on the
bucket."""
+ try:
+ hook = self._get_async_hook()
+ while True:
+ res = await self._list_blobs_with_prefix(
Review Comment:
post-2.6 with logging in triggerer, we can now consider logging every now
and again messages like "checking for files". nice to know from logs that the
trigger is still doing something. not saying you must do in this PR just a
reminder that it's something that makes more sense now with 2.6 than it did
previously.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]