aminghadersohi commented on code in PR #38576:
URL: https://github.com/apache/superset/pull/38576#discussion_r3482508888
##########
tests/unit_tests/utils/test_screenshot_cache_fix.py:
##########
@@ -149,38 +155,37 @@ def test_cache_saved_only_when_image_generated(
assert cached_value["status"] == "Updated"
assert cached_value["image"] is not None
- def test_no_intermediate_cache_during_computing(
- self, mocker: MockerFixture, screenshot_obj, mock_user
- ):
- """Test that cache is not saved during COMPUTING state."""
+ def test_computing_status_written_to_cache_early(
+ self,
+ mocker: MockerFixture,
+ screenshot_obj: BaseScreenshot,
+ mock_user: MagicMock,
+ ) -> None:
+ """compute_and_cache writes COMPUTING to cache before taking the
screenshot
+ so concurrent tasks can detect it and avoid duplicate work."""
+ mocker.patch(DISTRIBUTED_LOCK_PATH)
mocker.patch(BASE_SCREENSHOT_PATH + ".get_from_cache_key",
return_value=None)
BaseScreenshot.cache = MockCache()
- # Mock get_screenshot to check cache state during execution
- def check_cache_during_screenshot(*args, **kwargs):
- # At this point, we're in COMPUTING state
- # Cache should NOT be set yet
+ def check_cache_during_screenshot(*args: object, **kwargs: object) ->
bytes:
cache_key = screenshot_obj.get_cache_key()
cached_value = BaseScreenshot.cache.get(cache_key)
- # Cache should be empty during screenshot generation
- assert cached_value is None, (
- "Cache should not be saved during COMPUTING state"
+ assert cached_value is not None, (
+ "Cache should be set to COMPUTING before screenshot starts"
)
+ assert cached_value["status"] == "Computing"
return b"image_data"
mocker.patch(
BASE_SCREENSHOT_PATH + ".get_screenshot",
side_effect=check_cache_during_screenshot,
)
- # Mock resize to avoid PIL errors with fake image data
mocker.patch(
BASE_SCREENSHOT_PATH + ".resize_image",
return_value=b"resized_image_data"
)
Review Comment:
Fixed — added `MagicMock`, `MockerFixture`, `BaseScreenshot` type
annotations and `-> None` return type to `test_stale_computing_triggers_retry`.
##########
superset/utils/screenshots.py:
##########
@@ -280,43 +281,66 @@ def compute_and_cache( # pylint:
disable=too-many-arguments
:return: Image payload
"""
cache_key = cache_key or self.get_cache_key(window_size, thumb_size)
- cache_payload = self.get_from_cache_key(cache_key) or
ScreenshotCachePayload()
- if not cache_payload.should_trigger_task(force=force):
- logger.info(
- "Skipping compute - already processed for thumbnail: %s",
cache_key
- )
- return
-
- window_size = window_size or self.window_size
- thumb_size = thumb_size or self.thumb_size
- logger.info("Processing url for thumbnail: %s", cache_key)
- cache_payload.computing()
- image = None
- # Assuming all sorts of things can go wrong with Selenium
try:
- logger.info("trying to generate screenshot")
- with
event_logger.log_context(f"screenshot.compute.{self.thumbnail_type}"):
- image = self.get_screenshot(user=user, window_size=window_size)
- except Exception as ex: # pylint: disable=broad-except
- logger.warning("Failed at generating thumbnail %s", ex,
exc_info=True)
- cache_payload.error()
- if image and window_size != thumb_size:
- try:
- image = self.resize_image(image, thumb_size=thumb_size)
- except Exception as ex: # pylint: disable=broad-except
- logger.warning("Failed at resizing thumbnail %s", ex,
exc_info=True)
- cache_payload.error()
+ with DistributedLock(
+ namespace="thumbnail",
+ key=cache_key,
+ ttl_seconds=app.config["THUMBNAIL_COMPUTING_CACHE_TTL"],
+ ):
Review Comment:
The lease-expiry window is intentionally closed by the Celery
`soft_time_limit=300` on all three thumbnail tasks (`cache_chart_thumbnail`,
`cache_dashboard_thumbnail`, `cache_dashboard_screenshot`), which is less than
`THUMBNAIL_COMPUTING_CACHE_TTL=360s`. A task that runs longer than 300s is
killed by Celery before the 360s lock expires, so the lease cannot outlive the
holder in normal operation. `config.py` documents this relationship:
```python
# Should exceed the task soft_time_limit (300 s) by a margin.
THUMBNAIL_COMPUTING_CACHE_TTL = int(timedelta(seconds=360).total_seconds())
```
The owner-token concern (unconditional release deleting a newer holder's
key) is a pre-existing property of the `DistributedLock` backend that predates
this PR. Fixing it would require changing `SET NX EX` to store a random token
and compare-and-delete on release — a worthwhile follow-up but out of scope
here.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]