[
https://issues.apache.org/jira/browse/BEAM-8537?focusedWorklogId=379618&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-379618
]
ASF GitHub Bot logged work on BEAM-8537:
----------------------------------------
Author: ASF GitHub Bot
Created on: 30/Jan/20 21:41
Start Date: 30/Jan/20 21:41
Worklog Time Spent: 10m
Work Description: robertwb commented on pull request #10375: [BEAM-8537]
Provide WatermarkEstimator to track watermark
URL: https://github.com/apache/beam/pull/10375#discussion_r373204325
##########
File path: sdks/python/apache_beam/runners/common.py
##########
@@ -503,6 +534,182 @@ def invoke_process(self,
windowed_value, self.process_method(windowed_value.value))
+class _ThreadsafeWatermarkEstimator(object):
+ """A threadsafe wrapper which wraps a WatermarkEstimator with locking
+ mechanism to guarantee multi-thread safety.
+ """
+ def __init__(self, watermark_estimator, lock):
+ from apache_beam.io.iobase import WatermarkEstimator
+ if not isinstance(watermark_estimator, WatermarkEstimator):
+ raise ValueError('Initializing Threadsafe requires a WatermarkEstimator')
+ self._watermark_estimator = watermark_estimator
+ self._lock = lock
+
+ def __getattr__(self, attr):
+ if hasattr(self._watermark_estimator, attr):
+ def method_wrapper(*args, **kw):
+ with self._lock:
+ return getattr(self._watermark_estimator, attr)(*args, **kw)
+ return method_wrapper
+ raise AttributeError(attr)
+
+ def get_estimator_state_with_lock(self):
+ # The caller should hold the lock before entering this function.
+ if not self._lock.locked():
+ raise RuntimeError('Expected lock to be held to guarantee thread-safe '
+ 'access.')
+ return self._watermark_estimator.get_estimator_state()
+
+ def get_estimator_state(self):
+ with self._lock:
+ return self.get_estimator_state_with_lock()
+
+ def current_watermark_with_lock(self):
+ # The caller should hold the lock before entering this function.
+ if not self._lock.locked():
+ raise RuntimeError('Expected lock to be held to guarantee thread-safe '
+ 'access.')
+ return self._watermark_estimator.current_watermark()
+
+ def current_watermark(self):
+ with self._lock:
+ return self.current_watermark_with_lock()
+
+ def observe_timestamp(self, timestamp):
+ if not isinstance(timestamp, Timestamp):
+ raise ValueError('Input of observe_timestamp should be a Timestamp '
+ 'object')
+ with self._lock:
+ self._watermark_estimator.observe_timestamp(timestamp)
+
+
+class _ThreadsafeRestrictionTracker(object):
+ """A thread-safe wrapper which wraps a `RestrictionTracker`.
+
+ This wrapper guarantees synchronization of modifying restrictions across
+ multiple threads.
+ """
+
+ def __init__(self, restriction_tracker, lock):
+ from apache_beam.io.iobase import RestrictionTracker
+ if not isinstance(restriction_tracker, RestrictionTracker):
+ raise ValueError(
+ 'Initialize ThreadsafeRestrictionTracker requires'
+ 'RestrictionTracker.')
+ self._restriction_tracker = restriction_tracker
+ # Records an absolute timestamp when defer_remainder is called.
+ self._deferred_timestamp = None
+ self._lock = lock
+ self._deferred_residual = None
+ self._deferred_watermark = None
+
+ def current_restriction(self):
+ with self._lock:
+ return self._restriction_tracker.current_restriction()
+
+ def try_claim(self, position):
+ with self._lock:
+ return self._restriction_tracker.try_claim(position)
+
+ def defer_remainder(self, deferred_time=None):
+ """Performs self-checkpoint on current processing restriction with an
+ expected resuming time.
+
+ Self-checkpoint could happen during processing elements. When executing an
+ DoFn.process(), you may want to stop processing an element and resuming
+ later if current element has been processed quit a long time or you also
+ want to have some outputs from other elements. ``defer_remainder()`` can be
+ called on per element if needed.
+
+ Args:
+ deferred_time: A relative ``Duration`` that indicates the ideal time gap
+ between now and resuming, or an absolute ``Timestamp`` for resuming
+ execution time. If the time_delay is None, the deferred work will be
+ executed as soon as possible.
+ """
+
+ # Record current time for calculating deferred_time later.
+ with self._lock:
+ self._deferred_timestamp = Timestamp.now()
+ if (deferred_time and
+ not isinstance(deferred_time, Duration) and
+ not isinstance(deferred_time, Timestamp)):
+ raise ValueError('The timestamp of deter_remainder() should be a '
+ 'Duration or a Timestamp, or None.')
+ self._deferred_watermark = deferred_time
+ checkpoint = self.try_split(0)
+ if checkpoint:
+ _, self._deferred_residual = checkpoint
+
+ def check_done(self):
+ with self._lock:
+ return self._restriction_tracker.check_done()
+
+ def current_progress(self):
+ with self._lock:
+ return self._restriction_tracker.current_progress()
+
+ def try_split(self, fraction_of_remainder):
+ # The caller should hold the lock before entering this function.
+ if not self._lock.locked():
+ raise RuntimeError('Expected lock to be held to guarantee thread-safe '
+ 'access.')
+ return self._restriction_tracker.try_split(fraction_of_remainder)
+
+ def deferred_status(self):
+ # type: () -> Optional[Tuple[Any, Timestamp]]
+ """Returns deferred work which is produced by ``defer_remainder()``.
+
+ When there is a self-checkpoint performed, the system needs to fulfill the
+ DelayedBundleApplication with deferred_work for a ProcessBundleResponse.
+ The system calls this API to get deferred_residual with watermark together
+ to help the runner to schedule a future work.
+
+ Returns: (deferred_residual, time_delay) if having any residual, else None.
+ """
+ if self._deferred_residual:
+ # If _deferred_watermark is None, create Duration(0).
+ if not self._deferred_watermark:
Review comment:
Let's rename this, as it's not really a watermark.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
Issue Time Tracking
-------------------
Worklog Id: (was: 379618)
> Provide WatermarkEstimatorProvider for different types of WatermarkEstimator
> ----------------------------------------------------------------------------
>
> Key: BEAM-8537
> URL: https://issues.apache.org/jira/browse/BEAM-8537
> Project: Beam
> Issue Type: Improvement
> Components: sdk-py-core, sdk-py-harness
> Reporter: Boyuan Zhang
> Assignee: Boyuan Zhang
> Priority: Major
> Time Spent: 7h 50m
> Remaining Estimate: 0h
>
> This is a follow up for in-progress PR:
> https://github.com/apache/beam/pull/9794.
> Current implementation in PR9794 provides a default implementation of
> WatermarkEstimator. For further work, we want to let WatermarkEstimator to be
> a pure Interface. We'll provide a WatermarkEstimatorProvider to be able to
> create a custom WatermarkEstimator per windowed value. It should be similar
> to how we track restriction for SDF:
> WatermarkEstimator <---> RestrictionTracker
> WatermarkEstimatorProvider <---> RestrictionTrackerProvider
> WatermarkEstimatorParam <---> RestrictionDoFnParam
--
This message was sent by Atlassian Jira
(v8.3.4#803005)