[
https://issues.apache.org/jira/browse/BEAM-7949?focusedWorklogId=361608&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-361608
]
ASF GitHub Bot logged work on BEAM-7949:
----------------------------------------
Author: ASF GitHub Bot
Created on: 20/Dec/19 11:55
Start Date: 20/Dec/19 11:55
Worklog Time Spent: 10m
Work Description: mxm commented on pull request #10246: [BEAM-7949] Add
time-based cache threshold support in the data service of the Python SDK harness
URL: https://github.com/apache/beam/pull/10246#discussion_r360343183
##########
File path: sdks/python/apache_beam/runners/worker/data_plane.py
##########
@@ -69,24 +70,90 @@
class ClosableOutputStream(OutputStream):
"""A Outputstream for use with CoderImpls that has a close() method."""
+ def __init__(self, close_callback=None):
+ super(ClosableOutputStream, self).__init__()
+ self._close_callback = close_callback
+
+ def close(self):
+ if self._close_callback:
+ self._close_callback(self.get())
+
+ @staticmethod
+ def create(close_callback,
+ flush_callback,
+ data_buffer_time_limit_ms):
+ if data_buffer_time_limit_ms > 0:
+ return TimeBasedBufferingClosableOutputStream(
+ close_callback,
+ flush_callback=flush_callback,
+ time_flush_threshold_ms=data_buffer_time_limit_ms)
+ else:
+ return SizeBasedBufferingClosableOutputStream(
+ close_callback, flush_callback=flush_callback)
+
+
+class SizeBasedBufferingClosableOutputStream(ClosableOutputStream):
+ """A size-based buffering OutputStream."""
+
def __init__(self,
close_callback=None, # type: Optional[Callable[[bytes], None]]
flush_callback=None, # type: Optional[Callable[[bytes], None]]
- flush_threshold=_DEFAULT_FLUSH_THRESHOLD):
- super(ClosableOutputStream, self).__init__()
- self._close_callback = close_callback
+ size_flush_threshold=_DEFAULT_SIZE_FLUSH_THRESHOLD):
+ super(SizeBasedBufferingClosableOutputStream,
self).__init__(close_callback)
self._flush_callback = flush_callback
- self._flush_threshold = flush_threshold
+ self._size_flush_threshold = size_flush_threshold
# This must be called explicitly to avoid flushing partial elements.
def maybe_flush(self):
- if self._flush_callback and self.size() > self._flush_threshold:
+ if self.size() > self._size_flush_threshold:
+ self.flush()
+
+ def flush(self):
+ if self._flush_callback:
self._flush_callback(self.get())
self._clear()
+
+class TimeBasedBufferingClosableOutputStream(
+ SizeBasedBufferingClosableOutputStream):
+ """A buffering OutputStream with both time-based and size-based."""
+
+ def __init__(self,
+ close_callback=None,
+ flush_callback=None,
+ size_flush_threshold=_DEFAULT_SIZE_FLUSH_THRESHOLD,
+ time_flush_threshold_ms=_DEFAULT_TIME_FLUSH_THRESHOLD_MS):
+ super(TimeBasedBufferingClosableOutputStream, self).__init__(
+ close_callback, flush_callback, size_flush_threshold)
+ assert time_flush_threshold_ms > 0
+ self._time_flush_threshold_ms = time_flush_threshold_ms
+ self._flush_lock = threading.Lock()
+ self._schedule_lock = threading.Lock()
+ self._closed = False
+ self._schedule_periodic_flush()
+
+ def flush(self):
+ with self._flush_lock:
+ super(TimeBasedBufferingClosableOutputStream, self).flush()
+
def close(self):
- if self._close_callback:
- self._close_callback(self.get())
+ with self._schedule_lock:
+ self._closed = True
+ if self._flush_timer:
+ self._flush_timer.cancel()
+ self._flush_timer = None
+ super(TimeBasedBufferingClosableOutputStream, self).close()
+
+ def _schedule_periodic_flush(self):
+ def _periodic_flush():
+ with self._schedule_lock:
+ if not self._closed:
+ self.flush()
+ self._schedule_periodic_flush()
Review comment:
Also, this creates a `Thread` for every flush. We might just want to use a
single thread, e.g. https://stackoverflow.com/a/12435256/2225100
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
Issue Time Tracking
-------------------
Worklog Id: (was: 361608)
Time Spent: 3h 10m (was: 3h)
> Add time-based cache threshold support in the data service of the Python SDK
> harness
> ------------------------------------------------------------------------------------
>
> Key: BEAM-7949
> URL: https://issues.apache.org/jira/browse/BEAM-7949
> Project: Beam
> Issue Type: Sub-task
> Components: sdk-py-harness
> Reporter: sunjincheng
> Priority: Major
> Time Spent: 3h 10m
> Remaining Estimate: 0h
>
> Currently only size-based cache threshold is supported in the data service of
> Python SDK harness. It should also support the time-based cache threshold.
> This is very important, especially for streaming jobs which are sensitive to
> the delay.
--
This message was sent by Atlassian Jira
(v8.3.4#803005)