[ 
https://issues.apache.org/jira/browse/BEAM-8335?focusedWorklogId=395354&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-395354
 ]

ASF GitHub Bot logged work on BEAM-8335:
----------------------------------------

                Author: ASF GitHub Bot
            Created on: 29/Feb/20 01:03
            Start Date: 29/Feb/20 01:03
    Worklog Time Spent: 10m 
      Work Description: pabloem commented on pull request #10497: [BEAM-8335] 
Add the ReverseTestStream
URL: https://github.com/apache/beam/pull/10497#discussion_r385979045
 
 

 ##########
 File path: sdks/python/apache_beam/testing/test_stream.py
 ##########
 @@ -314,3 +355,239 @@ def from_runner_api_parameter(ptransform, payload, 
context):
         coder=coder,
         events=[Event.from_runner_api(e, coder) for e in payload.events],
         output_tags=output_tags)
+
+
+class TimingInfo(object):
+  def __init__(self, processing_time, watermark):
+    self._processing_time = timestamp.Timestamp.of(processing_time)
+    self._watermark = timestamp.Timestamp.of(watermark)
+
+  @property
+  def processing_time(self):
+    return self._processing_time
+
+  @property
+  def watermark(self):
+    return self._watermark
+
+  def __repr__(self):
+    return '({}, {}, {})'.format(
+        self.event_timestamp, self.processing_time, self.watermark)
+
+
+class PairWithTiming(PTransform):
+  """Pairs the input element with timing information.
+
+  Input: element; output: KV(element, timing information)
+  Where timing information := (processing time, watermark)
+
+  This is used in the ReverseTestStream implementation to replay watermark
+  advancements.
+  """
+
+  URN = "beam:transform:pair_with_timing:v1"
+
+  def expand(self, pcoll):
+    return pvalue.PCollection.from_(pcoll)
+
+
+class ReverseTestStream(PTransform):
+  """A Transform that can create TestStream events from a stream of elements.
+
+  This currently assumes that this the pipeline being run on a single machine
+  and elements come in order and are outputted in the same order that they came
+  in.
+  """
+  class Format(Enum):
+    TEST_STREAM_EVENTS = 1
+    TEST_STREAM_FILE_RECORDS = 2
+    SERIALIZED_TEST_STREAM_FILE_RECORDS = 3
+
+  def __init__(
+      self, sample_resolution_sec, output_tag, coder=None, output_format=None):
+    self._sample_resolution_sec = sample_resolution_sec
+    self._output_tag = output_tag
+    self._output_format = output_format if output_format \
+                          else ReverseTestStream.Format.TEST_STREAM_EVENTS
+    self._coder = coder if coder else beam.coders.FastPrimitivesCoder()
+
+  def expand(self, pcoll):
+    generator = (
+        _TestStreamFileRecordGenerator(coder=self._coder) if (
+            self._output_format in (
+                self.Format.TEST_STREAM_FILE_RECORDS,
+                self.Format.SERIALIZED_TEST_STREAM_FILE_RECORDS)) else
+        _TestStreamEventGenerator())
+
+    ret = (
+        pcoll
+        | beam.WindowInto(beam.window.GlobalWindows())
+
+        # First get the initial timing information. This will be used to start
+        # the periodic timers which will generate processing time and watermark
+        # advancements every `sample_resolution_sec`.
+        | 'initial timing' >> PairWithTiming()
+
+        # Next, map every element to the same key so that only a single timer 
is
+        # started for this given ReverseTestStream.
+        | beam.Map(lambda x: (0, x))
+
+        # Next, pass-through each element which will be paired with its timing
+        # info in the next step. Also, start the periodic timers. We use timers
+        # in this situation to capture watermark advancements that occur when
+        # there are no elements being produced upstream.
+        | beam.ParDo(
+            _WatermarkEventGenerator(
+                output_tag=self._output_tag,
+                sample_resolution_sec=self._sample_resolution_sec))
+
+        # Next, retrieve the timing information for watermark events that were
+        # generated in the previous step. This is because elements generated
+        # through the timers don't have their timing information yet.
+        | 'timing info for watermarks' >> PairWithTiming()
+
+        # Format the events properly.
+        | beam.ParDo(generator))
+
+    if self._output_format == self.Format.SERIALIZED_TEST_STREAM_FILE_RECORDS:
+
+      def serializer(e):
+        return e.SerializeToString()
+
+      ret = ret | 'serializer' >> beam.Map(serializer)
+
+    return ret
+
+
+class _WatermarkEventGenerator(beam.DoFn):
+  # Used to return the initial timing information.
+  EXECUTE_ONCE_STATE = beam.transforms.userstate.BagStateSpec(
+      name='execute_once_state', coder=beam.coders.FastPrimitivesCoder())
+  WATERMARK_TRACKER = TimerSpec('watermark_tracker', TimeDomain.REAL_TIME)
+
+  def __init__(self, output_tag, sample_resolution_sec=0.1):
+    self._output_tag = output_tag
+    self._sample_resolution_sec = sample_resolution_sec
+
+  @on_timer(WATERMARK_TRACKER)
+  def on_watermark_tracker(
+      self,
+      timestamp=beam.DoFn.TimestampParam,
+      window=beam.DoFn.WindowParam,
+      watermark_tracker=beam.DoFn.TimerParam(WATERMARK_TRACKER)):
+    next_sample_time = (timestamp.micros * 1e-6) + self._sample_resolution_sec
+    watermark_tracker.set(next_sample_time)
 
 Review comment:
   We may want to be careful with time skew on the processing time timer 
firings. If they skew consistently, the time we think it is in this transform 
may be very different from the time it actually is.
 
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


Issue Time Tracking
-------------------

    Worklog Id:     (was: 395354)
    Time Spent: 77h 40m  (was: 77.5h)

> Add streaming support to Interactive Beam
> -----------------------------------------
>
>                 Key: BEAM-8335
>                 URL: https://issues.apache.org/jira/browse/BEAM-8335
>             Project: Beam
>          Issue Type: Improvement
>          Components: runner-py-interactive
>            Reporter: Sam Rohde
>            Assignee: Sam Rohde
>            Priority: Major
>          Time Spent: 77h 40m
>  Remaining Estimate: 0h
>
> This issue tracks the work items to introduce streaming support to the 
> Interactive Beam experience. This will allow users to:
>  * Write and run a streaming job in IPython
>  * Automatically cache records from unbounded sources
>  * Add a replay experience that replays all cached records to simulate the 
> original pipeline execution
>  * Add controls to play/pause/stop/step individual elements from the cached 
> records
>  * Add ability to inspect/visualize unbounded PCollections



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to