lostluck commented on a change in pull request #15743:
URL: https://github.com/apache/beam/pull/15743#discussion_r732027485
##########
File path: sdks/go/pkg/beam/core/runtime/exec/window.go
##########
@@ -96,3 +96,23 @@ func (w *WindowInto) Down(ctx context.Context) error {
func (w *WindowInto) String() string {
return fmt.Sprintf("WindowInto[%v]. Out:%v", w.Fn, w.Out.ID())
}
+
+// WindowMapper defines an interface maps windows from a main input window
space
+// to windows from a side input window space. Used during side input
materialization.
+type WindowMapper interface {
+ MapWindow(w typex.Window) (typex.Window, error)
+}
+
+type windowMapper struct {
+ wfn *window.Fn
+}
+
+func (f *windowMapper) MapWindow(w typex.Window) (typex.Window, error) {
+ candidates := assignWindows(f.wfn, w.MaxTimestamp())
+ if len(candidates) == 0 {
+ return nil, fmt.Errorf("failed to map main input window to side
input window with WindowFn %v", f.wfn.String())
+ }
+ // Return latest candidate window in terms of event time (only relevant
for sliding windows)
+ // Sliding windows append the latest window first in assignWindows.
+ return candidates[0], nil
Review comment:
This is returning the 1st candidate. Is this correct? Shouldn't it be
the last candidate `candidates[len(candidates-1)]` ?
Python uses the last candidate....
https://github.com/apache/beam/blob/master/sdks/python/apache_beam/transforms/sideinputs.py#L65
And generates them like so,
https://github.com/apache/beam/blob/aa4edda39ceb8d7a80f56bd37caa6233dba7de5d/sdks/python/apache_beam/transforms/window.py#L494
which matches how we assign them in Go:
https://github.com/apache/beam/blob/aa4edda39ceb8d7a80f56bd37caa6233dba7de5d/sdks/go/pkg/beam/core/runtime/exec/window.go#L72
Java also does the same thing:
https://github.com/apache/beam/blob/aa4edda39ceb8d7a80f56bd37caa6233dba7de5d/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/windowing/SlidingWindows.java#L111
But statically constructs the window.
https://github.com/apache/beam/blob/aa4edda39ceb8d7a80f56bd37caa6233dba7de5d/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/windowing/SlidingWindows.java#L133
calling it the "earliest window" instead of the "latest" window.
The bit that tipped me off to the inconsistency is that in the unit test you
have, the side input window ends very much later than the fixed window, which
doesn't make sense processing wise: Why wait for additional later data and
delay main input processing until the watermark passes that later time?
##########
File path: sdks/go/pkg/beam/core/runtime/exec/window_test.go
##########
@@ -113,3 +113,47 @@ func TestAssignWindow(t *testing.T) {
}
}
}
+
+func TestMapWindow(t *testing.T) {
+ tests := []struct {
+ name string
+ wfn *window.Fn
+ in typex.Window
+ expected typex.Window
+ }{
+ {
+ "interval to global",
+ window.NewGlobalWindows(),
+ window.IntervalWindow{Start: 0, End: 1000},
+ window.GlobalWindow{},
+ },
+ {
+ "global to global",
+ window.NewGlobalWindows(),
+ window.GlobalWindow{},
+ window.GlobalWindow{},
+ },
+ {
+ "interval to interval",
+ window.NewFixedWindows(1000 * time.Millisecond),
+ window.IntervalWindow{Start: 0, End: 100},
+ window.IntervalWindow{Start: 0, End: 1000},
+ },
+ {
+ "interval to sliding",
+ window.NewSlidingWindows(500*time.Millisecond,
1000*time.Millisecond),
+ window.IntervalWindow{Start: 0, End: 600},
+ window.IntervalWindow{Start: 500, End: 1500},
Review comment:
The "earliest" window should be 0-1000 here I think.
Since this one is trickier, I suggest we copy the testing values that Java's
unit test uses (minus the offsets, which we don't support at present)
https://github.com/apache/beam/blob/4b7b74673b647c8d964b4877a8d66d47096acce4/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/windowing/SlidingWindowsTest.java#L175
##########
File path: sdks/go/test/integration/primitives/windowinto.go
##########
@@ -93,6 +94,41 @@ func WindowSums_Lifted(s beam.Scope) {
WindowSums(s.Scope("Lifted"), stats.SumPerKey)
}
+// ValidateWindowedSideInputs checks that side inputs have accurate windowing
information when used.
+func ValidateWindowedSideInputs(s beam.Scope) {
+ timestampedData := beam.ParDo(s, &createTimestampedData{Data: []int{1,
2, 3}}, beam.Impulse(s))
+
+ timestampedData = beam.DropKey(s, timestampedData)
+
+ windowSize := 1 * time.Second
+
+ validateSums := func(s beam.Scope, wfn, sideFn *window.Fn, in, side
beam.PCollection, expected ...interface{}) {
+ wData := beam.WindowInto(s, wfn, in)
+ wSide := beam.WindowInto(s, sideFn, side)
+
+ sums := beam.ParDo(s, sumSideInputs, wData,
beam.SideInput{Input: wSide})
+
+ sums = beam.WindowInto(s, window.NewGlobalWindows(), sums)
+
+ passert.Equals(s, sums, expected...)
+ }
+
+ validateSums(s.Scope("Fixed-Global"),
window.NewFixedWindows(windowSize), window.NewGlobalWindows(), timestampedData,
timestampedData, 7, 8, 9)
+ validateSums(s.Scope("Fixed-Same"), window.NewFixedWindows(windowSize),
window.NewFixedWindows(windowSize), timestampedData, timestampedData, 2, 4, 6)
+ validateSums(s.Scope("Fixed-Big"), window.NewFixedWindows(windowSize),
window.NewFixedWindows(10*time.Second), timestampedData, timestampedData, 7, 8,
9)
+ validateSums(s.Scope("Fixed-Sliding"),
window.NewFixedWindows(windowSize), window.NewSlidingWindows(windowSize,
2*windowSize), timestampedData, timestampedData, 7, 4, 6)
+ validateSums(s.Scope("Sliding-Fixed"),
window.NewSlidingWindows(windowSize, 2*windowSize),
window.NewFixedWindows(windowSize), timestampedData, timestampedData, 2, 3, 4,
5, 6, 3)
Review comment:
Just so I understand what's going on for these sums, which we should
probably add a clarifying comment for, as they are harder to figure out quickly
vs the plain fixed ones.
For Fixed-Sliding
Main: With window size 1, each window contains 1 element (1, 2, 3)
Side: window size 2, each window starts at 1. So we have [1], [1,2], [2,3],
[3]
So what gets computed here should be with earliest windows:
(1, [1]) = 2
(2, [1, 2]) = 5
(3, [2, 3]) = 8
What we have here does match what's implemented at least (latest windows).
(1, [1, 2]) = 4
(2, [2, 3]) = 7
(3, [3]) = 6
For sliding-Fixed:
We have
([1], [1]) = 2
([1, 2], [2]) = 3, 4
([2, 3], [3]) = 5, 6
([3], [] ) = 3
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]