johnyangk commented on a change in pull request #123: [NEMO-129] Support Beam's
WindowedWordCount example
URL: https://github.com/apache/incubator-nemo/pull/123#discussion_r224952801
##########
File path:
compiler/frontend/beam/src/main/java/org/apache/nemo/compiler/frontend/beam/transform/GroupByKeyTransform.java
##########
@@ -26,53 +32,172 @@
/**
* Group Beam KVs.
- * @param <I> input type.
+ * @param <K> key type.
+ * @param <InputT> input type.
*/
-public final class GroupByKeyTransform<I> implements Transform<I,
WindowedValue<KV<Object, List>>> {
+public final class GroupByKeyTransform<K, InputT>
+ extends AbstractTransform<KV<K, InputT>, KeyedWorkItem<K, InputT>, KV<K,
Iterable<InputT>>> {
private static final Logger LOG =
LoggerFactory.getLogger(GroupByKeyTransform.class.getName());
- private final Map<Object, List> keyToValues;
- private OutputCollector<WindowedValue<KV<Object, List>>> outputCollector;
+
+ private final SystemReduceFn reduceFn;
+ private transient TimerInternalsFactory timerInternalsFactory;
/**
* GroupByKey constructor.
*/
- public GroupByKeyTransform() {
- this.keyToValues = new HashMap<>();
+ public GroupByKeyTransform(final Map<TupleTag<?>, Coder<?>> outputCoders,
+ final TupleTag<KV<K, Iterable<InputT>>>
mainOutputTag,
+ final List<TupleTag<?>> additionalOutputTags,
+ final WindowingStrategy<?, ?> windowingStrategy,
+ final Collection<PCollectionView<?>> sideInputs,
+ final PipelineOptions options,
+ final SystemReduceFn reduceFn) {
+ super(null, /* doFn */
+ null, /* inputCoder */
+ outputCoders,
+ mainOutputTag,
+ additionalOutputTags,
+ windowingStrategy,
+ sideInputs,
+ options);
+ this.reduceFn = reduceFn;
}
+ /**
+ * This creates a new DoFn that groups elements by key and window.
+ * @param doFn original doFn.
+ * @return GroupAlsoByWindowViaWindowSetNewDoFn
+ */
@Override
- public void prepare(final Context context, final
OutputCollector<WindowedValue<KV<Object, List>>> oc) {
- this.outputCollector = oc;
+ protected DoFn wrapDoFn(final DoFn doFn) {
+ timerInternalsFactory = new InMemoryTimerInternalsFactory();
+ return
+ GroupAlsoByWindowViaWindowSetNewDoFn.create(
+ getWindowingStrategy(),
+ new InMemoryStateInternalsFactory(),
+ timerInternalsFactory,
+ getSideInputReader(),
+ reduceFn,
+ getOutputManager(),
+ getMainOutputTag());
}
@Override
- public void onData(final I element) {
- // TODO #129: support window in group by key for windowed groupByKey
- final WindowedValue<KV> windowedValue = (WindowedValue<KV>) element;
- final KV kv = windowedValue.getValue();
- keyToValues.putIfAbsent(kv.getKey(), new ArrayList());
- keyToValues.get(kv.getKey()).add(kv.getValue());
+ public void onData(final WindowedValue<KV<K, InputT>> element) {
+ // The GroupAlsoByWindowViaWindowSetNewDoFn requires KeyedWorkItem,
+ // so we convert the KV to KeyedWorkItem
+ final KV<K, InputT> kv = element.getValue();
+ final KeyedWorkItem<K, InputT> keyedWorkItem =
+ KeyedWorkItems.elementsWorkItem(kv.getKey(),
+ Collections.singletonList(element.withValue(kv.getValue())));
+
+
getDoFnRunner().processElement(WindowedValue.valueInGlobalWindow(keyedWorkItem));
}
+ /**
+ * This advances the input watermark and processing time to the timestamp
max value
+ * in order to emit all data.
+ */
@Override
- public void close() {
- // TODO #129: support window in group by key for windowed groupByKey
- if (keyToValues.isEmpty()) {
- LOG.warn("Beam GroupByKeyTransform received no data!");
- } else {
- keyToValues.entrySet().stream().map(entry ->
- WindowedValue.valueInGlobalWindow(KV.of(entry.getKey(),
entry.getValue())))
- .forEach(outputCollector::emit);
- keyToValues.clear();
- }
+ protected void beforeClose() {
+ final InMemoryTimerInternalsFactory imTimerFactory =
+ (InMemoryTimerInternalsFactory) timerInternalsFactory;
+
+ imTimerFactory.internalsMap.entrySet().stream()
+ .forEach(entry -> {
+ final K key = entry.getKey();
+ final InMemoryTimerInternals timerInternals = entry.getValue();
+
+ try {
+ // Finish any pending windows by advancing the input watermark to
infinity.
+
timerInternals.advanceInputWatermark(BoundedWindow.TIMESTAMP_MAX_VALUE);
+
+ // Finally, advance the processing time to infinity to fire any
timers.
+
timerInternals.advanceProcessingTime(BoundedWindow.TIMESTAMP_MAX_VALUE);
+
timerInternals.advanceSynchronizedProcessingTime(BoundedWindow.TIMESTAMP_MAX_VALUE);
+
+ fireEligibleTimers(key, timerInternals);
+ } catch (final Exception e) {
+ e.printStackTrace();
+ }
+ });
}
@Override
public String toString() {
final StringBuilder sb = new StringBuilder();
sb.append("GroupByKeyTransform:");
- sb.append(super.toString());
return sb.toString();
}
+
+ private void fireEligibleTimers(final K key,
+ final InMemoryTimerInternals timerInternals)
{
+ while (true) {
+ TimerInternals.TimerData timer;
+ boolean hasFired = false;
+
+ while ((timer = timerInternals.removeNextEventTimer()) != null) {
+ hasFired = true;
+ fireTimer(key, timer);
+ }
+ while ((timer = timerInternals.removeNextProcessingTimer()) != null) {
+ hasFired = true;
+ fireTimer(key, timer);
+ }
+ while ((timer = timerInternals.removeNextSynchronizedProcessingTimer())
!= null) {
+ hasFired = true;
+ fireTimer(key, timer);
+ }
+ if (!hasFired) {
+ break;
+ }
+ }
+ }
+
+ private void fireTimer(final K key,
+ final TimerInternals.TimerData timer) {
+ getDoFnRunner().processElement(
+ WindowedValue.valueInGlobalWindow(
+ KeyedWorkItems.timersWorkItem(key, Collections.singletonList(timer))));
Review comment:
Instead of using a singleton list, what do you think about accumulating the
eligible timers in `fireEligibleTimers` and converting them into a list of
timers?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services