guoweiM commented on a change in pull request #16:
URL: https://github.com/apache/flink-ml/pull/16#discussion_r739982935
##########
File path:
flink-ml-iteration/src/main/java/org/apache/flink/iteration/Iterations.java
##########
@@ -112,15 +145,400 @@ public static DataStreamList
iterateBoundedStreamsUntilTermination(
ReplayableDataStreamList dataStreams,
IterationConfig config,
IterationBody body) {
- Preconditions.checkArgument(
- config.getOperatorLifeCycle() ==
IterationConfig.OperatorLifeCycle.ALL_ROUND);
-
Preconditions.checkArgument(dataStreams.getReplayedDataStreams().size() == 0);
+ OperatorWrapper wrapper =
+ config.getOperatorLifeCycle() ==
IterationConfig.OperatorLifeCycle.ALL_ROUND
+ ? new AllRoundOperatorWrapper<>()
+ : new PerRoundOperatorWrapper<>();
- return IterationFactory.createIteration(
+ List<DataStream<?>> allDatastreams = new ArrayList<>();
+ allDatastreams.addAll(dataStreams.getReplayedDataStreams());
+ allDatastreams.addAll(dataStreams.getNonReplayedStreams());
+
+ Set<Integer> replayedIndices =
+ IntStream.range(0, dataStreams.getReplayedDataStreams().size())
+ .boxed()
+ .collect(Collectors.toSet());
+
+ return createIteration(
initVariableStreams,
- new DataStreamList(dataStreams.getNonReplayedStreams()),
+ new DataStreamList(allDatastreams),
+ replayedIndices,
body,
- new AllRoundOperatorWrapper(),
+ wrapper,
true);
}
+
+ @SuppressWarnings({"unchecked", "rawtypes"})
+ private static DataStreamList createIteration(
+ DataStreamList initVariableStreams,
+ DataStreamList dataStreams,
+ Set<Integer> replayedDataStreamIndices,
+ IterationBody body,
+ OperatorWrapper<?, IterationRecord<?>> initialOperatorWrapper,
+ boolean mayHaveCriteria) {
+ checkState(initVariableStreams.size() > 0, "There should be at least
one variable stream");
+
+ IterationID iterationId = new IterationID();
+
+ List<TypeInformation<?>> initVariableTypeInfos =
getTypeInfos(initVariableStreams);
+ List<TypeInformation<?>> dataStreamTypeInfos =
getTypeInfos(dataStreams);
+
+ // Add heads and inputs
+ int totalInitVariableParallelism =
+ map(
+ initVariableStreams,
+ dataStream ->
+ dataStream.getParallelism() > 0
+ ? dataStream.getParallelism()
+ : dataStream
+
.getExecutionEnvironment()
+ .getConfig()
+ .getParallelism())
+ .stream()
+ .mapToInt(i -> i)
+ .sum();
+ DataStreamList initVariableInputs = addInputs(initVariableStreams,
false);
+ DataStreamList headStreams =
+ addHeads(
+ initVariableStreams,
+ initVariableInputs,
+ iterationId,
+ totalInitVariableParallelism,
+ false,
+ 0);
+
+ DataStreamList dataStreamInputs = addInputs(dataStreams, true);
+ if (replayedDataStreamIndices.size() > 0) {
+ dataStreamInputs =
+ addReplayer(
+ headStreams.get(0),
+ dataStreams,
+ dataStreamInputs,
+ replayedDataStreamIndices);
+ }
+
+ // Create the iteration body. We map the inputs of iteration body into
the draft sources,
+ // which serve as the start points to build the draft subgraph.
+ StreamExecutionEnvironment env =
initVariableStreams.get(0).getExecutionEnvironment();
+ DraftExecutionEnvironment draftEnv =
+ new DraftExecutionEnvironment(env, initialOperatorWrapper);
+ DataStreamList draftHeadStreams =
+ addDraftSources(headStreams, draftEnv, initVariableTypeInfos);
+ DataStreamList draftDataStreamInputs =
+ addDraftSources(dataStreamInputs, draftEnv,
dataStreamTypeInfos);
+
+ IterationBodyResult iterationBodyResult =
+ body.process(draftHeadStreams, draftDataStreamInputs);
+
ensuresTransformationAdded(iterationBodyResult.getFeedbackVariableStreams(),
draftEnv);
+ ensuresTransformationAdded(iterationBodyResult.getOutputStreams(),
draftEnv);
+ draftEnv.copyToActualEnvironment();
+
+ // Add tails and co-locate them with the heads.
+ DataStreamList feedbackStreams =
+
getActualDataStreams(iterationBodyResult.getFeedbackVariableStreams(),
draftEnv);
+ checkState(
+ feedbackStreams.size() == initVariableStreams.size(),
+ "The number of feedback streams "
+ + feedbackStreams.size()
+ + " does not match the initialized one "
+ + initVariableStreams.size());
+ for (int i = 0; i < feedbackStreams.size(); ++i) {
+ checkState(
+ feedbackStreams.get(i).getParallelism() ==
headStreams.get(i).getParallelism(),
+ String.format(
+ "The feedback stream %d have different parallelism
%d with the initial stream, which is %d",
+ i,
+ feedbackStreams.get(i).getParallelism(),
+ headStreams.get(i).getParallelism()));
+ }
+
+ DataStreamList tails = addTails(feedbackStreams, iterationId, 0);
+ for (int i = 0; i < headStreams.size(); ++i) {
+ String coLocationGroupKey = "co-" + iterationId.toHexString() +
"-" + i;
+
headStreams.get(i).getTransformation().setCoLocationGroupKey(coLocationGroupKey);
+
tails.get(i).getTransformation().setCoLocationGroupKey(coLocationGroupKey);
+ }
+
+ checkState(
+ mayHaveCriteria ||
iterationBodyResult.getTerminationCriteria() == null,
+ "The current iteration type does not support the termination
criteria.");
+
+ if (iterationBodyResult.getTerminationCriteria() != null) {
+ addCriteriaStream(
+ iterationBodyResult.getTerminationCriteria(),
+ iterationId,
+ env,
+ draftEnv,
+ initVariableStreams,
+ headStreams,
+ totalInitVariableParallelism);
+ }
+
+ return
addOutputs(getActualDataStreams(iterationBodyResult.getOutputStreams(),
draftEnv));
+ }
+
+ private static DataStreamList addReplayer(
+ DataStream<?> firstHeadStream,
+ DataStreamList originalDataStreams,
+ DataStreamList dataStreamInputs,
+ Set<Integer> replayedDataStreamIndices) {
+
+ List<DataStream<?>> result = new ArrayList<>(dataStreamInputs.size());
+ for (int i = 0; i < dataStreamInputs.size(); ++i) {
+ if (!replayedDataStreamIndices.contains(i)) {
+ result.add(dataStreamInputs.get(i));
+ continue;
+ }
+
+ // Notes that the HeadOperator would broadcast the globally
aligned events,
+ // thus the operator does not require emit to the sideoutput
specially.
+ DataStream<?> replayedInput =
+ ((SingleOutputStreamOperator<IterationRecord<?>>)
firstHeadStream)
+
.getSideOutput(HeadOperator.ALIGN_NOTIFY_OUTPUT_TAG)
+ .map(x -> x, dataStreamInputs.get(i).getType())
+ .setParallelism(1)
+ .name("signal-change-typeinfo")
+ .broadcast()
+ .union(dataStreamInputs.get(i))
+ .transform(
+ "Replayer-"
+ + originalDataStreams
+ .get(i)
+ .getTransformation()
+ .getName(),
+ dataStreamInputs.get(i).getType(),
+ (OneInputStreamOperator) new
ReplayOperator<>())
+
.setParallelism(dataStreamInputs.get(i).getParallelism());
+ result.add(replayedInput);
+ }
+
+ return new DataStreamList(result);
+ }
+
+ private static void addCriteriaStream(
+ DataStream<?> draftCriteriaStream,
+ IterationID iterationId,
+ StreamExecutionEnvironment env,
+ DraftExecutionEnvironment draftEnv,
+ DataStreamList initVariableStreams,
+ DataStreamList headStreams,
+ int totalInitVariableParallelism) {
+ // deal with the criteria streams
Review comment:
deals?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]