tdas commented on a change in pull request #33336:
URL: https://github.com/apache/spark/pull/33336#discussion_r670668748
##########
File path:
sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala
##########
@@ -404,3 +402,72 @@ case class FlatMapGroupsWithStateExec(
copy(child = newLeft, initialState = newRight)
}
+object FlatMapGroupsWithStateExec {
+
+ def foundDuplicateInitialKeyException(): Exception = {
+ throw new IllegalArgumentException("The initial state provided contained "
+
+ "multiple rows(state) with the same key. Make sure to de-duplicate the "
+
+ "initial state before passing it.")
+ }
+
+ /**
+ * Special handling for when the child relation is a batch relation.
+ * If the initial state is provided, we create an instance of the
CoGroupExec, if the initial
+ * state is not provided we create an instance of the MapGroupsExec
+ */
+ // scalastyle:off argcount
+ def forBatch(
+ userFunc: (Any, Iterator[Any], LogicalGroupState[Any]) => Iterator[Any],
+ keyDeserializer: Expression,
+ valueDeserializer: Expression,
+ initialStateDeserializer: Expression,
+ groupingAttributes: Seq[Attribute],
+ initialStateGroupAttrs: Seq[Attribute],
+ dataAttributes: Seq[Attribute],
+ initialStateDataAttrs: Seq[Attribute],
+ outputObjAttr: Attribute,
+ timeoutConf: GroupStateTimeout,
+ hasInitialState: Boolean,
+ initialState: SparkPlan,
+ child: SparkPlan): SparkPlan = {
+ if (hasInitialState) {
+ val watermarkPresent = child.output.exists {
+ case a: Attribute if a.metadata.contains(EventTimeWatermark.delayKey)
=> true
+ case _ => false
+ }
+ val func = (keyRow: Any, values: Iterator[Any], states: Iterator[Any])
=> {
+ // Check if there is only one state for every key.
+ var foundInitialStateForKey = false
+ val optionalState = states.map { initialState =>
+ if (foundInitialStateForKey) {
+ foundDuplicateInitialKeyException()
+ }
+ foundInitialStateForKey = true
+ initialState
+ }.toSeq
+
+ // Create group state object
+ val groupState = GroupStateImpl.createForStreaming(
+ optionalState.headOption,
+ System.currentTimeMillis,
+ GroupStateImpl.NO_TIMESTAMP,
+ timeoutConf,
+ hasTimedOut = false,
+ watermarkPresent)
+
+ // Call user function with the state and values for this key
+ userFunc(keyRow, values, groupState)
+ }
+ new CoGroupExec(
+ func, keyDeserializer, valueDeserializer, initialStateDeserializer,
groupingAttributes,
+ initialStateGroupAttrs, dataAttributes, initialStateDataAttrs,
outputObjAttr,
+ child, initialState) {
+ override def outputPartitioning: Partitioning =
child.outputPartitioning
Review comment:
i am actually not sure if this is correct. Even for MapGroupsExec. This
operators output rows that are completely new columns... completely different
from the key-value columns as input. So the output cannot be partitioned in the
same way as the input is.
Its safer to not specify this. If you dont specify, then the worst case is
that an extra shuffle will be added if the next operator needs it to be
partitioned in some way. If it is specified and it is wrong, then the next
operator will end up incorrect assuming that things are partitioned in some way
and produce incorrect results.
so you were right about this being fishy.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]