dtarima commented on code in PR #45181: URL: https://github.com/apache/spark/pull/45181#discussion_r1526282544
########## sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala: ########## @@ -193,10 +193,40 @@ private[sql] object Dataset { */ @Stable class Dataset[T] private[sql]( - @DeveloperApi @Unstable @transient val queryExecution: QueryExecution, + @DeveloperApi @Unstable @transient val queryUnpersisted: QueryExecution, @DeveloperApi @Unstable @transient val encoder: Encoder[T]) extends Serializable { + @volatile private var queryPersisted: Option[(Array[Boolean], QueryExecution)] = None + + def queryExecution: QueryExecution = { Review Comment: This method should probably have `@DeveloperApi @Unstable`, and remove `@DeveloperApi` annotation from `queryUnpersisted` above. ########## sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala: ########## @@ -193,10 +193,40 @@ private[sql] object Dataset { */ @Stable class Dataset[T] private[sql]( - @DeveloperApi @Unstable @transient val queryExecution: QueryExecution, + @DeveloperApi @Unstable @transient val queryUnpersisted: QueryExecution, @DeveloperApi @Unstable @transient val encoder: Encoder[T]) extends Serializable { + @volatile private var queryPersisted: Option[(Array[Boolean], QueryExecution)] = None + + def queryExecution: QueryExecution = { + val cacheStatesSign = queryUnpersisted.computeCacheStateSignature() + // If all children aren't cached, directly return the queryUnpersisted + if (cacheStatesSign.forall(b => !b)) { Review Comment: nit: `cacheStatesSign.forall(_ == false)` is a bit more readable, and I think it'll make the comment unnecessary ########## sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala: ########## @@ -369,6 +375,20 @@ class QueryExecution( Utils.redact(sparkSession.sessionState.conf.stringRedactionPattern, message) } + /** + * This method performs a pre-order traversal and return a boolean Array + * representing whether some nodes of the logical tree are persisted. + */ + def computeCacheStateSignature(): Array[Boolean] = { Review Comment: How about using `BitSet` for persistence state representation? It'll be easier to work with and it's more efficient. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org