HyukjinKwon commented on a change in pull request #33422:
URL: https://github.com/apache/spark/pull/33422#discussion_r673112471
##########
File path: sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
##########
@@ -1947,6 +1947,31 @@ class Dataset[T] private[sql](
CollectMetrics(name, (expr +: exprs).map(_.named), logicalPlan)
}
+ /**
+ * Observe (named) metrics through an `org.apache.spark.sql.Observation`
instance.
+ * This is equivalent to calling `observe(String, Column, Column*)` but does
not require
+ * adding `org.apache.spark.sql.util.QueryExecutionListener` to the spark
session.
+ * This method does not support streaming datasets.
+ *
+ * A user can retrieve the metrics by accessing
`org.apache.spark.sql.Observation.get`.
+ *
+ * {{{
+ * // Observe row count (rows) and highest id (maxid) in the Dataset while
writing it
+ * val observation = Observation("my_metrics")
+ * val observed_ds = ds.observe(observation, count(lit(1)).as("rows"),
max($"id").as("maxid"))
+ * observed_ds.write.parquet("ds.parquet")
+ * val metrics = observation.get
+ * }}}
+ *
+ * @throws IllegalArgumentException If this is a streaming Dataset
(this.isStreaming == true)
+ *
+ * @group typedrel
+ * @since 3.3.0
+ */
+ def observe(observation: Observation, expr: Column, exprs: Column*):
Dataset[T] = {
Review comment:
oh yeah!
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]