Github user mengxr commented on a diff in the pull request:
https://github.com/apache/spark/pull/10960#discussion_r51533776
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Corr.scala
---
@@ -29,165 +28,95 @@ import org.apache.spark.sql.types._
* Definition of Pearson correlation can be found at
*
http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient
*/
-case class Corr(
- left: Expression,
- right: Expression,
- mutableAggBufferOffset: Int = 0,
- inputAggBufferOffset: Int = 0)
- extends ImperativeAggregate {
-
- def this(left: Expression, right: Expression) =
- this(left, right, mutableAggBufferOffset = 0, inputAggBufferOffset = 0)
-
- override def children: Seq[Expression] = Seq(left, right)
+case class Corr(x: Expression, y: Expression) extends DeclarativeAggregate
{
+ override def children: Seq[Expression] = Seq(x, y)
override def nullable: Boolean = true
-
override def dataType: DataType = DoubleType
-
override def inputTypes: Seq[AbstractDataType] = Seq(DoubleType,
DoubleType)
- override def checkInputDataTypes(): TypeCheckResult = {
- if (left.dataType.isInstanceOf[DoubleType] &&
right.dataType.isInstanceOf[DoubleType]) {
- TypeCheckResult.TypeCheckSuccess
+ protected val count = AttributeReference("count", DoubleType, nullable =
false)()
+ protected val xAvg = AttributeReference("xAvg", DoubleType, nullable =
false)()
+ protected val yAvg = AttributeReference("yAvg", DoubleType, nullable =
false)()
+ protected val ck = AttributeReference("ck", DoubleType, nullable =
false)()
+ protected val xMk = AttributeReference("xMk", DoubleType, nullable =
false)()
+ protected val yMk = AttributeReference("yMk", DoubleType, nullable =
false)()
+
+ override val aggBufferAttributes: Seq[AttributeReference] = Seq(count,
xAvg, yAvg, ck, xMk, yMk)
+
+ override val initialValues: Seq[Expression] = Seq(
+ /* count = */ Literal(0.0),
+ /* xAvg = */ Literal(0.0),
+ /* yAvg = */ Literal(0.0),
+ /* ck = */ Literal(0.0),
+ /* xMk = */ Literal(0.0),
+ /* yMk = */ Literal(0.0)
+ )
+
+ override lazy val updateExpressions: Seq[Expression] = {
+ val n = count + Literal(1.0)
+ val dx = x - xAvg
+ val dxN = dx / n
+ val dy = y - yAvg
+ val dyN = dy / n
+ val newXAvg = xAvg + dxN
+ val newYAvg = yAvg + dyN
+ val newCk = ck + dx * (dy - dyN)
--- End diff --
This is equivalent to the original update rule. However, for reference
purpose, we shouldn't change it. So users can easily map the formula on the
wikipedia page to the implementation here. Shall we change it back to `y -
newYAvg`? Same applies to the two lines below.
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]