Jefffrey commented on code in PR #22764:
URL: https://github.com/apache/datafusion/pull/22764#discussion_r3373731073
##########
datafusion/sqllogictest/test_files/window.slt:
##########
@@ -6604,6 +6604,142 @@ ORDER BY i;
3 1
4 NULL
+# Covariance/correlation sliding-window regression test. Verifies correct
+# results across row removals and a NULL-gap empty-frame transition.
+query IRRR
+SELECT
+ column1,
+ covar_pop(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
+ ),
+ covar_samp(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
+ ),
+ corr(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
+ )
+FROM (
+ VALUES
+ (1, 10.0, 5.0),
+ (2, NULL, NULL),
+ (3, NULL, NULL),
+ (4, 30.0, 10.0),
+ (5, 40.0, 20.0),
+ (6, 50.0, 10.0)
+);
+----
+1 0 NULL NULL
+2 0 NULL NULL
+3 NULL NULL NULL
+4 0 NULL NULL
+5 25 50 1
+6 -25 -50 -1
+
+# Multi-row covariance/correlation sliding-window regression test. Verifies
+# correct accumulation when valid rows enter the frame after a reset.
+query IRRR
+SELECT
+ column1,
+ covar_pop(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
+ ),
+ covar_samp(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
+ ),
+ corr(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
+ )
+FROM (
+ VALUES
+ (1, 10.0, 5.0),
+ (2, NULL, NULL),
+ (3, NULL, NULL),
+ (4, 30.0, 10.0),
+ (5, 40.0, 20.0),
+ (6, 50.0, 10.0)
+);
+----
+1 0 NULL NULL
+2 0 NULL NULL
+3 0 NULL NULL
+4 0 NULL NULL
+5 25 50 1
+6 0 0 0
Review Comment:
similarly
```sql
memory D SELECT
column1,
covar_pop(column2, column3) OVER (
ORDER BY column1
ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
) as covar_pop,
covar_samp(column2, column3) OVER (
ORDER BY column1
ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
) as covar_samp,
corr(column2, column3) OVER (
ORDER BY column1
ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
) as corr
FROM (
VALUES
(1, 10.0, 5.0),
(2, NULL, NULL),
(3, NULL, NULL),
(4, 30.0, 10.0),
(5, 40.0, 20.0),
(6, 50.0, 10.0)
) t(column1,column2,column3);
┌─────────┬───────────┬────────────┬────────┐
│ column1 │ covar_pop │ covar_samp │ corr │
│ int32 │ double │ double │ double │
├─────────┼───────────┼────────────┼────────┤
│ 1 │ 0.0 │ NULL │ nan │
│ 2 │ 0.0 │ NULL │ nan │
│ 3 │ 0.0 │ NULL │ nan │
│ 4 │ 0.0 │ NULL │ nan │
│ 5 │ 25.0 │ 50.0 │ 1.0 │
│ 6 │ 0.0 │ 0.0 │ 0.0 │
└─────────┴───────────┴────────────┴────────┘
```
##########
datafusion/sqllogictest/test_files/window.slt:
##########
@@ -6604,6 +6604,142 @@ ORDER BY i;
3 1
4 NULL
+# Covariance/correlation sliding-window regression test. Verifies correct
+# results across row removals and a NULL-gap empty-frame transition.
+query IRRR
+SELECT
+ column1,
+ covar_pop(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
+ ),
+ covar_samp(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
+ ),
+ corr(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
+ )
+FROM (
+ VALUES
+ (1, 10.0, 5.0),
+ (2, NULL, NULL),
+ (3, NULL, NULL),
+ (4, 30.0, 10.0),
+ (5, 40.0, 20.0),
+ (6, 50.0, 10.0)
+);
+----
+1 0 NULL NULL
+2 0 NULL NULL
+3 NULL NULL NULL
+4 0 NULL NULL
+5 25 50 1
+6 -25 -50 -1
+
Review Comment:
comparison against duckdb
```sql
memory D SELECT
column1,
covar_pop(column2, column3) OVER (
ORDER BY column1
ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
) as covar_pop,
covar_samp(column2, column3) OVER (
ORDER BY column1
ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
) as covar_samp,
corr(column2, column3) OVER (
ORDER BY column1
ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
) as corr
FROM (
VALUES
(1, 10.0, 5.0),
(2, NULL, NULL),
(3, NULL, NULL),
(4, 30.0, 10.0),
(5, 40.0, 20.0),
(6, 50.0, 10.0)
) t(column1, column2, column3);
┌─────────┬───────────┬────────────┬────────┐
│ column1 │ covar_pop │ covar_samp │ corr │
│ int32 │ double │ double │ double │
├─────────┼───────────┼────────────┼────────┤
│ 1 │ 0.0 │ NULL │ nan │
│ 2 │ 0.0 │ NULL │ nan │
│ 3 │ NULL │ NULL │ NULL │
│ 4 │ 0.0 │ NULL │ nan │
│ 5 │ 25.0 │ 50.0 │ 1.0 │
│ 6 │ -25.0 │ -50.0 │ -1.0 │
└─────────┴───────────┴────────────┴────────┘
```
theres a few places it has `nan` instead of `null` for `corr`, dont know how
significant this is
##########
datafusion/sqllogictest/test_files/window.slt:
##########
@@ -6604,6 +6604,142 @@ ORDER BY i;
3 1
4 NULL
+# Covariance/correlation sliding-window regression test. Verifies correct
+# results across row removals and a NULL-gap empty-frame transition.
+query IRRR
+SELECT
+ column1,
+ covar_pop(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
+ ),
+ covar_samp(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
+ ),
+ corr(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
+ )
+FROM (
+ VALUES
+ (1, 10.0, 5.0),
+ (2, NULL, NULL),
+ (3, NULL, NULL),
+ (4, 30.0, 10.0),
+ (5, 40.0, 20.0),
+ (6, 50.0, 10.0)
+);
+----
+1 0 NULL NULL
+2 0 NULL NULL
+3 NULL NULL NULL
+4 0 NULL NULL
+5 25 50 1
+6 -25 -50 -1
+
+# Multi-row covariance/correlation sliding-window regression test. Verifies
+# correct accumulation when valid rows enter the frame after a reset.
+query IRRR
+SELECT
+ column1,
+ covar_pop(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
+ ),
+ covar_samp(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
+ ),
+ corr(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
+ )
+FROM (
+ VALUES
+ (1, 10.0, 5.0),
+ (2, NULL, NULL),
+ (3, NULL, NULL),
+ (4, 30.0, 10.0),
+ (5, 40.0, 20.0),
+ (6, 50.0, 10.0)
+);
+----
+1 0 NULL NULL
+2 0 NULL NULL
+3 0 NULL NULL
+4 0 NULL NULL
+5 25 50 1
+6 0 0 0
+
+# Covariance/correlation sliding-window regression test. Rows with NULL in
+# either input column must not contribute to the aggregate state.
+query IRRR
+SELECT
+ column1,
+ covar_pop(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 3 PRECEDING AND CURRENT ROW
+ ),
+ covar_samp(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 3 PRECEDING AND CURRENT ROW
+ ),
+ corr(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 3 PRECEDING AND CURRENT ROW
+ )
+FROM (
+ VALUES
+ (1, 10.0, 5.0),
+ (2, 20.0, NULL),
+ (3, NULL, 15.0),
+ (4, 30.0, 10.0),
+ (5, 40.0, 20.0)
+);
+----
+1 0 NULL NULL
+2 0 NULL NULL
+3 0 NULL NULL
+4 25 50 1
+5 25 50 1
Review Comment:
```sql
memory D SELECT
column1,
covar_pop(column2, column3) OVER (
ORDER BY column1
ROWS BETWEEN 3 PRECEDING AND CURRENT ROW
) as covar_pop,
covar_samp(column2, column3) OVER (
ORDER BY column1
ROWS BETWEEN 3 PRECEDING AND CURRENT ROW
) as covar_samp,
corr(column2, column3) OVER (
ORDER BY column1
ROWS BETWEEN 3 PRECEDING AND CURRENT ROW
) as corr
FROM (
VALUES
(1, 10.0, 5.0),
(2, 20.0, NULL),
(3, NULL, 15.0),
(4, 30.0, 10.0),
(5, 40.0, 20.0)
) t(column1,column2,column3);
┌─────────┬───────────┬────────────┬────────┐
│ column1 │ covar_pop │ covar_samp │ corr │
│ int32 │ double │ double │ double │
├─────────┼───────────┼────────────┼────────┤
│ 1 │ 0.0 │ NULL │ nan │
│ 2 │ 0.0 │ NULL │ nan │
│ 3 │ 0.0 │ NULL │ nan │
│ 4 │ 25.0 │ 50.0 │ 1.0 │
│ 5 │ 25.0 │ 50.0 │ 1.0 │
└─────────┴───────────┴────────────┴────────┘
```
##########
datafusion/sqllogictest/test_files/window.slt:
##########
@@ -6604,6 +6604,142 @@ ORDER BY i;
3 1
4 NULL
+# Covariance/correlation sliding-window regression test. Verifies correct
+# results across row removals and a NULL-gap empty-frame transition.
+query IRRR
+SELECT
+ column1,
+ covar_pop(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
+ ),
+ covar_samp(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
+ ),
+ corr(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
+ )
+FROM (
+ VALUES
+ (1, 10.0, 5.0),
+ (2, NULL, NULL),
+ (3, NULL, NULL),
+ (4, 30.0, 10.0),
+ (5, 40.0, 20.0),
+ (6, 50.0, 10.0)
+);
+----
+1 0 NULL NULL
+2 0 NULL NULL
+3 NULL NULL NULL
+4 0 NULL NULL
+5 25 50 1
+6 -25 -50 -1
+
+# Multi-row covariance/correlation sliding-window regression test. Verifies
+# correct accumulation when valid rows enter the frame after a reset.
+query IRRR
+SELECT
+ column1,
+ covar_pop(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
+ ),
+ covar_samp(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
+ ),
+ corr(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
+ )
+FROM (
+ VALUES
+ (1, 10.0, 5.0),
+ (2, NULL, NULL),
+ (3, NULL, NULL),
+ (4, 30.0, 10.0),
+ (5, 40.0, 20.0),
+ (6, 50.0, 10.0)
+);
+----
+1 0 NULL NULL
+2 0 NULL NULL
+3 0 NULL NULL
+4 0 NULL NULL
+5 25 50 1
+6 0 0 0
+
+# Covariance/correlation sliding-window regression test. Rows with NULL in
+# either input column must not contribute to the aggregate state.
+query IRRR
+SELECT
+ column1,
+ covar_pop(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 3 PRECEDING AND CURRENT ROW
+ ),
+ covar_samp(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 3 PRECEDING AND CURRENT ROW
+ ),
+ corr(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 3 PRECEDING AND CURRENT ROW
+ )
+FROM (
+ VALUES
+ (1, 10.0, 5.0),
+ (2, 20.0, NULL),
+ (3, NULL, 15.0),
+ (4, 30.0, 10.0),
+ (5, 40.0, 20.0)
+);
+----
+1 0 NULL NULL
+2 0 NULL NULL
+3 0 NULL NULL
+4 25 50 1
+5 25 50 1
+
+# Variance/stddev sliding-window regression test. Verifies that retracting
+# the last valid row resets the aggregate state.
+query IRRRR
+SELECT
+ column1,
+ var_pop(column2) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
+ ),
+ var_samp(column2) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
+ ),
+ stddev_pop(column2) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
+ ),
+ stddev_samp(column2) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
+ )
+FROM (
+ VALUES
+ (1, 10.0),
+ (2, NULL),
+ (3, NULL),
+ (4, 30.0),
+ (5, 40.0)
+);
+----
+1 0 NULL 0 NULL
+2 0 NULL 0 NULL
+3 NULL NULL NULL NULL
+4 0 NULL 0 NULL
+5 25 50 5 7.071067811865
Review Comment:
```sql
memory D SELECT
column1,
var_pop(column2) OVER (
ORDER BY column1
ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
) as var_pop,
var_samp(column2) OVER (
ORDER BY column1
ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
) as var_samp,
stddev_pop(column2) OVER (
ORDER BY column1
ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
) as stddev_pop,
stddev_samp(column2) OVER (
ORDER BY column1
ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
) as stddev_samp
FROM (
VALUES
(1, 10.0),
(2, NULL),
(3, NULL),
(4, 30.0),
(5, 40.0)
) t(column1, column2);
┌─────────┬─────────┬──────────┬────────────┬────────────────────┐
│ column1 │ var_pop │ var_samp │ stddev_pop │ stddev_samp │
│ int32 │ double │ double │ double │ double │
├─────────┼─────────┼──────────┼────────────┼────────────────────┤
│ 1 │ 0.0 │ NULL │ 0.0 │ NULL │
│ 2 │ 0.0 │ NULL │ 0.0 │ NULL │
│ 3 │ NULL │ NULL │ NULL │ NULL │
│ 4 │ 0.0 │ NULL │ 0.0 │ NULL │
│ 5 │ 25.0 │ 50.0 │ 5.0 │ 7.0710678118654755 │
└─────────┴─────────┴──────────┴────────────┴────────────────────┘
```
looks good
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]