alamb commented on code in PR #4505:
URL: https://github.com/apache/arrow-datafusion/pull/4505#discussion_r1039916942
##########
datafusion/core/tests/sqllogictests/test_files/aggregate.slt:
##########
@@ -274,187 +271,290 @@ SELECT approx_distinct(c9) AS a, approx_distinct(c9) AS
b FROM aggregate_test_10
----
100 100
-# TODO: csv_query_approx_percentile_cont
+## This test executes the APPROX_PERCENTILE_CONT aggregation against the test
Review Comment:
👍 thank you for porting this over
##########
datafusion/core/tests/sqllogictests/test_files/aggregate.slt:
##########
@@ -274,187 +271,367 @@ SELECT approx_distinct(c9) AS a, approx_distinct(c9) AS
b FROM aggregate_test_10
----
100 100
-# TODO: csv_query_approx_percentile_cont
+## This test executes the APPROX_PERCENTILE_CONT aggregation against the test
+## data, asserting the estimated quantiles are ±5% their actual values.
+##
+## Actual quantiles calculated with:
+##
+## ```r
+## read_csv("./testing/data/csv/aggregate_test_100.csv") |>
+## select_if(is.numeric) |>
+## summarise_all(~ quantile(., c(0.1, 0.5, 0.9)))
+## ```
+##
+## Giving:
+##
+## ```text
+## c2 c3 c4 c5 c6 c7 c8 c9
c10 c11 c12
+## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
<dbl> <dbl> <dbl>
+## 1 1 -95.3 -22925. -1882606710 -7.25e18 18.9 2671. 472608672.
1.83e18 0.109 0.0714
+## 2 3 15.5 4599 377164262 1.13e18 134. 30634 2365817608.
9.30e18 0.491 0.551
+## 3 5 102. 25334. 1991374996. 7.37e18 231 57518. 3776538487.
1.61e19 0.834 0.946
+## ```
+##
+## Column `c12` is omitted due to a large relative error (~10%) due to the
small
+## float values.
+
+#csv_query_approx_percentile_cont (c2)
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c2, 0.1) AS DOUBLE) / 1.0) < 0.05)
AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c2, 0.5) AS DOUBLE) / 3.0) < 0.05)
AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c2, 0.9) AS DOUBLE) / 5.0) < 0.05)
AS q FROM aggregate_test_100
+----
+true
+
+# csv_query_approx_percentile_cont (c3)
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c3, 0.1) AS DOUBLE) / -95.3) <
0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c3, 0.5) AS DOUBLE) / 15.5) <
0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c3, 0.9) AS DOUBLE) / 102.0) <
0.05) AS q FROM aggregate_test_100
+----
+true
+
+# csv_query_approx_percentile_cont (c4)
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c4, 0.1) AS DOUBLE) / -22925.0) <
0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c4, 0.5) AS DOUBLE) / 4599.0) <
0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c4, 0.9) AS DOUBLE) / 25334.0) <
0.05) AS q FROM aggregate_test_100
+----
+true
+
+# csv_query_approx_percentile_cont (c5)
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c5, 0.1) AS DOUBLE) /
-1882606710.0) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c5, 0.5) AS DOUBLE) / 377164262.0)
< 0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c5, 0.9) AS DOUBLE) /
1991374996.0) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
+# csv_query_approx_percentile_cont (c6)
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c6, 0.1) AS DOUBLE) /
-7250000000000000000) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c6, 0.5) AS DOUBLE) /
1130000000000000000) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c6, 0.9) AS DOUBLE) /
7370000000000000000) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
+# csv_query_approx_percentile_cont (c7)
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c7, 0.1) AS DOUBLE) / 18.9) <
0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c7, 0.5) AS DOUBLE) / 134.0) <
0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c7, 0.9) AS DOUBLE) / 231.0) <
0.05) AS q FROM aggregate_test_100
+----
+true
+
+# csv_query_approx_percentile_cont (c8)
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c8, 0.1) AS DOUBLE) / 2671.0) <
0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c8, 0.5) AS DOUBLE) / 30634.0) <
0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c8, 0.9) AS DOUBLE) / 57518.0) <
0.05) AS q FROM aggregate_test_100
+----
+true
+
+# csv_query_approx_percentile_cont (c9)
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c9, 0.1) AS DOUBLE) / 472608672.0)
< 0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c9, 0.5) AS DOUBLE) /
2365817608.0) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c9, 0.9) AS DOUBLE) /
3776538487.0) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
+# csv_query_approx_percentile_cont (c10)
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c10, 0.1) AS DOUBLE) /
1830000000000000000) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c10, 0.5) AS DOUBLE) /
9300000000000000000) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c10, 0.9) AS DOUBLE) /
16100000000000000000) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
+# csv_query_approx_percentile_cont (c11)
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c11, 0.1) AS DOUBLE) / 0.109) <
0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c11, 0.5) AS DOUBLE) / 0.491) <
0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c11, 0.9) AS DOUBLE) / 0.834) <
0.05) AS q FROM aggregate_test_100
+----
+true
-# TODO: fix decimal places
# csv_query_cube_avg
-# query TIR
-# SELECT c1, c2, AVG(c3) FROM aggregate_test_100_by_sql GROUP BY CUBE (c1, c2)
ORDER BY c1, c2
-# ----
-# a 1 -17.6
-# a 2 -15.333333333333334
-# a 3 -4.5
-# a 4 -32
-# a 5 -32
-# a -18.333333333333332
-# b 1 31.666666666666668
-# b 2 25.5
-# b 3 -42
-# b 4 -44.6
-# b 5 -0.2
-# b -5.842105263157895
-# c 1 47.5
-# c 2 -55.57142857142857
-# c 3 47.5
-# c 4 -10.75
-# c 5 12
-# c -1.3333333333333333
-# d 1 -8.142857142857142
-# d 2 109.33333333333333
-# d 3 41.333333333333336
-# d 4 54
-# d 5 -49.5
-# d 25.444444444444443
-# e 1 75.66666666666667
-# e 2 37.8
-# e 3 48
-# e 4 37.285714285714285
-# e 5 -11
-# e 40.333333333333336
-# 1 16.681818181818183
-# 2 8.363636363636363
-# 3 20.789473684210527
-# 4 1.2608695652173914
-# 5 -13.857142857142858
-# 7.81
-
-# TODO: fix decimal places
+query TIR
+SELECT c1, c2, AVG(c3) FROM aggregate_test_100_by_sql GROUP BY CUBE (c1, c2)
ORDER BY c1, c2
+----
Review Comment:
For large outputs there is some sort of sqllogictest standards:
https://duckdb.org/dev/sqllogictest/result_verification
```
mode output_hash
```
And
https://www.sqlite.org/sqllogictest/doc/trunk/about.wiki
```
The "hash-threshold" record sets a limit on the number of values that can
appear in a result set. If the number of values exceeds this, then instead of
recording each individual value in the full test script, an MD5 hash of all
values is computed in stored. This makes the full test scripts much shorter,
but at the cost of obscuring the results. If the hash-threshold is 0, then
results are never hashed. A hash-threshold of 10 or 20 is recommended. During
debugging, it is advantage to set the hash-threshold to zero so that all
results can be seen.
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]