[GitHub] [arrow-datafusion] mvanschellebeeck commented on a diff in pull request #4505: Add some more aggregate sqllogictests and remove rust tests

GitBox Sun, 04 Dec 2022 08:22:01 -0800


mvanschellebeeck commented on code in PR #4505:
URL: https://github.com/apache/arrow-datafusion/pull/4505#discussion_r1038996124



##########
datafusion/core/tests/sqllogictests/test_files/aggregate.slt:
##########
@@ -274,187 +271,367 @@ SELECT approx_distinct(c9) AS a, approx_distinct(c9) AS 
b FROM aggregate_test_10
 ----
 100 100
 
-# TODO: csv_query_approx_percentile_cont
+## This test executes the APPROX_PERCENTILE_CONT aggregation against the test
+## data, asserting the estimated quantiles are ±5% their actual values.
+##
+## Actual quantiles calculated with:
+##
+## ```r
+## read_csv("./testing/data/csv/aggregate_test_100.csv") |>
+##     select_if(is.numeric) |>
+##     summarise_all(~ quantile(., c(0.1, 0.5, 0.9)))
+## ```
+##
+## Giving:
+##
+## ```text
+##      c2    c3      c4           c5       c6    c7     c8          c9     
c10   c11    c12
+##   <dbl> <dbl>   <dbl>        <dbl>    <dbl> <dbl>  <dbl>       <dbl>   
<dbl> <dbl>  <dbl>
+## 1     1 -95.3 -22925. -1882606710  -7.25e18  18.9  2671.  472608672. 
1.83e18 0.109 0.0714
+## 2     3  15.5   4599    377164262   1.13e18 134.  30634  2365817608. 
9.30e18 0.491 0.551
+## 3     5 102.   25334.  1991374996.  7.37e18 231   57518. 3776538487. 
1.61e19 0.834 0.946
+## ```
+##
+## Column `c12` is omitted due to a large relative error (~10%) due to the 
small
+## float values.
+
+#csv_query_approx_percentile_cont (c2)
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c2, 0.1) AS DOUBLE) / 1.0) < 0.05) 
AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c2, 0.5) AS DOUBLE) / 3.0) < 0.05) 
AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c2, 0.9) AS DOUBLE) / 5.0) < 0.05) 
AS q FROM aggregate_test_100
+----
+true
+
+# csv_query_approx_percentile_cont (c3)
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c3, 0.1) AS DOUBLE) / -95.3) < 
0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c3, 0.5) AS DOUBLE) / 15.5) < 
0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c3, 0.9) AS DOUBLE) / 102.0) < 
0.05) AS q FROM aggregate_test_100
+----
+true
+
+# csv_query_approx_percentile_cont (c4)
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c4, 0.1) AS DOUBLE) / -22925.0) < 
0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c4, 0.5) AS DOUBLE) / 4599.0) < 
0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c4, 0.9) AS DOUBLE) / 25334.0) < 
0.05) AS q FROM aggregate_test_100
+----
+true
+
+# csv_query_approx_percentile_cont (c5)
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c5, 0.1) AS DOUBLE) / 
-1882606710.0) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c5, 0.5) AS DOUBLE) / 377164262.0) 
< 0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c5, 0.9) AS DOUBLE) / 
1991374996.0) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
+# csv_query_approx_percentile_cont (c6)
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c6, 0.1) AS DOUBLE) / 
-7250000000000000000) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c6, 0.5) AS DOUBLE) / 
1130000000000000000) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c6, 0.9) AS DOUBLE) / 
7370000000000000000) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
+# csv_query_approx_percentile_cont (c7)
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c7, 0.1) AS DOUBLE) / 18.9) < 
0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c7, 0.5) AS DOUBLE) / 134.0) < 
0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c7, 0.9) AS DOUBLE) / 231.0) < 
0.05) AS q FROM aggregate_test_100
+----
+true
+
+# csv_query_approx_percentile_cont (c8)
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c8, 0.1) AS DOUBLE) / 2671.0) < 
0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c8, 0.5) AS DOUBLE) / 30634.0) < 
0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c8, 0.9) AS DOUBLE) / 57518.0) < 
0.05) AS q FROM aggregate_test_100
+----
+true
+
+# csv_query_approx_percentile_cont (c9)
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c9, 0.1) AS DOUBLE) / 472608672.0) 
< 0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c9, 0.5) AS DOUBLE) / 
2365817608.0) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c9, 0.9) AS DOUBLE) / 
3776538487.0) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
+# csv_query_approx_percentile_cont (c10)
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c10, 0.1) AS DOUBLE) / 
1830000000000000000) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c10, 0.5) AS DOUBLE) / 
9300000000000000000) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c10, 0.9) AS DOUBLE) / 
16100000000000000000) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
+# csv_query_approx_percentile_cont (c11)
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c11, 0.1) AS DOUBLE) /  0.109) < 
0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c11, 0.5) AS DOUBLE) / 0.491) < 
0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c11, 0.9) AS DOUBLE) / 0.834) < 
0.05) AS q FROM aggregate_test_100
+----
+true
 
-# TODO: fix decimal places
 # csv_query_cube_avg
-# query TIR
-# SELECT c1, c2, AVG(c3) FROM aggregate_test_100_by_sql GROUP BY CUBE (c1, c2) 
ORDER BY c1, c2
-# ----
-# a 1 -17.6
-# a 2 -15.333333333333334
-# a 3 -4.5
-# a 4 -32
-# a 5 -32
-# a  -18.333333333333332
-# b 1 31.666666666666668
-# b 2 25.5
-# b 3 -42
-# b 4 -44.6
-# b 5 -0.2
-# b  -5.842105263157895
-# c 1 47.5
-# c 2 -55.57142857142857
-# c 3 47.5
-# c 4 -10.75
-# c 5 12
-# c  -1.3333333333333333
-# d 1 -8.142857142857142
-# d 2 109.33333333333333
-# d 3 41.333333333333336
-# d 4 54
-# d 5 -49.5
-# d  25.444444444444443
-# e 1 75.66666666666667
-# e 2 37.8
-# e 3 48
-# e 4 37.285714285714285
-# e 5 -11
-# e  40.333333333333336
-#  1 16.681818181818183
-#  2 8.363636363636363
-#  3 20.789473684210527
-#  4 1.2608695652173914
-#  5 -13.857142857142858
-#    7.81
-
-# TODO: fix decimal places
+query TIR
+SELECT c1, c2, AVG(c3) FROM aggregate_test_100_by_sql GROUP BY CUBE (c1, c2) 
ORDER BY c1, c2
+----

Review Comment:
   Yeah this was just a migrated test - should we track simplifying some of 
these tests in a separate PR/issue?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow-datafusion] mvanschellebeeck commented on a diff in pull request #4505: Add some more aggregate sqllogictests and remove rust tests

Reply via email to