[GitHub] [arrow-datafusion] xudong963 commented on a diff in pull request #4505: Add some more aggregate sqllogictests and remove rust tests

GitBox Sun, 04 Dec 2022 08:56:05 -0800


xudong963 commented on code in PR #4505:
URL: https://github.com/apache/arrow-datafusion/pull/4505#discussion_r1039000530



##########
datafusion/core/tests/sqllogictests/test_files/aggregate.slt:
##########
@@ -274,187 +271,367 @@ SELECT approx_distinct(c9) AS a, approx_distinct(c9) AS 
b FROM aggregate_test_10
 ----
 100 100
 
-# TODO: csv_query_approx_percentile_cont
+## This test executes the APPROX_PERCENTILE_CONT aggregation against the test
+## data, asserting the estimated quantiles are ±5% their actual values.
+##
+## Actual quantiles calculated with:
+##
+## ```r
+## read_csv("./testing/data/csv/aggregate_test_100.csv") |>
+##     select_if(is.numeric) |>
+##     summarise_all(~ quantile(., c(0.1, 0.5, 0.9)))
+## ```
+##
+## Giving:
+##
+## ```text
+##      c2    c3      c4           c5       c6    c7     c8          c9     
c10   c11    c12
+##   <dbl> <dbl>   <dbl>        <dbl>    <dbl> <dbl>  <dbl>       <dbl>   
<dbl> <dbl>  <dbl>
+## 1     1 -95.3 -22925. -1882606710  -7.25e18  18.9  2671.  472608672. 
1.83e18 0.109 0.0714
+## 2     3  15.5   4599    377164262   1.13e18 134.  30634  2365817608. 
9.30e18 0.491 0.551
+## 3     5 102.   25334.  1991374996.  7.37e18 231   57518. 3776538487. 
1.61e19 0.834 0.946
+## ```
+##
+## Column `c12` is omitted due to a large relative error (~10%) due to the 
small
+## float values.
+
+#csv_query_approx_percentile_cont (c2)
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c2, 0.1) AS DOUBLE) / 1.0) < 0.05) 
AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c2, 0.5) AS DOUBLE) / 3.0) < 0.05) 
AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c2, 0.9) AS DOUBLE) / 5.0) < 0.05) 
AS q FROM aggregate_test_100
+----
+true
+
+# csv_query_approx_percentile_cont (c3)
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c3, 0.1) AS DOUBLE) / -95.3) < 
0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c3, 0.5) AS DOUBLE) / 15.5) < 
0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c3, 0.9) AS DOUBLE) / 102.0) < 
0.05) AS q FROM aggregate_test_100
+----
+true
+
+# csv_query_approx_percentile_cont (c4)
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c4, 0.1) AS DOUBLE) / -22925.0) < 
0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c4, 0.5) AS DOUBLE) / 4599.0) < 
0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c4, 0.9) AS DOUBLE) / 25334.0) < 
0.05) AS q FROM aggregate_test_100
+----
+true
+
+# csv_query_approx_percentile_cont (c5)
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c5, 0.1) AS DOUBLE) / 
-1882606710.0) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c5, 0.5) AS DOUBLE) / 377164262.0) 
< 0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c5, 0.9) AS DOUBLE) / 
1991374996.0) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
+# csv_query_approx_percentile_cont (c6)
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c6, 0.1) AS DOUBLE) / 
-7250000000000000000) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c6, 0.5) AS DOUBLE) / 
1130000000000000000) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c6, 0.9) AS DOUBLE) / 
7370000000000000000) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
+# csv_query_approx_percentile_cont (c7)
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c7, 0.1) AS DOUBLE) / 18.9) < 
0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c7, 0.5) AS DOUBLE) / 134.0) < 
0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c7, 0.9) AS DOUBLE) / 231.0) < 
0.05) AS q FROM aggregate_test_100
+----
+true
+
+# csv_query_approx_percentile_cont (c8)
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c8, 0.1) AS DOUBLE) / 2671.0) < 
0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c8, 0.5) AS DOUBLE) / 30634.0) < 
0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c8, 0.9) AS DOUBLE) / 57518.0) < 
0.05) AS q FROM aggregate_test_100
+----
+true
+
+# csv_query_approx_percentile_cont (c9)
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c9, 0.1) AS DOUBLE) / 472608672.0) 
< 0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c9, 0.5) AS DOUBLE) / 
2365817608.0) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c9, 0.9) AS DOUBLE) / 
3776538487.0) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
+# csv_query_approx_percentile_cont (c10)
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c10, 0.1) AS DOUBLE) / 
1830000000000000000) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c10, 0.5) AS DOUBLE) / 
9300000000000000000) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c10, 0.9) AS DOUBLE) / 
16100000000000000000) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
+# csv_query_approx_percentile_cont (c11)
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c11, 0.1) AS DOUBLE) /  0.109) < 
0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c11, 0.5) AS DOUBLE) / 0.491) < 
0.05) AS q FROM aggregate_test_100
+----
+true
+
+query T
+SELECT (ABS(1 - CAST(approx_percentile_cont(c11, 0.9) AS DOUBLE) / 0.834) < 
0.05) AS q FROM aggregate_test_100
+----
+true
 
-# TODO: fix decimal places
 # csv_query_cube_avg
-# query TIR
-# SELECT c1, c2, AVG(c3) FROM aggregate_test_100_by_sql GROUP BY CUBE (c1, c2) 
ORDER BY c1, c2
-# ----
-# a 1 -17.6
-# a 2 -15.333333333333334
-# a 3 -4.5
-# a 4 -32
-# a 5 -32
-# a  -18.333333333333332
-# b 1 31.666666666666668
-# b 2 25.5
-# b 3 -42
-# b 4 -44.6
-# b 5 -0.2
-# b  -5.842105263157895
-# c 1 47.5
-# c 2 -55.57142857142857
-# c 3 47.5
-# c 4 -10.75
-# c 5 12
-# c  -1.3333333333333333
-# d 1 -8.142857142857142
-# d 2 109.33333333333333
-# d 3 41.333333333333336
-# d 4 54
-# d 5 -49.5
-# d  25.444444444444443
-# e 1 75.66666666666667
-# e 2 37.8
-# e 3 48
-# e 4 37.285714285714285
-# e 5 -11
-# e  40.333333333333336
-#  1 16.681818181818183
-#  2 8.363636363636363
-#  3 20.789473684210527
-#  4 1.2608695652173914
-#  5 -13.857142857142858
-#    7.81
-
-# TODO: fix decimal places
+query TIR
+SELECT c1, c2, AVG(c3) FROM aggregate_test_100_by_sql GROUP BY CUBE (c1, c2) 
ORDER BY c1, c2
+----

Review Comment:
   Simplifying during migration or in separate PR both ok to me. -- Seems 
during migration can reduce our workload



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow-datafusion] xudong963 commented on a diff in pull request #4505: Add some more aggregate sqllogictests and remove rust tests

Reply via email to