This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch theta_test
in repository https://gitbox.apache.org/repos/asf/datasketches-bigquery.git

commit 5ac1822978a2230641c807020629777d703cbcd3
Author: AlexanderSaydakov <[email protected]>
AuthorDate: Wed Nov 6 17:10:20 2024 -0800

    theta sketch tests
---
 tests/theta_sketch_test.js | 250 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 250 insertions(+)

diff --git a/tests/theta_sketch_test.js b/tests/theta_sketch_test.js
new file mode 100644
index 0000000..fb8e87c
--- /dev/null
+++ b/tests/theta_sketch_test.js
@@ -0,0 +1,250 @@
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+const { generate_udf_test, generate_udaf_test } = require('./unit_test_utils');
+
+// using defaults
+
+const theta_1 = `FROM_BASE64('AQQDPwEazJMDEIFfUcrcGW6ylF+DQ0nLOjDZ/9ze6gyQ')`;
+
+generate_udaf_test("theta_sketch_agg_string", {
+  input_columns: [`str`],
+  input_rows: `SELECT * FROM UNNEST(['a', 'b', 'c']) AS str`,
+  expected_output: theta_1
+});
+
+const theta_2 = `FROM_BASE64('AQQDPwEazJMDEIFfUcrcGW6aAe192ejCXiH3k2yKaRSQ')`;
+
+generate_udaf_test("theta_sketch_agg_string", {
+  input_columns: [`str`],
+  input_rows: `SELECT * FROM UNNEST(['c', 'd', 'e']) AS str`,
+  expected_output: theta_2
+});
+
+const theta_union_1 = 
`FROM_BASE64('AQQDPgEazJMFIQK+o5W4Mt5oB7X3Z6MJcYknIFaWEI3+GlXsNvTgWyADqD2ToYTc')`;
+
+generate_udf_test("theta_sketch_union", [{
+  inputs: [ theta_1, theta_2 ],
+  expected_output: theta_union_1
+}]);
+
+generate_udf_test("theta_sketch_get_estimate", [{
+  inputs: [ theta_union_1 ],
+  expected_output: 5
+}]);
+
+generate_udf_test("theta_sketch_to_string", [{
+  inputs: [ theta_union_1 ],
+  expected_output: `'''### Theta sketch summary:
+   num retained entries : 5
+   seed hash            : 37836
+   empty?               : false
+   ordered?             : true
+   estimation mode?     : false
+   theta (fraction)     : 1
+   theta (raw 64-bit)   : 9223372036854775807
+   estimate             : 5
+   lower bound 95% conf : 5
+   upper bound 95% conf : 5
+### End sketch summary
+'''`
+}]);
+
+const theta_intersection = `FROM_BASE64('AQMDAAAazJO3DG7lqK9ACA==')`;
+
+generate_udf_test("theta_sketch_intersection", [{
+  inputs: [ theta_1, theta_2 ],
+  expected_output: theta_intersection
+}]);
+
+generate_udf_test("theta_sketch_get_estimate", [{
+  inputs: [ theta_intersection ],
+  expected_output: 1
+}]);
+
+const theta_a_not_b = `FROM_BASE64('AQQDPwEazJMCacuPE2yA/wsYbP/ub3UGSA==')`;
+
+generate_udf_test("theta_sketch_a_not_b", [{
+  inputs: [ theta_1, theta_2 ],
+  expected_output: theta_a_not_b
+}]);
+
+generate_udf_test("theta_sketch_get_estimate", [{
+  inputs: [ theta_a_not_b ],
+  expected_output: 2
+}]);
+
+generate_udf_test("theta_sketch_jaccard_similarity", [{
+  inputs: [ theta_1, theta_2 ],
+  expected_output: `STRUCT(0.2 AS lower_bound, 0.2 AS estimate, 0.2 AS 
upper_bound)`
+}]);
+
+const theta_3 = 
`FROM_BASE64('AgQDNgIazJOI2wmVmcIORMwUHLq5RvX6vANDDcOYafHagm31FRxAyQgbfyZfHK6cXlu9nAyuSw3Y5yDG20eD0yTFxfYYqhtUAC6763N9mDFgY3kkzVBludYyfoSC8b+Wj3BFBSTittWUzAJlaZk1z9A7O+JY8s1Ct8UDqPUJAh6RWugNTAX1DMmVdSJ38xPtnltAG5MaqHqvBaI/40laLBeMENPyA/ClmV7GWILEhfPewrIrAAKvVcTZHAxuSIdkIwBJeACf0+gBqQ8kXsEJBJl95rkPPD0hGu9h5BK8Tq9KH7uBvpCjP+RaAq9wfeQhIEFGxK3+hsBkJB4VimzBG2THfCmRBV+wxMh8WBaKnif6K4BGEOTuXWjBjsRGcyBeAB7R4hPRWBsxbe1HWqCS1YCqqVcDHmwRyBQcDbXJW1Q6kEzf04zk/NALWss0RHxADMsBFg/MASepKd
 [...]
+
+generate_udaf_test("theta_sketch_agg_int64", {
+  input_columns: [`value`],
+  input_rows: `SELECT * FROM UNNEST(GENERATE_ARRAY(1, 10000, 1)) AS value`,
+  expected_output: theta_3
+});
+
+const theta_4 = 
`FROM_BASE64('AgQDNQIazJOL1LJ4JHHtRLEUF6w8Sd7sAB+koAWVgcm3Dc77EqYT96KDTTWAGXDaQIKKGnnNdDwmvLyMx7DJQgUYo8Vx2K1ClT05PgNA/FaIDY0vgM+YqN8zwCooyUqPctI9VhpDSrOCHy0qUwEIAqLAECZIgjtaSULFIBoXzfv3/EipBG0e27TESWXLNiY8Ru9uAtnkRRLWxCRKNAROo+Aq+AiLRvwCgm0gqn8Tf83WBu6O9SSiyB/e0w0nDpXQv65d+FouschQALNC8rue0Wjrv/olc9WiZYk8VUuOIWMx/WJkZWUEAtNezCYhTGYW+2brCHtyJ1TzWBjRw/S8VzAyYp4e6A6Asl/I6+2EUHVyL8PExFS2vSDxtAl/qMS+hSA1PYt70jsDPnQCaKcqGVupNesCkxSgHx6/kgsEahua5FiYQURAhwmiaAyIEuI2AgSP2tdLqP82wk
 [...]
+
+generate_udaf_test("theta_sketch_agg_int64", {
+  input_columns: [`value`],
+  input_rows: `SELECT * FROM UNNEST(GENERATE_ARRAY(100000, 110000, 1)) AS 
value`,
+  expected_output: theta_4
+});
+
+const theta_union_2 = 
`FROM_BASE64('AgQDNAIazJNqLrF78HeQGgAQL1h4k73YAH6SgBZWBzupRIa0soA0MNw5hp8OzBwti3jBP3ooNNNYAy4bSBBRRQrsGEXEeQyQgbfyZfDKfylKHtdeRmPYZKEAfMGwxoaADK5LDdjnIxttHg9MkwtZBFvncoUQZdLrpCwAuu+tzfZjFgY3kkzVAgjKHM8AEXZatv0KESev9pOSqtBnzFRvmeAA7z2OaruBSTittWUzAmVpmTXP0OzviWPLNQNxnrFb4QKAqxiTNvkIekVroDUwX1DMmVdSETC/igA8kIfLSpTAQgFRYAgTJEI7WklCxSA0L5v37/iSpBG0e27TANeU7DT6IBuTGqh6rxaI/40laLCTFoNc8kIOWqieIt/SlmV7GWILDy5tsyDjw5MNA5Cj7wAKvVcTZHDG5Ih2QjASXgAn9PoBqQ8kXsEJEmX3muQ88dXBhkQj
 [...]
+
+generate_udaf_test("theta_sketch_agg_union", {
+  input_columns: [`sketch`],
+  input_rows: `SELECT * FROM UNNEST([${theta_3}, ${theta_4}]) AS sketch`,
+  expected_output: theta_union_2
+});
+
+generate_udf_test("theta_sketch_get_estimate_and_bounds", [{
+  inputs: [ theta_union_2, 3 ],
+  expected_output: `STRUCT(19736.541348415347 AS estimate, 18927.112205958525 
AS lower_bound, 20580.437426810073 AS upper_bound)`
+}]);
+
+generate_udf_test("theta_sketch_get_theta", [{
+  inputs: [ theta_union_2 ],
+  expected_output: 0.20753382913916013
+}]);
+
+generate_udf_test("theta_sketch_get_num_retained", [{
+  inputs: [ theta_union_2 ],
+  expected_output: 4096
+}]);
+
+
+// using full signatures
+
+const theta_8_111_09_1 = 
`FROM_BASE64('AgQDPgEajNMAAAAAADMzcwNbX0hyljVFUBHLpzFb/p08wnWFIBcXdIA=')`;
+
+generate_udaf_test("theta_sketch_agg_string_lgk_seed_p", {
+  input_columns: [`str`, 'STRUCT(8 AS lgk, 111 AS seed, 0.9 AS p) NOT 
AGGREGATE'],
+  input_rows: `SELECT * FROM UNNEST(['a', 'b', 'c']) AS str`,
+  expected_output: theta_8_111_09_1
+});
+
+const theta_8_111_09_2 = 
`FROM_BASE64('AgQDPwEajNMAAAAAADMzcwMtr6Q5SxqiqB2ct1zLfRY+Ix9R7oe55fg=')`;
+
+generate_udaf_test("theta_sketch_agg_string_lgk_seed_p", {
+  input_columns: [`str`, 'STRUCT(8 AS lgk, 111 AS seed, 0.9 AS p) NOT 
AGGREGATE'],
+  input_rows: `SELECT * FROM UNNEST(['c', 'd', 'e']) AS str`,
+  expected_output: theta_8_111_09_2
+});
+
+const theta_union_8_111_1 = 
`FROM_BASE64('AgQDPgEajNMAAAAAADMzcwVbX0hyljVFUBHLpzFb/p0xkpzZB0fW7yy/YrBjPQIWXpIeNbbo6qQ=')`;
+
+generate_udf_test("theta_sketch_union_lgk_seed", [{
+  inputs: [ theta_8_111_09_1, theta_8_111_09_2, 8, 111 ],
+  expected_output: theta_union_8_111_1
+}]);
+
+generate_udf_test("theta_sketch_get_estimate_seed", [{
+  inputs: [ theta_union_8_111_1, 111 ],
+  expected_output: 5.5555557027275215
+}]);
+
+generate_udf_test("theta_sketch_to_string_seed", [{
+  inputs: [ theta_union_8_111_1, 111 ],
+  expected_output: `'''### Theta sketch summary:
+   num retained entries : 5
+   seed hash            : 54156
+   empty?               : false
+   ordered?             : true
+   estimation mode?     : true
+   theta (fraction)     : 0.9
+   theta (raw 64-bit)   : 8301034613266972672
+   estimate             : 5.55556
+   lower bound 95% conf : 5
+   upper bound 95% conf : 9
+### End sketch summary
+'''`
+}]);
+
+const theta_intersection_111 = 
`FROM_BASE64('AgQDPQEajNMAAAAAADMzcwG2vpDlLGqKoA==')`;
+
+generate_udf_test("theta_sketch_intersection_seed", [{
+  inputs: [ theta_8_111_09_1, theta_8_111_09_2, 111 ],
+  expected_output: theta_intersection_111
+}]);
+
+generate_udf_test("theta_sketch_get_estimate_seed", [{
+  inputs: [ theta_intersection_111, 111 ],
+  expected_output: 1.1111111405455043
+}]);
+
+const theta_a_not_b_111 = 
`FROM_BASE64('AgQDPgEajNMAAAAAADMzcwJf0jI+7TTsnzCdYUgFxd0g')`;
+
+generate_udf_test("theta_sketch_a_not_b_seed", [{
+  inputs: [ theta_8_111_09_1, theta_8_111_09_2, 111 ],
+  expected_output: theta_a_not_b_111
+}]);
+
+generate_udf_test("theta_sketch_get_estimate_seed", [{
+  inputs: [ theta_a_not_b_111, 111 ],
+  expected_output: 2.2222222810910086
+}]);
+
+generate_udf_test("theta_sketch_jaccard_similarity_seed", [{
+  inputs: [ theta_8_111_09_1, theta_8_111_09_2, 111 ],
+  expected_output: `STRUCT(0.05868247546115801 AS lower_bound, 0.2 AS 
estimate, 0.4517325934817119 AS upper_bound)`
+}]);
+
+const theta_8_111_09_3 = 
`FROM_BASE64('AgQDNQIajNMldPmtaP2wBJcBGnTc0DnwUOjIxHsAD0IJ/loCTuIUlq9nGldDVYICpKqDD/sFVaQRDBNy1k9BHWBSsFMtZ18AyFbv794xws7sRZdFgj8n/PCK5go1FhPWdtJKjcPmrjQK59RL+x9IBE0rAx2RJLxG4t2YrxYv1ykEwNBw7dl8LODBumZKJt7opXXGEUpXoK4Iatkn8iSEEsUTAQwME52vZvljn6bBgvOoIjafqY0wSe38trB+lIO5LAttJaQFwSMrrMugsc27xPoNAt/Fe5WydFwzH0i9pWNo2ZA368EBLrVBtu75T2/WaVqMQC1Ehto4YEPlsZoopZA7rPPfOKqOjrT6Y4ikgJTP6+fsAd4kh0RBWQd5pSCZbgCFRz+dexPA7Y/0pcioD1YW/ocJYG4By3WtnosrOgpttEAAMQNH1xxiHmwK40eBLwAcZ
 [...]
+
+generate_udaf_test("theta_sketch_agg_int64_lgk_seed_p", {
+  input_columns: [`value`, 'STRUCT(8 AS lgk, 111 AS seed, 0.9 AS p) NOT 
AGGREGATE'],
+  input_rows: `SELECT * FROM UNNEST(GENERATE_ARRAY(1, 10000, 1)) AS value`,
+  expected_output: theta_8_111_09_3
+});
+
+const theta_8_111_09_4 = 
`FROM_BASE64('AgQDNQIajNMPlXAiArkLBYQBAOEEcZzveKZ4EjkeRoIpnfihDBodnISNDTZgfzMr9ebvCSMdBmkWEAWqgC4mPiEin7XhELsPzRkwazooGRnUqwC/D6KrZe6oaAFidQgoqYG/J9Zar0eosT8tze8EaXiNE3odgYKlAqKYiyHLI3rOMWDh/GWCGIDHtyqRW4MeNJSsBd7GUbepSN2D65BOhKCCV/ggwafqk3kA1K/8iq0BDPdSVhGmeBX8T5h23M+yANk25VRcXusaZI6AjHnM5UyHg38TXkk0SEJuJmq8yoCoS5x5Rb8HQdTqg+XIii3GppP7xMXBeevUxkC0CWLgs0JvBB6N/WEN+0WiaG8kJkouFCLp4r+L2JjF2hAj8ORL+8snWwgT7HwADyHX60qqGiulvstl8IdFRg7Ku4BdgoOJYkQMCKPF1F6mTs9cBZ4wB/20F
 [...]
+
+generate_udaf_test("theta_sketch_agg_int64_lgk_seed_p", {
+  input_columns: [`value`, 'STRUCT(8 AS lgk, 111 AS seed, 0.9 AS p) NOT 
AGGREGATE'],
+  input_rows: `SELECT * FROM UNNEST(GENERATE_ARRAY(100000, 110000, 1)) AS 
value`,
+  expected_output: theta_8_111_09_4
+});
+
+const theta_union_8_111_09_2 = 
`FROM_BASE64('AgQDNAIajNN7wYMWj7V8AQABAcII4zne8pngSOR5GgmJrC7ycBB8NDlhXwwdnISNDTZg/mZX683eBOwDfN/+0QT/LQEncQ9Qfcyx4mAtVAFxMfECcPGDVVXQ+5Cdq7teH5oyYNZ0UGRnUqwC/DUWju5C9nP+wVVpBEMIErbqogigFidQgoqYAEQ/NNXckFKwUy1nXwGQrd/fvGMOUIg8naE/zrKNm/dRH5P+eEVzCjUWE9Z21Gx/QuQPbAKJxE6Uz8K59RL+x9ICJpWBjsiQRsm4ms+VGCpQKimIsvLYJ6AwjxRoxHuZWdF/bp1nAn0cO3ZfCzgwRMuSsKwDCYZ5Jiw3E0Mttkr+HjSUrAXexlNvUpG7B9cFuRWLCzIBXBDVsk/kJb/vm9kvoQYNP1SbyA1K/8iq0BGe6krCNM8FfxPmHbcy82zyg9Q+BgnO17N
 [...]
+
+generate_udaf_test("theta_sketch_agg_union_lgk_seed", {
+  input_columns: [`sketch`, 'STRUCT(8 AS lgk, 111 AS seed) NOT AGGREGATE'],
+  input_rows: `SELECT * FROM UNNEST([${theta_8_111_09_3}, 
${theta_8_111_09_4}]) AS sketch`,
+  expected_output: theta_union_8_111_09_2
+});
+
+generate_udf_test("theta_sketch_get_estimate_and_bounds_seed", [{
+  inputs: [ theta_union_8_111_09_2, 3, 111 ],
+  expected_output: `STRUCT(22034.160662067967 AS estimate, 18252.303584500878 
AS lower_bound, 26589.643724271038 AS upper_bound)`
+}]);
+
+generate_udf_test("theta_sketch_get_theta_seed", [{
+  inputs: [ theta_union_8_111_09_2, 111 ],
+  expected_output: 0.011618323199426725
+}]);
+
+generate_udf_test("theta_sketch_get_num_retained_seed", [{
+  inputs: [ theta_union_8_111_09_2, 111 ],
+  expected_output: 256
+}]);


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to