This is an automated email from the ASF dual-hosted git repository. alsay pushed a commit to branch theta_test in repository https://gitbox.apache.org/repos/asf/datasketches-bigquery.git
commit 5ac1822978a2230641c807020629777d703cbcd3 Author: AlexanderSaydakov <[email protected]> AuthorDate: Wed Nov 6 17:10:20 2024 -0800 theta sketch tests --- tests/theta_sketch_test.js | 250 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 250 insertions(+) diff --git a/tests/theta_sketch_test.js b/tests/theta_sketch_test.js new file mode 100644 index 0000000..fb8e87c --- /dev/null +++ b/tests/theta_sketch_test.js @@ -0,0 +1,250 @@ + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const { generate_udf_test, generate_udaf_test } = require('./unit_test_utils'); + +// using defaults + +const theta_1 = `FROM_BASE64('AQQDPwEazJMDEIFfUcrcGW6ylF+DQ0nLOjDZ/9ze6gyQ')`; + +generate_udaf_test("theta_sketch_agg_string", { + input_columns: [`str`], + input_rows: `SELECT * FROM UNNEST(['a', 'b', 'c']) AS str`, + expected_output: theta_1 +}); + +const theta_2 = `FROM_BASE64('AQQDPwEazJMDEIFfUcrcGW6aAe192ejCXiH3k2yKaRSQ')`; + +generate_udaf_test("theta_sketch_agg_string", { + input_columns: [`str`], + input_rows: `SELECT * FROM UNNEST(['c', 'd', 'e']) AS str`, + expected_output: theta_2 +}); + +const theta_union_1 = `FROM_BASE64('AQQDPgEazJMFIQK+o5W4Mt5oB7X3Z6MJcYknIFaWEI3+GlXsNvTgWyADqD2ToYTc')`; + +generate_udf_test("theta_sketch_union", [{ + inputs: [ theta_1, theta_2 ], + expected_output: theta_union_1 +}]); + +generate_udf_test("theta_sketch_get_estimate", [{ + inputs: [ theta_union_1 ], + expected_output: 5 +}]); + +generate_udf_test("theta_sketch_to_string", [{ + inputs: [ theta_union_1 ], + expected_output: `'''### Theta sketch summary: + num retained entries : 5 + seed hash : 37836 + empty? : false + ordered? : true + estimation mode? : false + theta (fraction) : 1 + theta (raw 64-bit) : 9223372036854775807 + estimate : 5 + lower bound 95% conf : 5 + upper bound 95% conf : 5 +### End sketch summary +'''` +}]); + +const theta_intersection = `FROM_BASE64('AQMDAAAazJO3DG7lqK9ACA==')`; + +generate_udf_test("theta_sketch_intersection", [{ + inputs: [ theta_1, theta_2 ], + expected_output: theta_intersection +}]); + +generate_udf_test("theta_sketch_get_estimate", [{ + inputs: [ theta_intersection ], + expected_output: 1 +}]); + +const theta_a_not_b = `FROM_BASE64('AQQDPwEazJMCacuPE2yA/wsYbP/ub3UGSA==')`; + +generate_udf_test("theta_sketch_a_not_b", [{ + inputs: [ theta_1, theta_2 ], + expected_output: theta_a_not_b +}]); + +generate_udf_test("theta_sketch_get_estimate", [{ + inputs: [ theta_a_not_b ], + expected_output: 2 +}]); + +generate_udf_test("theta_sketch_jaccard_similarity", [{ + inputs: [ theta_1, theta_2 ], + expected_output: `STRUCT(0.2 AS lower_bound, 0.2 AS estimate, 0.2 AS upper_bound)` +}]); + +const theta_3 = `FROM_BASE64('AgQDNgIazJOI2wmVmcIORMwUHLq5RvX6vANDDcOYafHagm31FRxAyQgbfyZfHK6cXlu9nAyuSw3Y5yDG20eD0yTFxfYYqhtUAC6763N9mDFgY3kkzVBludYyfoSC8b+Wj3BFBSTittWUzAJlaZk1z9A7O+JY8s1Ct8UDqPUJAh6RWugNTAX1DMmVdSJ38xPtnltAG5MaqHqvBaI/40laLBeMENPyA/ClmV7GWILEhfPewrIrAAKvVcTZHAxuSIdkIwBJeACf0+gBqQ8kXsEJBJl95rkPPD0hGu9h5BK8Tq9KH7uBvpCjP+RaAq9wfeQhIEFGxK3+hsBkJB4VimzBG2THfCmRBV+wxMh8WBaKnif6K4BGEOTuXWjBjsRGcyBeAB7R4hPRWBsxbe1HWqCS1YCqqVcDHmwRyBQcDbXJW1Q6kEzf04zk/NALWss0RHxADMsBFg/MASepKd [...] + +generate_udaf_test("theta_sketch_agg_int64", { + input_columns: [`value`], + input_rows: `SELECT * FROM UNNEST(GENERATE_ARRAY(1, 10000, 1)) AS value`, + expected_output: theta_3 +}); + +const theta_4 = `FROM_BASE64('AgQDNQIazJOL1LJ4JHHtRLEUF6w8Sd7sAB+koAWVgcm3Dc77EqYT96KDTTWAGXDaQIKKGnnNdDwmvLyMx7DJQgUYo8Vx2K1ClT05PgNA/FaIDY0vgM+YqN8zwCooyUqPctI9VhpDSrOCHy0qUwEIAqLAECZIgjtaSULFIBoXzfv3/EipBG0e27TESWXLNiY8Ru9uAtnkRRLWxCRKNAROo+Aq+AiLRvwCgm0gqn8Tf83WBu6O9SSiyB/e0w0nDpXQv65d+FouschQALNC8rue0Wjrv/olc9WiZYk8VUuOIWMx/WJkZWUEAtNezCYhTGYW+2brCHtyJ1TzWBjRw/S8VzAyYp4e6A6Asl/I6+2EUHVyL8PExFS2vSDxtAl/qMS+hSA1PYt70jsDPnQCaKcqGVupNesCkxSgHx6/kgsEahua5FiYQURAhwmiaAyIEuI2AgSP2tdLqP82wk [...] + +generate_udaf_test("theta_sketch_agg_int64", { + input_columns: [`value`], + input_rows: `SELECT * FROM UNNEST(GENERATE_ARRAY(100000, 110000, 1)) AS value`, + expected_output: theta_4 +}); + +const theta_union_2 = `FROM_BASE64('AgQDNAIazJNqLrF78HeQGgAQL1h4k73YAH6SgBZWBzupRIa0soA0MNw5hp8OzBwti3jBP3ooNNNYAy4bSBBRRQrsGEXEeQyQgbfyZfDKfylKHtdeRmPYZKEAfMGwxoaADK5LDdjnIxttHg9MkwtZBFvncoUQZdLrpCwAuu+tzfZjFgY3kkzVAgjKHM8AEXZatv0KESev9pOSqtBnzFRvmeAA7z2OaruBSTittWUzAmVpmTXP0OzviWPLNQNxnrFb4QKAqxiTNvkIekVroDUwX1DMmVdSETC/igA8kIfLSpTAQgFRYAgTJEI7WklCxSA0L5v37/iSpBG0e27TANeU7DT6IBuTGqh6rxaI/40laLCTFoNc8kIOWqieIt/SlmV7GWILDy5tsyDjw5MNA5Cj7wAKvVcTZHDG5Ih2QjASXgAn9PoBqQ8kXsEJEmX3muQ88dXBhkQj [...] + +generate_udaf_test("theta_sketch_agg_union", { + input_columns: [`sketch`], + input_rows: `SELECT * FROM UNNEST([${theta_3}, ${theta_4}]) AS sketch`, + expected_output: theta_union_2 +}); + +generate_udf_test("theta_sketch_get_estimate_and_bounds", [{ + inputs: [ theta_union_2, 3 ], + expected_output: `STRUCT(19736.541348415347 AS estimate, 18927.112205958525 AS lower_bound, 20580.437426810073 AS upper_bound)` +}]); + +generate_udf_test("theta_sketch_get_theta", [{ + inputs: [ theta_union_2 ], + expected_output: 0.20753382913916013 +}]); + +generate_udf_test("theta_sketch_get_num_retained", [{ + inputs: [ theta_union_2 ], + expected_output: 4096 +}]); + + +// using full signatures + +const theta_8_111_09_1 = `FROM_BASE64('AgQDPgEajNMAAAAAADMzcwNbX0hyljVFUBHLpzFb/p08wnWFIBcXdIA=')`; + +generate_udaf_test("theta_sketch_agg_string_lgk_seed_p", { + input_columns: [`str`, 'STRUCT(8 AS lgk, 111 AS seed, 0.9 AS p) NOT AGGREGATE'], + input_rows: `SELECT * FROM UNNEST(['a', 'b', 'c']) AS str`, + expected_output: theta_8_111_09_1 +}); + +const theta_8_111_09_2 = `FROM_BASE64('AgQDPwEajNMAAAAAADMzcwMtr6Q5SxqiqB2ct1zLfRY+Ix9R7oe55fg=')`; + +generate_udaf_test("theta_sketch_agg_string_lgk_seed_p", { + input_columns: [`str`, 'STRUCT(8 AS lgk, 111 AS seed, 0.9 AS p) NOT AGGREGATE'], + input_rows: `SELECT * FROM UNNEST(['c', 'd', 'e']) AS str`, + expected_output: theta_8_111_09_2 +}); + +const theta_union_8_111_1 = `FROM_BASE64('AgQDPgEajNMAAAAAADMzcwVbX0hyljVFUBHLpzFb/p0xkpzZB0fW7yy/YrBjPQIWXpIeNbbo6qQ=')`; + +generate_udf_test("theta_sketch_union_lgk_seed", [{ + inputs: [ theta_8_111_09_1, theta_8_111_09_2, 8, 111 ], + expected_output: theta_union_8_111_1 +}]); + +generate_udf_test("theta_sketch_get_estimate_seed", [{ + inputs: [ theta_union_8_111_1, 111 ], + expected_output: 5.5555557027275215 +}]); + +generate_udf_test("theta_sketch_to_string_seed", [{ + inputs: [ theta_union_8_111_1, 111 ], + expected_output: `'''### Theta sketch summary: + num retained entries : 5 + seed hash : 54156 + empty? : false + ordered? : true + estimation mode? : true + theta (fraction) : 0.9 + theta (raw 64-bit) : 8301034613266972672 + estimate : 5.55556 + lower bound 95% conf : 5 + upper bound 95% conf : 9 +### End sketch summary +'''` +}]); + +const theta_intersection_111 = `FROM_BASE64('AgQDPQEajNMAAAAAADMzcwG2vpDlLGqKoA==')`; + +generate_udf_test("theta_sketch_intersection_seed", [{ + inputs: [ theta_8_111_09_1, theta_8_111_09_2, 111 ], + expected_output: theta_intersection_111 +}]); + +generate_udf_test("theta_sketch_get_estimate_seed", [{ + inputs: [ theta_intersection_111, 111 ], + expected_output: 1.1111111405455043 +}]); + +const theta_a_not_b_111 = `FROM_BASE64('AgQDPgEajNMAAAAAADMzcwJf0jI+7TTsnzCdYUgFxd0g')`; + +generate_udf_test("theta_sketch_a_not_b_seed", [{ + inputs: [ theta_8_111_09_1, theta_8_111_09_2, 111 ], + expected_output: theta_a_not_b_111 +}]); + +generate_udf_test("theta_sketch_get_estimate_seed", [{ + inputs: [ theta_a_not_b_111, 111 ], + expected_output: 2.2222222810910086 +}]); + +generate_udf_test("theta_sketch_jaccard_similarity_seed", [{ + inputs: [ theta_8_111_09_1, theta_8_111_09_2, 111 ], + expected_output: `STRUCT(0.05868247546115801 AS lower_bound, 0.2 AS estimate, 0.4517325934817119 AS upper_bound)` +}]); + +const theta_8_111_09_3 = `FROM_BASE64('AgQDNQIajNMldPmtaP2wBJcBGnTc0DnwUOjIxHsAD0IJ/loCTuIUlq9nGldDVYICpKqDD/sFVaQRDBNy1k9BHWBSsFMtZ18AyFbv794xws7sRZdFgj8n/PCK5go1FhPWdtJKjcPmrjQK59RL+x9IBE0rAx2RJLxG4t2YrxYv1ykEwNBw7dl8LODBumZKJt7opXXGEUpXoK4Iatkn8iSEEsUTAQwME52vZvljn6bBgvOoIjafqY0wSe38trB+lIO5LAttJaQFwSMrrMugsc27xPoNAt/Fe5WydFwzH0i9pWNo2ZA368EBLrVBtu75T2/WaVqMQC1Ehto4YEPlsZoopZA7rPPfOKqOjrT6Y4ikgJTP6+fsAd4kh0RBWQd5pSCZbgCFRz+dexPA7Y/0pcioD1YW/ocJYG4By3WtnosrOgpttEAAMQNH1xxiHmwK40eBLwAcZ [...] + +generate_udaf_test("theta_sketch_agg_int64_lgk_seed_p", { + input_columns: [`value`, 'STRUCT(8 AS lgk, 111 AS seed, 0.9 AS p) NOT AGGREGATE'], + input_rows: `SELECT * FROM UNNEST(GENERATE_ARRAY(1, 10000, 1)) AS value`, + expected_output: theta_8_111_09_3 +}); + +const theta_8_111_09_4 = `FROM_BASE64('AgQDNQIajNMPlXAiArkLBYQBAOEEcZzveKZ4EjkeRoIpnfihDBodnISNDTZgfzMr9ebvCSMdBmkWEAWqgC4mPiEin7XhELsPzRkwazooGRnUqwC/D6KrZe6oaAFidQgoqYG/J9Zar0eosT8tze8EaXiNE3odgYKlAqKYiyHLI3rOMWDh/GWCGIDHtyqRW4MeNJSsBd7GUbepSN2D65BOhKCCV/ggwafqk3kA1K/8iq0BDPdSVhGmeBX8T5h23M+yANk25VRcXusaZI6AjHnM5UyHg38TXkk0SEJuJmq8yoCoS5x5Rb8HQdTqg+XIii3GppP7xMXBeevUxkC0CWLgs0JvBB6N/WEN+0WiaG8kJkouFCLp4r+L2JjF2hAj8ORL+8snWwgT7HwADyHX60qqGiulvstl8IdFRg7Ku4BdgoOJYkQMCKPF1F6mTs9cBZ4wB/20F [...] + +generate_udaf_test("theta_sketch_agg_int64_lgk_seed_p", { + input_columns: [`value`, 'STRUCT(8 AS lgk, 111 AS seed, 0.9 AS p) NOT AGGREGATE'], + input_rows: `SELECT * FROM UNNEST(GENERATE_ARRAY(100000, 110000, 1)) AS value`, + expected_output: theta_8_111_09_4 +}); + +const theta_union_8_111_09_2 = `FROM_BASE64('AgQDNAIajNN7wYMWj7V8AQABAcII4zne8pngSOR5GgmJrC7ycBB8NDlhXwwdnISNDTZg/mZX683eBOwDfN/+0QT/LQEncQ9Qfcyx4mAtVAFxMfECcPGDVVXQ+5Cdq7teH5oyYNZ0UGRnUqwC/DUWju5C9nP+wVVpBEMIErbqogigFidQgoqYAEQ/NNXckFKwUy1nXwGQrd/fvGMOUIg8naE/zrKNm/dRH5P+eEVzCjUWE9Z21Gx/QuQPbAKJxE6Uz8K59RL+x9ICJpWBjsiQRsm4ms+VGCpQKimIsvLYJ6AwjxRoxHuZWdF/bp1nAn0cO3ZfCzgwRMuSsKwDCYZ5Jiw3E0Mttkr+HjSUrAXexlNvUpG7B9cFuRWLCzIBXBDVsk/kJb/vm9kvoQYNP1SbyA1K/8iq0BGe6krCNM8FfxPmHbcy82zyg9Q+BgnO17N [...] + +generate_udaf_test("theta_sketch_agg_union_lgk_seed", { + input_columns: [`sketch`, 'STRUCT(8 AS lgk, 111 AS seed) NOT AGGREGATE'], + input_rows: `SELECT * FROM UNNEST([${theta_8_111_09_3}, ${theta_8_111_09_4}]) AS sketch`, + expected_output: theta_union_8_111_09_2 +}); + +generate_udf_test("theta_sketch_get_estimate_and_bounds_seed", [{ + inputs: [ theta_union_8_111_09_2, 3, 111 ], + expected_output: `STRUCT(22034.160662067967 AS estimate, 18252.303584500878 AS lower_bound, 26589.643724271038 AS upper_bound)` +}]); + +generate_udf_test("theta_sketch_get_theta_seed", [{ + inputs: [ theta_union_8_111_09_2, 111 ], + expected_output: 0.011618323199426725 +}]); + +generate_udf_test("theta_sketch_get_num_retained_seed", [{ + inputs: [ theta_union_8_111_09_2, 111 ], + expected_output: 256 +}]); --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
