This is an automated email from the ASF dual-hosted git repository. leerho pushed a commit to branch improve_theta_code_docs in repository https://gitbox.apache.org/repos/asf/datasketches-bigquery.git
commit f644cfadec9798e6853f924cfd557b4b2ea9b5db Author: Lee Rhodes <[email protected]> AuthorDate: Tue Sep 10 11:48:13 2024 -0700 Improve code docs for Theta. I want to see how this looks before proceeding to the other sketches. --- theta/sqlx/theta_sketch_a_not_b.sqlx | 14 ++++++++++++-- theta/sqlx/theta_sketch_agg_string.sqlx | 21 +++++++++++++++------ theta/sqlx/theta_sketch_agg_string_lgk_seed_p.sqlx | 21 ++++++++++++++------- theta/sqlx/theta_sketch_agg_union.sqlx | 16 ++++++++++++---- theta/sqlx/theta_sketch_get_estimate.sqlx | 11 +++++++++-- .../sqlx/theta_sketch_get_estimate_and_bounds.sqlx | 13 ++++++++++--- theta/sqlx/theta_sketch_jaccard_similarity.sqlx | 8 ++++++-- theta/sqlx/theta_sketch_scalar_intersection.sqlx | 18 ++++++++++++++---- theta/sqlx/theta_sketch_scalar_union.sqlx | 16 ++++++++++++---- theta/sqlx/theta_sketch_to_string.sqlx | 10 +++++++--- 10 files changed, 111 insertions(+), 37 deletions(-) diff --git a/theta/sqlx/theta_sketch_a_not_b.sqlx b/theta/sqlx/theta_sketch_a_not_b.sqlx index d16b91a..9b36bcf 100644 --- a/theta/sqlx/theta_sketch_a_not_b.sqlx +++ b/theta/sqlx/theta_sketch_a_not_b.sqlx @@ -25,11 +25,21 @@ LANGUAGE js OPTIONS ( library=["gs://$GCS_BUCKET/theta_sketch.js"], description = '''Computes a sketch that represents the scalar set difference: sketchA and not sketchB. + Param sketchA: the first sketch "A" as bytes. Param sketchB: the second sketch "B" as bytes. Param seed: This is used to confirm that the given sketches were configured with the correct seed. -Returns: a Compact, Compressed Theta Sketch, as bytes, from which the set difference cardinality can be obtained. -For more details: https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html''' +Returns: a Compact, Compressed Theta Sketch, as BYTES. + +For more information: + - https://datasketches.apache.org/docs/Theta/ThetaSketchSetOps.html + - https://datasketches.apache.org/docs/Theta/ThetaAccuracy.html + - https://datasketches.apache.org/docs/Theta/ThetaSketchSetOpsAccuracy.html + - https://datasketches.apache.org/docs/Theta/ThetaAccuracyPlots.html + - https://datasketches.apache.org/docs/Theta/ThetaErrorTable.html + - https://datasketches.apache.org/docs/Theta/ThetaSize.html + - https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html + ''' ) AS R""" const default_seed = BigInt(Module.DEFAULT_SEED); try { diff --git a/theta/sqlx/theta_sketch_agg_string.sqlx b/theta/sqlx/theta_sketch_agg_string.sqlx index 583a8a4..73590f6 100644 --- a/theta/sqlx/theta_sketch_agg_string.sqlx +++ b/theta/sqlx/theta_sketch_agg_string.sqlx @@ -23,12 +23,21 @@ CREATE OR REPLACE AGGREGATE FUNCTION ${self()}(str STRING) RETURNS BYTES OPTIONS ( description = '''Creates a sketch that represents the cardinality of the given STRING column. + Param str: the STRING column of identifiers. -Param lg_k: 12 (assume default) -Param seed: 9001 (assume default) -Param p: 1.0 (assume default) -Returns: a Compact, Compressed Theta Sketch, as bytes, from which the cardinality can be obtained. -For more details: https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html''' +Param lg_k: assume default = 12 +Param seed: assume default = 9001 +Param p: assume default = 1.0 +Returns: a Compact, Compressed Theta Sketch, as BYTES. + +For more information: + - https://datasketches.apache.org/docs/Theta/ThetaAccuracy.html + - https://datasketches.apache.org/docs/Theta/ThetaAccuracyPlots.html + - https://datasketches.apache.org/docs/Theta/ThetaErrorTable.html + - https://datasketches.apache.org/docs/Theta/ThetaSize.html + - https://datasketches.apache.org/docs/Theta/ThetaPSampling.html + - https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html + ''' ) AS ( - $BQ_DATASET.theta_sketch_agg_string_lgk_seed_p(str, STRUCT<BYTEINT, INT64, FLOAT64>(null, null, null)) + $BQ_DATASET.theta_sketch_agg_string_lgk_seed_p(str, STRUCT<BYTEINT, INT64, FLOAT64>(NULL, NULL, NULL)) ); diff --git a/theta/sqlx/theta_sketch_agg_string_lgk_seed_p.sqlx b/theta/sqlx/theta_sketch_agg_string_lgk_seed_p.sqlx index c676f0c..33ca2c2 100644 --- a/theta/sqlx/theta_sketch_agg_string_lgk_seed_p.sqlx +++ b/theta/sqlx/theta_sketch_agg_string_lgk_seed_p.sqlx @@ -25,14 +25,21 @@ LANGUAGE js OPTIONS ( library=["gs://$GCS_BUCKET/theta_sketch.mjs"], description = '''Creates a sketch that represents the cardinality of the given STRING column. + Param str: the STRING column of identifiers. -Param lg_k: the sketch accuracy/size parameter as an integer in the range [4, 26]. -Param seed: the seed to be used by the underlying hash function. -Param p: sampling probability (initial theta). The default is 1, so the sketch retains -all entries until it reaches the limit, at which point it goes into the estimation mode -and reduces the effective sampling probability (theta) as necessary. -Returns: a Compact, Compressed Theta Sketch, as bytes, from which the cardinality can be obtained. -For more details: https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html''' +Param lg_k: the sketch accuracy/size parameter as a BYTEINT in the range [4, 26]. A NULL specifies the default of 12. +Param seed: the seed to be used by the underlying hash function. A NULL specifies the default of 9001. +Param p: up-front sampling probability. A NULL specifies the default of 1.0. +Returns: a Compact, Compressed Theta Sketch, as BYTES. + +For more information: + - https://datasketches.apache.org/docs/Theta/ThetaAccuracy.html + - https://datasketches.apache.org/docs/Theta/ThetaAccuracyPlots.html + - https://datasketches.apache.org/docs/Theta/ThetaErrorTable.html + - https://datasketches.apache.org/docs/Theta/ThetaSize.html + - https://datasketches.apache.org/docs/Theta/ThetaPSampling.html + - https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html + ''' ) AS R""" import ModuleFactory from "gs://$GCS_BUCKET/theta_sketch.mjs"; var Module = await ModuleFactory(); diff --git a/theta/sqlx/theta_sketch_agg_union.sqlx b/theta/sqlx/theta_sketch_agg_union.sqlx index 023c1a8..49c3633 100644 --- a/theta/sqlx/theta_sketch_agg_union.sqlx +++ b/theta/sqlx/theta_sketch_agg_union.sqlx @@ -25,11 +25,19 @@ LANGUAGE js OPTIONS ( library=["gs://$GCS_BUCKET/theta_sketch.mjs"], description = '''Creates a sketch that represents the union of the given column of sketches. -Param sketch: the column of sketches. Each as bytes. -Param lg_k: the sketch accuracy/size parameter as an integer in the range [4, 26]. + +Param sketch: the column of sketches. Each as BYTES. +Param lg_k: the sketch accuracy/size parameter as a BYTEINT in the range [4, 26]. Param seed: This is used to confirm that the given sketches were configured with the correct seed. -Returns: a Compact, Compressed Theta Sketch, as bytes, from which the union cardinality can be obtained. -For more details: https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html''' +Returns: a Compact, Compressed Theta Sketch, as BYTES. + +For more information: + - https://datasketches.apache.org/docs/Theta/ThetaAccuracy.html + - https://datasketches.apache.org/docs/Theta/ThetaAccuracyPlots.html + - https://datasketches.apache.org/docs/Theta/ThetaErrorTable.html + - https://datasketches.apache.org/docs/Theta/ThetaSize.html + - https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html +''' ) AS R""" import ModuleFactory from "gs://$GCS_BUCKET/theta_sketch.mjs"; var Module = await ModuleFactory(); diff --git a/theta/sqlx/theta_sketch_get_estimate.sqlx b/theta/sqlx/theta_sketch_get_estimate.sqlx index d37e6e7..f51c50f 100644 --- a/theta/sqlx/theta_sketch_get_estimate.sqlx +++ b/theta/sqlx/theta_sketch_get_estimate.sqlx @@ -25,10 +25,17 @@ LANGUAGE js OPTIONS ( library=["gs://$GCS_BUCKET/theta_sketch.js"], description = '''Gets cardinality estimate and bounds from given sketch. -Param sketch: The given sketch to query as bytes. + +Param sketch: The given sketch to query as BYTES. Param seed: This is used to confirm that the given sketch was configured with the correct seed. Returns: a FLOAT64 value as the cardinality estimate. -For more details: https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html''' + +For more information: + - https://datasketches.apache.org/docs/Theta/ThetaAccuracy.html + - https://datasketches.apache.org/docs/Theta/ThetaAccuracyPlots.html + - https://datasketches.apache.org/docs/Theta/ThetaErrorTable.html + - https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html +''' ) AS R""" const default_seed = BigInt(Module.DEFAULT_SEED); try { diff --git a/theta/sqlx/theta_sketch_get_estimate_and_bounds.sqlx b/theta/sqlx/theta_sketch_get_estimate_and_bounds.sqlx index 0929a25..635259c 100644 --- a/theta/sqlx/theta_sketch_get_estimate_and_bounds.sqlx +++ b/theta/sqlx/theta_sketch_get_estimate_and_bounds.sqlx @@ -25,13 +25,20 @@ LANGUAGE js OPTIONS ( library=["gs://$GCS_BUCKET/theta_sketch.js"], description = '''Gets cardinality estimate and bounds from given sketch. -Param sketch: The given sketch to query as bytes. + +Param sketch: The given sketch to query as BYTES. Param num_std_devs: The returned bounds will be based on the statistical confidence interval determined by the given number of standard deviations from the returned estimate. This number may be one of {1,2,3}, where 1 represents 68% confidence, 2 represents 95% confidence and 3 represents 99.7% confidence. For example, if the given num_std_devs = 2 and the returned values are {1000, 990, 1010} that means that with 95% confidence, the true value lies within the range [990, 1010]. Param seed: This is used to confirm that the given sketch was configured with the correct seed. -Returns: a struct with 3 FLOAT64 values as {estimate, lower_bound, upper_bound}. -For more details: https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html''' +Returns: a STRUCT with three FLOAT64 values as {estimate, lower_bound, upper_bound}. + +For more information: + - https://datasketches.apache.org/docs/Theta/ThetaAccuracy.html + - https://datasketches.apache.org/docs/Theta/ThetaAccuracyPlots.html + - https://datasketches.apache.org/docs/Theta/ThetaErrorTable.html + - https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html +''' ) AS R""" const default_seed = BigInt(Module.DEFAULT_SEED); try { diff --git a/theta/sqlx/theta_sketch_jaccard_similarity.sqlx b/theta/sqlx/theta_sketch_jaccard_similarity.sqlx index 0ba0c86..5d80fb1 100644 --- a/theta/sqlx/theta_sketch_jaccard_similarity.sqlx +++ b/theta/sqlx/theta_sketch_jaccard_similarity.sqlx @@ -28,11 +28,15 @@ OPTIONS ( J(A,B) = (A ^ B)/(A U B) is used to measure how similar the two sketches are to each other. If J = 1.0, the sketches are considered equal. If J = 0, the two sketches are disjoint. A Jaccard of .95 means the overlap between the two sets is 95% of the union of the two sets. + Param sketchA: the first sketch as bytes. Param sketchB: the second sketch as bytes. Param seed: This is used to confirm that the given sketches were configured with the correct seed. -Returns: a struct with 3 floating-point values {lower_bound, estimate, upper_bound} of the Jaccard index. -For more details: https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html''' +Returns: a STRUCT with three FLOAT64 values {lower_bound, estimate, upper_bound} of the Jaccard index. + +For more information: + - https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html + ''' ) AS R""" const default_seed = BigInt(Module.DEFAULT_SEED); try { diff --git a/theta/sqlx/theta_sketch_scalar_intersection.sqlx b/theta/sqlx/theta_sketch_scalar_intersection.sqlx index cdc15e9..24a5621 100644 --- a/theta/sqlx/theta_sketch_scalar_intersection.sqlx +++ b/theta/sqlx/theta_sketch_scalar_intersection.sqlx @@ -25,11 +25,21 @@ LANGUAGE js OPTIONS ( library=["gs://$GCS_BUCKET/theta_sketch.js"], description = '''Computes a sketch that represents the scalar intersection of the two given sketches. -Param sketchA: the first sketch as bytes. -Param sketchB: the second sketch as bytes. + +Param sketchA: the first sketch as BYTES. +Param sketchB: the second sketch as BYTES. Param seed: This is used to confirm that the given sketches were configured with the correct seed. -Returns: a Compact, Compressed Theta Sketch, as bytes, from which the intersection cardinality can be obtained. -For more details: https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html''' +Returns: a Compact, Compressed Theta Sketch, as BYTES. + +For more information: + - https://datasketches.apache.org/docs/Theta/ThetaSketchSetOps.html + - https://datasketches.apache.org/docs/Theta/ThetaAccuracy.html + - https://datasketches.apache.org/docs/Theta/ThetaSketchSetOpsAccuracy.html + - https://datasketches.apache.org/docs/Theta/ThetaAccuracyPlots.html + - https://datasketches.apache.org/docs/Theta/ThetaErrorTable.html + - https://datasketches.apache.org/docs/Theta/ThetaSize.html + - https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html +''' ) AS R""" const default_seed = BigInt(Module.DEFAULT_SEED); try { diff --git a/theta/sqlx/theta_sketch_scalar_union.sqlx b/theta/sqlx/theta_sketch_scalar_union.sqlx index 6c13f09..d3fd346 100644 --- a/theta/sqlx/theta_sketch_scalar_union.sqlx +++ b/theta/sqlx/theta_sketch_scalar_union.sqlx @@ -25,12 +25,20 @@ LANGUAGE js OPTIONS ( library=["gs://$GCS_BUCKET/theta_sketch.js"], description = '''Computes a sketch that represents the scalar union of the two given sketches. -Param sketchA: the first sketch as bytes. -Param sketchB: the second sketch as bytes. + +Param sketchA: the first sketch as BYTES. +Param sketchB: the second sketch as BYTES. Param lg_k: the sketch accuracy/size parameter as an integer in the range [4, 26]. Param seed: This is used to confirm that the given sketches were configured with the correct seed. -Returns: a Compact, Compressed Theta Sketch, as bytes, from which the union cardinality can be obtained. -For more details: https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html''' +Returns: a Compact, Compressed Theta Sketch, as BYTES. + +For more information: + - https://datasketches.apache.org/docs/Theta/ThetaAccuracy.html + - https://datasketches.apache.org/docs/Theta/ThetaAccuracyPlots.html + - https://datasketches.apache.org/docs/Theta/ThetaErrorTable.html + - https://datasketches.apache.org/docs/Theta/ThetaSize.html + - https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html +''' ) AS R""" const default_lg_k = Number(Module.DEFAULT_LG_K); const default_seed = BigInt(Module.DEFAULT_SEED); diff --git a/theta/sqlx/theta_sketch_to_string.sqlx b/theta/sqlx/theta_sketch_to_string.sqlx index 180cbf2..abebf42 100644 --- a/theta/sqlx/theta_sketch_to_string.sqlx +++ b/theta/sqlx/theta_sketch_to_string.sqlx @@ -25,10 +25,14 @@ LANGUAGE js OPTIONS ( library=["gs://$GCS_BUCKET/theta_sketch.js"], description = '''Returns a summary string that represents the state of the given sketch. -Param sketch: the given sketch as bytes. + +Param sketch: the given sketch as BYTES. Param seed: This is used to confirm that the given sketch was configured with the correct seed. -Returns: a string that represents the state of the given sketch. -For more details: https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html''' +Returns: a STRING that represents the state of the given sketch. + +For more information: + - https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html + ''' ) AS R""" const default_seed = BigInt(Module.DEFAULT_SEED); try { --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
