cboumalh commented on code in PR #51298: URL: https://github.com/apache/spark/pull/51298#discussion_r2308322075
########## sql/api/src/main/scala/org/apache/spark/sql/functions.scala: ########## @@ -3552,6 +3715,154 @@ object functions { hll_union(Column(columnName1), Column(columnName2), allowDifferentLgConfigK) } + /** + * Subtracts two binary representations of Datasketches ThetaSketch objects, using a + * Datasketches AnotB object. Uses default log nominal entries. + * + * @group misc_funcs + * @since 4.1.0 + */ + def theta_difference(c1: Column, c2: Column): Column = + Column.fn("theta_difference", c1, c2) + + /** + * Subtracts two binary representations of Datasketches ThetaSketch objects, using a + * Datasketches AnotB object. Uses default log nominal entries. + * + * @group misc_funcs + * @since 4.1.0 + */ + def theta_difference(columnName1: String, columnName2: String): Column = { + theta_difference(Column(columnName1), Column(columnName2)) + } + + /** + * Subtracts two binary representations of Datasketches ThetaSketch objects, using a + * Datasketches AnotB object. Allows setting of log nominal entries for the difference buffer. + * + * @group misc_funcs + * @since 4.1.0 + */ + def theta_difference(c1: Column, c2: Column, lgNomEntries: Int): Column = + Column.fn("theta_difference", c1, c2, lit(lgNomEntries)) + + /** + * Subtracts two binary representations of Datasketches ThetaSketch objects, using a + * Datasketches AnotB object. Allows setting of log nominal entries for the difference buffer. + * + * @group misc_funcs + * @since 4.1.0 + */ + def theta_difference(columnName1: String, columnName2: String, lgNomEntries: Int): Column = { + theta_difference(Column(columnName1), Column(columnName2), lgNomEntries) + } + + /** + * Intersects two binary representations of Datasketches ThetaSketch objects, using a + * Datasketches Intersection object. Uses default log nominal entries. + * + * @group misc_funcs + * @since 4.1.0 + */ + def theta_intersection(c1: Column, c2: Column): Column = + Column.fn("theta_intersection", c1, c2) + + /** + * Intersects two binary representations of Datasketches ThetaSketch objects, using a + * Datasketches Intersection object. Uses default log nominal entries. + * + * @group misc_funcs + * @since 4.1.0 + */ + def theta_intersection(columnName1: String, columnName2: String): Column = { + theta_intersection(Column(columnName1), Column(columnName2)) + } + + /** + * Intersects two binary representations of Datasketches ThetaSketch objects, using a + * Datasketches Intersection object. Allows setting of log nominal entries for the intersection + * buffer. + * + * @group misc_funcs + * @since 4.1.0 + */ + def theta_intersection(c1: Column, c2: Column, lgNomEntries: Int): Column = + Column.fn("theta_intersection", c1, c2, lit(lgNomEntries)) + + /** + * Intersects two binary representations of Datasketches ThetaSketch objects, using a + * Datasketches Intersection object. Allows setting of log nominal entries for the intersection + * buffer. + * + * @group misc_funcs + * @since 4.1.0 + */ + def theta_intersection(columnName1: String, columnName2: String, lgNomEntries: Int): Column = { + theta_intersection(Column(columnName1), Column(columnName2), lgNomEntries) + } + + /** + * Returns the estimated number of unique values given the binary representation of a + * Datasketches ThetaSketch. + * + * @group misc_funcs + * @since 4.1.0 + */ + def theta_sketch_estimate(c: Column): Column = Column.fn("theta_sketch_estimate", c) + + /** + * Returns the estimated number of unique values given the binary representation of a + * Datasketches ThetaSketch. + * + * @group misc_funcs + * @since 4.1.0 + */ + def theta_sketch_estimate(columnName: String): Column = { + theta_sketch_estimate(Column(columnName)) + } + + /** + * Merges two binary representations of Datasketches ThetaSketch objects, using a Datasketches + * Union object. Uses default log nominal entries. Review Comment: Was just trying to point out that in the case of `def theta_union(columnName1: String, columnName2: String)` for example, the function will use the default value of log nominal entries, whereas `def theta_union(c1: Column, c2: Column, lgNomEntries: Int)` is allowing the user to explicitly assign the value. Please let me know what you think. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org