cboumalh commented on code in PR #52551:
URL: https://github.com/apache/spark/pull/52551#discussion_r2421534317
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ThetaSketchUtils.scala:
##########
@@ -36,6 +39,11 @@ object ThetaSketchUtils {
final val MAX_LG_NOM_LONGS = 26
final val DEFAULT_LG_NOM_LONGS = 12
+ // Family constants for ThetaSketch
Review Comment:
similar to above with the nominal values, can we add a small explanation to
what the difference is here between both families
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/thetasketchesAggregates.scala:
##########
@@ -59,10 +59,12 @@ case class FinalizedSketch(sketch: CompactSketch) extends
ThetaSketchState {
*
* See [[https://datasketches.apache.org/docs/Theta/ThetaSketches.html]] for
more information.
*
- * @param left
+ * @param child
* child expression against which unique counting will occur
- * @param right
+ * @param lgNomEntriesExpr
Review Comment:
We can consider making the parameter renaming update in this whole file to
be consistent and here:
https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/thetasketchesExpressions.scala
##########
common/utils/src/main/resources/error/error-conditions.json:
##########
@@ -5605,6 +5605,12 @@
],
"sqlState" : "428EK"
},
+ "THETA_INVALID_FAMILY" : {
+ "message" : [
+ "Invalid call to <function>; the `family` parameter must be one of:
<validFamilies>. Got: <value>."
Review Comment:
super nit, but can we change `Got: <value>` to `, got: <value>` or `, but
got: <value>` to stay consistent with the rest of the file?
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ThetaSketchUtils.scala:
##########
@@ -53,6 +61,26 @@ object ThetaSketchUtils {
}
}
+ /**
+ * Converts a family string to DataSketches Family enum.
+ * Throws a Spark SQL exception if the family name is invalid.
+ *
+ * @param familyName The family name string
+ * @param prettyName The display name of the function/expression for error
messages
+ * @return The corresponding DataSketches Family enum value
+ */
+ def parseFamily(familyName: String, prettyName: String): Family = {
Review Comment:
Let's add a test for this new function here:
https://github.com/cboumalh/spark/blob/master/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ThetaSketchUtilsSuite.scala
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/thetasketchesAggregates.scala:
##########
@@ -122,15 +142,16 @@ case class ThetaSketchAgg(
copy(inputAggBufferOffset = newInputAggBufferOffset)
override protected def withNewChildrenInternal(
Review Comment:
let's update this to work with `TernaryLike`'s withNewChildrenInternal
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/thetasketchesAggregates.scala:
##########
@@ -71,46 +73,64 @@ case class FinalizedSketch(sketch: CompactSketch) extends
ThetaSketchState {
// scalastyle:off line.size.limit
@ExpressionDescription(
usage = """
- _FUNC_(expr, lgNomEntries) - Returns the ThetaSketch compact binary
representation.
+ _FUNC_(expr, lgNomEntries, family) - Returns the ThetaSketch compact
binary representation.
`lgNomEntries` (optional) is the log-base-2 of nominal entries, with
nominal entries deciding
- the number buckets or slots for the ThetaSketch. """,
+ the number buckets or slots for the ThetaSketch.
+ `family` (optional) is the sketch family, either 'QUICKSELECT' or
'ALPHA' (defaults to
+ 'QUICKSELECT').""",
examples = """
Examples:
+ > SELECT theta_sketch_estimate(_FUNC_(col)) FROM VALUES (1), (1), (2),
(2), (3) tab(col);
+ 3
> SELECT theta_sketch_estimate(_FUNC_(col, 12)) FROM VALUES (1), (1),
(2), (2), (3) tab(col);
3
+ > SELECT theta_sketch_estimate(_FUNC_(col, 15, 'ALPHA')) FROM VALUES
(1), (1), (2), (2), (3) tab(col);
+ 3
""",
group = "agg_funcs",
since = "4.1.0")
// scalastyle:on line.size.limit
case class ThetaSketchAgg(
- left: Expression,
- right: Expression,
+ child: Expression,
+ lgNomEntriesExpr: Expression,
+ familyExpr: Expression,
override val mutableAggBufferOffset: Int,
override val inputAggBufferOffset: Int)
extends TypedImperativeAggregate[ThetaSketchState]
- with BinaryLike[Expression]
Review Comment:
let's use `TernaryLike` here instead of removing this trait altogether
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/thetasketchesAggregates.scala:
##########
@@ -71,46 +73,64 @@ case class FinalizedSketch(sketch: CompactSketch) extends
ThetaSketchState {
// scalastyle:off line.size.limit
@ExpressionDescription(
usage = """
- _FUNC_(expr, lgNomEntries) - Returns the ThetaSketch compact binary
representation.
+ _FUNC_(expr, lgNomEntries, family) - Returns the ThetaSketch compact
binary representation.
`lgNomEntries` (optional) is the log-base-2 of nominal entries, with
nominal entries deciding
- the number buckets or slots for the ThetaSketch. """,
+ the number buckets or slots for the ThetaSketch.
+ `family` (optional) is the sketch family, either 'QUICKSELECT' or
'ALPHA' (defaults to
+ 'QUICKSELECT').""",
examples = """
Examples:
+ > SELECT theta_sketch_estimate(_FUNC_(col)) FROM VALUES (1), (1), (2),
(2), (3) tab(col);
+ 3
> SELECT theta_sketch_estimate(_FUNC_(col, 12)) FROM VALUES (1), (1),
(2), (2), (3) tab(col);
3
+ > SELECT theta_sketch_estimate(_FUNC_(col, 15, 'ALPHA')) FROM VALUES
(1), (1), (2), (2), (3) tab(col);
+ 3
""",
group = "agg_funcs",
since = "4.1.0")
// scalastyle:on line.size.limit
case class ThetaSketchAgg(
- left: Expression,
- right: Expression,
+ child: Expression,
+ lgNomEntriesExpr: Expression,
+ familyExpr: Expression,
override val mutableAggBufferOffset: Int,
override val inputAggBufferOffset: Int)
extends TypedImperativeAggregate[ThetaSketchState]
- with BinaryLike[Expression]
with ExpectsInputTypes {
// ThetaSketch config - mark as lazy so that they're not evaluated during
tree transformation.
- lazy val lgNomEntries: Int = {
- val lgNomEntriesInput = right.eval().asInstanceOf[Int]
+ private lazy val lgNomEntries: Int = {
+ val lgNomEntriesInput = lgNomEntriesExpr.eval().asInstanceOf[Int]
ThetaSketchUtils.checkLgNomLongs(lgNomEntriesInput, prettyName)
lgNomEntriesInput
}
- // Constructors
+ private lazy val family: Family =
+
ThetaSketchUtils.parseFamily(familyExpr.eval().asInstanceOf[UTF8String].toString,
prettyName)
Review Comment:
nit, we can break this up similar to `lgNomEntries` for readability
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]