cboumalh commented on code in PR #52551:
URL: https://github.com/apache/spark/pull/52551#discussion_r2421534317


##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ThetaSketchUtils.scala:
##########
@@ -36,6 +39,11 @@ object ThetaSketchUtils {
   final val MAX_LG_NOM_LONGS = 26
   final val DEFAULT_LG_NOM_LONGS = 12
 
+  // Family constants for ThetaSketch

Review Comment:
   similar to above with the nominal values, can we add a small explanation to 
what the difference is here between both families



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/thetasketchesAggregates.scala:
##########
@@ -59,10 +59,12 @@ case class FinalizedSketch(sketch: CompactSketch) extends 
ThetaSketchState {
  *
  * See [[https://datasketches.apache.org/docs/Theta/ThetaSketches.html]] for 
more information.
  *
- * @param left
+ * @param child
  *   child expression against which unique counting will occur
- * @param right
+ * @param lgNomEntriesExpr

Review Comment:
   We can consider making the parameter renaming update in this whole file to 
be consistent and here: 
https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/thetasketchesExpressions.scala



##########
common/utils/src/main/resources/error/error-conditions.json:
##########
@@ -5605,6 +5605,12 @@
     ],
     "sqlState" : "428EK"
   },
+  "THETA_INVALID_FAMILY" : {
+    "message" : [
+      "Invalid call to <function>; the `family` parameter must be one of: 
<validFamilies>. Got: <value>."

Review Comment:
   super nit, but can we change `Got: <value>` to `, got: <value>` or `, but 
got: <value>` to stay consistent with the rest of the file?



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ThetaSketchUtils.scala:
##########
@@ -53,6 +61,26 @@ object ThetaSketchUtils {
     }
   }
 
+  /**
+   * Converts a family string to DataSketches Family enum.
+   * Throws a Spark SQL exception if the family name is invalid.
+   *
+   * @param familyName The family name string
+   * @param prettyName The display name of the function/expression for error 
messages
+   * @return The corresponding DataSketches Family enum value
+   */
+  def parseFamily(familyName: String, prettyName: String): Family = {

Review Comment:
   Let's add a test for this new function here: 
https://github.com/cboumalh/spark/blob/master/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ThetaSketchUtilsSuite.scala



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/thetasketchesAggregates.scala:
##########
@@ -122,15 +142,16 @@ case class ThetaSketchAgg(
     copy(inputAggBufferOffset = newInputAggBufferOffset)
 
   override protected def withNewChildrenInternal(

Review Comment:
   let's update this to work with `TernaryLike`'s withNewChildrenInternal



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/thetasketchesAggregates.scala:
##########
@@ -71,46 +73,64 @@ case class FinalizedSketch(sketch: CompactSketch) extends 
ThetaSketchState {
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = """
-    _FUNC_(expr, lgNomEntries) - Returns the ThetaSketch compact binary 
representation.
+    _FUNC_(expr, lgNomEntries, family) - Returns the ThetaSketch compact 
binary representation.
       `lgNomEntries` (optional) is the log-base-2 of nominal entries, with 
nominal entries deciding
-      the number buckets or slots for the ThetaSketch. """,
+      the number buckets or slots for the ThetaSketch.
+      `family` (optional) is the sketch family, either 'QUICKSELECT' or 
'ALPHA' (defaults to
+      'QUICKSELECT').""",
   examples = """
     Examples:
+      > SELECT theta_sketch_estimate(_FUNC_(col)) FROM VALUES (1), (1), (2), 
(2), (3) tab(col);
+       3
       > SELECT theta_sketch_estimate(_FUNC_(col, 12)) FROM VALUES (1), (1), 
(2), (2), (3) tab(col);
        3
+      > SELECT theta_sketch_estimate(_FUNC_(col, 15, 'ALPHA')) FROM VALUES 
(1), (1), (2), (2), (3) tab(col);
+       3
   """,
   group = "agg_funcs",
   since = "4.1.0")
 // scalastyle:on line.size.limit
 case class ThetaSketchAgg(
-    left: Expression,
-    right: Expression,
+    child: Expression,
+    lgNomEntriesExpr: Expression,
+    familyExpr: Expression,
     override val mutableAggBufferOffset: Int,
     override val inputAggBufferOffset: Int)
     extends TypedImperativeAggregate[ThetaSketchState]
-    with BinaryLike[Expression]

Review Comment:
   let's use `TernaryLike` here instead of removing this trait altogether



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/thetasketchesAggregates.scala:
##########
@@ -71,46 +73,64 @@ case class FinalizedSketch(sketch: CompactSketch) extends 
ThetaSketchState {
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = """
-    _FUNC_(expr, lgNomEntries) - Returns the ThetaSketch compact binary 
representation.
+    _FUNC_(expr, lgNomEntries, family) - Returns the ThetaSketch compact 
binary representation.
       `lgNomEntries` (optional) is the log-base-2 of nominal entries, with 
nominal entries deciding
-      the number buckets or slots for the ThetaSketch. """,
+      the number buckets or slots for the ThetaSketch.
+      `family` (optional) is the sketch family, either 'QUICKSELECT' or 
'ALPHA' (defaults to
+      'QUICKSELECT').""",
   examples = """
     Examples:
+      > SELECT theta_sketch_estimate(_FUNC_(col)) FROM VALUES (1), (1), (2), 
(2), (3) tab(col);
+       3
       > SELECT theta_sketch_estimate(_FUNC_(col, 12)) FROM VALUES (1), (1), 
(2), (2), (3) tab(col);
        3
+      > SELECT theta_sketch_estimate(_FUNC_(col, 15, 'ALPHA')) FROM VALUES 
(1), (1), (2), (2), (3) tab(col);
+       3
   """,
   group = "agg_funcs",
   since = "4.1.0")
 // scalastyle:on line.size.limit
 case class ThetaSketchAgg(
-    left: Expression,
-    right: Expression,
+    child: Expression,
+    lgNomEntriesExpr: Expression,
+    familyExpr: Expression,
     override val mutableAggBufferOffset: Int,
     override val inputAggBufferOffset: Int)
     extends TypedImperativeAggregate[ThetaSketchState]
-    with BinaryLike[Expression]
     with ExpectsInputTypes {
 
   // ThetaSketch config - mark as lazy so that they're not evaluated during 
tree transformation.
 
-  lazy val lgNomEntries: Int = {
-    val lgNomEntriesInput = right.eval().asInstanceOf[Int]
+  private lazy val lgNomEntries: Int = {
+    val lgNomEntriesInput = lgNomEntriesExpr.eval().asInstanceOf[Int]
     ThetaSketchUtils.checkLgNomLongs(lgNomEntriesInput, prettyName)
     lgNomEntriesInput
   }
 
-  // Constructors
+  private lazy val family: Family =
+    
ThetaSketchUtils.parseFamily(familyExpr.eval().asInstanceOf[UTF8String].toString,
 prettyName)

Review Comment:
   nit, we can break this up similar to `lgNomEntries` for readability



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to