spark git commit: [SPARK-15197][DOCS] Added Scaladoc for countApprox and countByValueApprox parameters

srowen Sat, 14 May 2016 01:45:07 -0700

Repository: spark
Updated Branches:
  refs/heads/master 4210e2a6b -> 0f1f31d3a



[SPARK-15197][DOCS] Added Scaladoc for countApprox and countByValueApprox 
parameters

This pull request simply adds Scaladoc documentation of the parameters for 
countApprox and countByValueApprox.

This is an important documentation change, as it clarifies what should be 
passed in for the timeout. Without units, this was previously unclear.

I did not open a JIRA ticket per my understanding of the project contribution 
guidelines; as they state, the description in the ticket would be essentially 
just what is in the PR. If I should open one, let me know and I will do so.

Author: Nicholas Tietz <[email protected]>

Closes #12955 from ntietz/rdd-countapprox-docs.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0f1f31d3
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0f1f31d3
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0f1f31d3

Branch: refs/heads/master
Commit: 0f1f31d3a6669fbac474518cf2a871485e202bdc
Parents: 4210e2a
Author: Nicholas Tietz <[email protected]>
Authored: Sat May 14 09:44:20 2016 +0100
Committer: Sean Owen <[email protected]>
Committed: Sat May 14 09:44:20 2016 +0100

----------------------------------------------------------------------
 .../org/apache/spark/api/java/JavaRDDLike.scala | 29 ++++++++++++++++++--
 .../org/apache/spark/rdd/PairRDDFunctions.scala | 10 +++++++
 .../main/scala/org/apache/spark/rdd/RDD.scala   | 16 +++++++++++
 3 files changed, 53 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/0f1f31d3/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala 
b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
index c17ca12..e4ccd9f 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
@@ -445,6 +445,16 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends 
Serializable {
   /**
    * Approximate version of count() that returns a potentially incomplete 
result
    * within a timeout, even if not all tasks have finished.
+   *
+   * The confidence is the probability that the error bounds of the result will
+   * contain the true value. That is, if countApprox were called repeatedly
+   * with confidence 0.9, we would expect 90% of the results to contain the
+   * true count. The confidence must be in the range [0,1] or an exception will
+   * be thrown.
+   *
+   * @param timeout maximum time to wait for the job, in milliseconds
+   * @param confidence the desired statistical confidence in the result
+   * @return a potentially incomplete result, with error bounds
    */
   def countApprox(timeout: Long, confidence: Double): 
PartialResult[BoundedDouble] =
     rdd.countApprox(timeout, confidence)
@@ -452,6 +462,8 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends 
Serializable {
   /**
    * Approximate version of count() that returns a potentially incomplete 
result
    * within a timeout, even if not all tasks have finished.
+   *
+   * @param timeout maximum time to wait for the job, in milliseconds
    */
   def countApprox(timeout: Long): PartialResult[BoundedDouble] =
     rdd.countApprox(timeout)
@@ -464,7 +476,17 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends 
Serializable {
     mapAsSerializableJavaMap(rdd.countByValue()).asInstanceOf[JMap[T, jl.Long]]
 
   /**
-   * (Experimental) Approximate version of countByValue().
+   * Approximate version of countByValue().
+   *
+   * The confidence is the probability that the error bounds of the result will
+   * contain the true value. That is, if countApprox were called repeatedly
+   * with confidence 0.9, we would expect 90% of the results to contain the
+   * true count. The confidence must be in the range [0,1] or an exception will
+   * be thrown.
+   *
+   * @param timeout maximum time to wait for the job, in milliseconds
+   * @param confidence the desired statistical confidence in the result
+   * @return a potentially incomplete result, with error bounds
    */
   def countByValueApprox(
     timeout: Long,
@@ -473,7 +495,10 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends 
Serializable {
     rdd.countByValueApprox(timeout, confidence).map(mapAsSerializableJavaMap)
 
   /**
-   * (Experimental) Approximate version of countByValue().
+   * Approximate version of countByValue().
+   *
+   * @param timeout maximum time to wait for the job, in milliseconds
+   * @return a potentially incomplete result, with error bounds
    */
   def countByValueApprox(timeout: Long): PartialResult[JMap[T, BoundedDouble]] 
=
     rdd.countByValueApprox(timeout).map(mapAsSerializableJavaMap)

http://git-wip-us.apache.org/repos/asf/spark/blob/0f1f31d3/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala 
b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 7936d8e..3b12448 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -375,6 +375,16 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
   /**
    * Approximate version of countByKey that can return a partial result if it 
does
    * not finish within a timeout.
+   *
+   * The confidence is the probability that the error bounds of the result will
+   * contain the true value. That is, if countApprox were called repeatedly
+   * with confidence 0.9, we would expect 90% of the results to contain the
+   * true count. The confidence must be in the range [0,1] or an exception will
+   * be thrown.
+   *
+   * @param timeout maximum time to wait for the job, in milliseconds
+   * @param confidence the desired statistical confidence in the result
+   * @return a potentially incomplete result, with error bounds
    */
   def countByKeyApprox(timeout: Long, confidence: Double = 0.95)
       : PartialResult[Map[K, BoundedDouble]] = self.withScope {

http://git-wip-us.apache.org/repos/asf/spark/blob/0f1f31d3/core/src/main/scala/org/apache/spark/rdd/RDD.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala 
b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index d85d0ff..e6db9b3 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -1107,10 +1107,21 @@ abstract class RDD[T: ClassTag](
   /**
    * Approximate version of count() that returns a potentially incomplete 
result
    * within a timeout, even if not all tasks have finished.
+   *
+   * The confidence is the probability that the error bounds of the result will
+   * contain the true value. That is, if countApprox were called repeatedly
+   * with confidence 0.9, we would expect 90% of the results to contain the
+   * true count. The confidence must be in the range [0,1] or an exception will
+   * be thrown.
+   *
+   * @param timeout maximum time to wait for the job, in milliseconds
+   * @param confidence the desired statistical confidence in the result
+   * @return a potentially incomplete result, with error bounds
    */
   def countApprox(
       timeout: Long,
       confidence: Double = 0.95): PartialResult[BoundedDouble] = withScope {
+    require(0.0 <= confidence && confidence <= 1.0, s"confidence ($confidence) 
must be in [0,1]")
     val countElements: (TaskContext, Iterator[T]) => Long = { (ctx, iter) =>
       var result = 0L
       while (iter.hasNext) {
@@ -1137,10 +1148,15 @@ abstract class RDD[T: ClassTag](
 
   /**
    * Approximate version of countByValue().
+   *
+   * @param timeout maximum time to wait for the job, in milliseconds
+   * @param confidence the desired statistical confidence in the result
+   * @return a potentially incomplete result, with error bounds
    */
   def countByValueApprox(timeout: Long, confidence: Double = 0.95)
       (implicit ord: Ordering[T] = null)
       : PartialResult[Map[T, BoundedDouble]] = withScope {
+    require(0.0 <= confidence && confidence <= 1.0, s"confidence ($confidence) 
must be in [0,1]")
     if (elementClassTag.runtimeClass.isArray) {
       throw new SparkException("countByValueApprox() does not support arrays")
     }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

spark git commit: [SPARK-15197][DOCS] Added Scaladoc for countApprox and countByValueApprox parameters

Reply via email to