Repository: spark Updated Branches: refs/heads/master db4d317cc -> 28bcb9e9e
[SPARK-6370][core] Documentation: Improve all 3 docs for RDD.sample The docs for the `sample` method were insufficient, now less so. Author: mbonaci <mbon...@gmail.com> Closes #5097 from mbonaci/master and squashes the following commits: a6a9d97 [mbonaci] [SPARK-6370][core] Documentation: Improve all 3 docs for RDD.sample method Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/28bcb9e9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/28bcb9e9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/28bcb9e9 Branch: refs/heads/master Commit: 28bcb9e9e86a4b643fbf96b2b7e03928ebcfc060 Parents: db4d317 Author: mbonaci <mbon...@gmail.com> Authored: Fri Mar 20 18:30:45 2015 +0000 Committer: Sean Owen <so...@cloudera.com> Committed: Fri Mar 20 18:33:53 2015 +0000 ---------------------------------------------------------------------- .../main/scala/org/apache/spark/api/java/JavaRDD.scala | 11 +++++++++++ core/src/main/scala/org/apache/spark/rdd/RDD.scala | 6 ++++++ python/pyspark/rdd.py | 6 ++++++ 3 files changed, 23 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/28bcb9e9/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala ---------------------------------------------------------------------- diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala index 645dc3b..3e9beb6 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala @@ -101,12 +101,23 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T]) /** * Return a sampled subset of this RDD. + * + * @param withReplacement can elements be sampled multiple times (replaced when sampled out) + * @param fraction expected size of the sample as a fraction of this RDD's size + * without replacement: probability that each element is chosen; fraction must be [0, 1] + * with replacement: expected number of times each element is chosen; fraction must be >= 0 */ def sample(withReplacement: Boolean, fraction: Double): JavaRDD[T] = sample(withReplacement, fraction, Utils.random.nextLong) /** * Return a sampled subset of this RDD. + * + * @param withReplacement can elements be sampled multiple times (replaced when sampled out) + * @param fraction expected size of the sample as a fraction of this RDD's size + * without replacement: probability that each element is chosen; fraction must be [0, 1] + * with replacement: expected number of times each element is chosen; fraction must be >= 0 + * @param seed seed for the random number generator */ def sample(withReplacement: Boolean, fraction: Double, seed: Long): JavaRDD[T] = wrapRDD(rdd.sample(withReplacement, fraction, seed)) http://git-wip-us.apache.org/repos/asf/spark/blob/28bcb9e9/core/src/main/scala/org/apache/spark/rdd/RDD.scala ---------------------------------------------------------------------- diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index a139780..a4c74ed 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -377,6 +377,12 @@ abstract class RDD[T: ClassTag]( /** * Return a sampled subset of this RDD. + * + * @param withReplacement can elements be sampled multiple times (replaced when sampled out) + * @param fraction expected size of the sample as a fraction of this RDD's size + * without replacement: probability that each element is chosen; fraction must be [0, 1] + * with replacement: expected number of times each element is chosen; fraction must be >= 0 + * @param seed seed for the random number generator */ def sample(withReplacement: Boolean, fraction: Double, http://git-wip-us.apache.org/repos/asf/spark/blob/28bcb9e9/python/pyspark/rdd.py ---------------------------------------------------------------------- diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index bf17f51..c337a43 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -346,6 +346,12 @@ class RDD(object): """ Return a sampled subset of this RDD. + :param withReplacement: can elements be sampled multiple times (replaced when sampled out) + :param fraction: expected size of the sample as a fraction of this RDD's size + without replacement: probability that each element is chosen; fraction must be [0, 1] + with replacement: expected number of times each element is chosen; fraction must be >= 0 + :param seed: seed for the random number generator + >>> rdd = sc.parallelize(range(100), 4) >>> rdd.sample(False, 0.1, 81).count() 10 --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org