spark git commit: [SPARK-7710] [SPARK-7998] [DOCS] Docs for DataFrameStatFunctions

2015-08-24 Thread rxin
Repository: spark
Updated Branches:
  refs/heads/master 7478c8b66 -> 9ce0c7ad3


[SPARK-7710] [SPARK-7998] [DOCS] Docs for DataFrameStatFunctions

This PR contains examples on how to use some of the Stat Functions available 
for DataFrames under `df.stat`.

rxin

Author: Burak Yavuz 

Closes #8378 from brkyvz/update-sql-docs.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9ce0c7ad
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9ce0c7ad
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9ce0c7ad

Branch: refs/heads/master
Commit: 9ce0c7ad333f4a3c01207e5e9ed42bcafb99d894
Parents: 7478c8b
Author: Burak Yavuz 
Authored: Mon Aug 24 13:48:01 2015 -0700
Committer: Reynold Xin 
Committed: Mon Aug 24 13:48:01 2015 -0700

--
 .../scala/org/apache/spark/sql/DataFrame.scala  |   2 +-
 .../spark/sql/DataFrameStatFunctions.scala  | 101 +++
 2 files changed, 102 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/9ce0c7ad/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index d6688b2..791c10c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -684,7 +684,7 @@ class DataFrame private[sql](
   // make it a NamedExpression.
   case Column(u: UnresolvedAttribute) => UnresolvedAlias(u)
   case Column(expr: NamedExpression) => expr
-  // Leave an unaliased explode with an empty list of names since the 
analzyer will generate the
+  // Leave an unaliased explode with an empty list of names since the 
analyzer will generate the
   // correct defaults after the nested expression's type has been resolved.
   case Column(explode: Explode) => MultiAlias(explode, Nil)
   case Column(expr: Expression) => Alias(expr, expr.prettyString)()

http://git-wip-us.apache.org/repos/asf/spark/blob/9ce0c7ad/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
index 2e68e35..69c9847 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -39,6 +39,13 @@ final class DataFrameStatFunctions private[sql](df: 
DataFrame) {
* @param col2 the name of the second column
* @return the covariance of the two columns.
*
+   * {{{
+   *val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", 
rand(seed=10))
+   *  .withColumn("rand2", rand(seed=27))
+   *df.stat.cov("rand1", "rand2")
+   *res1: Double = 0.065...
+   * }}}
+   *
* @since 1.4.0
*/
   def cov(col1: String, col2: String): Double = {
@@ -54,6 +61,13 @@ final class DataFrameStatFunctions private[sql](df: 
DataFrame) {
* @param col2 the name of the column to calculate the correlation against
* @return The Pearson Correlation Coefficient as a Double.
*
+   * {{{
+   *val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", 
rand(seed=10))
+   *  .withColumn("rand2", rand(seed=27))
+   *df.stat.corr("rand1", "rand2")
+   *res1: Double = 0.613...
+   * }}}
+   *
* @since 1.4.0
*/
   def corr(col1: String, col2: String, method: String): Double = {
@@ -69,6 +83,13 @@ final class DataFrameStatFunctions private[sql](df: 
DataFrame) {
* @param col2 the name of the column to calculate the correlation against
* @return The Pearson Correlation Coefficient as a Double.
*
+   * {{{
+   *val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", 
rand(seed=10))
+   *  .withColumn("rand2", rand(seed=27))
+   *df.stat.corr("rand1", "rand2", "pearson")
+   *res1: Double = 0.613...
+   * }}}
+   *
* @since 1.4.0
*/
   def corr(col1: String, col2: String): Double = {
@@ -92,6 +113,20 @@ final class DataFrameStatFunctions private[sql](df: 
DataFrame) {
* of the DataFrame.
* @return A DataFrame containing for the contingency table.
*
+   * {{{
+   *val df = sqlContext.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 
1), (2, 3), (3, 2),
+   *  (3, 3))).toDF("key", "value")
+   *val ct = df.stat.crosstab("key", "value")
+   *ct.show()
+   *+-+---+---+---+
+   *|key_value|  1|  2|  3|
+   *+-+---+---+---+
+   *|2|  2|  0|  1|
+   *|   

spark git commit: [SPARK-7710] [SPARK-7998] [DOCS] Docs for DataFrameStatFunctions

2015-08-24 Thread rxin
Repository: spark
Updated Branches:
  refs/heads/branch-1.5 d003373bd -> 43dcf95e4


[SPARK-7710] [SPARK-7998] [DOCS] Docs for DataFrameStatFunctions

This PR contains examples on how to use some of the Stat Functions available 
for DataFrames under `df.stat`.

rxin

Author: Burak Yavuz 

Closes #8378 from brkyvz/update-sql-docs.

(cherry picked from commit 9ce0c7ad333f4a3c01207e5e9ed42bcafb99d894)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/43dcf95e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/43dcf95e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/43dcf95e

Branch: refs/heads/branch-1.5
Commit: 43dcf95e42eb77c7cd545179c461bb7f9430e0e3
Parents: d003373
Author: Burak Yavuz 
Authored: Mon Aug 24 13:48:01 2015 -0700
Committer: Reynold Xin 
Committed: Mon Aug 24 13:48:09 2015 -0700

--
 .../scala/org/apache/spark/sql/DataFrame.scala  |   2 +-
 .../spark/sql/DataFrameStatFunctions.scala  | 101 +++
 2 files changed, 102 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/43dcf95e/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 5bed299..ae341c8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -684,7 +684,7 @@ class DataFrame private[sql](
   // make it a NamedExpression.
   case Column(u: UnresolvedAttribute) => UnresolvedAlias(u)
   case Column(expr: NamedExpression) => expr
-  // Leave an unaliased explode with an empty list of names since the 
analzyer will generate the
+  // Leave an unaliased explode with an empty list of names since the 
analyzer will generate the
   // correct defaults after the nested expression's type has been resolved.
   case Column(explode: Explode) => MultiAlias(explode, Nil)
   case Column(expr: Expression) => Alias(expr, expr.prettyString)()

http://git-wip-us.apache.org/repos/asf/spark/blob/43dcf95e/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
index 2e68e35..69c9847 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -39,6 +39,13 @@ final class DataFrameStatFunctions private[sql](df: 
DataFrame) {
* @param col2 the name of the second column
* @return the covariance of the two columns.
*
+   * {{{
+   *val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", 
rand(seed=10))
+   *  .withColumn("rand2", rand(seed=27))
+   *df.stat.cov("rand1", "rand2")
+   *res1: Double = 0.065...
+   * }}}
+   *
* @since 1.4.0
*/
   def cov(col1: String, col2: String): Double = {
@@ -54,6 +61,13 @@ final class DataFrameStatFunctions private[sql](df: 
DataFrame) {
* @param col2 the name of the column to calculate the correlation against
* @return The Pearson Correlation Coefficient as a Double.
*
+   * {{{
+   *val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", 
rand(seed=10))
+   *  .withColumn("rand2", rand(seed=27))
+   *df.stat.corr("rand1", "rand2")
+   *res1: Double = 0.613...
+   * }}}
+   *
* @since 1.4.0
*/
   def corr(col1: String, col2: String, method: String): Double = {
@@ -69,6 +83,13 @@ final class DataFrameStatFunctions private[sql](df: 
DataFrame) {
* @param col2 the name of the column to calculate the correlation against
* @return The Pearson Correlation Coefficient as a Double.
*
+   * {{{
+   *val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", 
rand(seed=10))
+   *  .withColumn("rand2", rand(seed=27))
+   *df.stat.corr("rand1", "rand2", "pearson")
+   *res1: Double = 0.613...
+   * }}}
+   *
* @since 1.4.0
*/
   def corr(col1: String, col2: String): Double = {
@@ -92,6 +113,20 @@ final class DataFrameStatFunctions private[sql](df: 
DataFrame) {
* of the DataFrame.
* @return A DataFrame containing for the contingency table.
*
+   * {{{
+   *val df = sqlContext.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 
1), (2, 3), (3, 2),
+   *  (3, 3))).toDF("key", "value")
+   *val ct = df.stat.crosstab("key", "value")
+   *ct.show()
+   *+-+---+---+---+
+   *