Repository: spark Updated Branches: refs/heads/branch-1.4 29350eef3 -> 598902b54
[SPARK-7243][SQL] Reduce size for Contingency Tables in DataFrames Reduced take size from 1e8 to 1e6. cc rxin Author: Burak Yavuz <[email protected]> Closes #5900 from brkyvz/df-cont-followup and squashes the following commits: c11e762 [Burak Yavuz] fix grammar b30ace2 [Burak Yavuz] address comments a417ba5 [Burak Yavuz] [SPARK-7243][SQL] Reduce size for Contingency Tables in DataFrames (cherry picked from commit 18340d7be55a6834918956555bf820c96769aa52) Signed-off-by: Reynold Xin <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/598902b5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/598902b5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/598902b5 Branch: refs/heads/branch-1.4 Commit: 598902b54993fb62300e9a87dd00d8f88c368ae2 Parents: 29350ee Author: Burak Yavuz <[email protected]> Authored: Tue May 5 11:01:25 2015 -0700 Committer: Reynold Xin <[email protected]> Committed: Tue May 5 11:01:34 2015 -0700 ---------------------------------------------------------------------- python/pyspark/sql/dataframe.py | 9 +++++---- .../scala/org/apache/spark/sql/DataFrameStatFunctions.scala | 9 +++++---- .../org/apache/spark/sql/execution/stat/StatFunctions.scala | 6 +++--- 3 files changed, 13 insertions(+), 11 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/598902b5/python/pyspark/sql/dataframe.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index f30a92d..17448b3 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -934,10 +934,11 @@ class DataFrame(object): def crosstab(self, col1, col2): """ Computes a pair-wise frequency table of the given columns. Also known as a contingency - table. The number of distinct values for each column should be less than 1e4. The first - column of each row will be the distinct values of `col1` and the column names will be the - distinct values of `col2`. The name of the first column will be `$col1_$col2`. Pairs that - have no occurrences will have `null` as their counts. + table. The number of distinct values for each column should be less than 1e4. At most 1e6 + non-zero pair frequencies will be returned. + The first column of each row will be the distinct values of `col1` and the column names + will be the distinct values of `col2`. The name of the first column will be `$col1_$col2`. + Pairs that have no occurrences will have `null` as their counts. :func:`DataFrame.crosstab` and :func:`DataFrameStatFunctions.crosstab` are aliases. :param col1: The name of the first column. Distinct items will make the first item of http://git-wip-us.apache.org/repos/asf/spark/blob/598902b5/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala index fcf21ca..cb88dea 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -65,10 +65,11 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { /** * Computes a pair-wise frequency table of the given columns. Also known as a contingency table. - * The number of distinct values for each column should be less than 1e4. The first - * column of each row will be the distinct values of `col1` and the column names will be the - * distinct values of `col2`. The name of the first column will be `$col1_$col2`. Counts will be - * returned as `Long`s. Pairs that have no occurrences will have `null` as their counts. + * The number of distinct values for each column should be less than 1e4. At most 1e6 non-zero + * pair frequencies will be returned. + * The first column of each row will be the distinct values of `col1` and the column names will + * be the distinct values of `col2`. The name of the first column will be `$col1_$col2`. Counts + * will be returned as `Long`s. Pairs that have no occurrences will have `null` as their counts. * * @param col1 The name of the first column. Distinct items will make the first item of * each row. http://git-wip-us.apache.org/repos/asf/spark/blob/598902b5/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala index b50f606..386ac96 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala @@ -102,9 +102,9 @@ private[sql] object StatFunctions extends Logging { /** Generate a table of frequencies for the elements of two columns. */ private[sql] def crossTabulate(df: DataFrame, col1: String, col2: String): DataFrame = { val tableName = s"${col1}_$col2" - val counts = df.groupBy(col1, col2).agg(col(col1), col(col2), count("*")).take(1e8.toInt) - if (counts.length == 1e8.toInt) { - logWarning("The maximum limit of 1e8 pairs have been collected, which may not be all of " + + val counts = df.groupBy(col1, col2).agg(col(col1), col(col2), count("*")).take(1e6.toInt) + if (counts.length == 1e6.toInt) { + logWarning("The maximum limit of 1e6 pairs have been collected, which may not be all of " + "the pairs. Please try reducing the amount of distinct items in your columns.") } // get the distinct values of column 2, so that we can make them the column names --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
