This is an automated email from the ASF dual-hosted git repository.
dongjoon-hyun pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/spark-connect-swift.git
The following commit(s) were added to refs/heads/main by this push:
new 95774f2 [SPARK-57307] Support `stat.crosstab` for `DataFrame`
95774f2 is described below
commit 95774f2d5a5924b0331020e8a8740fac1baa035f
Author: Dongjoon Hyun <[email protected]>
AuthorDate: Sun Jun 7 15:29:19 2026 -0700
[SPARK-57307] Support `stat.crosstab` for `DataFrame`
### What changes were proposed in this pull request?
This PR aims to support `crosstab` for `DataFrame` by wiring the
`StatCrosstab`
Spark Connect relation through `DataFrameStatFunctions`, exposed via
`DataFrame.stat` like PySpark/Scala.
```swift
public func crosstab(_ col1: String, _ col2: String) async throws ->
DataFrame
```
Unlike `stat.cov`/`stat.corr` which return a `Double`, `crosstab` returns a
`DataFrame` (a contingency table): the first column is named
`<col1>_<col2>` and
holds the distinct values of `col1`, while the remaining column names are
the
distinct values of `col2`.
### Why are the changes needed?
To improve API coverage by mirroring PySpark/Scala `DataFrameStatFunctions`.
### Does this PR introduce _any_ user-facing change?
Yes, this PR adds a new API, `DataFrame.stat.crosstab`.
### How was this patch tested?
Pass the CIs.
### Was this patch authored or co-authored using generative AI tooling?
Generated-by: Claude Code (Claude Opus 4.8)
Closes #409 from dongjoon-hyun/SPARK-57307.
Authored-by: Dongjoon Hyun <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
Sources/SparkConnect/DataFrameStatFunctions.swift | 18 +++++++++++++++++-
Sources/SparkConnect/SparkConnectClient.swift | 8 ++++++++
.../DataFrameStatFunctionsTests.swift | 15 +++++++++++++++
3 files changed, 40 insertions(+), 1 deletion(-)
diff --git a/Sources/SparkConnect/DataFrameStatFunctions.swift
b/Sources/SparkConnect/DataFrameStatFunctions.swift
index 8c9bf72..3a86009 100644
--- a/Sources/SparkConnect/DataFrameStatFunctions.swift
+++ b/Sources/SparkConnect/DataFrameStatFunctions.swift
@@ -20,7 +20,7 @@
/// Statistic functions for ``DataFrame``s.
///
/// Use ``DataFrame/stat`` to access this. It mirrors PySpark's
`DataFrameStatFunctions`
-/// (`df.stat.cov`, `df.stat.corr`).
+/// (`df.stat.crosstab`, `df.stat.cov`, `df.stat.corr`).
public actor DataFrameStatFunctions: Sendable {
let df: DataFrame
@@ -28,6 +28,22 @@ public actor DataFrameStatFunctions: Sendable {
self.df = df
}
+ /// Computes a pair-wise frequency table of the given columns. Also known as
a contingency table.
+ /// The number of distinct values for each column should be less than `1e4`.
At most `1e6` non-zero
+ /// pair frequencies will be returned. The first column of each row will be
the distinct values of
+ /// `col1` and the column names will be the distinct values of `col2`. The
name of the first column
+ /// will be `<col1>_<col2>`. Counts will be returned as `Long`s. Pairs that
have no occurrences will
+ /// have zero as their counts.
+ /// - Parameters:
+ /// - col1: The name of the first column. Distinct items will make the
first item of each row.
+ /// - col2: The name of the second column. Distinct items will make the
column names of the ``DataFrame``.
+ /// - Returns: A ``DataFrame`` containing the contingency table.
+ public func crosstab(_ col1: String, _ col2: String) async throws ->
DataFrame {
+ let plan = await df.getPlan() as! Plan
+ return DataFrame(
+ spark: await df.spark, plan:
SparkConnectClient.getStatCrosstab(plan.root, col1, col2))
+ }
+
/// Calculates the sample covariance of two numerical columns of a
``DataFrame``.
/// - Parameters:
/// - col1: The name of the first column.
diff --git a/Sources/SparkConnect/SparkConnectClient.swift
b/Sources/SparkConnect/SparkConnectClient.swift
index e43fe34..d243fd0 100644
--- a/Sources/SparkConnect/SparkConnectClient.swift
+++ b/Sources/SparkConnect/SparkConnectClient.swift
@@ -612,6 +612,14 @@ public actor SparkConnectClient {
return createPlan { $0.summary = summary }
}
+ static func getStatCrosstab(_ child: Relation, _ col1: String, _ col2:
String) -> Plan {
+ var crosstab = Spark_Connect_StatCrosstab()
+ crosstab.input = child
+ crosstab.col1 = col1
+ crosstab.col2 = col2
+ return createPlan { $0.crosstab = crosstab }
+ }
+
static func getStatCov(_ child: Relation, _ col1: String, _ col2: String) ->
Plan {
var cov = Spark_Connect_StatCov()
cov.input = child
diff --git a/Tests/SparkConnectTests/DataFrameStatFunctionsTests.swift
b/Tests/SparkConnectTests/DataFrameStatFunctionsTests.swift
index 54a0315..3c9918e 100644
--- a/Tests/SparkConnectTests/DataFrameStatFunctionsTests.swift
+++ b/Tests/SparkConnectTests/DataFrameStatFunctionsTests.swift
@@ -23,6 +23,21 @@ import Testing
/// A test suite for `DataFrameStatFunctions`
@Suite(.serialized)
struct DataFrameStatFunctionsTests {
+ @Test
+ func crosstab() async throws {
+ let spark = try await SparkSession.builder.getOrCreate()
+ let df = try await spark.sql("SELECT * FROM VALUES (1, 1), (1, 2), (2, 1),
(2, 1) AS T(c1, c2)")
+ let ct = try await df.stat.crosstab("c1", "c2")
+ let columns = try await ct.columns
+ // The name of the first column is `<col1>_<col2>`.
+ #expect(columns[0] == "c1_c2")
+ // The remaining column names are the distinct values of `col2`.
+ #expect(Set(columns.dropFirst()) == ["1", "2"])
+ // One row per distinct value of `col1`.
+ #expect(try await ct.count() == 2)
+ await spark.stop()
+ }
+
@Test
func cov() async throws {
let spark = try await SparkSession.builder.getOrCreate()
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]