This is an automated email from the ASF dual-hosted git repository.

dongjoon-hyun pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/spark-connect-swift.git


The following commit(s) were added to refs/heads/main by this push:
     new 95774f2  [SPARK-57307] Support `stat.crosstab` for `DataFrame`
95774f2 is described below

commit 95774f2d5a5924b0331020e8a8740fac1baa035f
Author: Dongjoon Hyun <[email protected]>
AuthorDate: Sun Jun 7 15:29:19 2026 -0700

    [SPARK-57307] Support `stat.crosstab` for `DataFrame`
    
    ### What changes were proposed in this pull request?
    
    This PR aims to support `crosstab` for `DataFrame` by wiring the 
`StatCrosstab`
    Spark Connect relation through `DataFrameStatFunctions`, exposed via
    `DataFrame.stat` like PySpark/Scala.
    
    ```swift
    public func crosstab(_ col1: String, _ col2: String) async throws -> 
DataFrame
    ```
    
    Unlike `stat.cov`/`stat.corr` which return a `Double`, `crosstab` returns a
    `DataFrame` (a contingency table): the first column is named 
`<col1>_<col2>` and
    holds the distinct values of `col1`, while the remaining column names are 
the
    distinct values of `col2`.
    
    ### Why are the changes needed?
    
    To improve API coverage by mirroring PySpark/Scala `DataFrameStatFunctions`.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, this PR adds a new API, `DataFrame.stat.crosstab`.
    
    ### How was this patch tested?
    
    Pass the CIs.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    Generated-by: Claude Code (Claude Opus 4.8)
    
    Closes #409 from dongjoon-hyun/SPARK-57307.
    
    Authored-by: Dongjoon Hyun <[email protected]>
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 Sources/SparkConnect/DataFrameStatFunctions.swift      | 18 +++++++++++++++++-
 Sources/SparkConnect/SparkConnectClient.swift          |  8 ++++++++
 .../DataFrameStatFunctionsTests.swift                  | 15 +++++++++++++++
 3 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/Sources/SparkConnect/DataFrameStatFunctions.swift 
b/Sources/SparkConnect/DataFrameStatFunctions.swift
index 8c9bf72..3a86009 100644
--- a/Sources/SparkConnect/DataFrameStatFunctions.swift
+++ b/Sources/SparkConnect/DataFrameStatFunctions.swift
@@ -20,7 +20,7 @@
 /// Statistic functions for ``DataFrame``s.
 ///
 /// Use ``DataFrame/stat`` to access this. It mirrors PySpark's 
`DataFrameStatFunctions`
-/// (`df.stat.cov`, `df.stat.corr`).
+/// (`df.stat.crosstab`, `df.stat.cov`, `df.stat.corr`).
 public actor DataFrameStatFunctions: Sendable {
   let df: DataFrame
 
@@ -28,6 +28,22 @@ public actor DataFrameStatFunctions: Sendable {
     self.df = df
   }
 
+  /// Computes a pair-wise frequency table of the given columns. Also known as 
a contingency table.
+  /// The number of distinct values for each column should be less than `1e4`. 
At most `1e6` non-zero
+  /// pair frequencies will be returned. The first column of each row will be 
the distinct values of
+  /// `col1` and the column names will be the distinct values of `col2`. The 
name of the first column
+  /// will be `<col1>_<col2>`. Counts will be returned as `Long`s. Pairs that 
have no occurrences will
+  /// have zero as their counts.
+  /// - Parameters:
+  ///   - col1: The name of the first column. Distinct items will make the 
first item of each row.
+  ///   - col2: The name of the second column. Distinct items will make the 
column names of the ``DataFrame``.
+  /// - Returns: A ``DataFrame`` containing the contingency table.
+  public func crosstab(_ col1: String, _ col2: String) async throws -> 
DataFrame {
+    let plan = await df.getPlan() as! Plan
+    return DataFrame(
+      spark: await df.spark, plan: 
SparkConnectClient.getStatCrosstab(plan.root, col1, col2))
+  }
+
   /// Calculates the sample covariance of two numerical columns of a 
``DataFrame``.
   /// - Parameters:
   ///   - col1: The name of the first column.
diff --git a/Sources/SparkConnect/SparkConnectClient.swift 
b/Sources/SparkConnect/SparkConnectClient.swift
index e43fe34..d243fd0 100644
--- a/Sources/SparkConnect/SparkConnectClient.swift
+++ b/Sources/SparkConnect/SparkConnectClient.swift
@@ -612,6 +612,14 @@ public actor SparkConnectClient {
     return createPlan { $0.summary = summary }
   }
 
+  static func getStatCrosstab(_ child: Relation, _ col1: String, _ col2: 
String) -> Plan {
+    var crosstab = Spark_Connect_StatCrosstab()
+    crosstab.input = child
+    crosstab.col1 = col1
+    crosstab.col2 = col2
+    return createPlan { $0.crosstab = crosstab }
+  }
+
   static func getStatCov(_ child: Relation, _ col1: String, _ col2: String) -> 
Plan {
     var cov = Spark_Connect_StatCov()
     cov.input = child
diff --git a/Tests/SparkConnectTests/DataFrameStatFunctionsTests.swift 
b/Tests/SparkConnectTests/DataFrameStatFunctionsTests.swift
index 54a0315..3c9918e 100644
--- a/Tests/SparkConnectTests/DataFrameStatFunctionsTests.swift
+++ b/Tests/SparkConnectTests/DataFrameStatFunctionsTests.swift
@@ -23,6 +23,21 @@ import Testing
 /// A test suite for `DataFrameStatFunctions`
 @Suite(.serialized)
 struct DataFrameStatFunctionsTests {
+  @Test
+  func crosstab() async throws {
+    let spark = try await SparkSession.builder.getOrCreate()
+    let df = try await spark.sql("SELECT * FROM VALUES (1, 1), (1, 2), (2, 1), 
(2, 1) AS T(c1, c2)")
+    let ct = try await df.stat.crosstab("c1", "c2")
+    let columns = try await ct.columns
+    // The name of the first column is `<col1>_<col2>`.
+    #expect(columns[0] == "c1_c2")
+    // The remaining column names are the distinct values of `col2`.
+    #expect(Set(columns.dropFirst()) == ["1", "2"])
+    // One row per distinct value of `col1`.
+    #expect(try await ct.count() == 2)
+    await spark.stop()
+  }
+
   @Test
   func cov() async throws {
     let spark = try await SparkSession.builder.getOrCreate()


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to