(spark-connect-swift) branch main updated: [SPARK-57308] Support `stat.approxQuantile` for `DataFrame`

dongjoon Sun, 07 Jun 2026 18:45:02 -0700

This is an automated email from the ASF dual-hosted git repository.

dongjoon-hyun pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/spark-connect-swift.git



The following commit(s) were added to refs/heads/main by this push:
     new d0a7ab4  [SPARK-57308] Support `stat.approxQuantile` for `DataFrame`
d0a7ab4 is described below

commit d0a7ab491f2ec29d9a0dca40dc00a2157e86766a
Author: Dongjoon Hyun <[email protected]>
AuthorDate: Sun Jun 7 18:44:44 2026 -0700

    [SPARK-57308] Support `stat.approxQuantile` for `DataFrame`
    
    ### What changes were proposed in this pull request?
    
    This PR aims to support `approxQuantile` for `DataFrame` by wiring the
    `StatApproxQuantile` Spark Connect relation through 
`DataFrameStatFunctions`,
    exposed via `DataFrame.stat` like PySpark/Scala.
    
    ```swift
    public func approxQuantile(_ col: String, _ probabilities: [Double], _ 
relativeError: Double) async throws -> [Double]
    public func approxQuantile(_ cols: [String], _ probabilities: [Double], _ 
relativeError: Double) async throws -> [[Double]]
    ```
    
    ### Why are the changes needed?
    
    To improve API coverage by mirroring PySpark/Scala `DataFrameStatFunctions`.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, this PR adds a new API, `DataFrame.stat.approxQuantile`.
    
    ### How was this patch tested?
    
    Pass the CIs with a newly added test in `DataFrameStatFunctionsTests`.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    Generated-by: Claude Code (Claude Opus 4.8)
    
    Closes #410 from dongjoon-hyun/SPARK-57308.
    
    Authored-by: Dongjoon Hyun <[email protected]>
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 Sources/SparkConnect/DataFrameStatFunctions.swift  | 38 ++++++++++++++++++++--
 Sources/SparkConnect/SparkConnectClient.swift      | 11 +++++++
 .../DataFrameStatFunctionsTests.swift              | 13 ++++++++
 3 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/Sources/SparkConnect/DataFrameStatFunctions.swift 
b/Sources/SparkConnect/DataFrameStatFunctions.swift
index 3a86009..29a9ecc 100644
--- a/Sources/SparkConnect/DataFrameStatFunctions.swift
+++ b/Sources/SparkConnect/DataFrameStatFunctions.swift
@@ -19,8 +19,7 @@
 
 /// Statistic functions for ``DataFrame``s.
 ///
-/// Use ``DataFrame/stat`` to access this. It mirrors PySpark's 
`DataFrameStatFunctions`
-/// (`df.stat.crosstab`, `df.stat.cov`, `df.stat.corr`).
+/// Use ``DataFrame/stat`` to access this. It mirrors PySpark's 
`DataFrameStatFunctions`.
 public actor DataFrameStatFunctions: Sendable {
   let df: DataFrame
 
@@ -66,6 +65,41 @@ public actor DataFrameStatFunctions: Sendable {
     return try await collectDouble { SparkConnectClient.getStatCorr($0, col1, 
col2, method) }
   }
 
+  /// Calculates the approximate quantiles of a numerical column of a 
``DataFrame``.
+  /// - Parameters:
+  ///   - col: The name of the numerical column.
+  ///   - probabilities: A list of quantile probabilities. Each number must 
belong to `[0, 1]`.
+  ///     For example, 0 is the minimum, 0.5 is the median, 1 is the maximum.
+  ///   - relativeError: The relative target precision to achieve (greater 
than or equal to 0).
+  ///     If set to zero, the exact quantiles are computed, which could be 
very expensive. Note that
+  ///     values greater than 1 are accepted but give the same result as 1.
+  /// - Returns: The approximate quantiles at the given probabilities.
+  public func approxQuantile(
+    _ col: String, _ probabilities: [Double], _ relativeError: Double
+  ) async throws -> [Double] {
+    return try await approxQuantile([col], probabilities, relativeError)[0]
+  }
+
+  /// Calculates the approximate quantiles of numerical columns of a 
``DataFrame``.
+  /// - Parameters:
+  ///   - cols: The names of the numerical columns.
+  ///   - probabilities: A list of quantile probabilities. Each number must 
belong to `[0, 1]`.
+  ///     For example, 0 is the minimum, 0.5 is the median, 1 is the maximum.
+  ///   - relativeError: The relative target precision to achieve (greater 
than or equal to 0).
+  ///     If set to zero, the exact quantiles are computed, which could be 
very expensive. Note that
+  ///     values greater than 1 are accepted but give the same result as 1.
+  /// - Returns: The approximate quantiles at the given probabilities of each 
column.
+  public func approxQuantile(
+    _ cols: [String], _ probabilities: [Double], _ relativeError: Double
+  ) async throws -> [[Double]] {
+    let plan = await df.getPlan() as! Plan
+    let result = DataFrame(
+      spark: await df.spark,
+      plan: SparkConnectClient.getStatApproxQuantile(plan.root, cols, 
probabilities, relativeError))
+    let quantilesPerColumn = try await result.collect()[0].get(0) as! [any 
Sendable]
+    return quantilesPerColumn.map { ($0 as! [any Sendable]).map { $0 as! 
Double } }
+  }
+
   // MARK: - Helpers
 
   /// Builds a single-value ``DataFrame`` from this ``DataFrame``'s plan using 
the given plan
diff --git a/Sources/SparkConnect/SparkConnectClient.swift 
b/Sources/SparkConnect/SparkConnectClient.swift
index d243fd0..a491c42 100644
--- a/Sources/SparkConnect/SparkConnectClient.swift
+++ b/Sources/SparkConnect/SparkConnectClient.swift
@@ -639,6 +639,17 @@ public actor SparkConnectClient {
     return createPlan { $0.corr = corr }
   }
 
+  static func getStatApproxQuantile(
+    _ child: Relation, _ cols: [String], _ probabilities: [Double], _ 
relativeError: Double
+  ) -> Plan {
+    var approxQuantile = Spark_Connect_StatApproxQuantile()
+    approxQuantile.input = child
+    approxQuantile.cols = cols
+    approxQuantile.probabilities = probabilities
+    approxQuantile.relativeError = relativeError
+    return createPlan { $0.approxQuantile = approxQuantile }
+  }
+
   static func getSort(_ child: Relation, _ cols: [String]) -> Plan {
     var sort = Sort()
     sort.input = child
diff --git a/Tests/SparkConnectTests/DataFrameStatFunctionsTests.swift 
b/Tests/SparkConnectTests/DataFrameStatFunctionsTests.swift
index 3c9918e..bda3b55 100644
--- a/Tests/SparkConnectTests/DataFrameStatFunctionsTests.swift
+++ b/Tests/SparkConnectTests/DataFrameStatFunctionsTests.swift
@@ -57,4 +57,17 @@ struct DataFrameStatFunctionsTests {
     #expect(try await df.stat.corr("c1", "c2", method: "pearson") == 1.0)
     await spark.stop()
   }
+
+  @Test
+  func approxQuantile() async throws {
+    let spark = try await SparkSession.builder.getOrCreate()
+    let df = try await spark.sql(
+      "SELECT * FROM VALUES (1, 10), (2, 20), (3, 30), (4, 40), (5, 50) AS 
T(c1, c2)")
+    // Single column: exact quantiles (relativeError 0) at min, median, max.
+    #expect(try await df.stat.approxQuantile("c1", [0.0, 0.5, 1.0], 0.0) == 
[1.0, 3.0, 5.0])
+    // Multiple columns: one array of quantiles per column.
+    let quantiles = try await df.stat.approxQuantile(["c1", "c2"], [0.0, 0.5, 
1.0], 0.0)
+    #expect(quantiles == [[1.0, 3.0, 5.0], [10.0, 30.0, 50.0]])
+    await spark.stop()
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark-connect-swift) branch main updated: [SPARK-57308] Support `stat.approxQuantile` for `DataFrame`

Reply via email to