This is an automated email from the ASF dual-hosted git repository.

dongjoon-hyun pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/spark-connect-swift.git


The following commit(s) were added to refs/heads/main by this push:
     new 112f838  [SPARK-57310] Support `stat.freqItems` for `DataFrame`
112f838 is described below

commit 112f838bcf58c955e57c11fae6e726e849caf213
Author: Dongjoon Hyun <[email protected]>
AuthorDate: Sun Jun 7 18:51:55 2026 -0700

    [SPARK-57310] Support `stat.freqItems` for `DataFrame`
    
    ### What changes were proposed in this pull request?
    
    This PR aims to support `freqItems` for `DataFrame` by wiring the 
`StatFreqItems`
    Spark Connect relation through `DataFrameStatFunctions`, exposed via 
`DataFrame.stat`
    like PySpark/Scala.
    
    ```swift
    public func freqItems(_ cols: [String], support: Double = 0.01) async 
throws -> DataFrame
    ```
    
    Like `stat.crosstab`, `freqItems` returns a `DataFrame` whose output 
columns are named
    `<column>_freqItems`. The optional `support` (the minimum frequency for an 
item to be
    considered frequent) defaults to `0.01`, matching PySpark/Scala.
    
    ### Why are the changes needed?
    
    To improve API coverage by mirroring PySpark/Scala `DataFrameStatFunctions`.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, this PR adds a new API, `DataFrame.stat.freqItems`.
    
    ### How was this patch tested?
    
    Pass the CIs with a newly added test, 
`DataFrameStatFunctionsTests.freqItems`.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    Generated-by: Claude Code (Claude Opus 4.8)
    
    This patch had conflicts when merged, resolved by
    Committer: Dongjoon Hyun <[email protected]>
    
    Closes #412 from dongjoon-hyun/SPARK-57310.
    
    Authored-by: Dongjoon Hyun <[email protected]>
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 Sources/SparkConnect/DataFrameStatFunctions.swift         | 15 +++++++++++++++
 Sources/SparkConnect/SparkConnectClient.swift             |  8 ++++++++
 Tests/SparkConnectTests/DataFrameStatFunctionsTests.swift | 12 ++++++++++++
 3 files changed, 35 insertions(+)

diff --git a/Sources/SparkConnect/DataFrameStatFunctions.swift 
b/Sources/SparkConnect/DataFrameStatFunctions.swift
index cfdf356..4d65ff4 100644
--- a/Sources/SparkConnect/DataFrameStatFunctions.swift
+++ b/Sources/SparkConnect/DataFrameStatFunctions.swift
@@ -127,6 +127,21 @@ public actor DataFrameStatFunctions: Sendable {
     return await sampleBy(col, fractions, Int64.random(in: 
Int64.min...Int64.max))
   }
 
+  /// Finds frequent items for columns, possibly with false positives. Uses 
the frequent element
+  /// count algorithm described in "https://doi.org/10.1145/762471.762473";, 
proposed by Karp,
+  /// Schenker, and Papadimitriou.
+  /// - Parameters:
+  ///   - cols: The names of the columns to search frequent items in.
+  ///   - support: The minimum frequency for an item to be considered 
`frequent`. Should be greater
+  ///     than 1e-4.
+  /// - Returns: A ``DataFrame`` with the frequent items for each column. The 
output columns are
+  ///   named `{column}_freqItems`.
+  public func freqItems(_ cols: [String], support: Double = 0.01) async throws 
-> DataFrame {
+    let plan = await df.getPlan() as! Plan
+    return DataFrame(
+      spark: await df.spark, plan: SparkConnectClient.getFreqItems(plan.root, 
cols, support))
+  }
+
   // MARK: - Helpers
 
   /// Builds a single-value ``DataFrame`` from this ``DataFrame``'s plan using 
the given plan
diff --git a/Sources/SparkConnect/SparkConnectClient.swift 
b/Sources/SparkConnect/SparkConnectClient.swift
index 89f32eb..d424a63 100644
--- a/Sources/SparkConnect/SparkConnectClient.swift
+++ b/Sources/SparkConnect/SparkConnectClient.swift
@@ -668,6 +668,14 @@ public actor SparkConnectClient {
     return createPlan { $0.sampleBy = sampleBy }
   }
 
+  static func getFreqItems(_ child: Relation, _ cols: [String], _ support: 
Double) -> Plan {
+    var freqItems = Spark_Connect_StatFreqItems()
+    freqItems.input = child
+    freqItems.cols = cols
+    freqItems.support = support
+    return createPlan { $0.freqItems = freqItems }
+  }
+
   static func getSort(_ child: Relation, _ cols: [String]) -> Plan {
     var sort = Sort()
     sort.input = child
diff --git a/Tests/SparkConnectTests/DataFrameStatFunctionsTests.swift 
b/Tests/SparkConnectTests/DataFrameStatFunctionsTests.swift
index 578a354..56c95a7 100644
--- a/Tests/SparkConnectTests/DataFrameStatFunctionsTests.swift
+++ b/Tests/SparkConnectTests/DataFrameStatFunctionsTests.swift
@@ -85,4 +85,16 @@ struct DataFrameStatFunctionsTests {
     #expect(try await df.stat.sampleBy("key", [0: 1.0, 1: 1.0, 2: 
1.0]).count() == 99)
     await spark.stop()
   }
+
+  @Test
+  func freqItems() async throws {
+    let spark = try await SparkSession.builder.getOrCreate()
+    let df = try await spark.sql("SELECT * FROM VALUES (1, 2), (1, 2), (1, 2) 
AS T(a, b)")
+    // The result is a single-row `DataFrame` whose columns are named 
`{column}_freqItems`.
+    #expect(try await df.stat.freqItems(["a", "b"]).columns == ["a_freqItems", 
"b_freqItems"])
+    #expect(try await df.stat.freqItems(["a", "b"]).collect() == 
[Row(Array([1]), Array([2]))])
+    // `support` can be specified explicitly.
+    #expect(try await df.stat.freqItems(["a"], support: 0.5).collect() == 
[Row(Array([1]))])
+    await spark.stop()
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to