This is an automated email from the ASF dual-hosted git repository.

dongjoon-hyun pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/spark-connect-swift.git


The following commit(s) were added to refs/heads/main by this push:
     new 972244e  [SPARK-57060] Support `analyzeTable` in `Catalog`
972244e is described below

commit 972244e9c7cfd0c6ad4d641f3801d8425cfc098b
Author: Dongjoon Hyun <[email protected]>
AuthorDate: Mon May 25 18:33:58 2026 -0700

    [SPARK-57060] Support `analyzeTable` in `Catalog`
    
    ### What changes were proposed in this pull request?
    
    This PR aims to support `Spark_Connect_AnalyzeTable` message added in 
Apache Spark Connect 4.2.0-preview5.
    - https://github.com/apache/spark/pull/55025
    
    ### Why are the changes needed?
    
    For feature parity with Spark Connect.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Pass the CIs with the newly added test case.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    Generated-by: Claude Opus 4.7
    
    Closes #388 from dongjoon-hyun/SPARK-57060.
    
    Authored-by: Dongjoon Hyun <[email protected]>
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 Sources/SparkConnect/Catalog.swift         | 16 ++++++++++++++++
 Tests/SparkConnectTests/CatalogTests.swift | 18 ++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/Sources/SparkConnect/Catalog.swift 
b/Sources/SparkConnect/Catalog.swift
index bb3f617..eea6755 100644
--- a/Sources/SparkConnect/Catalog.swift
+++ b/Sources/SparkConnect/Catalog.swift
@@ -584,6 +584,22 @@ public actor Catalog: Sendable {
     try await df.count()
   }
 
+  /// Analyzes the given table to compute statistics that can be used by the 
query optimizer.
+  /// - Parameters:
+  ///   - tableName: A qualified or unqualified name that designates a table.
+  ///   - noScan: If true, only basic statistics (row count) are computed 
without scanning the data.
+  public func analyzeTable(_ tableName: String, noScan: Bool = false) async 
throws {
+    let df = getDataFrame({
+      var analyzeTable = Spark_Connect_AnalyzeTable()
+      analyzeTable.tableName = tableName
+      analyzeTable.noScan = noScan
+      var catalog = Spark_Connect_Catalog()
+      catalog.catType = .analyzeTable(analyzeTable)
+      return catalog
+    })
+    try await df.count()
+  }
+
   /// Invalidates and refreshes all the cached data (and the associated 
metadata) for any ``DataFrame``
   /// that contains the given data source path. Path matching is by checking 
for sub-directories,
   /// i.e. "/" would invalidate everything that is cached and "/test/parent" 
would invalidate
diff --git a/Tests/SparkConnectTests/CatalogTests.swift 
b/Tests/SparkConnectTests/CatalogTests.swift
index 1456bba..eba3e9e 100644
--- a/Tests/SparkConnectTests/CatalogTests.swift
+++ b/Tests/SparkConnectTests/CatalogTests.swift
@@ -608,6 +608,24 @@ struct CatalogTests {
     await spark.stop()
   }
 
+  @Test
+  func analyzeTable() async throws {
+    let spark = try await SparkSession.builder.getOrCreate()
+    if await spark.version >= "4.2" {
+      let tableName = "TABLE_" + UUID().uuidString.replacingOccurrences(of: 
"-", with: "")
+      try await SQLHelper.withTable(spark, tableName)({
+        try await spark.range(10).write.saveAsTable(tableName)
+        try await spark.catalog.analyzeTable(tableName)
+        try await spark.catalog.analyzeTable(tableName, noScan: true)
+      })
+
+      try await #require(throws: SparkConnectError.TableOrViewNotFound) {
+        try await spark.catalog.analyzeTable("not_exist_table")
+      }
+    }
+    await spark.stop()
+  }
+
   @Test
   func refreshByPath() async throws {
     let spark = try await SparkSession.builder.getOrCreate()


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to