This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 2ddb6be4724 [SPARK-46298][PYTHON][CONNECT] Match deprecation warning, 
test case, and error of Catalog.createExternalTable
2ddb6be4724 is described below

commit 2ddb6be472431feceecd3daece8bafc8c80d7eb1
Author: Hyukjin Kwon <gurwls...@apache.org>
AuthorDate: Thu Dec 7 14:00:41 2023 +0900

    [SPARK-46298][PYTHON][CONNECT] Match deprecation warning, test case, and 
error of Catalog.createExternalTable
    
    ### What changes were proposed in this pull request?
    
    This PR adds tests for catalog error cases for `createExternalTable`.
    
    Also, this PR includes several minor cleanups:
    - Show a deprecation for `spark.catalog.createExternalTable` (to match with 
the non-Spark Connect)
    - Remove `_reset` at `Catalog` which is not used anywhere.
    - Switch the implementation of Spark Connect 
`spark.catalog.createExternalTable` to directly call 
`spark.catalog.createTable`, and remove the corresponding Python protobuf 
definition.
      - this PR does not remove the protobuf message definition itself for 
potential compatibility concern.
    
    ### Why are the changes needed?
    
    - For feature parity.
    - To improve the test coverage.
        See 
https://app.codecov.io/gh/apache/spark/commit/1a651753f4e760643d719add3b16acd311454c76/blob/python/pyspark/sql/catalog.py
    
    This is not being tested.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Virtually no (except the ones descried above)
    
    ### How was this patch tested?
    
    Manually ran the new unittest.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #44226 from HyukjinKwon/SPARK-46298.
    
    Authored-by: Hyukjin Kwon <gurwls...@apache.org>
    Signed-off-by: Hyukjin Kwon <gurwls...@apache.org>
---
 python/pyspark/sql/catalog.py            |  8 --------
 python/pyspark/sql/connect/catalog.py    | 22 ++++++++++++---------
 python/pyspark/sql/connect/plan.py       | 34 --------------------------------
 python/pyspark/sql/tests/test_catalog.py |  9 ++++++++-
 4 files changed, 21 insertions(+), 52 deletions(-)

diff --git a/python/pyspark/sql/catalog.py b/python/pyspark/sql/catalog.py
index b5337734b3b..6595659a4da 100644
--- a/python/pyspark/sql/catalog.py
+++ b/python/pyspark/sql/catalog.py
@@ -1237,14 +1237,6 @@ class Catalog:
         """
         self._jcatalog.refreshByPath(path)
 
-    def _reset(self) -> None:
-        """(Internal use only) Drop all existing databases (except "default"), 
tables,
-        partitions and functions, and set the current database to "default".
-
-        This is mainly used for tests.
-        """
-        self._jsparkSession.sessionState().catalog().reset()
-
 
 def _test() -> None:
     import os
diff --git a/python/pyspark/sql/connect/catalog.py 
b/python/pyspark/sql/connect/catalog.py
index 9143a03d324..ef1bff9d28c 100644
--- a/python/pyspark/sql/connect/catalog.py
+++ b/python/pyspark/sql/connect/catalog.py
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+from pyspark.errors import PySparkTypeError
 from pyspark.sql.connect.utils import check_dependencies
 
 check_dependencies(__name__)
@@ -215,16 +216,11 @@ class Catalog:
         schema: Optional[StructType] = None,
         **options: str,
     ) -> DataFrame:
-        catalog = plan.CreateExternalTable(
-            table_name=tableName,
-            path=path,  # type: ignore[arg-type]
-            source=source,
-            schema=schema,
-            options=options,
+        warnings.warn(
+            "createExternalTable is deprecated since Spark 4.0, please use 
createTable instead.",
+            FutureWarning,
         )
-        df = DataFrame(catalog, session=self._sparkSession)
-        df._to_table()  # Eager execution.
-        return df
+        return self.createTable(tableName, path, source, schema, **options)
 
     createExternalTable.__doc__ = PySparkCatalog.createExternalTable.__doc__
 
@@ -237,6 +233,14 @@ class Catalog:
         description: Optional[str] = None,
         **options: str,
     ) -> DataFrame:
+        if schema is not None and not isinstance(schema, StructType):
+            raise PySparkTypeError(
+                error_class="NOT_STRUCT",
+                message_parameters={
+                    "arg_name": "schema",
+                    "arg_type": type(schema).__name__,
+                },
+            )
         catalog = plan.CreateTable(
             table_name=tableName,
             path=path,  # type: ignore[arg-type]
diff --git a/python/pyspark/sql/connect/plan.py 
b/python/pyspark/sql/connect/plan.py
index 67a33c2b6cf..cdc06b0f31c 100644
--- a/python/pyspark/sql/connect/plan.py
+++ b/python/pyspark/sql/connect/plan.py
@@ -1997,40 +1997,6 @@ class FunctionExists(LogicalPlan):
         return plan
 
 
-class CreateExternalTable(LogicalPlan):
-    def __init__(
-        self,
-        table_name: str,
-        path: str,
-        source: Optional[str] = None,
-        schema: Optional[DataType] = None,
-        options: Mapping[str, str] = {},
-    ) -> None:
-        super().__init__(None)
-        self._table_name = table_name
-        self._path = path
-        self._source = source
-        self._schema = schema
-        self._options = options
-
-    def plan(self, session: "SparkConnectClient") -> proto.Relation:
-        plan = self._create_proto_relation()
-        plan.catalog.create_external_table.table_name = self._table_name
-        if self._path is not None:
-            plan.catalog.create_external_table.path = self._path
-        if self._source is not None:
-            plan.catalog.create_external_table.source = self._source
-        if self._schema is not None:
-            plan.catalog.create_external_table.schema.CopyFrom(
-                pyspark_types_to_proto_types(self._schema)
-            )
-        for k in self._options.keys():
-            v = self._options.get(k)
-            if v is not None:
-                plan.catalog.create_external_table.options[k] = v
-        return plan
-
-
 class CreateTable(LogicalPlan):
     def __init__(
         self,
diff --git a/python/pyspark/sql/tests/test_catalog.py 
b/python/pyspark/sql/tests/test_catalog.py
index b72172a402b..278fbbb2ba5 100644
--- a/python/pyspark/sql/tests/test_catalog.py
+++ b/python/pyspark/sql/tests/test_catalog.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 #
 from pyspark import StorageLevel
-from pyspark.errors import AnalysisException
+from pyspark.errors import AnalysisException, PySparkTypeError
 from pyspark.sql.types import StructType, StructField, IntegerType
 from pyspark.testing.sqlutils import ReusedSQLTestCase
 
@@ -81,6 +81,13 @@ class CatalogTestsMixin:
 
                     schema = StructType([StructField("a", IntegerType(), 
True)])
                     description = "this a table created via 
Catalog.createTable()"
+
+                    with self.assertRaisesRegex(PySparkTypeError, "should be a 
struct type"):
+                        # Test deprecated API and negative error case.
+                        spark.catalog.createExternalTable(
+                            "invalid_table_creation", schema=IntegerType(), 
description=description
+                        )
+
                     spark.catalog.createTable(
                         "tab3_via_catalog", schema=schema, 
description=description
                     )


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to