[spark] branch master updated: [SPARK-40051][PYTHON][SQL][DOCS] Make pyspark.sql.catalog examples self-contained

gurwls223 Sun, 14 Aug 2022 16:53:13 -0700

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 36dd531a93a [SPARK-40051][PYTHON][SQL][DOCS] Make pyspark.sql.catalog 
examples self-contained
36dd531a93a is described below

commit 36dd531a93af55ce5c2bfd8d275814ccb2846962
Author: Hyukjin Kwon <[email protected]>
AuthorDate: Mon Aug 15 08:52:54 2022 +0900

    [SPARK-40051][PYTHON][SQL][DOCS] Make pyspark.sql.catalog examples 
self-contained
    
    ### What changes were proposed in this pull request?
    
    This PR proposes to improve the examples in `pyspark.sql.catalog` by making 
each example self-contained with a brief explanation and a bit more realistic 
example.
    
    ### Why are the changes needed?
    
    To make the documentation more readable and able to copy and paste directly 
in PySpark shell.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, it changes the documentation
    
    ### How was this patch tested?
    
    Manually ran each doctests. CI also runs this.
    
    Closes #37490 from HyukjinKwon/SPARK-40051.
    
    Lead-authored-by: Hyukjin Kwon <[email protected]>
    Co-authored-by: Hyukjin Kwon <[email protected]>
    Signed-off-by: Hyukjin Kwon <[email protected]>
---
 python/pyspark/sql/catalog.py | 668 +++++++++++++++++++++++++++++++++---------
 1 file changed, 536 insertions(+), 132 deletions(-)

diff --git a/python/pyspark/sql/catalog.py b/python/pyspark/sql/catalog.py
index 10c9ab5f6d2..4a49ef1fa04 100644
--- a/python/pyspark/sql/catalog.py
+++ b/python/pyspark/sql/catalog.py
@@ -19,7 +19,6 @@ import sys
 import warnings
 from typing import Any, Callable, NamedTuple, List, Optional, TYPE_CHECKING
 
-from pyspark import since
 from pyspark.sql.dataframe import DataFrame
 from pyspark.sql.session import SparkSession
 from pyspark.sql.types import StructType
@@ -108,6 +107,10 @@ class Catalog:
         ----------
         catalogName : str
             name of the catalog to set
+
+        Examples
+        --------
+        >>> spark.catalog.setCurrentCatalog("spark_catalog")
         """
         return self._jcatalog.setCurrentCatalog(catalogName)
 
@@ -115,6 +118,11 @@ class Catalog:
         """Returns a list of catalogs in this session.
 
         .. versionadded:: 3.4.0
+
+        Returns
+        -------
+        list
+            A list of :class:`CatalogMetadata`.
         """
         iter = self._jcatalog.listCatalogs().toLocalIterator()
         catalogs = []
@@ -123,19 +131,52 @@ class Catalog:
             catalogs.append(CatalogMetadata(name=jcatalog.name, 
description=jcatalog.description))
         return catalogs
 
-    @since(2.0)
     def currentDatabase(self) -> str:
-        """Returns the current default database in this session."""
+        """
+        Returns the current default database in this session.
+
+        .. versionadded:: 2.0.0
+
+        Returns
+        -------
+        str
+            The current default database name.
+
+        Examples
+        --------
+        >>> spark.catalog.currentDatabase()
+        'default'
+        """
         return self._jcatalog.currentDatabase()
 
-    @since(2.0)
     def setCurrentDatabase(self, dbName: str) -> None:
-        """Sets the current default database in this session."""
+        """
+        Sets the current default database in this session.
+
+        .. versionadded:: 2.0.0
+
+        Examples
+        --------
+        >>> spark.catalog.setCurrentDatabase("default")
+        """
         return self._jcatalog.setCurrentDatabase(dbName)
 
-    @since(2.0)
     def listDatabases(self) -> List[Database]:
-        """Returns a list of databases available across all sessions."""
+        """
+        Returns a list of databases available across all sessions.
+
+        .. versionadded:: 2.0.0
+
+        Returns
+        -------
+        list
+            A list of :class:`Database`.
+
+        Examples
+        --------
+        >>> spark.catalog.listDatabases()
+        [Database(name='default', catalog='spark_catalog', 
description='default database', ...
+        """
         iter = self._jcatalog.listDatabases().toLocalIterator()
         databases = []
         while iter.hasNext():
@@ -159,12 +200,20 @@ class Catalog:
         Parameters
         ----------
         dbName : str
-             name of the database to check existence.
+             name of the database to get.
+
+        Returns
+        -------
+        :class:`Database`
+            The database found by the name.
 
         Examples
         --------
         >>> spark.catalog.getDatabase("default")
         Database(name='default', catalog='spark_catalog', description='default 
database', ...
+
+        Using the fully qualified name with the catalog name.
+
         >>> spark.catalog.getDatabase("spark_catalog.default")
         Database(name='default', catalog='spark_catalog', description='default 
database', ...
         """
@@ -184,38 +233,66 @@ class Catalog:
         Parameters
         ----------
         dbName : str
-             name of the database to check existence
+            name of the database to check existence
+
+            .. versionchanged:: 3.4.0
+               Allow ``dbName`` to be qualified with catalog name.
 
         Returns
         -------
         bool
             Indicating whether the database exists
 
-        .. versionchanged:: 3.4
-           Allowed ``dbName`` to be qualified with catalog name.
-
         Examples
         --------
+        Check if 'test_new_database' database exists
+
         >>> spark.catalog.databaseExists("test_new_database")
         False
-        >>> df = spark.sql("CREATE DATABASE test_new_database")
+        >>> _ = spark.sql("CREATE DATABASE test_new_database")
         >>> spark.catalog.databaseExists("test_new_database")
         True
+
+        Using the fully qualified name with the catalog name.
+
         >>> spark.catalog.databaseExists("spark_catalog.test_new_database")
         True
-        >>> df = spark.sql("DROP DATABASE test_new_database")
+        >>> _ = spark.sql("DROP DATABASE test_new_database")
         """
         return self._jcatalog.databaseExists(dbName)
 
-    @since(2.0)
     def listTables(self, dbName: Optional[str] = None) -> List[Table]:
         """Returns a list of tables/views in the specified database.
 
-        If no database is specified, the current database is used.
-        This includes all temporary views.
+        .. versionadded:: 2.0.0
+
+        Parameters
+        ----------
+        dbName : str
+            name of the database to list the tables.
+
+            .. versionchanged:: 3.4.0
+               Allow ``dbName`` to be qualified with catalog name.
+
+        Returns
+        -------
+        list
+            A list of :class:`Table`.
+
+        Notes
+        -----
+        If no database is specified, the current database and catalog
+        are used. This API includes all temporary views.
 
-        .. versionchanged:: 3.4
-           Allowed ``dbName`` to be qualified with catalog name.
+        Examples
+        --------
+        >>> spark.range(1).createTempView("test_view")
+        >>> spark.catalog.listTables()
+        [Table(name='test_view', catalog=None, namespace=[], description=None, 
...
+
+        >>> _ = spark.catalog.dropTempView("test_view")
+        >>> spark.catalog.listTables()
+        []
         """
         if dbName is None:
             dbName = self.currentDatabase()
@@ -251,19 +328,34 @@ class Catalog:
         Parameters
         ----------
         tableName : str
-                    name of the table to check existence.
+            name of the table to get.
+
+            .. versionchanged:: 3.4.0
+               Allow `tableName` to be qualified with catalog name.
+
+        Returns
+        -------
+        :class:`Table`
+            The table found by the name.
 
         Examples
         --------
-        >>> df = spark.sql("CREATE TABLE tab1 (name STRING, age INT) USING 
parquet")
-        >>> spark.catalog.getTable("tab1")
-        Table(name='tab1', catalog='spark_catalog', namespace=['default'], ...
-        >>> spark.catalog.getTable("default.tab1")
-        Table(name='tab1', catalog='spark_catalog', namespace=['default'], ...
-        >>> spark.catalog.getTable("spark_catalog.default.tab1")
-        Table(name='tab1', catalog='spark_catalog', namespace=['default'], ...
-        >>> df = spark.sql("DROP TABLE tab1")
-        >>> spark.catalog.getTable("tab1")
+        >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
+        >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING 
parquet")
+        >>> spark.catalog.getTable("tbl1")
+        Table(name='tbl1', catalog='spark_catalog', namespace=['default'], ...
+
+        Using the fully qualified name with the catalog name.
+
+        >>> spark.catalog.getTable("default.tbl1")
+        Table(name='tbl1', catalog='spark_catalog', namespace=['default'], ...
+        >>> spark.catalog.getTable("spark_catalog.default.tbl1")
+        Table(name='tbl1', catalog='spark_catalog', namespace=['default'], ...
+        >>> _ = spark.sql("DROP TABLE tbl1")
+
+        Throw an analysis exception when the table does not exists.
+
+        >>> spark.catalog.getTable("tbl1")
         Traceback (most recent call last):
             ...
         pyspark.sql.utils.AnalysisException: ...
@@ -283,15 +375,32 @@ class Catalog:
             isTemporary=jtable.isTemporary(),
         )
 
-    @since(2.0)
     def listFunctions(self, dbName: Optional[str] = None) -> List[Function]:
-        """Returns a list of functions registered in the specified database.
+        """
+        Returns a list of functions registered in the specified database.
+
+        .. versionadded:: 3.4.0
 
-        If no database is specified, the current database is used.
-        This includes all temporary functions.
+        Parameters
+        ----------
+        dbName : str
+            name of the database to list the functions.
+            ``dbName`` can be qualified with catalog name.
 
-        .. versionchanged:: 3.4
-           Allowed ``dbName`` to be qualified with catalog name.
+        Returns
+        -------
+        list
+            A list of :class:`Function`.
+
+        Notes
+        -----
+        If no database is specified, the current database and catalog
+        are used. This API includes all temporary functions.
+
+        Examples
+        --------
+        >>> spark.catalog.listFunctions()
+        [Function(name=...
         """
         if dbName is None:
             dbName = self.currentDatabase()
@@ -327,25 +436,30 @@ class Catalog:
         ----------
         functionName : str
             name of the function to check existence
-        dbName : str, optional
-            name of the database to check function existence in.
-            If no database is specified, the current database is used
 
-           .. deprecated:: 3.4.0
+            .. versionchanged:: 3.4.0
+               Allow ``functionName`` to be qualified with catalog name
 
+        dbName : str, optional
+            name of the database to check function existence in.
 
         Returns
         -------
         bool
             Indicating whether the function exists
 
-        .. versionchanged:: 3.4
-           Allowed ``functionName`` to be qualified with catalog name
+        Notes
+        -----
+        If no database is specified, the current database and catalog
+        are used. This API includes all temporary functions.
 
         Examples
         --------
-        >>> spark.catalog.functionExists("unexisting_function")
-        False
+        >>> spark.catalog.functionExists("count")
+        True
+
+        Using the fully qualified name for function name.
+
         >>> spark.catalog.functionExists("default.unexisting_function")
         False
         >>> 
spark.catalog.functionExists("spark_catalog.default.unexisting_function")
@@ -369,18 +483,29 @@ class Catalog:
 
         Parameters
         ----------
-        tableName : str
-                    name of the function to check existence.
+        functionName : str
+            name of the function to check existence.
+
+        Returns
+        -------
+        :class:`Function`
+            The function found by the name.
 
         Examples
         --------
         >>> func = spark.sql("CREATE FUNCTION my_func1 AS 
'test.org.apache.spark.sql.MyDoubleAvg'")
         >>> spark.catalog.getFunction("my_func1")
         Function(name='my_func1', catalog='spark_catalog', 
namespace=['default'], ...
+
+        Using the fully qualified name for function name.
+
         >>> spark.catalog.getFunction("default.my_func1")
         Function(name='my_func1', catalog='spark_catalog', 
namespace=['default'], ...
         >>> spark.catalog.getFunction("spark_catalog.default.my_func1")
         Function(name='my_func1', catalog='spark_catalog', 
namespace=['default'], ...
+
+        Throw an analysis exception when the function does not exists.
+
         >>> spark.catalog.getFunction("my_func2")
         Traceback (most recent call last):
             ...
@@ -404,26 +529,39 @@ class Catalog:
     def listColumns(self, tableName: str, dbName: Optional[str] = None) -> 
List[Column]:
         """Returns a list of columns for the given table/view in the specified 
database.
 
-         If no database is specified, the current database is used.
-
         .. versionadded:: 2.0.0
 
         Parameters
         ----------
         tableName : str
-                    name of the table to check existence
+            name of the table to list columns.
+
+            .. versionchanged:: 3.4.0
+               Allow ``tableName`` to be qualified with catalog name when 
``dbName`` is None.
+
         dbName : str, optional
-                 name of the database to check table existence in.
+            name of the database to find the table to list columns.
 
-           .. deprecated:: 3.4.0
+        Returns
+        -------
+        list
+            A list of :class:`Column`.
+
+        Notes
+        -----
+        The order of arguments here is different from that of its JVM 
counterpart
+        because Python does not support method overloading.
 
-        .. versionchanged:: 3.4
-           Allowed ``tableName`` to be qualified with catalog name when 
``dbName`` is None.
+        If no database is specified, the current database and catalog
+        are used. This API includes all temporary views.
 
-         Notes
-         -----
-         the order of arguments here is different from that of its JVM 
counterpart
-         because Python does not support method overloading.
+        Examples
+        --------
+        >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
+        >>> _ = spark.sql("CREATE TABLE tblA (name STRING, age INT) USING 
parquet")
+        >>> spark.catalog.listColumns("tblA")
+        [Column(name='name', description=None, dataType='string', 
nullable=True, ...
+        >>> _ = spark.sql("DROP TABLE tblA")
         """
         if dbName is None:
             iter = self._jcatalog.listColumns(tableName).toLocalIterator()
@@ -459,64 +597,64 @@ class Catalog:
         Parameters
         ----------
         tableName : str
-                    name of the table to check existence
-                    If no database is specified, first try to treat 
``tableName`` as a
-                    multi-layer-namespace identifier, then try to 
``tableName`` as a normal table
-                    name in current database if necessary.
-        dbName : str, optional
-                 name of the database to check table existence in.
+            name of the table to check existence.
+            If no database is specified, first try to treat ``tableName`` as a
+            multi-layer-namespace identifier, then try to ``tableName`` as a 
normal table
+            name in current database if necessary.
 
-           .. deprecated:: 3.4.0
+            .. versionchanged:: 3.4.0
+               Allow ``tableName`` to be qualified with catalog name when 
``dbName`` is None.
 
+        dbName : str, optional
+            name of the database to check table existence in.
 
         Returns
         -------
         bool
             Indicating whether the table/view exists
 
-        .. versionchanged:: 3.4
-           Allowed ``tableName`` to be qualified with catalog name when 
``dbName`` is None.
-
         Examples
         --------
-
         This function can check if a table is defined or not:
 
         >>> spark.catalog.tableExists("unexisting_table")
         False
-        >>> df = spark.sql("CREATE TABLE tab1 (name STRING, age INT) USING 
parquet")
-        >>> spark.catalog.tableExists("tab1")
+        >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
+        >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING 
parquet")
+        >>> spark.catalog.tableExists("tbl1")
         True
-        >>> spark.catalog.tableExists("default.tab1")
+
+        Using the fully qualified names for tables.
+
+        >>> spark.catalog.tableExists("default.tbl1")
         True
-        >>> spark.catalog.tableExists("spark_catalog.default.tab1")
+        >>> spark.catalog.tableExists("spark_catalog.default.tbl1")
         True
-        >>> spark.catalog.tableExists("tab1", "default")
+        >>> spark.catalog.tableExists("tbl1", "default")
         True
-        >>> df = spark.sql("DROP TABLE tab1")
-        >>> spark.catalog.tableExists("unexisting_table")
-        False
+        >>> _ = spark.sql("DROP TABLE tbl1")
 
-        It also works for views:
+        Check if views exist:
 
         >>> spark.catalog.tableExists("view1")
         False
-        >>> df = spark.sql("CREATE VIEW view1 AS SELECT 1")
+        >>> _ = spark.sql("CREATE VIEW view1 AS SELECT 1")
         >>> spark.catalog.tableExists("view1")
         True
+
+        Using the fully qualified names for views.
+
         >>> spark.catalog.tableExists("default.view1")
         True
         >>> spark.catalog.tableExists("spark_catalog.default.view1")
         True
         >>> spark.catalog.tableExists("view1", "default")
         True
-        >>> df = spark.sql("DROP VIEW view1")
-        >>> spark.catalog.tableExists("view1")
-        False
+        >>> _ = spark.sql("DROP VIEW view1")
 
-        And also for temporary views:
+        Check if temporary views exist:
 
-        >>> df = spark.sql("CREATE TEMPORARY VIEW view1 AS SELECT 1")
+        >>> _ = spark.sql("CREATE TEMPORARY VIEW view1 AS SELECT 1")
         >>> spark.catalog.tableExists("view1")
         True
         >>> df = spark.sql("DROP VIEW view1")
@@ -575,27 +713,54 @@ class Catalog:
     ) -> DataFrame:
         """Creates a table based on the dataset in a data source.
 
-        It returns the DataFrame associated with the table.
-
-        The data source is specified by the ``source`` and a set of 
``options``.
-        If ``source`` is not specified, the default data source configured by
-        ``spark.sql.sources.default`` will be used. When ``path`` is 
specified, an external table is
-        created from the data at the given path. Otherwise a managed table is 
created.
-
-        Optionally, a schema can be provided as the schema of the returned 
:class:`DataFrame` and
-        created table.
-
         .. versionadded:: 2.2.0
 
+        Parameters
+        ----------
+        tableName : str
+            name of the table to create.
+
+            .. versionchanged:: 3.4.0
+               Allow ``tableName`` to be qualified with catalog name.
+
+        path : str, optional
+            the path in which the data for this table exists.
+            When ``path`` is specified, an external table is
+            created from the data at the given path. Otherwise a managed table 
is created.
+        source : str, optional
+            the source of this table such as 'parquet, 'orc', etc.
+            If ``source`` is not specified, the default data source configured 
by
+            ``spark.sql.sources.default`` will be used.
+        schema : class:`StructType`, optional
+            the schema for this table.
+        description : str, optional
+            the description of this table.
+
+            .. versionchanged:: 3.1.0
+                Added the ``description`` parameter.
+
+        **options : dict, optional
+            extra options to specify in the table.
+
         Returns
         -------
         :class:`DataFrame`
+            The DataFrame associated with the table.
+
+        Examples
+        --------
+        Creating a managed table.
+
+        >>> _ = spark.catalog.createTable("tbl1", 
schema=spark.range(1).schema, source='parquet')
+        >>> _ = spark.sql("DROP TABLE tbl1")
 
-        .. versionchanged:: 3.1
-           Added the ``description`` parameter.
+        Creating an external table
 
-        .. versionchanged:: 3.4
-           Allowed ``tableName`` to be qualified with catalog name.
+        >>> import tempfile
+        >>> with tempfile.TemporaryDirectory() as d:
+        ...     _ = spark.catalog.createTable(
+        ...         "tbl2", schema=spark.range(1).schema, path=d, 
source='parquet')
+        >>> _ = spark.sql("DROP TABLE tbl2")
         """
         if path is not None:
             options["path"] = path
@@ -613,47 +778,76 @@ class Catalog:
             df = self._jcatalog.createTable(tableName, source, scala_datatype, 
description, options)
         return DataFrame(df, self._sparkSession)
 
-    def dropTempView(self, viewName: str) -> None:
+    def dropTempView(self, viewName: str) -> bool:
         """Drops the local temporary view with the given view name in the 
catalog.
         If the view has been cached before, then it will also be uncached.
         Returns true if this view is dropped successfully, false otherwise.
 
         .. versionadded:: 2.0.0
 
-        Notes
-        -----
-        The return type of this method was None in Spark 2.0, but changed to 
Boolean
-        in Spark 2.1.
+        Parameters
+        ----------
+        viewName : str
+            name of the temporary view to drop.
+
+        Returns
+        -------
+        bool
+            If the temporary view was successfully dropped or not.
+
+            .. versionadded:: 2.1.0
+                The return type of this method was ``None`` in Spark 2.0, but 
changed to ``bool``
+                in Spark 2.1.
 
         Examples
         --------
         >>> spark.createDataFrame([(1, 1)]).createTempView("my_table")
-        >>> spark.table("my_table").collect()
-        [Row(_1=1, _2=1)]
+
+        Droppping the temporary view.
+
         >>> spark.catalog.dropTempView("my_table")
         True
-        >>> spark.table("my_table") # doctest: +IGNORE_EXCEPTION_DETAIL
+
+        Throw an exception if the temporary view does not exists.
+
+        >>> spark.table("my_table")
         Traceback (most recent call last):
             ...
         AnalysisException: ...
         """
         return self._jcatalog.dropTempView(viewName)
 
-    def dropGlobalTempView(self, viewName: str) -> None:
+    def dropGlobalTempView(self, viewName: str) -> bool:
         """Drops the global temporary view with the given view name in the 
catalog.
-        If the view has been cached before, then it will also be uncached.
-        Returns true if this view is dropped successfully, false otherwise.
 
         .. versionadded:: 2.1.0
 
+        Parameters
+        ----------
+        viewName : str
+            name of the global view to drop.
+
+        Returns
+        -------
+        bool
+            If the global view was successfully dropped or not.
+
+        Notes
+        -----
+        If the view has been cached before, then it will also be uncached.
+
         Examples
         --------
         >>> spark.createDataFrame([(1, 1)]).createGlobalTempView("my_table")
-        >>> spark.table("global_temp.my_table").collect()
-        [Row(_1=1, _2=1)]
+
+        Droppping the global view.
+
         >>> spark.catalog.dropGlobalTempView("my_table")
         True
-        >>> spark.table("global_temp.my_table") # doctest: 
+IGNORE_EXCEPTION_DETAIL
+
+        Throw an exception if the global view does not exists.
+
+        >>> spark.table("global_temp.my_table")
         Traceback (most recent call last):
             ...
         AnalysisException: ...
@@ -674,59 +868,267 @@ class Catalog:
         warnings.warn("Deprecated in 2.3.0. Use spark.udf.register instead.", 
FutureWarning)
         return self._sparkSession.udf.register(name, f, returnType)
 
-    @since(2.0)
     def isCached(self, tableName: str) -> bool:
-        """Returns true if the table is currently cached in-memory.
+        """
+        Returns true if the table is currently cached in-memory.
+
+        .. versionadded:: 2.0.0
+
+        Parameters
+        ----------
+        tableName : str
+            name of the table to get.
+
+            .. versionchanged:: 3.4.0
+                Allow ``tableName`` to be qualified with catalog name.
+
+        Returns
+        -------
+        bool
+
+        Examples
+        --------
+        >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
+        >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING 
parquet")
+        >>> spark.catalog.cacheTable("tbl1")
+        >>> spark.catalog.isCached("tbl1")
+        True
+
+        Throw an analysis exception when the table does not exists.
+
+        >>> spark.catalog.isCached("not_existing_table")
+        Traceback (most recent call last):
+            ...
+        pyspark.sql.utils.AnalysisException: ...
 
-        .. versionchanged:: 3.4
-           Allowed ``tableName`` to be qualified with catalog name.
+        Using the fully qualified name for the table.
+
+        >>> spark.catalog.isCached("spark_catalog.default.tbl1")
+        True
+        >>> spark.catalog.uncacheTable("tbl1")
+        >>> _ = spark.sql("DROP TABLE tbl1")
         """
         return self._jcatalog.isCached(tableName)
 
-    @since(2.0)
     def cacheTable(self, tableName: str) -> None:
         """Caches the specified table in-memory.
 
-        .. versionchanged:: 3.4
-           Allowed ``tableName`` to be qualified with catalog name.
+        .. versionadded:: 2.0.0
+
+        Parameters
+        ----------
+        tableName : str
+            name of the table to get.
+
+            .. versionchanged:: 3.4.0
+                Allow ``tableName`` to be qualified with catalog name.
+
+        Examples
+        --------
+        >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
+        >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING 
parquet")
+        >>> spark.catalog.cacheTable("tbl1")
+
+        Throw an analysis exception when the table does not exist.
+
+        >>> spark.catalog.cacheTable("not_existing_table")
+        Traceback (most recent call last):
+            ...
+        pyspark.sql.utils.AnalysisException: ...
+
+        Using the fully qualified name for the table.
+
+        >>> spark.catalog.cacheTable("spark_catalog.default.tbl1")
+        >>> spark.catalog.uncacheTable("tbl1")
+        >>> _ = spark.sql("DROP TABLE tbl1")
         """
         self._jcatalog.cacheTable(tableName)
 
-    @since(2.0)
     def uncacheTable(self, tableName: str) -> None:
         """Removes the specified table from the in-memory cache.
 
-        .. versionchanged:: 3.4
-           Allowed ``tableName`` to be qualified with catalog name.
+        .. versionadded:: 2.0.0
+
+        Parameters
+        ----------
+        tableName : str
+            name of the table to get.
+
+            .. versionchanged:: 3.4.0
+                Allow ``tableName`` to be qualified with catalog name.
+
+        Examples
+        --------
+        >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
+        >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING 
parquet")
+        >>> spark.catalog.cacheTable("tbl1")
+        >>> spark.catalog.uncacheTable("tbl1")
+        >>> spark.catalog.isCached("tbl1")
+        False
+
+        Throw an analysis exception when the table does not exist.
+
+        >>> spark.catalog.uncacheTable("not_existing_table")  # doctest: 
+IGNORE_EXCEPTION_DETAIL
+        Traceback (most recent call last):
+            ...
+        pyspark.sql.utils.AnalysisException: ...
+
+        Using the fully qualified name for the table.
+
+        >>> spark.catalog.uncacheTable("spark_catalog.default.tbl1")
+        >>> spark.catalog.isCached("tbl1")
+        False
+        >>> _ = spark.sql("DROP TABLE tbl1")
         """
         self._jcatalog.uncacheTable(tableName)
 
-    @since(2.0)
     def clearCache(self) -> None:
-        """Removes all cached tables from the in-memory cache."""
+        """Removes all cached tables from the in-memory cache.
+
+        .. versionadded:: 2.0.0
+
+        Examples
+        --------
+        >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
+        >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING 
parquet")
+        >>> spark.catalog.clearCache()
+        >>> spark.catalog.isCached("tbl1")
+        False
+        >>> _ = spark.sql("DROP TABLE tbl1")
+        """
         self._jcatalog.clearCache()
 
-    @since(2.0)
     def refreshTable(self, tableName: str) -> None:
         """Invalidates and refreshes all the cached data and metadata of the 
given table.
 
-        .. versionchanged:: 3.4
-           Allowed ``tableName`` to be qualified with catalog name.
+        .. versionadded:: 2.0.0
+
+        Parameters
+        ----------
+        tableName : str
+            name of the table to get.
+
+            .. versionchanged:: 3.4.0
+                Allow ``tableName`` to be qualified with catalog name.
+
+        Examples
+        --------
+        The example below caches a table, and then removes the data.
+
+        >>> import tempfile
+        >>> with tempfile.TemporaryDirectory() as d:
+        ...     _ = spark.sql("DROP TABLE IF EXISTS tbl1")
+        ...     _ = spark.sql("CREATE TABLE tbl1 (col STRING) USING TEXT 
LOCATION '{}'".format(d))
+        ...     _ = spark.sql("INSERT INTO tbl1 SELECT 'abc'")
+        ...     spark.catalog.cacheTable("tbl1")
+        ...     spark.table("tbl1").show()
+        +---+
+        |col|
+        +---+
+        |abc|
+        +---+
+
+        Because the table is cached, it computes from the cached data as below.
+
+        >>> spark.table("tbl1").count()
+        1
+
+        After refreshing the table, it shows 0 because the data does not exist 
anymore.
+
+        >>> spark.catalog.refreshTable("tbl1")
+        >>> spark.table("tbl1").count()
+        0
+
+        Using the fully qualified name for the table.
+
+        >>> spark.catalog.refreshTable("spark_catalog.default.tbl1")
+        >>> _ = spark.sql("DROP TABLE tbl1")
         """
         self._jcatalog.refreshTable(tableName)
 
-    @since("2.1.1")
     def recoverPartitions(self, tableName: str) -> None:
         """Recovers all the partitions of the given table and update the 
catalog.
 
+        .. versionadded:: 2.1.1
+
+        Parameters
+        ----------
+        tableName : str
+            name of the table to get.
+
+        Notes
+        -----
         Only works with a partitioned table, and not a view.
+
+        Examples
+        --------
+        The example below creates a partitioned table against the existing 
directory of
+        the partitioned table. After that, it recovers the partitions.
+
+        >>> import tempfile
+        >>> with tempfile.TemporaryDirectory() as d:
+        ...     _ = spark.sql("DROP TABLE IF EXISTS tbl1")
+        ...     spark.range(1).selectExpr(
+        ...         "id as key", "id as 
value").write.partitionBy("key").mode("overwrite").save(d)
+        ...     _ = spark.sql(
+        ...          "CREATE TABLE tbl1 (key LONG, value LONG)"
+        ...          "USING parquet OPTIONS (path '{}') PARTITIONED BY 
(key)".format(d))
+        ...     spark.table("tbl1").show()
+        ...     spark.catalog.recoverPartitions("tbl1")
+        ...     spark.table("tbl1").show()
+        +-----+---+
+        |value|key|
+        +-----+---+
+        +-----+---+
+        +-----+---+
+        |value|key|
+        +-----+---+
+        |    0|  0|
+        +-----+---+
+        >>> _ = spark.sql("DROP TABLE tbl1")
         """
         self._jcatalog.recoverPartitions(tableName)
 
-    @since("2.2.0")
     def refreshByPath(self, path: str) -> None:
         """Invalidates and refreshes all the cached data (and the associated 
metadata) for any
         DataFrame that contains the given data source path.
+
+        .. versionadded:: 2.2.0
+
+        Parameters
+        ----------
+        path : str
+            the path to refresh the cache.
+
+        Examples
+        --------
+        The example below caches a table, and then removes the data.
+
+        >>> import tempfile
+        >>> with tempfile.TemporaryDirectory() as d:
+        ...     _ = spark.sql("DROP TABLE IF EXISTS tbl1")
+        ...     _ = spark.sql("CREATE TABLE tbl1 (col STRING) USING TEXT 
LOCATION '{}'".format(d))
+        ...     _ = spark.sql("INSERT INTO tbl1 SELECT 'abc'")
+        ...     spark.catalog.cacheTable("tbl1")
+        ...     spark.table("tbl1").show()
+        +---+
+        |col|
+        +---+
+        |abc|
+        +---+
+
+        Because the table is cached, it computes from the cached data as below.
+
+        >>> spark.table("tbl1").count()
+        1
+
+        After refreshing the table by path, it shows 0 because the data does 
not exist anymore.
+
+        >>> spark.catalog.refreshByPath(d)
+        >>> spark.table("tbl1").count()
+        0
+
+        >>> _ = spark.sql("DROP TABLE tbl1")
         """
         self._jcatalog.refreshByPath(path)
 
@@ -748,15 +1150,17 @@ def _test() -> None:
     os.chdir(os.environ["SPARK_HOME"])
 
     globs = pyspark.sql.catalog.__dict__.copy()
-    spark = SparkSession.builder.master("local[4]").appName("sql.catalog 
tests").getOrCreate()
-    globs["sc"] = spark.sparkContext
-    globs["spark"] = spark
+    globs["spark"] = (
+        SparkSession.builder.master("local[4]").appName("sql.catalog 
tests").getOrCreate()
+    )
     (failure_count, test_count) = doctest.testmod(
         pyspark.sql.catalog,
         globs=globs,
-        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
+        optionflags=doctest.ELLIPSIS
+        | doctest.NORMALIZE_WHITESPACE
+        | doctest.IGNORE_EXCEPTION_DETAIL,
     )
-    spark.stop()
+    globs["spark"].stop()
     if failure_count:
         sys.exit(-1)
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] branch master updated: [SPARK-40051][PYTHON][SQL][DOCS] Make pyspark.sql.catalog examples self-contained

Reply via email to