This is an automated email from the ASF dual-hosted git repository. potiuk pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/airflow.git
commit a58506b2a68f0d4533b41feb67efb0caf34e14d8 Author: Alex Ott <[email protected]> AuthorDate: Sun Apr 24 11:36:28 2022 +0200 Address review comments --- .../providers/databricks/hooks/databricks_sql.py | 37 ++++++++-------- docs/apache-airflow-providers-databricks/index.rst | 1 - .../operators/copy_into.rst | 48 +++------------------ .../operators/sql.rst | 49 +++++----------------- 4 files changed, 35 insertions(+), 100 deletions(-) diff --git a/airflow/providers/databricks/hooks/databricks_sql.py b/airflow/providers/databricks/hooks/databricks_sql.py index aa8245772a..afc165ee20 100644 --- a/airflow/providers/databricks/hooks/databricks_sql.py +++ b/airflow/providers/databricks/hooks/databricks_sql.py @@ -33,7 +33,24 @@ USER_AGENT_STRING = f'airflow-{__version__}' class DatabricksSqlHook(BaseDatabricksHook, DbApiHook): - """Hook to interact with Databricks SQL.""" + """ + Hook to interact with Databricks SQL. + + :param databricks_conn_id: Reference to the + :ref:`Databricks connection <howto/connection:databricks>`. + :param http_path: Optional string specifying HTTP path of Databricks SQL Endpoint or cluster. + If not specified, it should be either specified in the Databricks connection's extra parameters, + or ``sql_endpoint_name`` must be specified. + :param sql_endpoint_name: Optional name of Databricks SQL Endpoint. If not specified, ``http_path`` + must be provided as described above. + :param session_configuration: An optional dictionary of Spark session parameters. Defaults to None. + If not specified, it could be specified in the Databricks connection's extra parameters. + :param http_headers: An optional list of (k, v) pairs that will be set as HTTP headers + on every request + :param catalog: An optional initial catalog to use. Requires DBR version 9.0+ + :param schema: An optional initial schema to use. Requires DBR version 9.0+ + :param kwargs: Additional parameters internal to Databricks SQL Connector parameters + """ hook_name = 'Databricks SQL' @@ -48,24 +65,6 @@ class DatabricksSqlHook(BaseDatabricksHook, DbApiHook): schema: Optional[str] = None, **kwargs, ) -> None: - """ - Initializes DatabricksSqlHook - - :param databricks_conn_id: Reference to the - :ref:`Databricks connection <howto/connection:databricks>`. - :param http_path: Optional string specifying HTTP path of Databricks SQL Endpoint or cluster. - If not specified, it should be either specified in the Databricks connection's extra parameters, - or ``sql_endpoint_name`` must be specified. - :param sql_endpoint_name: Optional name of Databricks SQL Endpoint. If not specified, ``http_path`` - must be provided as described above. - :param session_configuration: An optional dictionary of Spark session parameters. Defaults to None. - If not specified, it could be specified in the Databricks connection's extra parameters. - :param http_headers: An optional list of (k, v) pairs that will be set as HTTP headers - on every request - :param catalog: An optional initial catalog to use. Requires DBR version 9.0+ - :param schema: An optional initial schema to use. Requires DBR version 9.0+ - :param kwargs: Additional parameters internal to Databricks SQL Connector parameters - """ super().__init__(databricks_conn_id) self._sql_conn = None self._token: Optional[str] = None diff --git a/docs/apache-airflow-providers-databricks/index.rst b/docs/apache-airflow-providers-databricks/index.rst index 968b94149b..1a6d32cab3 100644 --- a/docs/apache-airflow-providers-databricks/index.rst +++ b/docs/apache-airflow-providers-databricks/index.rst @@ -81,7 +81,6 @@ PIP package Version required ============================ =================== ``apache-airflow`` ``>=2.1.0`` ``databricks-sql-connector`` ``>=2.0.0, <3.0.0`` ->>>>>>> DatabricksSqlOperator - switch to databricks-sql-connector 2.x ``requests`` ``>=2.26.0, <3`` ============================ =================== diff --git a/docs/apache-airflow-providers-databricks/operators/copy_into.rst b/docs/apache-airflow-providers-databricks/operators/copy_into.rst index 1d4ef07de2..79716c256f 100644 --- a/docs/apache-airflow-providers-databricks/operators/copy_into.rst +++ b/docs/apache-airflow-providers-databricks/operators/copy_into.rst @@ -29,50 +29,14 @@ command. Using the Operator ------------------ -Operator loads data from a specified location into a table using a configured endpoint. +Operator loads data from a specified location into a table using a configured endpoint. The only required parameters are: -.. list-table:: - :widths: 15 25 - :header-rows: 1 +* ``table_name`` - string with the table name +* ``file_location`` - string with the URI of data to load +* ``file_format`` - string specifying the file format of data to load. Supported formats are ``CSV``, ``JSON``, ``AVRO``, ``ORC``, ``PARQUET``, ``TEXT``, ``BINARYFILE``. +* One of ``sql_endpoint_name`` (name of Databricks SQL endpoint to use) or ``http_path`` (HTTP path for Databricks SQL endpoint or Databricks cluster). - * - Parameter - - Input - * - table_name: str - - Required name of the table. - * - file_location: str - - Required location of files to import. - * - file_format: str - - Required file format. Supported formats are ``CSV``, ``JSON``, ``AVRO``, ``ORC``, ``PARQUET``, ``TEXT``, ``BINARYFILE``. - * - sql_endpoint_name: str - - Optional name of Databricks SQL endpoint to use. If not specified, ``http_path`` should be provided. - * - http_path: str - - Optional HTTP path for Databricks SQL endpoint or Databricks cluster. If not specified, it should be provided in Databricks connection, or the ``sql_endpoint_name`` parameter must be set. - * - session_configuration: dict[str,str] - - optional dict specifying Spark configuration parameters that will be set for the session. - * - http_headers: list[tuple[str, str]] - - Optional list of (k, v) pairs that will be set as HTTP headers on every request - * - client_parameters: dict[str,str] - - optional additional parameters internal to Databricks SQL Connector parameters - * - files: list[str]] - - optional list of files to import. Can't be specified together with ``pattern``. - * - pattern: str - - optional regex string to match file names to import. Can't be specified together with ``files``. - * - expression_list: str - - optional string that will be used in the ``SELECT`` expression. - * - credential: dict[str, str] - - optional credential configuration for authentication against a specified location - * - encryption: dict[str, str] - - optional encryption configuration for a specified location - * - storage_credential: str - - optional Unity Catalog storage credential name for the target table - * - format_options: dict[str, str] - - optional dictionary with options specific for a given file format. - * - force_copy: bool - - optional boolean parameter to control forcing of data import (could be also specified in ``copy_options``). - * - copy_options: dict[str, str] - - optional dictionary of copy options. Right now only ``force`` option is supported. - * - validate: union[bool, int]] - - optional validation configuration. ``True`` forces validation of all rows, positive number - only N first rows. (requires Preview channel) +Other parameters are optional and could be found in the class documentation. Examples -------- diff --git a/docs/apache-airflow-providers-databricks/operators/sql.rst b/docs/apache-airflow-providers-databricks/operators/sql.rst index d0a1d6d337..93a3b88007 100644 --- a/docs/apache-airflow-providers-databricks/operators/sql.rst +++ b/docs/apache-airflow-providers-databricks/operators/sql.rst @@ -29,44 +29,17 @@ on a `Databricks SQL endpoint <https://docs.databricks.com/sql/admin/sql-endpoi Using the Operator ------------------ -Operator executes given SQL queries against configured endpoint. There are 3 ways of specifying SQL queries: - -1. Simple string with SQL statement. -2. List of strings representing SQL statements. -3. Name of the file with SQL queries. File must have ``.sql`` extension. Each query should finish with ``;<new_line>`` - -.. list-table:: - :widths: 15 25 - :header-rows: 1 - - * - Parameter - - Input - * - sql: str or list[str] - - Required parameter specifying a queries to execute. - * - sql_endpoint_name: str - - Optional name of Databricks SQL endpoint to use. If not specified, ``http_path`` should be provided. - * - http_path: str - - Optional HTTP path for Databricks SQL endpoint or Databricks cluster. If not specified, it should be provided in Databricks connection, or the ``sql_endpoint_name`` parameter must be set. - * - parameters: dict[str, any] - - Optional parameters that will be used to substitute variable(s) in SQL query. - * - session_configuration: dict[str,str] - - optional dict specifying Spark configuration parameters that will be set for the session. - * - http_headers: list[tuple[str, str]] - - Optional list of (k, v) pairs that will be set as HTTP headers on every request - * - client_parameters: dict[str,str] - - optional additional parameters internal to Databricks SQL Connector parameters - * - catalog: str - - Optional initial catalog to use. Requires DBR version 9.0+ - * - schema: str - - Optional initial schema to use. Requires DBR version 9.0+ - * - output_path: str - - Optional path to the file to which results will be written. - * - output_format: str - - Name of the format which will be used to write results. Supported values are (case-insensitive): ``JSON`` (array of JSON objects), ``JSONL`` (each row as JSON object on a separate line), ``CSV`` (default). - * - csv_params: dict[str, any] - - Optional dictionary with parameters to customize Python CSV writer. - * - do_xcom_push: bool - - whether we should push query results (last query if multiple queries are provided) to xcom. Default: false +Operator executes given SQL queries against configured endpoint. The only required parameters are: + +* ``sql`` - SQL queries to execute. There are 3 ways of specifying SQL queries: + + 1. Simple string with SQL statement. + 2. List of strings representing SQL statements. + 3. Name of the file with SQL queries. File must have ``.sql`` extension. Each query should finish with ``;<new_line>`` + +* One of ``sql_endpoint_name`` (name of Databricks SQL endpoint to use) or ``http_path`` (HTTP path for Databricks SQL endpoint or Databricks cluster). + +Other parameters are optional and could be found in the class documentation. Examples --------
