This is an automated email from the ASF dual-hosted git repository.
beto pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/superset.git
The following commit(s) were added to refs/heads/master by this push:
new c993c58 fix(BigQuery): explicitly quote columns in select_star
(#16822)
c993c58 is described below
commit c993c5845f8a2ca001ba18d4af988edd3e53e8bf
Author: Beto Dealmeida <[email protected]>
AuthorDate: Wed Oct 6 07:43:32 2021 -0700
fix(BigQuery): explicitly quote columns in select_star (#16822)
* fix (BigQuery): explicitly quote columns in select_star
* Fix test
* Fix SELECT * in BQ
* Add unit tests
* Remove type changes
---
superset/db_engine_specs/bigquery.py | 120 ++++++++++++--
superset/result_set.py | 4 +-
tests/unit_tests/db_engine_specs/test_bigquery.py | 190 ++++++++++++++++++++++
3 files changed, 296 insertions(+), 18 deletions(-)
diff --git a/superset/db_engine_specs/bigquery.py
b/superset/db_engine_specs/bigquery.py
index 1b7b73e..a862df5 100644
--- a/superset/db_engine_specs/bigquery.py
+++ b/superset/db_engine_specs/bigquery.py
@@ -26,9 +26,10 @@ from apispec.ext.marshmallow import MarshmallowPlugin
from flask_babel import gettext as __
from marshmallow import fields, Schema
from marshmallow.exceptions import ValidationError
-from sqlalchemy import literal_column
+from sqlalchemy import column
+from sqlalchemy.engine.base import Engine
from sqlalchemy.engine.url import make_url
-from sqlalchemy.sql.expression import ColumnClause
+from sqlalchemy.sql import sqltypes
from typing_extensions import TypedDict
from superset.databases.schemas import encrypted_field_properties,
EncryptedString
@@ -283,20 +284,6 @@ class BigQueryEngineSpec(BaseEngineSpec):
}
@classmethod
- def _get_fields(cls, cols: List[Dict[str, Any]]) -> List[ColumnClause]:
- """
- BigQuery dialect requires us to not use backtick in the fieldname
which are
- nested.
- Using literal_column handles that issue.
-
https://docs.sqlalchemy.org/en/latest/core/tutorial.html#using-more-specific-text-with-table-literal-column-and-column
- Also explicility specifying column names so we don't encounter
duplicate
- column names in the result.
- """
- return [
- literal_column(c["name"]).label(c["name"].replace(".", "__")) for
c in cols
- ]
-
- @classmethod
def epoch_to_dttm(cls) -> str:
return "TIMESTAMP_SECONDS({col})"
@@ -425,3 +412,104 @@ class BigQueryEngineSpec(BaseEngineSpec):
ma_plugin.converter.add_attribute_function(encrypted_field_properties)
spec.components.schema(cls.__name__, schema=cls.parameters_schema)
return spec.to_dict()["components"]["schemas"][cls.__name__]
+
+ @classmethod
+ def select_star( # pylint: disable=too-many-arguments
+ cls,
+ database: "Database",
+ table_name: str,
+ engine: Engine,
+ schema: Optional[str] = None,
+ limit: int = 100,
+ show_cols: bool = False,
+ indent: bool = True,
+ latest_partition: bool = True,
+ cols: Optional[List[Dict[str, Any]]] = None,
+ ) -> str:
+ """
+ Remove array structures from `SELECT *`.
+
+ BigQuery supports structures and arrays of structures, eg:
+
+ author STRUCT<name STRING, email STRING>
+ trailer ARRAY<STRUCT<key STRING, value STRING>>
+
+ When loading metadata for a table each key in the struct is displayed
as a
+ separate pseudo-column, eg:
+
+ - author
+ - author.name
+ - author.email
+ - trailer
+ - trailer.key
+ - trailer.value
+
+ When generating the `SELECT *` statement we want to remove any keys
from
+ structs inside an array, since selecting them results in an error. The
correct
+ select statement should look like this:
+
+ SELECT
+ `author`,
+ `author`.`name`,
+ `author`.`email`,
+ `trailer`
+ FROM
+ table
+
+ Selecting `trailer.key` or `trailer.value` results in an error, as
opposed to
+ selecting `author.name`, since they are keys in a structure inside an
array.
+
+ This method removes any array pseudo-columns.
+ """
+ if cols:
+ # For arrays of structs, remove the child columns, otherwise the
query
+ # will fail.
+ array_prefixes = {
+ col["name"] for col in cols if isinstance(col["type"],
sqltypes.ARRAY)
+ }
+ cols = [
+ col
+ for col in cols
+ if "." not in col["name"]
+ or col["name"].split(".")[0] not in array_prefixes
+ ]
+
+ return super().select_star(
+ database,
+ table_name,
+ engine,
+ schema,
+ limit,
+ show_cols,
+ indent,
+ latest_partition,
+ cols,
+ )
+
+ @classmethod
+ def _get_fields(cls, cols: List[Dict[str, Any]]) -> List[Any]:
+ """
+ Label columns using their fully qualified name.
+
+ BigQuery supports columns of type `struct`, which are basically
dictionaries.
+ When loading metadata for a table with struct columns, each key in the
struct
+ is displayed as a separate pseudo-column, eg:
+
+ author STRUCT<name STRING, email STRING>
+
+ Will be shown as 3 columns:
+
+ - author
+ - author.name
+ - author.email
+
+ If we select those fields:
+
+ SELECT `author`, `author`.`name`, `author`.`email` FROM table
+
+ The resulting columns will be called "author", "name", and "email",
This may
+ result in a clash with other columns. To prevent that, we explicitly
label
+ the columns using their fully qualified name, so we end up with
"author",
+ "author__name" and "author__email", respectively.
+ """
+ return [column(c["name"]).label(c["name"].replace(".", "__")) for c in
cols]
diff --git a/superset/result_set.py b/superset/result_set.py
index 1f92513..b95b5e6 100644
--- a/superset/result_set.py
+++ b/superset/result_set.py
@@ -25,7 +25,7 @@ import numpy as np
import pandas as pd
import pyarrow as pa
-from superset import db_engine_specs
+from superset.db_engine_specs import BaseEngineSpec
from superset.typing import DbapiDescription, DbapiResult
from superset.utils import core as utils
@@ -76,7 +76,7 @@ class SupersetResultSet:
self,
data: DbapiResult,
cursor_description: DbapiDescription,
- db_engine_spec: Type[db_engine_specs.BaseEngineSpec],
+ db_engine_spec: Type[BaseEngineSpec],
):
self.db_engine_spec = db_engine_spec
data = data or []
diff --git a/tests/unit_tests/db_engine_specs/test_bigquery.py
b/tests/unit_tests/db_engine_specs/test_bigquery.py
new file mode 100644
index 0000000..dcf1de7
--- /dev/null
+++ b/tests/unit_tests/db_engine_specs/test_bigquery.py
@@ -0,0 +1,190 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-argument, import-outside-toplevel, protected-access
+
+from flask.ctx import AppContext
+from pytest_mock import MockFixture
+from sqlalchemy import select
+from sqlalchemy.dialects.sqlite.base import SQLiteDialect
+from sqlalchemy.sql import sqltypes
+
+
+def test_get_fields(app_context: AppContext) -> None:
+ """
+ Test the custom ``_get_fields`` method.
+
+ The method adds custom labels (aliases) to the columns to prevent
+ collision when referencing record fields. Eg, if we had these two
+ columns:
+
+ name STRING
+ project STRUCT<name STRING>
+
+ One could write this query:
+
+ SELECT
+ `name`,
+ `project`.`name`
+ FROM
+ the_table
+
+ But then both columns would get aliased as "name".
+
+ The custom method will replace the fields so that the final query
+ looks like this:
+
+ SELECT
+ `name` AS `name`,
+ `project`.`name` AS project__name
+ FROM
+ the_table
+
+ """
+ from superset.db_engine_specs.bigquery import BigQueryEngineSpec
+
+ columns = [{"name": "limit"}, {"name": "name"}, {"name": "project.name"}]
+ fields = BigQueryEngineSpec._get_fields(columns)
+
+ # generic SQL
+ query = select(fields)
+ assert (
+ str(query)
+ == 'SELECT "limit" AS "limit", name AS name, "project.name" AS
project__name'
+ )
+
+ # BigQuery-specific SQL
+ try:
+ from pybigquery.sqlalchemy_bigquery import BigQueryDialect
+ except ModuleNotFoundError:
+ return
+
+ assert str(query.compile(dialect=BigQueryDialect())) == (
+ "SELECT `limit` AS `limit`, `name` AS `name`, "
+ "`project`.`name` AS `project__name`"
+ )
+
+
+def test_select_star(mocker: MockFixture, app_context: AppContext) -> None:
+ """
+ Test the ``select_star`` method.
+
+ The method removes pseudo-columns from structures inside arrays. While
these
+ pseudo-columns show up as "columns" for metadata reasons, we can't select
them
+ in the query, as opposed to fields from non-array structures.
+ """
+ from superset.db_engine_specs.bigquery import BigQueryEngineSpec
+
+ cols = [
+ {
+ "name": "trailer",
+ "type": sqltypes.ARRAY(sqltypes.JSON()),
+ "nullable": True,
+ "comment": None,
+ "default": None,
+ "precision": None,
+ "scale": None,
+ "max_length": None,
+ },
+ {
+ "name": "trailer.key",
+ "type": sqltypes.String(),
+ "nullable": True,
+ "comment": None,
+ "default": None,
+ "precision": None,
+ "scale": None,
+ "max_length": None,
+ },
+ {
+ "name": "trailer.value",
+ "type": sqltypes.String(),
+ "nullable": True,
+ "comment": None,
+ "default": None,
+ "precision": None,
+ "scale": None,
+ "max_length": None,
+ },
+ {
+ "name": "trailer.email",
+ "type": sqltypes.String(),
+ "nullable": True,
+ "comment": None,
+ "default": None,
+ "precision": None,
+ "scale": None,
+ "max_length": None,
+ },
+ ]
+
+ # mock the database so we can compile the query
+ database = mocker.MagicMock()
+ database.compile_sqla_query = lambda query: str(
+ query.compile(dialect=SQLiteDialect())
+ )
+
+ # use SQLite dialect so we don't need the BQ dependency
+ engine = mocker.MagicMock()
+ engine.dialect = SQLiteDialect()
+
+ sql = BigQueryEngineSpec.select_star(
+ database=database,
+ table_name="my_table",
+ engine=engine,
+ schema=None,
+ limit=100,
+ show_cols=True,
+ indent=True,
+ latest_partition=False,
+ cols=cols,
+ )
+ assert (
+ sql
+ == """SELECT trailer AS trailer
+FROM my_table
+LIMIT ?
+OFFSET ?"""
+ )
+
+ # BigQuery-specific SQL
+ try:
+ from pybigquery.sqlalchemy_bigquery import BigQueryDialect
+ except ModuleNotFoundError:
+ return
+
+ database.compile_sqla_query = lambda query: str(
+ query.compile(dialect=BigQueryDialect())
+ )
+ engine.dialect = BigQueryDialect()
+
+ sql = BigQueryEngineSpec.select_star(
+ database=database,
+ table_name="my_table",
+ engine=engine,
+ schema=None,
+ limit=100,
+ show_cols=True,
+ indent=True,
+ latest_partition=False,
+ cols=cols,
+ )
+ assert (
+ sql
+ == """SELECT `trailer` AS `trailer`
+FROM `my_table`
+LIMIT :param_1"""
+ )