[GitHub] [superset] ktmud commented on a change in pull request #13561: feat: script to benchmark DB migrations

GitBox Wed, 17 Mar 2021 12:17:28 -0700


ktmud commented on a change in pull request #13561:
URL: https://github.com/apache/superset/pull/13561#discussion_r596271660




##########
File path: requirements/development.txt
##########
@@ -6,38 +6,85 @@
 #    pip-compile-multi
 #
 -r base.txt
--e file:.                 # via -r requirements/base.in
-boto3==1.16.10            # via tabulator
-botocore==1.19.10         # via boto3, s3transfer
-cached-property==1.5.2    # via tableschema
-certifi==2020.6.20        # via requests
-deprecated==1.2.11        # via pygithub
-et-xmlfile==1.0.1         # via openpyxl
-flask-cors==3.0.9         # via -r requirements/development.in
-future==0.18.2            # via pyhive
-ijson==3.1.2.post0        # via tabulator
-jdcal==1.4.1              # via openpyxl
-jmespath==0.10.0          # via boto3, botocore
-jsonlines==1.2.0          # via tabulator
-linear-tsv==1.1.0         # via tabulator
-mysqlclient==1.4.2.post1  # via -r requirements/development.in
-openpyxl==3.0.5           # via tabulator
-pillow==7.2.0             # via -r requirements/development.in
-psycopg2-binary==2.8.5    # via -r requirements/development.in
-pydruid==0.6.1            # via -r requirements/development.in
-pygithub==1.54.1          # via -r requirements/development.in
-pyhive[hive]==0.6.3       # via -r requirements/development.in
-requests==2.24.0          # via pydruid, pygithub, tableschema, tabulator
-rfc3986==1.4.0            # via tableschema
-s3transfer==0.3.3         # via boto3
-sasl==0.2.1               # via pyhive, thrift-sasl
-tableschema==1.20.0       # via -r requirements/development.in
-tabulator==1.52.5         # via tableschema
-thrift-sasl==0.4.2        # via pyhive
-thrift==0.13.0            # via -r requirements/development.in, pyhive, 
thrift-sasl
-unicodecsv==0.14.1        # via tableschema, tabulator
-wrapt==1.12.1             # via deprecated
-xlrd==1.2.0               # via tabulator
+-e file:.
+    # via -r requirements/base.in
+boto3==1.16.10
+    # via tabulator
+botocore==1.19.10
+    # via
+    #   boto3
+    #   s3transfer
+cached-property==1.5.2
+    # via tableschema
+certifi==2020.6.20
+    # via requests
+deprecated==1.2.11
+    # via pygithub
+et-xmlfile==1.0.1
+    # via openpyxl
+flask-cors==3.0.9
+    # via -r requirements/development.in
+future==0.18.2
+    # via pyhive
+ijson==3.1.2.post0
+    # via tabulator
+jdcal==1.4.1
+    # via openpyxl
+jmespath==0.10.0
+    # via
+    #   boto3
+    #   botocore
+jsonlines==1.2.0
+    # via tabulator
+linear-tsv==1.1.0
+    # via tabulator
+mysqlclient==1.4.2.post1
+    # via -r requirements/development.in
+openpyxl==3.0.5
+    # via tabulator
+pillow==7.2.0
+    # via -r requirements/development.in
+psycopg2-binary==2.8.5
+    # via -r requirements/development.in
+pydruid==0.6.1
+    # via -r requirements/development.in
+pygithub==1.54.1
+    # via -r requirements/development.in
+pyhive[hive]==0.6.3
+    # via -r requirements/development.in
+requests==2.24.0
+    # via
+    #   pydruid
+    #   pygithub
+    #   tableschema
+    #   tabulator
+rfc3986==1.4.0
+    # via tableschema
+s3transfer==0.3.3
+    # via boto3
+sasl==0.2.1
+    # via
+    #   pyhive
+    #   thrift-sasl
+tableschema==1.20.0
+    # via -r requirements/development.in
+tabulator==1.52.5
+    # via tableschema
+thrift-sasl==0.4.2
+    # via pyhive
+thrift==0.13.0
+    # via
+    #   -r requirements/development.in
+    #   pyhive
+    #   thrift-sasl
+unicodecsv==0.14.1
+    # via
+    #   tableschema
+    #   tabulator
+wrapt==1.12.1
+    # via deprecated
+xlrd==1.2.0
+    # via tabulator

Review comment:
       Was this updated by Prettier or an upgrade of `pip-compile`?

##########
File path: superset/utils/data.py
##########
@@ -161,5 +227,145 @@ def generate_data(columns: List[ColumnInfo], num_rows: 
int) -> List[Dict[str, An
 
 
 def generate_column_data(column: ColumnInfo, num_rows: int) -> List[Any]:
-    func = get_type_generator(column["type"])
-    return [func() for _ in range(num_rows)]
+    gen = get_type_generator(column["type"])
+    return [gen() for _ in range(num_rows)]
+
+
+def add_to_model(session: Session, model: Type[Model], count: int) -> 
List[Model]:
+    """
+    Add entities of a given model.
+
+    :param Model model: a Superset/FAB model
+    :param int count: how many entities to generate and insert
+    """
+    inspector = inspect(model)
+
+    # select samples to copy relationship values
+    relationships = inspector.relationships.items()
+    samples = session.query(model).limit(count).all() if relationships else []
+
+    entities: List[Model] = []
+    max_primary_key: Optional[int] = None
+    for i in range(count):
+        sample = samples[i % len(samples)] if samples else None
+        kwargs = {}
+        for column in inspector.columns.values():
+            # for primary keys, keep incrementing
+            if column.primary_key:
+                if max_primary_key is None:
+                    max_primary_key = (
+                        session.query(func.max(getattr(model, 
column.name))).scalar()
+                        or 0
+                    )
+                max_primary_key += 1
+                kwargs[column.name] = max_primary_key
+
+            # if the column has a foreign key, copy the value from an existing 
entity
+            elif column.foreign_keys:
+                if sample:
+                    kwargs[column.name] = getattr(sample, column.name)
+                else:
+                    kwargs[column.name] = get_valid_foreign_key(column)
+
+            # should be an enum but it's not
+            elif column.name == "datasource_type":
+                kwargs[column.name] = "table"
+
+            # otherwise, generate a random value based on the type
+            else:
+                kwargs[column.name] = generate_value(column)
+
+        entities.append(model(**kwargs))
+
+    session.add_all(entities)
+    return entities
+
+
+def get_valid_foreign_key(column: Column) -> Any:
+    foreign_key = list(column.foreign_keys)[0]
+    table_name, column_name = foreign_key.target_fullname.split(".", 1)
+    return db.engine.execute(f"SELECT {column_name} FROM {table_name} LIMIT 
1").scalar()
+
+
+def generate_value(column: Column) -> Any:
+    if hasattr(column.type, "enums"):
+        return random.choice(column.type.enums)
+
+    json_as_string = "json" in column.name.lower() and isinstance(
+        column.type, sqlalchemy.sql.sqltypes.Text
+    )
+    type_ = sqlalchemy.sql.sqltypes.JSON() if json_as_string else column.type
+    value = get_type_generator(type_)()
+    if json_as_string:
+        value = json.dumps(value)
+    return value
+
+
+def find_models(module: ModuleType) -> List[Type[Model]]:
+    """
+    Find all models in a migration script.
+    """
+    models: List[Type[Model]] = []
+    tables = extract_modified_tables(module)
+
+    # add models defined explicitly in the migration script
+    queue = list(module.__dict__.values())
+    while queue:
+        obj = queue.pop()
+        if hasattr(obj, "__tablename__"):
+            tables.add(obj.__tablename__)
+        elif isinstance(obj, list):
+            queue.extend(obj)
+        elif isinstance(obj, dict):
+            queue.extend(obj.values())
+
+    # add implicit models
+    # pylint: disable=no-member, protected-access
+    for obj in Model._decl_class_registry.values():
+        if hasattr(obj, "__table__") and obj.__table__.fullname in tables:
+            models.append(obj)
+
+    # sort topologically so we can create entities in order and
+    # maintain relationships (eg, create a database before creating
+    # a slice)
+    sorter = TopologicalSorter()
+    for model in models:
+        inspector = inspect(model)
+        dependent_tables: List[str] = []
+        for column in inspector.columns.values():
+            for foreign_key in column.foreign_keys:
+                
dependent_tables.append(foreign_key.target_fullname.split(".")[0])
+        sorter.add(model.__tablename__, *dependent_tables)
+    order = list(sorter.static_order())
+    models.sort(key=lambda model: order.index(model.__tablename__))
+
+    return models
+
+
+def import_migration_script(filepath: Path) -> ModuleType:
+    """
+    Import migration script as if it were a module.
+    """
+    spec = importlib.util.spec_from_file_location(filepath.stem, filepath)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)  # type: ignore
+    return module

Review comment:
       Do you think it'd be better to keep `import_migration_script` and 
`extract_modified_tables`  within the benchmark script? They don't seem to be 
useful in other parts of Superset anyway.
   
   I was also a little confused by the module name `superset.utils.data`, maybe 
`superset.utils.mock_data` or `superset.utils.sample_data` would be clearer? 

##########
File path: scripts/benchmark_migration.py
##########
@@ -0,0 +1,127 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import logging
+import time
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Type
+
+import click
+from flask_appbuilder import Model
+from flask_migrate import downgrade, upgrade
+from sqlalchemy.orm import Session
+
+from superset import db
+from superset.utils.data import add_to_model, find_models, 
import_migration_script
+
+logger = logging.getLogger(__name__)
+
+
[email protected]()
[email protected]("filepath")
[email protected]("--limit", default=1000, help="Maximum number of entities.")
[email protected]("--force", is_flag=True, help="Do not prompt for confirmation.")
+def main(filepath: str, limit: int = 1000, force: bool = False) -> None:
+    session = db.session()
+
+    print(f"Importing migration script: {filepath}")
+    module = import_migration_script(Path(filepath))
+
+    revision: str = getattr(module, "revision", "")
+    down_revision: str = getattr(module, "down_revision", "")
+    if not revision or not down_revision:
+        raise Exception(
+            "Not a valid migration script, couldn't find 
down_revision/revision"
+        )
+
+    print(f"Migration goes from {down_revision} to {revision}")
+    current_revision = db.engine.execute(
+        "SELECT version_num FROM alembic_version"
+    ).scalar()
+    print(f"Current version of the DB is {current_revision}")
+
+    print("\nIdentifying models used in the migration:")
+    models = find_models(module)
+    model_rows: Dict[Type[Model], int] = {}
+    for model in models:
+        rows = session.query(model).count()
+        print(f"- {model.__name__} ({rows} rows in table 
{model.__tablename__})")
+        model_rows[model] = rows
+
+    if not force:
+        click.confirm(
+            f"\nDowngrade DB to {down_revision} and start benchmark?", 
abort=True
+        )
+    downgrade(revision=down_revision)
+
+    print("Benchmarking migration")
+    results: Dict[str, float] = {}
+    start = time.time()
+    upgrade(revision=revision)
+    duration = time.time() - start
+    results["Current"] = duration
+    print(f"Migration on current DB took: {duration:.2f} seconds")
+
+    min_entities = 10
+    new_models: Dict[Type[Model], List[Model]] = defaultdict(list)
+    while min_entities <= limit:
+        downgrade(revision=down_revision)
+        print(f"Running with at least {min_entities} entities of each model")
+        for model in models:
+            missing = min_entities - model_rows[model]
+            if missing > 0:
+                print(f"- Adding {missing} entities to the {model.__name__} 
model")
+                try:
+                    new_models[model].extend(add_to_model(session, model, 
missing))

Review comment:
       It seems this will keep all created models in memory, which could 
potentially interfere with performance if the test db is also on the local 
machine. I'm wondering whether we should just find a way to always run 
benchmarking on a sample database and truncate the whole database after finish 
instead?

##########
File path: scripts/benchmark_migration.py
##########
@@ -0,0 +1,127 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import logging
+import time
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Type
+
+import click
+from flask_appbuilder import Model
+from flask_migrate import downgrade, upgrade
+from sqlalchemy.orm import Session
+
+from superset import db
+from superset.utils.data import add_to_model, find_models, 
import_migration_script
+
+logger = logging.getLogger(__name__)
+
+
[email protected]()
[email protected]("filepath")
[email protected]("--limit", default=1000, help="Maximum number of entities.")
[email protected]("--force", is_flag=True, help="Do not prompt for confirmation.")
+def main(filepath: str, limit: int = 1000, force: bool = False) -> None:
+    session = db.session()
+
+    print(f"Importing migration script: {filepath}")
+    module = import_migration_script(Path(filepath))
+
+    revision: str = getattr(module, "revision", "")
+    down_revision: str = getattr(module, "down_revision", "")
+    if not revision or not down_revision:
+        raise Exception(
+            "Not a valid migration script, couldn't find 
down_revision/revision"
+        )
+
+    print(f"Migration goes from {down_revision} to {revision}")
+    current_revision = db.engine.execute(
+        "SELECT version_num FROM alembic_version"
+    ).scalar()
+    print(f"Current version of the DB is {current_revision}")
+
+    print("\nIdentifying models used in the migration:")
+    models = find_models(module)
+    model_rows: Dict[Type[Model], int] = {}
+    for model in models:
+        rows = session.query(model).count()
+        print(f"- {model.__name__} ({rows} rows in table 
{model.__tablename__})")
+        model_rows[model] = rows
+
+    if not force:
+        click.confirm(
+            f"\nDowngrade DB to {down_revision} and start benchmark?", 
abort=True
+        )
+    downgrade(revision=down_revision)

Review comment:
       Do we need to check if current version is `revision` in case it may lead 
to downgrading multiple versions? 

##########
File path: superset/utils/data.py
##########
@@ -53,24 +72,35 @@
 days_range = (MAXIMUM_DATE - MINIMUM_DATE).days
 
 
+# pylint: disable=too-many-return-statements, too-many-branches
 def get_type_generator(sqltype: sqlalchemy.sql.sqltypes) -> Callable[[], Any]:
-    if isinstance(sqltype, sqlalchemy.sql.sqltypes.INTEGER):
+    if isinstance(
+        sqltype, (sqlalchemy.sql.sqltypes.INTEGER, 
sqlalchemy.sql.sqltypes.Integer)
+    ):
         return lambda: random.randrange(2147483647)
 
     if isinstance(sqltype, sqlalchemy.sql.sqltypes.BIGINT):
         return lambda: random.randrange(sys.maxsize)
 
-    if isinstance(sqltype, sqlalchemy.sql.sqltypes.VARCHAR):
+    if isinstance(
+        sqltype, (sqlalchemy.sql.sqltypes.VARCHAR, 
sqlalchemy.sql.sqltypes.String)
+    ):
         length = random.randrange(sqltype.length or 255)
+        length = max(8, length)  # for unique values
+        length = min(100, length)  # for FAB perms
         return lambda: "".join(random.choices(string.printable, k=length))
 
-    if isinstance(sqltype, sqlalchemy.sql.sqltypes.TEXT):
+    if isinstance(
+        sqltype, (sqlalchemy.sql.sqltypes.TEXT, sqlalchemy.sql.sqltypes.Text)
+    ):
         length = random.randrange(65535)
         # "practicality beats purity"
         length = max(length, 2048)
         return lambda: "".join(random.choices(string.printable, k=length))
 
-    if isinstance(sqltype, sqlalchemy.sql.sqltypes.BOOLEAN):
+    if isinstance(
+        sqltype, (sqlalchemy.sql.sqltypes.BOOLEAN, 
sqlalchemy.sql.sqltypes.Boolean)
+    ):

Review comment:
       Are these changes related to the benchmark script or are they just 
bycatch?

##########
File path: superset/utils/data.py
##########
@@ -161,5 +227,145 @@ def generate_data(columns: List[ColumnInfo], num_rows: 
int) -> List[Dict[str, An
 
 
 def generate_column_data(column: ColumnInfo, num_rows: int) -> List[Any]:
-    func = get_type_generator(column["type"])
-    return [func() for _ in range(num_rows)]
+    gen = get_type_generator(column["type"])
+    return [gen() for _ in range(num_rows)]
+
+
+def add_to_model(session: Session, model: Type[Model], count: int) -> 
List[Model]:

Review comment:
       ```suggestion
   def add_sample_rows(session: Session, model: Type[Model], count: int) -> 
List[Model]:
   ```
   
   Nit: naming suggestion as `add_to_model` sounds like adding some attributes 
to an model instance.

##########
File path: scripts/benchmark_migration.py
##########
@@ -0,0 +1,127 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import logging
+import time
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Type
+
+import click
+from flask_appbuilder import Model
+from flask_migrate import downgrade, upgrade
+from sqlalchemy.orm import Session
+
+from superset import db
+from superset.utils.data import add_to_model, find_models, 
import_migration_script
+
+logger = logging.getLogger(__name__)
+
+
[email protected]()
[email protected]("filepath")
[email protected]("--limit", default=1000, help="Maximum number of entities.")
[email protected]("--force", is_flag=True, help="Do not prompt for confirmation.")
+def main(filepath: str, limit: int = 1000, force: bool = False) -> None:
+    session = db.session()
+
+    print(f"Importing migration script: {filepath}")
+    module = import_migration_script(Path(filepath))
+
+    revision: str = getattr(module, "revision", "")
+    down_revision: str = getattr(module, "down_revision", "")
+    if not revision or not down_revision:
+        raise Exception(
+            "Not a valid migration script, couldn't find 
down_revision/revision"
+        )
+
+    print(f"Migration goes from {down_revision} to {revision}")
+    current_revision = db.engine.execute(
+        "SELECT version_num FROM alembic_version"
+    ).scalar()
+    print(f"Current version of the DB is {current_revision}")
+
+    print("\nIdentifying models used in the migration:")
+    models = find_models(module)
+    model_rows: Dict[Type[Model], int] = {}
+    for model in models:
+        rows = session.query(model).count()
+        print(f"- {model.__name__} ({rows} rows in table 
{model.__tablename__})")
+        model_rows[model] = rows
+
+    if not force:
+        click.confirm(
+            f"\nDowngrade DB to {down_revision} and start benchmark?", 
abort=True
+        )
+    downgrade(revision=down_revision)
+
+    print("Benchmarking migration")
+    results: Dict[str, float] = {}
+    start = time.time()
+    upgrade(revision=revision)
+    duration = time.time() - start
+    results["Current"] = duration
+    print(f"Migration on current DB took: {duration:.2f} seconds")

Review comment:
       I'm not sure about the benefit of running an extra round of downgrade + 
upgrade before benchmarking with potential generated data.

##########
File path: scripts/benchmark_migration.py
##########
@@ -0,0 +1,127 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import logging
+import time
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Type
+
+import click
+from flask_appbuilder import Model
+from flask_migrate import downgrade, upgrade
+from sqlalchemy.orm import Session
+
+from superset import db
+from superset.utils.data import add_to_model, find_models, 
import_migration_script
+
+logger = logging.getLogger(__name__)
+
+
[email protected]()
[email protected]("filepath")
[email protected]("--limit", default=1000, help="Maximum number of entities.")
[email protected]("--force", is_flag=True, help="Do not prompt for confirmation.")
+def main(filepath: str, limit: int = 1000, force: bool = False) -> None:
+    session = db.session()
+
+    print(f"Importing migration script: {filepath}")
+    module = import_migration_script(Path(filepath))
+
+    revision: str = getattr(module, "revision", "")
+    down_revision: str = getattr(module, "down_revision", "")
+    if not revision or not down_revision:
+        raise Exception(
+            "Not a valid migration script, couldn't find 
down_revision/revision"
+        )
+
+    print(f"Migration goes from {down_revision} to {revision}")
+    current_revision = db.engine.execute(
+        "SELECT version_num FROM alembic_version"
+    ).scalar()
+    print(f"Current version of the DB is {current_revision}")
+
+    print("\nIdentifying models used in the migration:")
+    models = find_models(module)
+    model_rows: Dict[Type[Model], int] = {}
+    for model in models:
+        rows = session.query(model).count()
+        print(f"- {model.__name__} ({rows} rows in table 
{model.__tablename__})")
+        model_rows[model] = rows
+
+    if not force:
+        click.confirm(
+            f"\nDowngrade DB to {down_revision} and start benchmark?", 
abort=True

Review comment:
       ```suggestion
               f"\nRunning benchmarking will downgrade the Superset db to 
{down_revision} and upgrade to {revision} again."
                 "There may be data loss in downgrades. Continue?",
               abort=True
   ```




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [superset] ktmud commented on a change in pull request #13561: feat: script to benchmark DB migrations

Reply via email to