This is an automated email from the ASF dual-hosted git repository. yjc pushed a commit to branch perf-recent-activity in repository https://gitbox.apache.org/repos/asf/incubator-superset.git
commit b881d197c515c8da97591cd0b99fccdfd5a6da30 Author: Jesse Yang <[email protected]> AuthorDate: Wed Nov 25 19:06:00 2020 -0800 perf: improve loading speed of recent activities --- superset/extensions.py | 3 +- .../versions/debf575ce1a1_add_index_to_logs.py | 48 +++++++++++++ superset/views/core.py | 82 ++++++++-------------- 3 files changed, 80 insertions(+), 53 deletions(-) diff --git a/superset/extensions.py b/superset/extensions.py index 7011bb2..ff41370 100644 --- a/superset/extensions.py +++ b/superset/extensions.py @@ -29,6 +29,7 @@ from werkzeug.local import LocalProxy from superset.utils.cache_manager import CacheManager from superset.utils.feature_flag_manager import FeatureFlagManager +from superset.utils.log import AbstractEventLogger from superset.utils.machine_auth import MachineAuthProviderFactory @@ -101,7 +102,7 @@ cache_manager = CacheManager() celery_app = celery.Celery() csrf = CSRFProtect() db = SQLA() -_event_logger: Dict[str, Any] = {} +_event_logger: Dict[str, AbstractEventLogger] = {} event_logger = LocalProxy(lambda: _event_logger.get("event_logger")) feature_flag_manager = FeatureFlagManager() machine_auth_provider_factory = MachineAuthProviderFactory() diff --git a/superset/migrations/versions/debf575ce1a1_add_index_to_logs.py b/superset/migrations/versions/debf575ce1a1_add_index_to_logs.py new file mode 100644 index 0000000..33b923a --- /dev/null +++ b/superset/migrations/versions/debf575ce1a1_add_index_to_logs.py @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""add_index_to_logs + +Revision ID: debf575ce1a1 +Revises: a8173232b786 +Create Date: 2020-11-25 17:28:39.383290 + +""" +from alembic import op + +# revision identifiers, used by Alembic. +revision = "debf575ce1a1" +down_revision = "a8173232b786" + + +IDX_LOGS_ACTION = "ix_logs_action" +IDX_LOGS_USER_ID_DTTM = "ix_logs_user_id_dttm" + + +def upgrade(): + with op.batch_alter_table("logs") as batch_op: + batch_op.create_index( + op.f(IDX_LOGS_USER_ID_DTTM), ["user_id", "dttm"], unique=False, + ) + batch_op.create_index( + op.f(IDX_LOGS_ACTION), ["action"], unique=False, + ) + + +def downgrade(): + with op.batch_alter_table("logs") as batch_op: + batch_op.drop_index(op.f(IDX_LOGS_USER_ID_DTTM)) + batch_op.drop_index(op.f(IDX_LOGS_ACTION)) diff --git a/superset/views/core.py b/superset/views/core.py index 72cf12c..06d141f 100755 --- a/superset/views/core.py +++ b/superset/views/core.py @@ -17,8 +17,9 @@ # pylint: disable=comparison-with-callable import logging import re +import time from contextlib import closing -from datetime import datetime, timedelta +from datetime import datetime from typing import Any, Callable, cast, Dict, List, Optional, Union from urllib import parse @@ -1216,7 +1217,7 @@ class Superset(BaseSupersetView): # pylint: disable=too-many-public-methods ) -> FlaskResponse: """Recent activity (actions) for a given user""" limit = request.args.get("limit") - limit = int(limit) if limit and limit.isdigit() else 100 + limit = int(limit) if limit and limit.isdigit() else 50 actions = request.args.get("actions", "explore,dashboard").split(",") # whether to get distinct subjects distinct = request.args.get("distinct") != "false" @@ -1228,60 +1229,37 @@ class Superset(BaseSupersetView): # pylint: disable=too-many-public-methods and_(Slice.slice_name is not None, Slice.slice_name != ""), ) - if distinct: - one_year_ago = datetime.today() - timedelta(days=365) - subqry = ( - db.session.query( - Log.dashboard_id, - Log.slice_id, - Log.action, - func.max(Log.dttm).label("dttm"), - ) - .group_by(Log.dashboard_id, Log.slice_id, Log.action) - .filter( - and_( - Log.action.in_(actions), - Log.user_id == user_id, - # limit to one year of data to improve performance - Log.dttm > one_year_ago, - or_(Log.dashboard_id.isnot(None), Log.slice_id.isnot(None)), - ) - ) - .subquery() - ) - qry = ( - db.session.query( - subqry, - Dashboard.slug.label("dashboard_slug"), - Dashboard.dashboard_title, - Slice.slice_name, - ) - .outerjoin(Dashboard, Dashboard.id == subqry.c.dashboard_id) - .outerjoin(Slice, Slice.id == subqry.c.slice_id,) - .filter(has_subject_title) - .order_by(subqry.c.dttm.desc()) - .limit(limit) + qry = ( + db.session.query( + Log.dttm, + Log.action, + Log.dashboard_id, + Log.slice_id, + Dashboard.slug.label("dashboard_slug"), + Dashboard.dashboard_title, + Slice.slice_name, + ) + .outerjoin(Dashboard, Dashboard.id == Log.dashboard_id) + .outerjoin(Slice, Slice.id == Log.slice_id) + .filter( + and_(Log.action.in_(actions), Log.user_id == user_id, has_subject_title) ) + .order_by(Log.dttm.desc()) + ) + if distinct: + # stream the list and try to find enough distinct results + items = [] + seen = set() + for item in qry.limit(100 * limit).yield_per(5 * limit): + key = (item.dashboard_id, item.slice_id) + if key not in seen: + seen.add(key) + items.append(item) else: - qry = ( - db.session.query( - Log.dttm, - Log.action, - Log.dashboard_id, - Log.slice_id, - Dashboard.slug.label("dashboard_slug"), - Dashboard.dashboard_title, - Slice.slice_name, - ) - .outerjoin(Dashboard, Dashboard.id == Log.dashboard_id) - .outerjoin(Slice, Slice.id == Log.slice_id) - .filter(has_subject_title) - .order_by(Log.dttm.desc()) - .limit(limit) - ) + items = qry.limit(limit).all() payload = [] - for log in qry.all(): + for log in items: item_url = None item_title = None item_type = None
