codeant-ai-for-open-source[bot] commented on code in PR #38416:
URL: https://github.com/apache/superset/pull/38416#discussion_r2907958031


##########
superset/mcp_service/common/popularity.py:
##########
@@ -0,0 +1,359 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Popularity scoring for charts, dashboards, and datasets.
+
+Computes a composite popularity score based on views, favorites,
+relationships, certification, and recency.
+"""
+
+from __future__ import annotations
+
+import logging
+from collections.abc import Callable
+from datetime import datetime, timedelta, timezone
+from typing import Any
+
+import sqlalchemy as sa
+
+from superset.connectors.sqla.models import SqlaTable
+from superset.extensions import db
+from superset.models.core import FavStar, Log
+from superset.models.dashboard import Dashboard, dashboard_slices
+from superset.models.slice import Slice
+from superset.utils import json as json_utils
+
+logger = logging.getLogger(__name__)
+
+# Scoring weights
+VIEW_WEIGHT = 3
+FAV_WEIGHT = 5
+DASHBOARD_COUNT_WEIGHT = 2
+CHART_COUNT_WEIGHT_DASHBOARD = 1
+CHART_COUNT_WEIGHT_DATASET = 3
+CERTIFICATION_BONUS = 10
+PUBLISHED_BONUS = 3
+
+# Recency thresholds and bonuses
+RECENCY_RECENT_DAYS = 7
+RECENCY_MODERATE_DAYS = 30
+RECENCY_RECENT_BONUS = 5.0
+RECENCY_MODERATE_BONUS = 2.0
+
+# Two-pass query limit
+MAX_POPULARITY_SORT_PAGE_SIZE = 100_000
+
+
+def _recency_bonus(changed_on: datetime | None) -> float:
+    """Compute recency bonus based on changed_on timestamp."""
+    if not changed_on:
+        return 0.0
+    now = datetime.now(timezone.utc)
+    # Ensure changed_on is timezone-aware for comparison
+    if changed_on.tzinfo is None:
+        changed_on = changed_on.replace(tzinfo=timezone.utc)
+    delta = now - changed_on
+    if delta <= timedelta(days=RECENCY_RECENT_DAYS):
+        return RECENCY_RECENT_BONUS
+    if delta <= timedelta(days=RECENCY_MODERATE_DAYS):
+        return RECENCY_MODERATE_BONUS
+    return 0.0
+
+
+def _init_scores(ids: list[int]) -> dict[int, float]:
+    """Initialize a score dict with 0.0 for each ID."""
+    return {i: 0.0 for i in ids}  # noqa: C420
+
+
+def _add_view_scores(
+    scores: dict[int, float],
+    action: str,
+    id_column: sa.Column,
+    entity_ids: list[int],
+    cutoff: datetime,
+    weight: int,
+) -> None:
+    """Add view count scores from the logs table."""
+    view_counts = (
+        db.session.query(id_column, sa.func.count(Log.id).label("view_count"))
+        .filter(Log.action == action, id_column.in_(entity_ids), Log.dttm >= 
cutoff)
+        .group_by(id_column)
+        .all()
+    )
+    for row in view_counts:
+        entity_id = row[0]
+        if entity_id in scores:
+            scores[entity_id] += row.view_count * weight
+
+
+def _add_fav_scores(
+    scores: dict[int, float],
+    class_name: str,
+    entity_ids: list[int],
+    weight: int,
+) -> None:
+    """Add favorite count scores from the favstar table."""
+    fav_counts = (
+        db.session.query(FavStar.obj_id, 
sa.func.count(FavStar.id).label("fav_count"))
+        .filter(FavStar.class_name == class_name, 
FavStar.obj_id.in_(entity_ids))
+        .group_by(FavStar.obj_id)
+        .all()
+    )
+    for row in fav_counts:
+        if row.obj_id in scores:
+            scores[row.obj_id] += row.fav_count * weight
+
+
+def compute_chart_popularity(chart_ids: list[int], days: int = 30) -> 
dict[int, float]:
+    """Compute popularity scores for charts.
+
+    Formula: view_count_30d * 3 + fav_count * 5 + dashboard_count * 2
+             + is_certified * 10 + recency_bonus
+
+    Args:
+        chart_ids: List of chart IDs to score
+        days: Number of days for view count window
+
+    Returns:
+        Dict mapping chart_id -> popularity score
+    """
+    if not chart_ids:
+        return {}
+
+    scores = _init_scores(chart_ids)
+    cutoff = datetime.now(timezone.utc) - timedelta(days=days)
+
+    _add_view_scores(
+        scores, "mount_explorer", Log.slice_id, chart_ids, cutoff, VIEW_WEIGHT
+    )
+    _add_fav_scores(scores, "slice", chart_ids, FAV_WEIGHT)
+
+    # Dashboard count (how many dashboards contain each chart)
+    dash_counts = (
+        db.session.query(
+            dashboard_slices.c.slice_id,
+            sa.func.count(dashboard_slices.c.dashboard_id).label("dash_count"),
+        )
+        .filter(dashboard_slices.c.slice_id.in_(chart_ids))
+        .group_by(dashboard_slices.c.slice_id)
+        .all()
+    )
+    for row in dash_counts:
+        if row.slice_id in scores:
+            scores[row.slice_id] += row.dash_count * DASHBOARD_COUNT_WEIGHT
+
+    # Certification and recency from chart objects
+    charts = (
+        db.session.query(Slice.id, Slice.certified_by, Slice.changed_on)
+        .filter(Slice.id.in_(chart_ids))
+        .all()
+    )
+    for chart in charts:
+        if chart.id in scores:
+            if chart.certified_by:
+                scores[chart.id] += CERTIFICATION_BONUS
+            scores[chart.id] += _recency_bonus(chart.changed_on)
+
+    return scores
+
+
+def compute_dashboard_popularity(
+    dashboard_ids: list[int], days: int = 30
+) -> dict[int, float]:
+    """Compute popularity scores for dashboards.
+
+    Formula: view_count_30d * 3 + fav_count * 5 + chart_count * 1
+             + is_published * 3 + is_certified * 10 + recency_bonus
+
+    Args:
+        dashboard_ids: List of dashboard IDs to score
+        days: Number of days for view count window
+
+    Returns:
+        Dict mapping dashboard_id -> popularity score
+    """
+    if not dashboard_ids:
+        return {}
+
+    scores = _init_scores(dashboard_ids)
+    cutoff = datetime.now(timezone.utc) - timedelta(days=days)
+
+    _add_view_scores(
+        scores, "mount_dashboard", Log.dashboard_id, dashboard_ids, cutoff, 
VIEW_WEIGHT
+    )
+    _add_fav_scores(scores, "Dashboard", dashboard_ids, FAV_WEIGHT)
+
+    # Chart count per dashboard
+    chart_counts = (
+        db.session.query(
+            dashboard_slices.c.dashboard_id,
+            sa.func.count(dashboard_slices.c.slice_id).label("chart_count"),
+        )
+        .filter(dashboard_slices.c.dashboard_id.in_(dashboard_ids))
+        .group_by(dashboard_slices.c.dashboard_id)
+        .all()
+    )
+    for row in chart_counts:
+        if row.dashboard_id in scores:
+            scores[row.dashboard_id] += row.chart_count * 
CHART_COUNT_WEIGHT_DASHBOARD
+
+    # Published, certification, and recency from dashboard objects
+    dashboards = (
+        db.session.query(
+            Dashboard.id,
+            Dashboard.published,
+            Dashboard.certified_by,
+            Dashboard.changed_on,
+        )
+        .filter(Dashboard.id.in_(dashboard_ids))
+        .all()
+    )
+    for dash in dashboards:
+        if dash.id in scores:
+            if dash.published:
+                scores[dash.id] += PUBLISHED_BONUS
+            if dash.certified_by:
+                scores[dash.id] += CERTIFICATION_BONUS
+            scores[dash.id] += _recency_bonus(dash.changed_on)
+
+    return scores
+
+
+def compute_dataset_popularity(
+    dataset_ids: list[int], days: int = 30
+) -> dict[int, float]:
+    """Compute popularity scores for datasets.
+
+    Formula: chart_count * 3 + is_certified * 10 + recency_bonus
+
+    Args:
+        dataset_ids: List of dataset IDs to score
+        days: Number of days (unused for datasets, kept for API consistency)
+
+    Returns:
+        Dict mapping dataset_id -> popularity score
+    """
+    if not dataset_ids:
+        return {}
+
+    scores = _init_scores(dataset_ids)
+
+    # 1. Chart count per dataset (how many charts use each dataset)
+    chart_counts = (
+        db.session.query(
+            Slice.datasource_id,
+            sa.func.count(Slice.id).label("chart_count"),
+        )
+        .filter(
+            Slice.datasource_id.in_(dataset_ids),
+            Slice.datasource_type == "table",
+        )
+        .group_by(Slice.datasource_id)
+        .all()
+    )
+    for row in chart_counts:
+        if row.datasource_id in scores:
+            scores[row.datasource_id] += row.chart_count * 
CHART_COUNT_WEIGHT_DATASET
+
+    # 2. Certification and recency from dataset objects
+    # Datasets use CertificationMixin where certification is in the extra JSON
+    datasets = (
+        db.session.query(SqlaTable.id, SqlaTable.extra, SqlaTable.changed_on)
+        .filter(SqlaTable.id.in_(dataset_ids))  # type: 
ignore[attr-defined,unused-ignore]
+        .all()
+    )
+    for ds in datasets:
+        if ds.id in scores:
+            # Check certification from extra JSON
+            if ds.extra:
+                try:
+                    extra_dict = json_utils.loads(ds.extra)
+                    if extra_dict.get("certification"):
+                        scores[ds.id] += CERTIFICATION_BONUS
+                except (TypeError, ValueError):
+                    pass
+            scores[ds.id] += _recency_bonus(ds.changed_on)
+
+    return scores
+
+
+def get_popularity_sorted_ids(
+    compute_fn: Callable[..., dict[int, float]],
+    dao_class: Any,
+    filters: list[Any] | None,
+    search: str | None,
+    search_columns: list[str],
+    order_direction: str = "desc",
+    days: int = 30,
+) -> tuple[list[int], dict[int, float], int]:
+    """Fetch all matching IDs, compute popularity scores, return sorted IDs.
+
+    This is the "two-pass" approach: first get all matching IDs (lightweight),
+    then compute scores and sort by popularity.
+
+    Args:
+        compute_fn: One of compute_chart/dashboard/dataset_popularity
+        dao_class: DAO class to query
+        filters: Column operator filters
+        search: Search string
+        search_columns: Columns to search
+        order_direction: "asc" or "desc"
+        days: Days window for view counts
+
+    Returns:
+        Tuple of (sorted_ids, scores_dict, total_count)
+    """
+    # Use a large page_size to get all matching IDs in one query
+    all_items, total_count = dao_class.list(
+        column_operators=filters,
+        order_column="changed_on",
+        order_direction="desc",
+        page=0,
+        page_size=MAX_POPULARITY_SORT_PAGE_SIZE,
+        search=search,
+        search_columns=search_columns,
+        columns=["id"],
+    )
+
+    if not all_items:
+        return [], {}, 0
+
+    all_ids: list[int] = [
+        item_id
+        for item in all_items
+        if (item_id := getattr(item, "id", None)) is not None
+    ]
+
+    if not all_ids:
+        return [], {}, total_count
+
+    # Compute popularity scores
+    scores = compute_fn(all_ids, days=days)
+
+    # Sort by score
+    reverse = order_direction == "desc"
+    sorted_ids = sorted(all_ids, key=lambda x: scores.get(x, 0.0), 
reverse=reverse)
+
+    return sorted_ids, scores, total_count

Review Comment:
   **Suggestion:** When there are more matching rows than 
MAX_POPULARITY_SORT_PAGE_SIZE, dao_class.list returns only the first N items 
but total_count still reflects the full count; get_popularity_sorted_ids then 
computes scores and sorted_ids only for this truncated subset while returning 
the full total_count, which makes downstream pagination report additional pages 
that can never return results and misrepresents how many items are actually 
sortable by popularity. [logic error]
   
   <details>
   <summary><b>Severity Level:</b> Major ⚠️</summary>
   
   ```mdx
   - ⚠️ Popularity-sorted MCP list tools misreport total result counts.
   - ⚠️ Users see empty pages when paginating beyond returned subset.
   - ⚠️ Popularity sorting unreliable for large chart/dashboard/dataset 
collections.
   ```
   </details>
   
   ```suggestion
       all_items, total_count = dao_class.list(
           column_operators=filters,
           order_column="changed_on",
           order_direction="desc",
           page=0,
           page_size=MAX_POPULARITY_SORT_PAGE_SIZE,
           search=search,
           search_columns=search_columns,
           columns=["id"],
       )
   
       if not all_items:
           return [], {}, 0
   
       all_ids: list[int] = [
           item_id
           for item in all_items
           if (item_id := getattr(item, "id", None)) is not None
       ]
   
       # Cap total_count to the number of IDs actually loaded to keep 
pagination consistent
       total_count = len(all_ids)
   
       if not all_ids:
           return [], {}, total_count
   
       # Compute popularity scores
       scores = compute_fn(all_ids, days=days)
   
       # Sort by score
       reverse = order_direction == "desc"
       sorted_ids = sorted(all_ids, key=lambda x: scores.get(x, 0.0), 
reverse=reverse)
   
       return sorted_ids, scores, total_count
   ```
   <details>
   <summary><b>Steps of Reproduction ✅ </b></summary>
   
   ```mdx
   1. Configure a DAO class whose `list` method is used with popularity 
sorting, and ensure
   it can return more than `MAX_POPULARITY_SORT_PAGE_SIZE` matching rows (e.g., 
>100,000
   dashboards) while still reporting the full total count; this DAO is passed 
as `dao_class`
   into `get_popularity_sorted_ids` in 
`superset/mcp_service/common/popularity.py:294-352`.
   
   2. Call `get_popularity_sorted_ids` (in
   `superset/mcp_service/common/popularity.py:294-352`) with that `dao_class`,
   `order_direction="desc"`, and filters/search that match all rows; 
`dao_class.list` returns
   `all_items` truncated to `MAX_POPULARITY_SORT_PAGE_SIZE` plus `total_count` 
equal to the
   full row count.
   
   3. Observe in `get_popularity_sorted_ids` that `all_ids` is built only from 
`all_items`
   (lines 337-341) so `sorted_ids` contains at most 
`MAX_POPULARITY_SORT_PAGE_SIZE` IDs, but
   the function returns the unmodified `total_count` from `dao_class.list` 
(line 352), which
   is larger than `len(sorted_ids)`.
   
   4. Any caller that uses the returned `total_count` for pagination (e.g., an 
MCP list tool
   that exposes `order_column="popularity_score"` and computes number of pages 
from
   `total_count`) will request pages beyond `ceil(len(sorted_ids)/page_size)`; 
those later
   page requests slice past the end of `sorted_ids` and yield empty result sets 
even though
   the reported `total_count` suggests more data is available.
   ```
   </details>
   <details>
   <summary><b>Prompt for AI Agent 🤖 </b></summary>
   
   ```mdx
   This is a comment left during a code review.
   
   **Path:** superset/mcp_service/common/popularity.py
   **Line:** 321:352
   **Comment:**
        *Logic Error: When there are more matching rows than 
MAX_POPULARITY_SORT_PAGE_SIZE, dao_class.list returns only the first N items 
but total_count still reflects the full count; get_popularity_sorted_ids then 
computes scores and sorted_ids only for this truncated subset while returning 
the full total_count, which makes downstream pagination report additional pages 
that can never return results and misrepresents how many items are actually 
sortable by popularity.
   
   Validate the correctness of the flagged issue. If correct, How can I resolve 
this? If you propose a fix, implement it and please make it concise.
   ```
   </details>
   <a 
href='https://app.codeant.ai/feedback?pr_url=https%3A%2F%2Fgithub.com%2Fapache%2Fsuperset%2Fpull%2F38416&comment_hash=56f3f619132f772740d3e417fad57e6d92d4ef6452fded7314020dd56969b824&reaction=like'>👍</a>
 | <a 
href='https://app.codeant.ai/feedback?pr_url=https%3A%2F%2Fgithub.com%2Fapache%2Fsuperset%2Fpull%2F38416&comment_hash=56f3f619132f772740d3e417fad57e6d92d4ef6452fded7314020dd56969b824&reaction=dislike'>👎</a>



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to