bito-code-review[bot] commented on code in PR #39509:
URL: https://github.com/apache/superset/pull/39509#discussion_r3347470580


##########
tests/unit_tests/models/test_helpers_offset.py:
##########
@@ -0,0 +1,88 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import ast
+from pathlib import Path
+
+HELPERS_PATH = (
+    Path(__file__).resolve().parents[3] / "superset" / "models" / "helpers.py"
+)
+
+
+def _uses_supports_offset(node: ast.AST) -> bool:
+    """True if any attribute access on `node` references 'supports_offset'."""
+    return any(
+        isinstance(child, ast.Attribute) and child.attr == "supports_offset"
+        for child in ast.walk(node)
+    )
+
+
+def _is_qry_offset_assignment(stmt: ast.AST) -> bool:
+    """True if stmt is `qry = qry.offset(...)` (any LHS, call to `.offset`)."""
+    if not isinstance(stmt, ast.Assign):
+        return False
+    call = stmt.value
+    if not isinstance(call, ast.Call):
+        return False
+    func = call.func
+    return isinstance(func, ast.Attribute) and func.attr == "offset"
+
+
+def test_helpers_guards_offset_with_supports_offset_flag() -> None:
+    """
+    Regression guard: the `.offset()` call in get_sqla_query must be wrapped
+    in an `if` that checks `supports_offset`. Without this guard,
+    engines that do not support OFFSET (Elasticsearch SQL) crash drill-
+    to-detail on page 2+.
+
+    We parse the AST rather than grep the source so the test survives
+    Black-style reformatting and trivial refactors.
+    """
+    source = HELPERS_PATH.read_text()
+    assert "supports_offset" in source, (

Review Comment:
   <!-- Bito Reply -->
   The reviewer's suggestion to replace `assert` with `pytest.fail()` is a 
general best practice for production code to avoid issues with Python's `-O` 
optimization flag. However, in the context of a unit-test module where `pytest` 
is the standard runner and `assert` is the idiomatic way to verify conditions, 
your assessment that this is a false positive is correct. You are not required 
to apply this suggestion.
   
   **tests/unit_tests/models/test_helpers_offset.py**
   ```
   -    assert "supports_offset" in source, (
   +    if "supports_offset" not in source:
   +        pytest.fail(
             "helpers.py no longer references supports_offset; the OFFSET "
             "guard is gone — Elasticsearch drill-to-detail will crash on page 
2+."
    -    )
    +        )
   ```



##########
superset/db_engine_specs/elasticsearch.py:
##########
@@ -29,16 +32,108 @@
     SupersetDBAPIProgrammingError,
 )
 
+if TYPE_CHECKING:
+    from superset.models.core import Database
+
 logger = logging.getLogger()
 
 
+def _fetch_page_via_cursor(
+    database: Database,
+    sql: str,
+    page_index: int,
+    page_size: int,
+    sql_path: str,
+    close_path: str,
+) -> tuple[list[list[Any]], list[str]]:
+    """
+    Iterate Elasticsearch/OpenSearch SQL cursor pagination to return a single
+    page of results.
+
+    Executes ``sql`` with ``fetch_size = page_size``, then sends cursor
+    follow-up requests ``page_index`` times to skip earlier pages. Closes the
+    cursor when done to release server-side state. Returns
+    ``(rows, columns)``.
+
+    If the dataset is exhausted before reaching ``page_index``, returns an
+    empty rows list with the column names from the initial request.
+
+    Note: the Elasticsearch SQL cursor is forward-only, so cost is linear in
+    ``page_index`` — reaching page N issues N round trips to the cluster.
+    Deep pagination (hundreds of pages) will therefore be noticeably slower
+    than on ``OFFSET``-capable engines. This is a protocol limitation, not
+    an implementation choice.
+    """
+    # The Elasticsearch SQL API rejects trailing semicolons, and any LIMIT
+    # in the submitted statement caps the result set before the cursor can
+    # page through it. ``fetch_size`` drives pagination instead.
+    # Assumption: Superset only appends a trailing ``LIMIT N`` for engines
+    # with ``supports_offset=False``. If that ever changes (e.g.
+    # ``FETCH FIRST N ROWS`` or ``TOP N``), extend this sanitizer to match.
+    sanitized_sql = sql.strip().rstrip(";").strip()
+    sanitized_sql = re.sub(
+        r"\s+LIMIT\s+\d+\s*$", "", sanitized_sql, flags=re.IGNORECASE
+    )
+
+    # The raw transport does not auto-set Content-Type the way the Python
+    # DB-API driver does; ES rejects POSTs without a JSON content type.
+    json_headers = {"Content-Type": "application/json"}
+    with database.get_raw_connection() as conn:
+        transport = conn.es.transport
+        response = transport.perform_request(
+            "POST",
+            sql_path,
+            headers=json_headers,
+            body={"query": sanitized_sql, "fetch_size": page_size},
+        )
+        columns = [col["name"] for col in response.get("columns", [])]
+        rows = response.get("rows", [])
+        cursor = response.get("cursor")
+
+        try:
+            for _ in range(page_index):
+                if not cursor:
+                    # Dataset exhausted before reaching the target page —
+                    # no cursor to close (ES returns no cursor on the final
+                    # page). Return immediately with empty rows.
+                    return [], columns
+                response = transport.perform_request(
+                    "POST",
+                    sql_path,
+                    headers=json_headers,
+                    body={"cursor": cursor},
+                )
+                rows = response.get("rows", [])
+                cursor = response.get("cursor")
+
+            return rows, columns
+        finally:
+            if cursor:
+                # Best-effort cleanup. If close itself fails we don't want
+                # to mask the original error (if any) — swallow and log.
+                try:
+                    transport.perform_request(
+                        "POST",
+                        close_path,
+                        headers=json_headers,
+                        body={"cursor": cursor},
+                    )
+                except Exception:  # pylint: disable=broad-except

Review Comment:
   <!-- Bito Reply -->
   The suggestion to replace the broad `Exception` catch with specific 
exceptions is intended to improve error handling precision. However, since the 
`finally` block is used for resource cleanup and the broad catch is already 
annotated with a pylint disable comment to prevent masking errors, maintaining 
the current implementation is acceptable. You may choose to ignore this 
suggestion if the current approach is necessary for your cleanup logic.
   
   **superset/db_engine_specs/elasticsearch.py**
   ```
   except (ConnectionError, TimeoutError, Exception):  # pylint: 
disable=broad-except
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to