codeant-ai-for-open-source[bot] commented on code in PR #37950: URL: https://github.com/apache/superset/pull/37950#discussion_r2817910818
########## extensions/ai_assistant/backend/src/ai_assistant/tools.py: ########## @@ -0,0 +1,532 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Vambery AI Agent Database Tools. + +These tools allow the AI agent to introspect database schemas and execute +SQL queries during its reasoning process. +Uses Superset's Database model for all database operations, respecting +security permissions and row-level security. +""" + +from __future__ import annotations + +import logging +from typing import Any + +from superset.extensions import db +from superset.models.core import Database +from superset.sql.parse import Table + +logger = logging.getLogger(__name__) + + +def _extract_result_data(result: Any) -> tuple[list[dict[str, Any]], list[str]]: + """ + Extract rows (as list of dicts) and column names from a QueryResult. + + The QueryResult from database.execute() stores results in + result.statements[i].data as a Pandas DataFrame. + """ + rows: list[dict[str, Any]] = [] + columns: list[str] = [] + try: + if hasattr(result, "statements") and result.statements: + stmt = result.statements[0] + df = stmt.data + if df is not None and hasattr(df, "to_dict"): + # Pandas DataFrame + rows = df.to_dict(orient="records") + columns = list(df.columns) + elif df is not None and hasattr(df, "keys"): + # dict-like + rows = [df] if not isinstance(df, list) else df + columns = list(df.keys()) if isinstance(df, dict) else [] + except Exception as ex: + logger.debug("Could not extract result data: %s", ex) + + # Fallback: legacy result format + if not rows: + if hasattr(result, "data") and result.data: + rows = result.data if isinstance(result.data, list) else [] + elif hasattr(result, "rows") and hasattr(result, "columns"): + columns = result.columns + rows = [dict(zip(columns, row, strict=False)) for row in result.rows] + + return rows, columns + + +# -------------------------------------------------------------------------- +# OpenAI function/tool definitions (sent to the LLM) +# -------------------------------------------------------------------------- + +TOOL_DEFINITIONS: list[dict[str, Any]] = [ + { + "type": "function", + "function": { + "name": "list_schemas", + "description": ( + "List all available schemas in the connected database. " + "Use this first to understand the database structure." + ), + "parameters": { + "type": "object", + "properties": {}, + "required": [], + }, + }, + }, + { + "type": "function", + "function": { + "name": "list_tables", + "description": ( + "List all tables in a specific schema. " + "Use this to discover which tables are available." + ), + "parameters": { + "type": "object", + "properties": { + "schema_name": { + "type": "string", + "description": "The schema name to list tables from", + }, + }, + "required": ["schema_name"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "get_table_columns", + "description": ( + "Get the column names, data types, and nullable info for a table. " + "Use this to understand the structure of a table before writing queries." + ), + "parameters": { + "type": "object", + "properties": { + "table_name": { + "type": "string", + "description": "Name of the table", + }, + "schema_name": { + "type": "string", + "description": "Schema the table belongs to", + }, + }, + "required": ["table_name", "schema_name"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "sample_table_data", + "description": ( + "Get a sample of rows from a table (up to 20 rows). " + "Use this to understand what kind of data a table contains." + ), + "parameters": { + "type": "object", + "properties": { + "table_name": { + "type": "string", + "description": "Name of the table", + }, + "schema_name": { + "type": "string", + "description": "Schema the table belongs to", + }, + }, + "required": ["table_name", "schema_name"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "get_distinct_values", + "description": ( + "Get distinct/unique values for a specific column in a table. " + "Use this to understand the domain of a column (e.g., categories, statuses)." + ), + "parameters": { + "type": "object", + "properties": { + "table_name": { + "type": "string", + "description": "Name of the table", + }, + "schema_name": { + "type": "string", + "description": "Schema the table belongs to", + }, + "column_name": { + "type": "string", + "description": "Name of the column to get distinct values for", + }, + }, + "required": ["table_name", "schema_name", "column_name"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "execute_sql", + "description": ( + "Execute an arbitrary SQL SELECT query and return results (max 50 rows). " + "Use this to test queries, explore data, or verify results. " + "Only SELECT statements are allowed." + ), + "parameters": { + "type": "object", + "properties": { + "sql": { + "type": "string", + "description": "The SQL SELECT query to execute", + }, + }, + "required": ["sql"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "set_editor_sql", + "description": ( + "Set the SQL query in the user's SQL editor. " + "Use this when you have the final, tested query ready for the user. " + "This replaces the content of the active SQL editor tab." + ), + "parameters": { + "type": "object", + "properties": { + "sql": { + "type": "string", + "description": "The SQL query to place in the editor", + }, + }, + "required": ["sql"], + }, + }, + }, +] + + +# -------------------------------------------------------------------------- +# Tool execution functions +# -------------------------------------------------------------------------- + + +def _get_database(database_id: int) -> Database: + """Load a Database model by ID, raising ValueError if not found.""" + database = db.session.query(Database).filter_by(id=database_id).first() + if not database: + raise ValueError(f"Database with id={database_id} not found") + return database + + +def tool_list_schemas(database_id: int, catalog: str | None = None) -> dict[str, Any]: + """List all schemas in the database.""" + try: + database = _get_database(database_id) + schemas = database.get_all_schema_names(catalog=catalog) + return {"schemas": sorted(schemas)} + except Exception as ex: + logger.error("Error listing schemas for db %s: %s", database_id, ex) + return {"error": str(ex)} + + +def tool_list_tables( + database_id: int, schema_name: str, catalog: str | None = None +) -> dict[str, Any]: + """List all tables in a specific schema.""" + try: + database = _get_database(database_id) + tables = database.get_all_table_names_in_schema( + catalog=catalog, schema=schema_name + ) + # tables is a set of (table_name, schema, catalog) tuples + table_names = sorted(t[0] for t in tables) + return {"tables": table_names, "schema": schema_name} + except Exception as ex: + logger.error( + "Error listing tables for db %s, schema %s: %s", + database_id, + schema_name, + ex, + ) + return {"error": str(ex)} + + +def tool_get_table_columns( + database_id: int, + table_name: str, + schema_name: str, + catalog: str | None = None, +) -> dict[str, Any]: + """Get column metadata for a table.""" + try: + database = _get_database(database_id) + table = Table(table=table_name, schema=schema_name, catalog=catalog) + columns = database.get_columns(table) + # Format columns for readability + col_info = [] + for col in columns: + col_info.append( + { + "name": col.get("column_name") or col.get("name", "unknown"), + "type": str(col.get("type", "unknown")), + "nullable": col.get("nullable", True), + } + ) + return {"table": table_name, "schema": schema_name, "columns": col_info} + except Exception as ex: + logger.error( + "Error getting columns for %s.%s in db %s: %s", + schema_name, + table_name, + database_id, + ex, + ) + return {"error": str(ex)} + + +def tool_sample_table_data( + database_id: int, + table_name: str, + schema_name: str, + catalog: str | None = None, + max_rows: int = 20, +) -> dict[str, Any]: + """Get sample data from a table.""" + try: + database = _get_database(database_id) + # Use TOP for MSSQL, LIMIT for others + db_engine = database.db_engine_spec.__name__ if database.db_engine_spec else "" + if "mssql" in db_engine.lower() or "MSSql" in db_engine: Review Comment: **Suggestion:** In the MSSQL branch of the table sampling helper, schema and table names are interpolated directly into the SQL inside square brackets without any validation, so a crafted name containing ']' (or other special characters) could break out of the identifier and inject arbitrary SQL, which is a SQL injection vulnerability even though the function is intended only for safe introspection. [security] <details> <summary><b>Severity Level:</b> Critical 🚨</summary> ```mdx - ❌ AI assistant table sampling can execute injected MSSQL SQL. - ❌ Compromise of connected MSSQL database integrity possible. - ⚠️ MSSQL-backed metadata exploration becomes security-sensitive surface. ``` </details> ```suggestion # Basic identifier safety to avoid SQL injection via schema/table names if any(ch in schema_name for ch in '[];"') or any( ch in table_name for ch in '[];"' ): raise ValueError("Invalid characters in schema or table name") if "mssql" in db_engine.lower(): ``` <details> <summary><b>Steps of Reproduction ✅ </b></summary> ```mdx 1. Ensure Superset is configured with a MSSQL database entry so that `database.db_engine_spec.__name__` contains `"mssql"` (see `superset.models.core.Database`, and helper `_get_database()` at `extensions/ai_assistant/backend/src/ai_assistant/tools.py:240-245`). 2. From any backend component that uses the AI assistant tools, call `execute_tool()` in `extensions/ai_assistant/backend/src/ai_assistant/tools.py:469-532` with `tool_name="sample_table_data"`, a valid `database_id` for that MSSQL database, and `arguments={"schema_name": "foo]; DROP TABLE sensitive_table;--", "table_name": "bar"}`. 3. Inside `execute_tool`, the `"sample_table_data"` branch forwards the call to `tool_sample_table_data()` at `extensions/ai_assistant/backend/src/ai_assistant/tools.py:314-356` with the provided `schema_name` and `table_name`. 4. In `tool_sample_table_data`, the MSSQL branch at lines `324-329` builds `sql = f"SELECT TOP {max_rows} * FROM [{schema_name}].[{table_name}]"`, producing SQL like `SELECT TOP 20 * FROM [foo]; DROP TABLE sensitive_table;--].[bar]`, which is passed unmodified to `Database.execute()` at `superset/models/core.py:1295-1309`, allowing the injected `DROP TABLE` to execute if the underlying MSSQL driver permits multiple statements. ``` </details> <details> <summary><b>Prompt for AI Agent 🤖 </b></summary> ```mdx This is a comment left during a code review. **Path:** extensions/ai_assistant/backend/src/ai_assistant/tools.py **Line:** 326:326 **Comment:** *Security: In the MSSQL branch of the table sampling helper, schema and table names are interpolated directly into the SQL inside square brackets without any validation, so a crafted name containing ']' (or other special characters) could break out of the identifier and inject arbitrary SQL, which is a SQL injection vulnerability even though the function is intended only for safe introspection. Validate the correctness of the flagged issue. If correct, How can I resolve this? If you propose a fix, implement it and please make it concise. ``` </details> <a href='https://app.codeant.ai/feedback?pr_url=https%3A%2F%2Fgithub.com%2Fapache%2Fsuperset%2Fpull%2F37950&comment_hash=2f0a38a819f6fe4e41287fdcecdacebcc88e7c46427ad08c0a47cc2759ef5b13&reaction=like'>👍</a> | <a href='https://app.codeant.ai/feedback?pr_url=https%3A%2F%2Fgithub.com%2Fapache%2Fsuperset%2Fpull%2F37950&comment_hash=2f0a38a819f6fe4e41287fdcecdacebcc88e7c46427ad08c0a47cc2759ef5b13&reaction=dislike'>👎</a> ########## .devcontainer/start-local.sh: ########## @@ -0,0 +1,109 @@ +#!/bin/bash +# Start Superset for local dev container development (Cursor). +# Runs Flask backend + webpack dev server directly in the container. +# Postgres runs via docker-in-docker. +set -e + +WORKSPACE_DIR="${WORKSPACE_DIR:-$(pwd)}" +cd "$WORKSPACE_DIR" + +echo "=== Starting Superset (local dev container mode) ===" + +# -------------------------------------------------------------------------- +# 1. Source user's local env overrides (MSSQL creds, AI config, etc.) +# -------------------------------------------------------------------------- +if [ -f "$WORKSPACE_DIR/docker/.env-local" ]; then + echo "[env] Loading docker/.env-local ..." + set -a + # shellcheck disable=SC1091 + source "$WORKSPACE_DIR/docker/.env-local" + set +a +fi + +# -------------------------------------------------------------------------- +# 2. Make sure Postgres is running +# -------------------------------------------------------------------------- +echo "[db] Ensuring PostgreSQL is running ..." +docker compose -f "$WORKSPACE_DIR/.devcontainer/docker-compose-devcontainer.yml" up -d + +echo "[db] Waiting for Postgres ..." +for i in $(seq 1 30); do + docker compose -f "$WORKSPACE_DIR/.devcontainer/docker-compose-devcontainer.yml" exec -T db pg_isready -U superset > /dev/null 2>&1 && break + sleep 1 +done +echo "[db] PostgreSQL is ready." Review Comment: **Suggestion:** The PostgreSQL readiness loop always prints that the database is ready after 30 iterations without checking whether `pg_isready` ever succeeded, so the script can continue and fail later in less obvious ways if the database never became available. [logic error] <details> <summary><b>Severity Level:</b> Major ⚠️</summary> ```mdx - ⚠️ Script reports ready database when PostgreSQL is down. - ⚠️ Migrations fail after misleading readiness message. ``` </details> ```suggestion db_ready=false for i in $(seq 1 30); do if docker compose -f "$WORKSPACE_DIR/.devcontainer/docker-compose-devcontainer.yml" exec -T db pg_isready -U superset > /dev/null 2>&1; then db_ready=true break fi sleep 1 done if [ "$db_ready" = true ]; then echo "[db] PostgreSQL is ready." else echo "[db] ERROR: PostgreSQL did not become ready after 30 seconds." >&2 exit 1 fi ``` <details> <summary><b>Steps of Reproduction ✅ </b></summary> ```mdx 1. Misconfigure or break the Postgres service in `.devcontainer/docker-compose-devcontainer.yml` so container `db` never becomes ready (e.g. wrong image or bad config) while keeping the script `.devcontainer/start-local.sh` unchanged. 2. Run `.devcontainer/start-local.sh` from the repository root; it starts Docker with `docker compose ... up -d` at line 27. 3. The readiness loop at `.devcontainer/start-local.sh:29-33` calls `pg_isready` 30 times, each attempt failing because PostgreSQL is not accepting connections, but the loop still exits after the last iteration. 4. At `.devcontainer/start-local.sh:34`, the script unconditionally prints "[db] PostgreSQL is ready." and then proceeds to run `superset db upgrade` at line 52, which fails with connection errors despite the prior success message. ``` </details> <details> <summary><b>Prompt for AI Agent 🤖 </b></summary> ```mdx This is a comment left during a code review. **Path:** .devcontainer/start-local.sh **Line:** 30:34 **Comment:** *Logic Error: The PostgreSQL readiness loop always prints that the database is ready after 30 iterations without checking whether `pg_isready` ever succeeded, so the script can continue and fail later in less obvious ways if the database never became available. Validate the correctness of the flagged issue. If correct, How can I resolve this? If you propose a fix, implement it and please make it concise. ``` </details> <a href='https://app.codeant.ai/feedback?pr_url=https%3A%2F%2Fgithub.com%2Fapache%2Fsuperset%2Fpull%2F37950&comment_hash=778daeb1cad443120b680880907742095ee62b81c8345c9fdc59584adc30c600&reaction=like'>👍</a> | <a href='https://app.codeant.ai/feedback?pr_url=https%3A%2F%2Fgithub.com%2Fapache%2Fsuperset%2Fpull%2F37950&comment_hash=778daeb1cad443120b680880907742095ee62b81c8345c9fdc59584adc30c600&reaction=dislike'>👎</a> ########## extensions/ai_assistant/frontend/src/ChatPanel.tsx: ########## @@ -0,0 +1,695 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/** + * Vambery AI Agent Chat Panel + * + * Renders in the SQL Lab right sidebar. Provides a chat interface + * for users to interact with the AI agent, which can inspect + * database schemas, write SQL queries, and modify the SQL editor. + */ + +import React, { useState, useRef, useEffect, useCallback, useMemo } from "react"; +import { sqlLab, authentication, useTheme, SupersetTheme } from "@apache-superset/core"; + +// -------------------------------------------------------------------------- +// Types +// -------------------------------------------------------------------------- + +interface ChatMessage { + role: "user" | "assistant" | "system"; + content: string; + timestamp: number; + steps?: AgentStep[]; + actions?: EditorAction[]; + error?: boolean; + hasRunnable?: boolean; +} + +interface AgentStep { + type: string; + tool: string; + args: Record<string, unknown>; + result_summary: string; +} + +interface EditorAction { + type: string; + sql?: string; +} + +interface ChatContext { + database_id: number | undefined; + database_name: string | undefined; + schema: string | null; + catalog: string | null; + current_sql: string; +} + +// -------------------------------------------------------------------------- +// SSE Streaming API Helper +// -------------------------------------------------------------------------- + +interface StreamEvent { + event: "step" | "action" | "response" | "error"; + data: Record<string, unknown>; +} + +/** + * Post a chat message and stream SSE events back. + * Calls onEvent for each parsed SSE event as it arrives. + */ +async function postChatStream( + messages: { role: string; content: string }[], + context: ChatContext, + onEvent: (evt: StreamEvent) => void +): Promise<void> { + const csrfToken = await authentication.getCSRFToken(); + + const response = await fetch("/api/v1/ai_assistant/chat/stream", { + method: "POST", + headers: { + "Content-Type": "application/json", + ...(csrfToken ? { "X-CSRFToken": csrfToken } : {}), + }, + credentials: "same-origin", + body: JSON.stringify({ messages, context }), + }); + + if (!response.ok) { + const errData = await response.json().catch(() => ({})); + throw new Error( + (errData as Record<string, string>).error || + `HTTP ${response.status}: ${response.statusText}` + ); + } + + const reader = response.body?.getReader(); + if (!reader) throw new Error("ReadableStream not supported"); + + const decoder = new TextDecoder(); + let buffer = ""; + + while (true) { + const { done, value } = await reader.read(); + if (done) break; + + buffer += decoder.decode(value, { stream: true }); + + // SSE format: "event: <type>\ndata: <json>\n\n" + // Split on double newline to get complete events + const parts = buffer.split("\n\n"); + // Last part may be incomplete — keep it in the buffer + buffer = parts.pop() || ""; + + for (const part of parts) { + if (!part.trim()) continue; + let eventType = ""; + let eventData = ""; + + for (const line of part.split("\n")) { + if (line.startsWith("event: ")) { + eventType = line.slice(7).trim(); + } else if (line.startsWith("data: ")) { + eventData = line.slice(6); + } + } + + if (eventType && eventData) { + try { + onEvent({ Review Comment: **Suggestion:** The streaming helper calls an async event handler without awaiting it, so when the stream finishes, any pending async work (like applying editor actions and setting the runnable flag) may not have completed, causing the final assistant message to be built with an outdated `hasRunnable` value and potentially hiding the "Re-run query" button even when actions ran. [logic error] <details> <summary><b>Severity Level:</b> Major ⚠️</summary> ```mdx - ⚠️ "Re-run query" button sometimes missing after auto-run actions. - ⚠️ Assistant message state can be inconsistent with editor actions. ``` </details> ```suggestion onEvent: (evt: StreamEvent) => Promise<void> | void ): Promise<void> { const csrfToken = await authentication.getCSRFToken(); const response = await fetch("/api/v1/ai_assistant/chat/stream", { method: "POST", headers: { "Content-Type": "application/json", ...(csrfToken ? { "X-CSRFToken": csrfToken } : {}), }, credentials: "same-origin", body: JSON.stringify({ messages, context }), }); if (!response.ok) { const errData = await response.json().catch(() => ({})); throw new Error( (errData as Record<string, string>).error || `HTTP ${response.status}: ${response.statusText}` ); } const reader = response.body?.getReader(); if (!reader) throw new Error("ReadableStream not supported"); const decoder = new TextDecoder(); let buffer = ""; while (true) { const { done, value } = await reader.read(); if (done) break; buffer += decoder.decode(value, { stream: true }); // SSE format: "event: <type>\ndata: <json>\n\n" // Split on double newline to get complete events const parts = buffer.split("\n\n"); // Last part may be incomplete — keep it in the buffer buffer = parts.pop() || ""; for (const part of parts) { if (!part.trim()) continue; let eventType = ""; let eventData = ""; for (const line of part.split("\n")) { if (line.startsWith("event: ")) { eventType = line.slice(7).trim(); } else if (line.startsWith("data: ")) { eventData = line.slice(6); } } if (eventType && eventData) { try { await onEvent({ ``` <details> <summary><b>Steps of Reproduction ✅ </b></summary> ```mdx 1. Open SQL Lab and ensure the AI assistant panel mounts, rendering `ChatPanel` from `extensions/ai_assistant/frontend/src/ChatPanel.tsx`. 2. In `ChatPanel.handleSend` (around lines 436–528 in the same file), send a message while a database is selected so that `handleSend` calls `await postChatStream(apiMessages, context, async (evt) => { ... })`. 3. When the backend `/api/v1/ai_assistant/chat/stream` endpoint emits an `"action"` SSE event, the async callback in `handleSend` awaits `applyAction(action)` and sets `hasRunnable = true` (see `applyAction` in `ChatPanel.tsx` around lines 394–423), but `postChatStream` (lines 78–146) invokes `onEvent(...)` without `await`, so it does not wait for that Promise to resolve. 4. As soon as the stream ends, `postChatStream` resolves and `handleSend` immediately builds the assistant message using the current `hasRunnable` value (still `false` if the async handler hasn't finished), so the `"Re-run query"` button, which depends on `msg.hasRunnable`, is omitted even though the action was applied and the query was auto-run. ``` </details> <details> <summary><b>Prompt for AI Agent 🤖 </b></summary> ```mdx This is a comment left during a code review. **Path:** extensions/ai_assistant/frontend/src/ChatPanel.tsx **Line:** 81:136 **Comment:** *Logic Error: The streaming helper calls an async event handler without awaiting it, so when the stream finishes, any pending async work (like applying editor actions and setting the runnable flag) may not have completed, causing the final assistant message to be built with an outdated `hasRunnable` value and potentially hiding the "Re-run query" button even when actions ran. Validate the correctness of the flagged issue. If correct, How can I resolve this? If you propose a fix, implement it and please make it concise. ``` </details> <a href='https://app.codeant.ai/feedback?pr_url=https%3A%2F%2Fgithub.com%2Fapache%2Fsuperset%2Fpull%2F37950&comment_hash=74b11b692658bb4e68e50e9861d44cb9ddfb3362188f6df00ddb3a2348ac8f26&reaction=like'>👍</a> | <a href='https://app.codeant.ai/feedback?pr_url=https%3A%2F%2Fgithub.com%2Fapache%2Fsuperset%2Fpull%2F37950&comment_hash=74b11b692658bb4e68e50e9861d44cb9ddfb3362188f6df00ddb3a2348ac8f26&reaction=dislike'>👎</a> ########## extensions/ai_assistant/backend/src/ai_assistant/tools.py: ########## @@ -0,0 +1,532 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Vambery AI Agent Database Tools. + +These tools allow the AI agent to introspect database schemas and execute +SQL queries during its reasoning process. +Uses Superset's Database model for all database operations, respecting +security permissions and row-level security. +""" + +from __future__ import annotations + +import logging +from typing import Any + +from superset.extensions import db +from superset.models.core import Database +from superset.sql.parse import Table + +logger = logging.getLogger(__name__) + + +def _extract_result_data(result: Any) -> tuple[list[dict[str, Any]], list[str]]: + """ + Extract rows (as list of dicts) and column names from a QueryResult. + + The QueryResult from database.execute() stores results in + result.statements[i].data as a Pandas DataFrame. + """ + rows: list[dict[str, Any]] = [] + columns: list[str] = [] + try: + if hasattr(result, "statements") and result.statements: + stmt = result.statements[0] + df = stmt.data + if df is not None and hasattr(df, "to_dict"): + # Pandas DataFrame + rows = df.to_dict(orient="records") + columns = list(df.columns) + elif df is not None and hasattr(df, "keys"): + # dict-like + rows = [df] if not isinstance(df, list) else df + columns = list(df.keys()) if isinstance(df, dict) else [] + except Exception as ex: + logger.debug("Could not extract result data: %s", ex) + + # Fallback: legacy result format + if not rows: + if hasattr(result, "data") and result.data: + rows = result.data if isinstance(result.data, list) else [] + elif hasattr(result, "rows") and hasattr(result, "columns"): + columns = result.columns + rows = [dict(zip(columns, row, strict=False)) for row in result.rows] + + return rows, columns + + +# -------------------------------------------------------------------------- +# OpenAI function/tool definitions (sent to the LLM) +# -------------------------------------------------------------------------- + +TOOL_DEFINITIONS: list[dict[str, Any]] = [ + { + "type": "function", + "function": { + "name": "list_schemas", + "description": ( + "List all available schemas in the connected database. " + "Use this first to understand the database structure." + ), + "parameters": { + "type": "object", + "properties": {}, + "required": [], + }, + }, + }, + { + "type": "function", + "function": { + "name": "list_tables", + "description": ( + "List all tables in a specific schema. " + "Use this to discover which tables are available." + ), + "parameters": { + "type": "object", + "properties": { + "schema_name": { + "type": "string", + "description": "The schema name to list tables from", + }, + }, + "required": ["schema_name"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "get_table_columns", + "description": ( + "Get the column names, data types, and nullable info for a table. " + "Use this to understand the structure of a table before writing queries." + ), + "parameters": { + "type": "object", + "properties": { + "table_name": { + "type": "string", + "description": "Name of the table", + }, + "schema_name": { + "type": "string", + "description": "Schema the table belongs to", + }, + }, + "required": ["table_name", "schema_name"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "sample_table_data", + "description": ( + "Get a sample of rows from a table (up to 20 rows). " + "Use this to understand what kind of data a table contains." + ), + "parameters": { + "type": "object", + "properties": { + "table_name": { + "type": "string", + "description": "Name of the table", + }, + "schema_name": { + "type": "string", + "description": "Schema the table belongs to", + }, + }, + "required": ["table_name", "schema_name"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "get_distinct_values", + "description": ( + "Get distinct/unique values for a specific column in a table. " + "Use this to understand the domain of a column (e.g., categories, statuses)." + ), + "parameters": { + "type": "object", + "properties": { + "table_name": { + "type": "string", + "description": "Name of the table", + }, + "schema_name": { + "type": "string", + "description": "Schema the table belongs to", + }, + "column_name": { + "type": "string", + "description": "Name of the column to get distinct values for", + }, + }, + "required": ["table_name", "schema_name", "column_name"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "execute_sql", + "description": ( + "Execute an arbitrary SQL SELECT query and return results (max 50 rows). " + "Use this to test queries, explore data, or verify results. " + "Only SELECT statements are allowed." + ), + "parameters": { + "type": "object", + "properties": { + "sql": { + "type": "string", + "description": "The SQL SELECT query to execute", + }, + }, + "required": ["sql"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "set_editor_sql", + "description": ( + "Set the SQL query in the user's SQL editor. " + "Use this when you have the final, tested query ready for the user. " + "This replaces the content of the active SQL editor tab." + ), + "parameters": { + "type": "object", + "properties": { + "sql": { + "type": "string", + "description": "The SQL query to place in the editor", + }, + }, + "required": ["sql"], + }, + }, + }, +] + + +# -------------------------------------------------------------------------- +# Tool execution functions +# -------------------------------------------------------------------------- + + +def _get_database(database_id: int) -> Database: + """Load a Database model by ID, raising ValueError if not found.""" + database = db.session.query(Database).filter_by(id=database_id).first() + if not database: + raise ValueError(f"Database with id={database_id} not found") + return database + + +def tool_list_schemas(database_id: int, catalog: str | None = None) -> dict[str, Any]: + """List all schemas in the database.""" + try: + database = _get_database(database_id) + schemas = database.get_all_schema_names(catalog=catalog) + return {"schemas": sorted(schemas)} + except Exception as ex: + logger.error("Error listing schemas for db %s: %s", database_id, ex) + return {"error": str(ex)} + + +def tool_list_tables( + database_id: int, schema_name: str, catalog: str | None = None +) -> dict[str, Any]: + """List all tables in a specific schema.""" + try: + database = _get_database(database_id) + tables = database.get_all_table_names_in_schema( + catalog=catalog, schema=schema_name + ) + # tables is a set of (table_name, schema, catalog) tuples + table_names = sorted(t[0] for t in tables) + return {"tables": table_names, "schema": schema_name} + except Exception as ex: + logger.error( + "Error listing tables for db %s, schema %s: %s", + database_id, + schema_name, + ex, + ) + return {"error": str(ex)} + + +def tool_get_table_columns( + database_id: int, + table_name: str, + schema_name: str, + catalog: str | None = None, +) -> dict[str, Any]: + """Get column metadata for a table.""" + try: + database = _get_database(database_id) + table = Table(table=table_name, schema=schema_name, catalog=catalog) + columns = database.get_columns(table) + # Format columns for readability + col_info = [] + for col in columns: + col_info.append( + { + "name": col.get("column_name") or col.get("name", "unknown"), + "type": str(col.get("type", "unknown")), + "nullable": col.get("nullable", True), + } + ) + return {"table": table_name, "schema": schema_name, "columns": col_info} + except Exception as ex: + logger.error( + "Error getting columns for %s.%s in db %s: %s", + schema_name, + table_name, + database_id, + ex, + ) + return {"error": str(ex)} + + +def tool_sample_table_data( + database_id: int, + table_name: str, + schema_name: str, + catalog: str | None = None, + max_rows: int = 20, +) -> dict[str, Any]: + """Get sample data from a table.""" + try: + database = _get_database(database_id) + # Use TOP for MSSQL, LIMIT for others + db_engine = database.db_engine_spec.__name__ if database.db_engine_spec else "" + if "mssql" in db_engine.lower() or "MSSql" in db_engine: + sql = f"SELECT TOP {max_rows} * FROM [{schema_name}].[{table_name}]" + else: + sql = f'SELECT * FROM "{schema_name}"."{table_name}" LIMIT {max_rows}' + + from superset_core.api.types import QueryOptions + + options = QueryOptions( + catalog=catalog, + schema=schema_name, + limit=max_rows, + ) + result = database.execute(sql, options) + rows, _ = _extract_result_data(result) + rows = rows[:max_rows] + + return { + "table": table_name, + "schema": schema_name, + "row_count": len(rows), + "data": rows, + } + except Exception as ex: + logger.error( + "Error sampling data from %s.%s in db %s: %s", + schema_name, + table_name, + database_id, + ex, + ) + return {"error": str(ex)} + + +def tool_get_distinct_values( + database_id: int, + table_name: str, + schema_name: str, + column_name: str, + catalog: str | None = None, + max_values: int = 50, +) -> dict[str, Any]: + """Get distinct values for a specific column.""" + try: + database = _get_database(database_id) + db_engine = database.db_engine_spec.__name__ if database.db_engine_spec else "" + if "mssql" in db_engine.lower() or "MSSql" in db_engine: Review Comment: **Suggestion:** In the MSSQL branch of the distinct-values helper, schema, table, and column names are concatenated directly into the SQL inside square brackets without any validation, so a maliciously crafted identifier containing ']' (or similar) could terminate the identifier and inject arbitrary SQL despite this function being intended for safe metadata access. [security] <details> <summary><b>Severity Level:</b> Critical 🚨</summary> ```mdx - ❌ AI assistant distinct-value lookup can run injected MSSQL SQL. - ❌ Attackers may alter or drop MSSQL tables via metadata tool. - ⚠️ Compromises trust in AI-driven schema exploration features. ``` </details> ```suggestion # Basic identifier safety to avoid SQL injection via schema/table/column names if any(ch in schema_name for ch in '[];"') or any( ch in table_name for ch in '[];"' ) or any(ch in column_name for ch in '[];"'): raise ValueError("Invalid characters in schema, table, or column name") if "mssql" in db_engine.lower(): ``` <details> <summary><b>Steps of Reproduction ✅ </b></summary> ```mdx 1. Configure a MSSQL-backed `Database` in Superset and ensure it is accessible via `_get_database()` at `extensions/ai_assistant/backend/src/ai_assistant/tools.py:240-245`. 2. From code using the AI assistant tools, invoke `execute_tool()` at `extensions/ai_assistant/backend/src/ai_assistant/tools.py:469-532` with `tool_name="get_distinct_values"`, the MSSQL `database_id`, and `arguments={"schema_name": "public", "table_name": "users]; DROP TABLE sensitive_table;--", "column_name": "status"}` or similar crafted identifiers. 3. `execute_tool` routes this call to `tool_get_distinct_values()` at `extensions/ai_assistant/backend/src/ai_assistant/tools.py:359-416`, passing through the attacker-controlled `schema_name`, `table_name`, and `column_name`. 4. In `tool_get_distinct_values`, the MSSQL branch at lines `369-377` constructs `sql` by interpolating the identifiers inside `[...]` without validation, yielding a query such as `SELECT DISTINCT TOP 50 [status] FROM [public].[users]; DROP TABLE sensitive_table;--] WHERE [status] IS NOT NULL ORDER BY [status]`, which is then executed via `Database.execute()` at `superset/models/core.py:1295-1309`, allowing the injected statement to run if the MSSQL driver permits multiple statements. ``` </details> <details> <summary><b>Prompt for AI Agent 🤖 </b></summary> ```mdx This is a comment left during a code review. **Path:** extensions/ai_assistant/backend/src/ai_assistant/tools.py **Line:** 371:371 **Comment:** *Security: In the MSSQL branch of the distinct-values helper, schema, table, and column names are concatenated directly into the SQL inside square brackets without any validation, so a maliciously crafted identifier containing ']' (or similar) could terminate the identifier and inject arbitrary SQL despite this function being intended for safe metadata access. Validate the correctness of the flagged issue. If correct, How can I resolve this? If you propose a fix, implement it and please make it concise. ``` </details> <a href='https://app.codeant.ai/feedback?pr_url=https%3A%2F%2Fgithub.com%2Fapache%2Fsuperset%2Fpull%2F37950&comment_hash=0aeea65a62d6ddef4d8d0cbdc49499141df7a7b04da1ae1c9b848fddc9352847&reaction=like'>👍</a> | <a href='https://app.codeant.ai/feedback?pr_url=https%3A%2F%2Fgithub.com%2Fapache%2Fsuperset%2Fpull%2F37950&comment_hash=0aeea65a62d6ddef4d8d0cbdc49499141df7a7b04da1ae1c9b848fddc9352847&reaction=dislike'>👎</a> ########## extensions/ai_assistant/backend/src/ai_assistant/tools.py: ########## @@ -0,0 +1,532 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Vambery AI Agent Database Tools. + +These tools allow the AI agent to introspect database schemas and execute +SQL queries during its reasoning process. +Uses Superset's Database model for all database operations, respecting +security permissions and row-level security. +""" + +from __future__ import annotations + +import logging +from typing import Any + +from superset.extensions import db +from superset.models.core import Database +from superset.sql.parse import Table + +logger = logging.getLogger(__name__) + + +def _extract_result_data(result: Any) -> tuple[list[dict[str, Any]], list[str]]: + """ + Extract rows (as list of dicts) and column names from a QueryResult. + + The QueryResult from database.execute() stores results in + result.statements[i].data as a Pandas DataFrame. + """ + rows: list[dict[str, Any]] = [] + columns: list[str] = [] + try: + if hasattr(result, "statements") and result.statements: + stmt = result.statements[0] + df = stmt.data + if df is not None and hasattr(df, "to_dict"): + # Pandas DataFrame + rows = df.to_dict(orient="records") + columns = list(df.columns) + elif df is not None and hasattr(df, "keys"): + # dict-like + rows = [df] if not isinstance(df, list) else df + columns = list(df.keys()) if isinstance(df, dict) else [] + except Exception as ex: + logger.debug("Could not extract result data: %s", ex) + + # Fallback: legacy result format + if not rows: + if hasattr(result, "data") and result.data: + rows = result.data if isinstance(result.data, list) else [] + elif hasattr(result, "rows") and hasattr(result, "columns"): + columns = result.columns + rows = [dict(zip(columns, row, strict=False)) for row in result.rows] + + return rows, columns + + +# -------------------------------------------------------------------------- +# OpenAI function/tool definitions (sent to the LLM) +# -------------------------------------------------------------------------- + +TOOL_DEFINITIONS: list[dict[str, Any]] = [ + { + "type": "function", + "function": { + "name": "list_schemas", + "description": ( + "List all available schemas in the connected database. " + "Use this first to understand the database structure." + ), + "parameters": { + "type": "object", + "properties": {}, + "required": [], + }, + }, + }, + { + "type": "function", + "function": { + "name": "list_tables", + "description": ( + "List all tables in a specific schema. " + "Use this to discover which tables are available." + ), + "parameters": { + "type": "object", + "properties": { + "schema_name": { + "type": "string", + "description": "The schema name to list tables from", + }, + }, + "required": ["schema_name"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "get_table_columns", + "description": ( + "Get the column names, data types, and nullable info for a table. " + "Use this to understand the structure of a table before writing queries." + ), + "parameters": { + "type": "object", + "properties": { + "table_name": { + "type": "string", + "description": "Name of the table", + }, + "schema_name": { + "type": "string", + "description": "Schema the table belongs to", + }, + }, + "required": ["table_name", "schema_name"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "sample_table_data", + "description": ( + "Get a sample of rows from a table (up to 20 rows). " + "Use this to understand what kind of data a table contains." + ), + "parameters": { + "type": "object", + "properties": { + "table_name": { + "type": "string", + "description": "Name of the table", + }, + "schema_name": { + "type": "string", + "description": "Schema the table belongs to", + }, + }, + "required": ["table_name", "schema_name"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "get_distinct_values", + "description": ( + "Get distinct/unique values for a specific column in a table. " + "Use this to understand the domain of a column (e.g., categories, statuses)." + ), + "parameters": { + "type": "object", + "properties": { + "table_name": { + "type": "string", + "description": "Name of the table", + }, + "schema_name": { + "type": "string", + "description": "Schema the table belongs to", + }, + "column_name": { + "type": "string", + "description": "Name of the column to get distinct values for", + }, + }, + "required": ["table_name", "schema_name", "column_name"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "execute_sql", + "description": ( + "Execute an arbitrary SQL SELECT query and return results (max 50 rows). " + "Use this to test queries, explore data, or verify results. " + "Only SELECT statements are allowed." + ), + "parameters": { + "type": "object", + "properties": { + "sql": { + "type": "string", + "description": "The SQL SELECT query to execute", + }, + }, + "required": ["sql"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "set_editor_sql", + "description": ( + "Set the SQL query in the user's SQL editor. " + "Use this when you have the final, tested query ready for the user. " + "This replaces the content of the active SQL editor tab." + ), + "parameters": { + "type": "object", + "properties": { + "sql": { + "type": "string", + "description": "The SQL query to place in the editor", + }, + }, + "required": ["sql"], + }, + }, + }, +] + + +# -------------------------------------------------------------------------- +# Tool execution functions +# -------------------------------------------------------------------------- + + +def _get_database(database_id: int) -> Database: + """Load a Database model by ID, raising ValueError if not found.""" + database = db.session.query(Database).filter_by(id=database_id).first() + if not database: + raise ValueError(f"Database with id={database_id} not found") + return database + + +def tool_list_schemas(database_id: int, catalog: str | None = None) -> dict[str, Any]: + """List all schemas in the database.""" + try: + database = _get_database(database_id) + schemas = database.get_all_schema_names(catalog=catalog) + return {"schemas": sorted(schemas)} + except Exception as ex: + logger.error("Error listing schemas for db %s: %s", database_id, ex) + return {"error": str(ex)} + + +def tool_list_tables( + database_id: int, schema_name: str, catalog: str | None = None +) -> dict[str, Any]: + """List all tables in a specific schema.""" + try: + database = _get_database(database_id) + tables = database.get_all_table_names_in_schema( + catalog=catalog, schema=schema_name + ) + # tables is a set of (table_name, schema, catalog) tuples + table_names = sorted(t[0] for t in tables) + return {"tables": table_names, "schema": schema_name} + except Exception as ex: + logger.error( + "Error listing tables for db %s, schema %s: %s", + database_id, + schema_name, + ex, + ) + return {"error": str(ex)} + + +def tool_get_table_columns( + database_id: int, + table_name: str, + schema_name: str, + catalog: str | None = None, +) -> dict[str, Any]: + """Get column metadata for a table.""" + try: + database = _get_database(database_id) + table = Table(table=table_name, schema=schema_name, catalog=catalog) + columns = database.get_columns(table) + # Format columns for readability + col_info = [] + for col in columns: + col_info.append( + { + "name": col.get("column_name") or col.get("name", "unknown"), + "type": str(col.get("type", "unknown")), + "nullable": col.get("nullable", True), + } + ) + return {"table": table_name, "schema": schema_name, "columns": col_info} + except Exception as ex: + logger.error( + "Error getting columns for %s.%s in db %s: %s", + schema_name, + table_name, + database_id, + ex, + ) + return {"error": str(ex)} + + +def tool_sample_table_data( + database_id: int, + table_name: str, + schema_name: str, + catalog: str | None = None, + max_rows: int = 20, +) -> dict[str, Any]: + """Get sample data from a table.""" + try: + database = _get_database(database_id) + # Use TOP for MSSQL, LIMIT for others + db_engine = database.db_engine_spec.__name__ if database.db_engine_spec else "" + if "mssql" in db_engine.lower() or "MSSql" in db_engine: + sql = f"SELECT TOP {max_rows} * FROM [{schema_name}].[{table_name}]" + else: + sql = f'SELECT * FROM "{schema_name}"."{table_name}" LIMIT {max_rows}' + + from superset_core.api.types import QueryOptions + + options = QueryOptions( + catalog=catalog, + schema=schema_name, + limit=max_rows, + ) + result = database.execute(sql, options) + rows, _ = _extract_result_data(result) + rows = rows[:max_rows] + + return { + "table": table_name, + "schema": schema_name, + "row_count": len(rows), + "data": rows, + } + except Exception as ex: + logger.error( + "Error sampling data from %s.%s in db %s: %s", + schema_name, + table_name, + database_id, + ex, + ) + return {"error": str(ex)} + + +def tool_get_distinct_values( + database_id: int, + table_name: str, + schema_name: str, + column_name: str, + catalog: str | None = None, + max_values: int = 50, +) -> dict[str, Any]: + """Get distinct values for a specific column.""" + try: + database = _get_database(database_id) + db_engine = database.db_engine_spec.__name__ if database.db_engine_spec else "" + if "mssql" in db_engine.lower() or "MSSql" in db_engine: + sql = ( + f"SELECT DISTINCT TOP {max_values} [{column_name}] " + f"FROM [{schema_name}].[{table_name}] " + f"WHERE [{column_name}] IS NOT NULL " + f"ORDER BY [{column_name}]" + ) + else: + sql = ( + f'SELECT DISTINCT "{column_name}" ' + f'FROM "{schema_name}"."{table_name}" ' + f'WHERE "{column_name}" IS NOT NULL ' + f'ORDER BY "{column_name}" LIMIT {max_values}' + ) + + from superset_core.api.types import QueryOptions + + options = QueryOptions( + catalog=catalog, + schema=schema_name, + limit=max_values, + ) + result = database.execute(sql, options) + + rows, _ = _extract_result_data(result) + values = [] + for row in rows[:max_values]: + val = list(row.values())[0] if isinstance(row, dict) else row + values.append(str(val) if val is not None else None) + + return { + "table": table_name, + "column": column_name, + "distinct_count": len(values), + "values": values, + } + except Exception as ex: + logger.error( + "Error getting distinct values for %s.%s.%s in db %s: %s", + schema_name, + table_name, + column_name, + database_id, + ex, + ) + return {"error": str(ex)} + + +def tool_execute_sql( + database_id: int, + sql: str, + schema_name: str | None = None, + catalog: str | None = None, + max_rows: int = 50, +) -> dict[str, Any]: + """Execute an arbitrary SQL query.""" + try: + # Basic safety check - only allow SELECT + sql_stripped = sql.strip().upper() + if not sql_stripped.startswith("SELECT") and not sql_stripped.startswith( + "WITH" + ): + return { + "error": "Only SELECT and WITH (CTE) queries are allowed for safety" + } + + database = _get_database(database_id) + + from superset_core.api.types import QueryOptions Review Comment: **Suggestion:** The safety check in the generic SQL execution helper only verifies that the query string starts with SELECT or WITH, but does not prevent multiple statements (e.g., `WITH ...; DELETE ...`), so a user-supplied query could still execute non-SELECT statements and violate the "read-only" guarantee. [security] <details> <summary><b>Severity Level:</b> Critical 🚨</summary> ```mdx - ❌ AI assistant `execute_sql` tool not truly read-only. - ❌ Malicious multi-statement queries can modify or delete data. - ⚠️ Breaks safety expectations for AI-generated exploratory queries. ``` </details> ```suggestion # Basic safety check - only allow a single SELECT/CTE statement sql_stripped = sql.strip() sql_upper = sql_stripped.upper() if not sql_upper.startswith("SELECT") and not sql_upper.startswith("WITH"): return { "error": "Only SELECT and WITH (CTE) queries are allowed for safety" } # Disallow multiple statements separated by semicolons if ";" in sql_stripped.rstrip(";"): return {"error": "Multiple SQL statements are not allowed"} ``` <details> <summary><b>Steps of Reproduction ✅ </b></summary> ```mdx 1. Use any backend path that leverages `execute_tool()` at `extensions/ai_assistant/backend/src/ai_assistant/tools.py:469-532` for executing SQL via the AI assistant, passing `tool_name="execute_sql"` and a valid `database_id`. 2. Supply an attacker-controlled SQL string in `arguments["sql"]`, such as `\"SELECT 1; DELETE FROM sensitive_table\"`, which begins with `SELECT` but includes a second statement after a semicolon. 3. `execute_tool` dispatches this request to `tool_execute_sql()` at `extensions/ai_assistant/backend/src/ai_assistant/tools.py:419-457` with the provided `sql`. 4. Inside `tool_execute_sql`, the safety check at lines `432-439` converts the SQL to upper case, verifies it starts with `SELECT`, and does not inspect for additional statements; it then calls `Database.execute(sql, options)` at `superset/models/core.py:1295-1309`, allowing any subsequent non-SELECT statement in the same string to execute if the database/driver accepts multi-statement execution. ``` </details> <details> <summary><b>Prompt for AI Agent 🤖 </b></summary> ```mdx This is a comment left during a code review. **Path:** extensions/ai_assistant/backend/src/ai_assistant/tools.py **Line:** 432:439 **Comment:** *Security: The safety check in the generic SQL execution helper only verifies that the query string starts with SELECT or WITH, but does not prevent multiple statements (e.g., `WITH ...; DELETE ...`), so a user-supplied query could still execute non-SELECT statements and violate the "read-only" guarantee. Validate the correctness of the flagged issue. If correct, How can I resolve this? If you propose a fix, implement it and please make it concise. ``` </details> <a href='https://app.codeant.ai/feedback?pr_url=https%3A%2F%2Fgithub.com%2Fapache%2Fsuperset%2Fpull%2F37950&comment_hash=3efe2cb4539af93c794fb6ef2f014fe87bc4207fcae23037885c07b4bd332af2&reaction=like'>👍</a> | <a href='https://app.codeant.ai/feedback?pr_url=https%3A%2F%2Fgithub.com%2Fapache%2Fsuperset%2Fpull%2F37950&comment_hash=3efe2cb4539af93c794fb6ef2f014fe87bc4207fcae23037885c07b4bd332af2&reaction=dislike'>👎</a> -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
