Re: [PR] feat(mcp): add create_dataset tool to register physical tables as datasets [superset]

via GitHub Tue, 02 Jun 2026 14:23:28 -0700


bito-code-review[bot] commented on code in PR #40340:
URL: https://github.com/apache/superset/pull/40340#discussion_r3344510459



##########
tests/unit_tests/mcp_service/dataset/tool/test_create_dataset.py:
##########
@@ -0,0 +1,544 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Unit tests for create_dataset MCP tool."""
+
+import logging
+from unittest.mock import MagicMock, Mock, patch
+
+import pytest
+from fastmcp import Client
+from fastmcp.exceptions import ToolError
+
+from superset.commands.dataset.exceptions import (
+    DatasetExistsValidationError,
+    DatasetInvalidError,
+    TableNotFoundValidationError,
+)
+from superset.exceptions import SupersetSecurityException
+from superset.mcp_service.app import mcp
+from superset.sql.parse import Table
+from superset.utils import json
+
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+
+
+def _make_mock_dataset(
+    dataset_id: int = 42,
+    table_name: str = "orders",
+    schema: str = "public",
+    database_name: str = "main_db",
+) -> MagicMock:
+    dataset = MagicMock()
+    dataset.id = dataset_id
+    dataset.table_name = table_name
+    dataset.schema = schema
+    dataset.description = None
+    dataset.certified_by = None
+    dataset.certification_details = None
+    dataset.changed_by_name = "admin"
+    dataset.changed_on = None
+    dataset.changed_on_humanized = None
+    dataset.created_by_name = "admin"
+    dataset.created_on = None
+    dataset.created_on_humanized = None
+    dataset.tags = []
+    dataset.owners = []
+    dataset.is_virtual = False
+    dataset.is_favorite = None
+    dataset.database_id = 1
+    dataset.schema_perm = f"[{database_name}].[{schema}]"
+    dataset.url = f"/tablemodelview/edit/{dataset_id}"
+    dataset.database = MagicMock()
+    dataset.database.database_name = database_name
+    dataset.sql = None
+    dataset.main_dttm_col = None
+    dataset.offset = 0
+    dataset.cache_timeout = 0
+    dataset.params = {}
+    dataset.template_params = {}
+    dataset.extra = {}
+    dataset.uuid = f"dataset-uuid-{dataset_id}"
+    dataset.columns = []
+    dataset.metrics = []
+    return dataset
+
+
[email protected]
+def mcp_server():
+    return mcp
+
+
[email protected](autouse=True)
+def mock_auth():
+    with patch("superset.mcp_service.auth.get_user_from_request") as 
mock_get_user:
+        mock_user = Mock()
+        mock_user.id = 1
+        mock_user.username = "admin"
+        mock_get_user.return_value = mock_user
+        yield mock_get_user
+
+
+class TestCreateDataset:
+    """Tests for the create_dataset MCP tool."""
+
+    @pytest.fixture(autouse=True)
+    def mock_database_access(self):
+        """Provide a mock database and allow all table access by default."""
+        mock_db = MagicMock()
+        with (
+            patch(
+                "superset.daos.database.DatabaseDAO.find_by_id", 
return_value=mock_db
+            ),
+            patch("superset.extensions.security_manager.raise_for_access"),
+        ):
+            yield mock_db
+
+    
@patch("superset.mcp_service.dataset.tool.create_dataset.CreateDatasetCommand")
+    @pytest.mark.asyncio
+    async def test_create_dataset_success(self, mock_command_class, 
mcp_server) -> None:
+        """Happy path: tool creates dataset and returns DatasetInfo."""
+        mock_dataset = _make_mock_dataset()
+        mock_command = MagicMock()
+        mock_command.run.return_value = mock_dataset
+        mock_command_class.return_value = mock_command
+
+        async with Client(mcp_server) as client:
+            result = await client.call_tool(
+                "create_dataset",
+                {
+                    "request": {
+                        "database_id": 1,
+                        "schema": "public",
+                        "table_name": "orders",
+                    }
+                },
+            )
+
+        assert result.content is not None
+        data = json.loads(result.content[0].text)
+        assert data["id"] == 42
+        assert data["table_name"] == "orders"
+        assert data["schema"] == "public"
+
+        # Verify the command was called with the right properties
+        call_kwargs = mock_command_class.call_args[0][0]
+        assert call_kwargs["database"] == 1
+        assert call_kwargs["schema"] == "public"
+        assert call_kwargs["table_name"] == "orders"
+        assert "owners" not in call_kwargs
+
+    
@patch("superset.mcp_service.dataset.tool.create_dataset.CreateDatasetCommand")
+    @pytest.mark.asyncio
+    async def test_create_dataset_with_owners(
+        self, mock_command_class, mcp_server
+    ) -> None:
+        """Owners list is forwarded to the command when supplied."""
+        mock_dataset = _make_mock_dataset()
+        mock_command = MagicMock()
+        mock_command.run.return_value = mock_dataset
+        mock_command_class.return_value = mock_command
+
+        async with Client(mcp_server) as client:
+            result = await client.call_tool(
+                "create_dataset",
+                {
+                    "request": {
+                        "database_id": 2,
+                        "schema": "sales",
+                        "table_name": "transactions",
+                        "owners": [5, 10],
+                    }
+                },
+            )
+
+        data = json.loads(result.content[0].text)
+        assert data["id"] == 42
+
+        call_kwargs = mock_command_class.call_args[0][0]
+        assert call_kwargs["owners"] == [5, 10]
+
+    
@patch("superset.mcp_service.dataset.tool.create_dataset.CreateDatasetCommand")
+    @pytest.mark.asyncio
+    async def test_create_dataset_already_exists(
+        self, mock_command_class, mcp_server
+    ) -> None:
+        """Returns DatasetExistsError when the table is already registered.
+
+        CreateDatasetCommand.validate() wraps DatasetExistsValidationError 
inside
+        DatasetInvalidError.  The tool must inspect get_list_classnames() to 
surface
+        the typed error response.
+        """
+        mock_command = MagicMock()
+        mock_command.run.side_effect = DatasetInvalidError(
+            exceptions=[DatasetExistsValidationError(Table("orders", "public", 
None))]
+        )
+        mock_command_class.return_value = mock_command
+
+        async with Client(mcp_server) as client:
+            result = await client.call_tool(
+                "create_dataset",
+                {
+                    "request": {
+                        "database_id": 1,
+                        "schema": "public",
+                        "table_name": "orders",
+                    }
+                },
+            )
+
+        data = json.loads(result.content[0].text)
+        assert data["error_type"] == "DatasetExistsError"
+        assert "error" in data
+
+    
@patch("superset.mcp_service.dataset.tool.create_dataset.CreateDatasetCommand")
+    @pytest.mark.asyncio
+    async def test_create_dataset_table_not_found(
+        self, mock_command_class, mcp_server
+    ) -> None:
+        """Returns TableNotFoundError when the physical table does not exist 
in the DB.
+
+        CreateDatasetCommand.validate() wraps TableNotFoundValidationError 
inside
+        DatasetInvalidError.  The tool must inspect get_list_classnames() to 
surface
+        the typed error response.
+        """
+        mock_command = MagicMock()
+        mock_command.run.side_effect = DatasetInvalidError(
+            exceptions=[
+                TableNotFoundValidationError(Table("missing_table", "public", 
None))
+            ]
+        )
+        mock_command_class.return_value = mock_command
+
+        async with Client(mcp_server) as client:
+            result = await client.call_tool(
+                "create_dataset",
+                {
+                    "request": {
+                        "database_id": 1,
+                        "schema": "public",
+                        "table_name": "missing_table",
+                    }
+                },
+            )
+
+        data = json.loads(result.content[0].text)
+        assert data["error_type"] == "TableNotFoundError"
+
+    
@patch("superset.mcp_service.dataset.tool.create_dataset.CreateDatasetCommand")
+    @pytest.mark.asyncio
+    async def test_create_dataset_with_catalog(
+        self, mock_command_class, mcp_server
+    ) -> None:
+        """Catalog field is normalized and forwarded to the command when 
supplied."""
+        mock_dataset = _make_mock_dataset()
+        mock_command = MagicMock()
+        mock_command.run.return_value = mock_dataset
+        mock_command_class.return_value = mock_command
+
+        async with Client(mcp_server) as client:
+            await client.call_tool(
+                "create_dataset",
+                {
+                    "request": {
+                        "database_id": 1,
+                        "catalog": "  hive  ",
+                        "schema": "default",
+                        "table_name": "events",
+                    }
+                },
+            )
+
+        call_kwargs = mock_command_class.call_args[0][0]
+        assert call_kwargs["catalog"] == "hive"
+
+    
@patch("superset.mcp_service.dataset.tool.create_dataset.CreateDatasetCommand")
+    @pytest.mark.asyncio
+    async def test_create_dataset_invalid_error(
+        self, mock_command_class, mcp_server
+    ) -> None:
+        """DatasetInvalidError is returned as ValidationError type."""
+        mock_command = MagicMock()
+        mock_command.run.side_effect = DatasetInvalidError()
+        mock_command_class.return_value = mock_command
+
+        async with Client(mcp_server) as client:
+            result = await client.call_tool(
+                "create_dataset",
+                {
+                    "request": {
+                        "database_id": 1,
+                        "schema": "public",
+                        "table_name": "orders",
+                    }
+                },
+            )
+
+        data = json.loads(result.content[0].text)
+        assert data["error_type"] == "ValidationError"
+        assert "error" in data
+
+    @pytest.mark.asyncio
+    async def test_create_dataset_database_not_found(self, mcp_server) -> None:
+        """Returns DatabaseNotFoundError when database_id does not exist."""
+        with patch("superset.daos.database.DatabaseDAO.find_by_id", 
return_value=None):
+            async with Client(mcp_server) as client:
+                result = await client.call_tool(
+                    "create_dataset",
+                    {
+                        "request": {
+                            "database_id": 999,
+                            "table_name": "orders",
+                        }
+                    },
+                )
+
+        data = json.loads(result.content[0].text)
+        assert data["error_type"] == "DatabaseNotFoundError"
+        assert "Database not found" in data["error"]
+
+    @pytest.mark.asyncio
+    async def test_create_dataset_access_denied(self, mcp_server) -> None:
+        """Returns AccessDeniedError when caller lacks table-level access."""
+        mock_db = MagicMock()
+        with (
+            patch(
+                "superset.daos.database.DatabaseDAO.find_by_id", 
return_value=mock_db
+            ),
+            patch(
+                "superset.extensions.security_manager.raise_for_access",
+                side_effect=SupersetSecurityException(
+                    MagicMock(message="Access is Denied")
+                ),
+            ),
+        ):
+            async with Client(mcp_server) as client:
+                result = await client.call_tool(
+                    "create_dataset",
+                    {
+                        "request": {
+                            "database_id": 1,
+                            "schema": "secret",
+                            "table_name": "restricted_table",
+                        }
+                    },
+                )
+
+        data = json.loads(result.content[0].text)
+        assert data["error_type"] == "AccessDeniedError"
+        assert "Access denied" in data["error"]
+
+    
@patch("superset.mcp_service.dataset.tool.create_dataset.CreateDatasetCommand")
+    @pytest.mark.asyncio
+    async def test_create_dataset_unexpected_error(
+        self, mock_command_class, mcp_server
+    ) -> None:
+        """Unexpected exceptions are re-raised as ToolError (handled by 
middleware)."""
+        mock_command = MagicMock()
+        mock_command.run.side_effect = RuntimeError("DB connection lost")
+        mock_command_class.return_value = mock_command
+
+        async with Client(mcp_server) as client:
+            with pytest.raises(ToolError):
+                await client.call_tool(
+                    "create_dataset",
+                    {
+                        "request": {
+                            "database_id": 1,
+                            "schema": "public",
+                            "table_name": "orders",
+                        }
+                    },
+                )
+
+    @pytest.mark.asyncio
+    async def test_create_dataset_missing_required_fields(self, mcp_server) -> 
None:
+        """Missing required fields raise a validation error before the tool 
runs."""
+        async with Client(mcp_server) as client:
+            with pytest.raises(ToolError):
+                await client.call_tool(
+                    "create_dataset",
+                    {
+                        "request": {
+                            # database_id and table_name are omitted 
intentionally
+                            "schema": "public",
+                        }
+                    },
+                )
+
+    
@patch("superset.mcp_service.dataset.tool.create_dataset.CreateDatasetCommand")
+    @pytest.mark.asyncio
+    async def test_create_dataset_returns_full_dataset_info(
+        self, mock_command_class, mcp_server
+    ) -> None:
+        """The returned DatasetInfo includes columns, metrics, and all core 
fields."""
+        mock_dataset = _make_mock_dataset(
+            dataset_id=99, table_name="sales", schema="dw"
+        )
+
+        col = MagicMock()
+        col.column_name = "amount"
+        col.verbose_name = "Amount"
+        col.type = "NUMERIC"
+        col.is_dttm = False
+        col.groupby = True
+        col.filterable = True
+        col.description = "Sale amount"
+        mock_dataset.columns = [col]
+
+        metric = MagicMock()
+        metric.metric_name = "total_sales"
+        metric.verbose_name = "Total Sales"
+        metric.expression = "SUM(amount)"
+        metric.description = "Sum of amounts"
+        metric.d3format = None
+        mock_dataset.metrics = [metric]
+
+        mock_command = MagicMock()
+        mock_command.run.return_value = mock_dataset
+        mock_command_class.return_value = mock_command
+
+        async with Client(mcp_server) as client:
+            result = await client.call_tool(
+                "create_dataset",
+                {
+                    "request": {
+                        "database_id": 1,
+                        "schema": "dw",
+                        "table_name": "sales",
+                    }
+                },
+            )
+
+        data = json.loads(result.content[0].text)
+        assert data["id"] == 99
+        assert data["table_name"] == "sales"
+        assert data["schema"] == "dw"
+        assert data["is_virtual"] is False
+        assert len(data["columns"]) == 1
+        assert data["columns"][0]["column_name"] == "amount"
+        assert len(data["metrics"]) == 1
+        assert data["metrics"][0]["metric_name"] == "total_sales"
+
+    
@patch("superset.mcp_service.dataset.tool.create_dataset.CreateDatasetCommand")
+    @pytest.mark.asyncio
+    async def test_create_dataset_table_name_whitespace_normalized(
+        self, mock_command_class, mcp_server
+    ) -> None:
+        """Whitespace in table_name is stripped before forwarding to the 
command."""
+        mock_dataset = _make_mock_dataset()
+        mock_command = MagicMock()
+        mock_command.run.return_value = mock_dataset
+        mock_command_class.return_value = mock_command
+
+        async with Client(mcp_server) as client:
+            await client.call_tool(
+                "create_dataset",
+                {
+                    "request": {
+                        "database_id": 1,
+                        "schema": "public",
+                        "table_name": "  orders  ",
+                    }
+                },
+            )
+
+        call_kwargs = mock_command_class.call_args[0][0]
+        assert call_kwargs["table_name"] == "orders"
+
+    
@patch("superset.mcp_service.dataset.tool.create_dataset.CreateDatasetCommand")
+    @pytest.mark.asyncio
+    async def test_create_dataset_blank_schema_normalized_to_none(
+        self, mock_command_class, mcp_server
+    ) -> None:
+        """Blank schema string is treated as absent: not forwarded to the 
command."""
+        mock_dataset = _make_mock_dataset(schema="")
+        mock_command = MagicMock()
+        mock_command.run.return_value = mock_dataset
+        mock_command_class.return_value = mock_command
+
+        async with Client(mcp_server) as client:
+            await client.call_tool(
+                "create_dataset",
+                {
+                    "request": {
+                        "database_id": 1,
+                        "schema": "",
+                        "table_name": "orders",
+                    }
+                },
+            )
+
+        call_kwargs = mock_command_class.call_args[0][0]
+        assert "schema" not in call_kwargs
+
+    
@patch("superset.mcp_service.dataset.tool.create_dataset.CreateDatasetCommand")
+    @pytest.mark.asyncio
+    async def test_create_dataset_blank_catalog_normalized_to_none(
+        self, mock_command_class, mcp_server
+    ) -> None:
+        """Blank catalog string is treated as absent: not forwarded to the 
command."""
+        mock_dataset = _make_mock_dataset()
+        mock_command = MagicMock()
+        mock_command.run.return_value = mock_dataset
+        mock_command_class.return_value = mock_command
+
+        async with Client(mcp_server) as client:
+            await client.call_tool(
+                "create_dataset",
+                {
+                    "request": {
+                        "database_id": 1,
+                        "catalog": "",
+                        "table_name": "orders",
+                    }
+                },
+            )
+
+        call_kwargs = mock_command_class.call_args[0][0]
+        assert "catalog" not in call_kwargs
+
+    
@patch("superset.mcp_service.dataset.tool.create_dataset.CreateDatasetCommand")
+    @patch(
+        
"superset.mcp_service.dataset.tool.create_dataset.serialize_dataset_object",
+        return_value=None,
+    )
+    @pytest.mark.asyncio
+    async def test_create_dataset_serialization_error(
+        self, mock_serialize, mock_command_class, mcp_server
+    ) -> None:
+        """Returns SerializationError when serialize_dataset_object returns 
None."""
+        mock_dataset = _make_mock_dataset()
+        mock_command = MagicMock()
+        mock_command.run.return_value = mock_dataset
+        mock_command_class.return_value = mock_command
+
+        async with Client(mcp_server) as client:
+            result = await client.call_tool(
+                "create_dataset",
+                {
+                    "request": {
+                        "database_id": 1,
+                        "schema": "public",
+                        "table_name": "orders",
+                    }
+                },
+            )
+
+        data = json.loads(result.content[0].text)
+        assert data["error_type"] == "SerializationError"

Review Comment:
   <div>
   
   
   <div id="suggestion">
   <div id="issue"><b>Missing test for CreateFailedError</b></div>
   <div id="fix">
   
   Missing test for `DatasetCreateFailedError` error path (lines 187-192 of 
`create_dataset.py`). The implementation catches this exception and returns 
`DatasetError.create(error="Dataset creation failed", 
error_type="CreateFailedError")`, but there is no corresponding test. All other 
error paths (`DatasetExistsError`, `TableNotFoundError`, `ValidationError`, 
`DatabaseNotFoundError`, `AccessDeniedError`, `SerializationError`, unexpected 
`Exception`) have dedicated tests. Per adaptive rule [11730], comprehensive 
unit tests should cover all error scenarios for MCP mutation tools.
   </div>
   
   
   </div>
   
   
   
   
   <div id="suggestion">
   <div id="issue"><b>Misleading test name</b></div>
   <div id="fix">
   
   The test name `test_create_dataset_serialization_error` suggests it 
exercises an actual serialization exception, but the test mocks 
`serialize_dataset_object` to return `None` instead. The docstring correctly 
describes the scenario, but the name is misleading — it will cause future 
maintainers to search for a serialization-error-raising code path that this 
test never touches.
   </div>
   
   
   <details>
   <summary>
   <b>Code suggestion</b>
   </summary>
   <blockquote>Check the AI-generated fix before applying</blockquote>
   <div id="code">
   
   
   ```
    --- a/tests/unit_tests/mcp_service/dataset/tool/test_create_dataset.py
    +++ b/tests/unit_tests/mcp_service/dataset/tool/test_create_dataset.py
    @@ -519,7 +519,7 @@ class TestCreateDataset:
             return_value=None,
         )
         @pytest.mark.asyncio
    -    async def test_create_dataset_serialization_error(
    +    async def test_create_dataset_when_serialize_returns_none(
             self, mock_serialize, mock_command_class, mcp_server
         ) -> None:
    -        """Returns SerializationError when serialize_dataset_object 
returns None."""
    +        """Returns SerializationError when serialization returns None."""
             mock_dataset = _make_mock_dataset()
             mock_command = MagicMock()
             mock_command.run.return_value = mock_dataset
   ```
   
   </div>
   </details>
   
   
   
   </div>
   
   
   
   
   <small><i>Code Review Run #e7ae57</i></small>
   </div>
   
   ---
   Should Bito avoid suggestions like this for future reviews? (<a 
href=https://alpha.bito.ai/home/ai-agents/review-rules>Manage Rules</a>)
   - [ ] Yes, avoid them



##########
superset/mcp_service/dataset/schemas.py:
##########
@@ -324,6 +322,48 @@ class GetDatasetInfoRequest(MetadataCacheControl):
     ]
 
 
+class CreateDatasetRequest(BaseModel):
+    """Request schema for create_dataset to register a physical table as a 
dataset."""
+
+    database_id: Annotated[
+        int,
+        Field(
+            description="ID of the database connection to register the table 
against"
+        ),
+    ]
+    schema: Annotated[
+        str | None,
+        Field(
+            default=None,
+            description="Schema (namespace) where the table lives, e.g. 
'public'. "
+            "Omit or pass None for databases without schema namespaces (e.g. 
SQLite).",
+        ),
+    ]
+    catalog: Annotated[
+        str | None,
+        Field(
+            default=None,
+            description="Catalog where the table lives. Omit for databases 
without "
+            "catalog support.",
+        ),
+    ]
+    table_name: Annotated[
+        str,
+        Field(
+            min_length=1,
+            description="Name of the physical table to register as a dataset",
+        ),
+    ]
+    owners: Annotated[
+        List[int] | None,
+        Field(
+            default=None,
+            description="Optional list of owner user IDs. "
+            "Defaults to the calling user.",
+        ),
+    ]
+

Review Comment:
   <div>
   
   
   <div id="suggestion">
   <div id="issue"><b>Missing unit tests for new schema</b></div>
   <div id="fix">
   
   The new `CreateDatasetRequest` schema lacks dedicated unit tests covering 
request validation, success paths, and error handling. Per project conventions, 
all new MCP mutation tools must include unit tests per the codebase testing 
standards.
   </div>
   
   
   </div>
   
   
   
   
   <small><i>Code Review Run #e7ae57</i></small>
   </div>
   
   ---
   Should Bito avoid suggestions like this for future reviews? (<a 
href=https://alpha.bito.ai/home/ai-agents/review-rules>Manage Rules</a>)
   - [ ] Yes, avoid them



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] feat(mcp): add create_dataset tool to register physical tables as datasets [superset]

Reply via email to