This is an automated email from the ASF dual-hosted git repository. jli pushed a commit to branch fix-examples-not-loading in repository https://gitbox.apache.org/repos/asf/superset.git
commit 124a1b0510d1976c96da060817276efd9797fa19 Author: Joe Li <[email protected]> AuthorDate: Thu Jan 29 13:33:57 2026 -0800 fix(examples): preserve UUIDs from YAML configs in load_parquet_table The example loading flow has two code paths that weren't communicating UUIDs properly: 1. Data loading (load_parquet_table) created SqlaTable without UUID 2. Config import (import_dataset) looks up datasets by UUID only This caused dataset matching failures when charts/dashboards referenced datasets by UUID. Changes: - Add uuid parameter to load_parquet_table() and create_generic_loader() - Extract uuid from dataset.yaml in get_dataset_config_from_yaml() - Extract uuid from datasets/*.yaml in _get_multi_dataset_config() - Pass uuid through discover_datasets() to the loaders - Use UUID-first lookup to avoid unique constraint violations - Backfill UUID on existing datasets found by table_name - Add pylint disable comments for pre-existing transaction warnings Co-Authored-By: Claude Opus 4.5 <[email protected]> --- superset/examples/data_loading.py | 8 +- superset/examples/generic_loader.py | 57 ++++- tests/unit_tests/examples/__init__.py | 16 ++ tests/unit_tests/examples/data_loading_test.py | 129 ++++++++++ tests/unit_tests/examples/generic_loader_test.py | 287 +++++++++++++++++++++++ 5 files changed, 483 insertions(+), 14 deletions(-) diff --git a/superset/examples/data_loading.py b/superset/examples/data_loading.py index c32f0fb0ab1..b652ec79be1 100644 --- a/superset/examples/data_loading.py +++ b/superset/examples/data_loading.py @@ -35,11 +35,12 @@ logger = logging.getLogger(__name__) def get_dataset_config_from_yaml(example_dir: Path) -> Dict[str, Optional[str]]: - """Read table_name, schema, and data_file from dataset.yaml if it exists.""" + """Read table_name, schema, data_file, and uuid from dataset.yaml if it exists.""" result: Dict[str, Optional[str]] = { "table_name": None, "schema": None, "data_file": None, + "uuid": None, } dataset_yaml = example_dir / "dataset.yaml" if dataset_yaml.exists(): @@ -48,6 +49,7 @@ def get_dataset_config_from_yaml(example_dir: Path) -> Dict[str, Optional[str]]: config = yaml.safe_load(f) result["table_name"] = config.get("table_name") result["data_file"] = config.get("data_file") + result["uuid"] = config.get("uuid") schema = config.get("schema") # Treat SQLite's 'main' schema as null (use target database default) result["schema"] = None if schema == "main" else schema @@ -72,6 +74,7 @@ def _get_multi_dataset_config( "table_name": dataset_name, "schema": None, "data_file": data_file, + "uuid": None, } if not datasets_yaml.exists(): @@ -81,6 +84,7 @@ def _get_multi_dataset_config( with open(datasets_yaml) as f: yaml_config = yaml.safe_load(f) result["table_name"] = yaml_config.get("table_name") or dataset_name + result["uuid"] = yaml_config.get("uuid") raw_schema = yaml_config.get("schema") result["schema"] = None if raw_schema == "main" else raw_schema @@ -142,6 +146,7 @@ def discover_datasets() -> Dict[str, Callable[..., None]]: table_name=table_name, schema=config["schema"], data_file=resolved_file, + uuid=config.get("uuid"), ) # Discover multiple parquet files in data/ folders (complex examples) @@ -160,6 +165,7 @@ def discover_datasets() -> Dict[str, Callable[..., None]]: table_name=config["table_name"], schema=config["schema"], data_file=config["data_file"], + uuid=config.get("uuid"), ) return loaders diff --git a/superset/examples/generic_loader.py b/superset/examples/generic_loader.py index 220e6939524..d2a1ba81f12 100644 --- a/superset/examples/generic_loader.py +++ b/superset/examples/generic_loader.py @@ -57,6 +57,7 @@ def load_parquet_table( # noqa: C901 sample_rows: Optional[int] = None, data_file: Optional[Any] = None, schema: Optional[str] = None, + uuid: Optional[str] = None, ) -> SqlaTable: """Load a Parquet file into the example database. @@ -69,6 +70,7 @@ def load_parquet_table( # noqa: C901 sample_rows: If specified, only load this many rows data_file: Optional specific file path (Path object) to load from schema: Schema to load into (defaults to database default schema) + uuid: UUID for the dataset (from YAML config) for import flow matching Returns: The created SqlaTable object @@ -90,12 +92,22 @@ def load_parquet_table( # noqa: C901 table_exists = database.has_table(Table(table_name, schema=schema)) if table_exists and not force: logger.info("Table %s already exists, skipping data load", table_name) - tbl = ( - db.session.query(SqlaTable) - .filter_by(table_name=table_name, database_id=database.id) - .first() - ) + # Use UUID-first lookup to avoid collisions, then fall back to table_name + tbl = None + if uuid: + tbl = db.session.query(SqlaTable).filter_by(uuid=uuid).first() + if not tbl: + tbl = ( + db.session.query(SqlaTable) + .filter_by(table_name=table_name, database_id=database.id) + .first() + ) if tbl: + # Backfill UUID if found by table_name and UUID not set + if uuid and not tbl.uuid: + tbl.uuid = uuid + db.session.merge(tbl) + db.session.commit() # pylint: disable=consider-using-transaction return tbl # Load data if not metadata only @@ -164,17 +176,33 @@ def load_parquet_table( # noqa: C901 logger.info("Loaded %d rows into %s", len(pdf), table_name) # Create or update SqlaTable metadata - tbl = ( - db.session.query(SqlaTable) - .filter_by(table_name=table_name, database_id=database.id) - .first() - ) + # If UUID is provided, look up by UUID first to avoid unique constraint violations + # (a prior broken run may have created a duplicate with this UUID) + tbl = None + found_by_uuid = False + + if uuid: + tbl = db.session.query(SqlaTable).filter_by(uuid=uuid).first() + if tbl: + found_by_uuid = True + + # Fall back to table_name + database_id lookup + if not tbl: + tbl = ( + db.session.query(SqlaTable) + .filter_by(table_name=table_name, database_id=database.id) + .first() + ) if not tbl: tbl = SqlaTable(table_name=table_name, database_id=database.id) - # Set the database reference tbl.database = database + # Set UUID if provided and not already set (backfill for existing datasets) + # Only do this if we found by table_name, not by UUID (to avoid collisions) + if uuid and not tbl.uuid and not found_by_uuid: + tbl.uuid = uuid + if not only_metadata: # Ensure database reference is set before fetching metadata if not tbl.database: @@ -182,7 +210,7 @@ def load_parquet_table( # noqa: C901 tbl.fetch_metadata() db.session.merge(tbl) - db.session.commit() + db.session.commit() # pylint: disable=consider-using-transaction return tbl @@ -194,6 +222,7 @@ def create_generic_loader( sample_rows: Optional[int] = None, data_file: Optional[Any] = None, schema: Optional[str] = None, + uuid: Optional[str] = None, ) -> Callable[[Database, SqlaTable], None]: """Create a loader function for a specific Parquet file. @@ -207,6 +236,7 @@ def create_generic_loader( sample_rows: Default number of rows to sample data_file: Optional specific file path (Path object) for data/ folder pattern schema: Schema to load into (defaults to database default schema) + uuid: UUID for the dataset (from YAML config) for import flow matching Returns: A loader function with the standard signature @@ -230,12 +260,13 @@ def create_generic_loader( sample_rows=rows, data_file=data_file, schema=schema, + uuid=uuid, ) if description and tbl: tbl.description = description db.session.merge(tbl) - db.session.commit() + db.session.commit() # pylint: disable=consider-using-transaction # Set function name and docstring loader.__name__ = f"load_{parquet_file}" diff --git a/tests/unit_tests/examples/__init__.py b/tests/unit_tests/examples/__init__.py new file mode 100644 index 00000000000..13a83393a91 --- /dev/null +++ b/tests/unit_tests/examples/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/tests/unit_tests/examples/data_loading_test.py b/tests/unit_tests/examples/data_loading_test.py new file mode 100644 index 00000000000..f8d934d11d2 --- /dev/null +++ b/tests/unit_tests/examples/data_loading_test.py @@ -0,0 +1,129 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Tests for the data_loading module, specifically UUID extraction from YAML.""" + +from pathlib import Path + + +def test_get_dataset_config_from_yaml_extracts_uuid(tmp_path: Path) -> None: + """Test that get_dataset_config_from_yaml extracts UUID from YAML.""" + from superset.examples.data_loading import get_dataset_config_from_yaml + + # Create a temporary dataset.yaml with UUID + yaml_content = """ +table_name: birth_names +schema: public +data_file: data.parquet +uuid: 14f48794-ebfa-4f60-a26a-582c49132f1b +""" + dataset_yaml = tmp_path / "dataset.yaml" + dataset_yaml.write_text(yaml_content) + + result = get_dataset_config_from_yaml(tmp_path) + + assert result["uuid"] == "14f48794-ebfa-4f60-a26a-582c49132f1b" + assert result["table_name"] == "birth_names" + assert result["schema"] == "public" + + +def test_get_dataset_config_from_yaml_handles_missing_uuid(tmp_path: Path) -> None: + """Test that missing UUID returns None.""" + from superset.examples.data_loading import get_dataset_config_from_yaml + + # Create a temporary dataset.yaml without UUID + yaml_content = """ +table_name: birth_names +schema: public +""" + dataset_yaml = tmp_path / "dataset.yaml" + dataset_yaml.write_text(yaml_content) + + result = get_dataset_config_from_yaml(tmp_path) + + assert result["uuid"] is None + assert result["table_name"] == "birth_names" + + +def test_get_dataset_config_from_yaml_handles_missing_file(tmp_path: Path) -> None: + """Test that missing dataset.yaml returns None for all fields.""" + from superset.examples.data_loading import get_dataset_config_from_yaml + + result = get_dataset_config_from_yaml(tmp_path) + + assert result["uuid"] is None + assert result["table_name"] is None + assert result["schema"] is None + + +def test_get_multi_dataset_config_extracts_uuid(tmp_path: Path) -> None: + """Test that _get_multi_dataset_config extracts UUID from datasets/*.yaml.""" + from superset.examples.data_loading import _get_multi_dataset_config + + # Create datasets directory and YAML file + datasets_dir = tmp_path / "datasets" + datasets_dir.mkdir() + + yaml_content = """ +table_name: cleaned_sales_data +schema: null +uuid: e8623bb9-5e00-f531-506a-19607f5f8005 +""" + dataset_yaml = datasets_dir / "cleaned_sales_data.yaml" + dataset_yaml.write_text(yaml_content) + + data_file = tmp_path / "data" / "cleaned_sales_data.parquet" + + result = _get_multi_dataset_config(tmp_path, "cleaned_sales_data", data_file) + + assert result["uuid"] == "e8623bb9-5e00-f531-506a-19607f5f8005" + assert result["table_name"] == "cleaned_sales_data" + + +def test_get_multi_dataset_config_handles_missing_uuid(tmp_path: Path) -> None: + """Test that missing UUID in multi-dataset config returns None.""" + from superset.examples.data_loading import _get_multi_dataset_config + + # Create datasets directory and YAML file without UUID + datasets_dir = tmp_path / "datasets" + datasets_dir.mkdir() + + yaml_content = """ +table_name: my_dataset +schema: null +""" + dataset_yaml = datasets_dir / "my_dataset.yaml" + dataset_yaml.write_text(yaml_content) + + data_file = tmp_path / "data" / "my_dataset.parquet" + + result = _get_multi_dataset_config(tmp_path, "my_dataset", data_file) + + assert result["uuid"] is None + assert result["table_name"] == "my_dataset" + + +def test_get_multi_dataset_config_handles_missing_file(tmp_path: Path) -> None: + """Test that missing datasets/*.yaml returns None for UUID.""" + from superset.examples.data_loading import _get_multi_dataset_config + + data_file = tmp_path / "data" / "my_dataset.parquet" + + result = _get_multi_dataset_config(tmp_path, "my_dataset", data_file) + + assert result["uuid"] is None + # Falls back to dataset_name when no YAML + assert result["table_name"] == "my_dataset" diff --git a/tests/unit_tests/examples/generic_loader_test.py b/tests/unit_tests/examples/generic_loader_test.py new file mode 100644 index 00000000000..464e7ee55b9 --- /dev/null +++ b/tests/unit_tests/examples/generic_loader_test.py @@ -0,0 +1,287 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Tests for the generic_loader module, specifically UUID handling.""" + +from unittest.mock import MagicMock, patch + +import pandas as pd + + +def _setup_database_mocks( + mock_get_db: MagicMock, mock_database: MagicMock, has_table: bool = False +) -> MagicMock: + """Helper to set up common database mocks.""" + mock_database.id = 1 + mock_database.has_table.return_value = has_table + mock_get_db.return_value = mock_database + + mock_engine = MagicMock() + mock_inspector = MagicMock() + mock_inspector.default_schema_name = "public" + mock_database.get_sqla_engine.return_value.__enter__ = MagicMock( + return_value=mock_engine + ) + mock_database.get_sqla_engine.return_value.__exit__ = MagicMock(return_value=False) + + return mock_inspector + + +@patch("superset.examples.generic_loader.db") +@patch("superset.examples.generic_loader.get_example_database") +@patch("superset.examples.generic_loader.read_example_data") +def test_load_parquet_table_sets_uuid_on_new_table( + mock_read_data: MagicMock, + mock_get_db: MagicMock, + mock_db: MagicMock, +) -> None: + """Test that load_parquet_table sets UUID when creating a new SqlaTable.""" + from superset.examples.generic_loader import load_parquet_table + + mock_database = MagicMock() + mock_inspector = _setup_database_mocks(mock_get_db, mock_database, has_table=False) + + with patch("superset.examples.generic_loader.inspect") as mock_inspect: + mock_inspect.return_value = mock_inspector + + # No existing table by UUID or table_name + mock_db.session.query.return_value.filter_by.return_value.first.return_value = ( + None + ) + + mock_read_data.return_value = pd.DataFrame({"col1": [1, 2, 3]}) + + test_uuid = "14f48794-ebfa-4f60-a26a-582c49132f1b" + + result = load_parquet_table( + parquet_file="test_data", + table_name="test_table", + database=mock_database, + only_metadata=True, + uuid=test_uuid, + ) + + assert result.uuid == test_uuid + + +@patch("superset.examples.generic_loader.db") +@patch("superset.examples.generic_loader.get_example_database") +def test_load_parquet_table_finds_existing_by_uuid_first( + mock_get_db: MagicMock, + mock_db: MagicMock, +) -> None: + """Test that load_parquet_table looks up by UUID first when provided.""" + from superset.examples.generic_loader import load_parquet_table + + mock_database = MagicMock() + mock_inspector = _setup_database_mocks(mock_get_db, mock_database, has_table=True) + + with patch("superset.examples.generic_loader.inspect") as mock_inspect: + mock_inspect.return_value = mock_inspector + + # Existing table found by UUID + test_uuid = "existing-uuid-1234" + mock_existing_table = MagicMock() + mock_existing_table.uuid = test_uuid + mock_existing_table.table_name = "test_table" + + # First call (by uuid) returns the table, second call (by table_name) not needed + mock_db.session.query.return_value.filter_by.return_value.first.return_value = ( + mock_existing_table + ) + + result = load_parquet_table( + parquet_file="test_data", + table_name="test_table", + database=mock_database, + only_metadata=True, + uuid=test_uuid, + ) + + # Should return the existing table found by UUID + assert result.uuid == test_uuid + assert result is mock_existing_table + + +@patch("superset.examples.generic_loader.db") +@patch("superset.examples.generic_loader.get_example_database") +def test_load_parquet_table_backfills_uuid_on_existing_table( + mock_get_db: MagicMock, + mock_db: MagicMock, +) -> None: + """Test that existing dataset with uuid=None gets UUID backfilled.""" + from superset.examples.generic_loader import load_parquet_table + + mock_database = MagicMock() + mock_inspector = _setup_database_mocks(mock_get_db, mock_database, has_table=True) + + with patch("superset.examples.generic_loader.inspect") as mock_inspect: + mock_inspect.return_value = mock_inspector + + # Existing table with NO UUID (needs backfill) + mock_existing_table = MagicMock() + mock_existing_table.uuid = None + mock_existing_table.table_name = "test_table" + + # UUID lookup returns None, table_name lookup returns the table + def filter_by_side_effect(**kwargs): + mock_result = MagicMock() + if "uuid" in kwargs: + mock_result.first.return_value = None + else: + mock_result.first.return_value = mock_existing_table + return mock_result + + mock_db.session.query.return_value.filter_by.side_effect = filter_by_side_effect + + new_uuid = "new-uuid-5678" + + result = load_parquet_table( + parquet_file="test_data", + table_name="test_table", + database=mock_database, + only_metadata=True, + uuid=new_uuid, + ) + + # UUID should be backfilled + assert result.uuid == new_uuid + + +@patch("superset.examples.generic_loader.db") +@patch("superset.examples.generic_loader.get_example_database") +def test_load_parquet_table_avoids_uuid_collision( + mock_get_db: MagicMock, + mock_db: MagicMock, +) -> None: + """Test that finding by UUID doesn't try to re-set UUID (avoids collision).""" + from superset.examples.generic_loader import load_parquet_table + + mock_database = MagicMock() + mock_inspector = _setup_database_mocks(mock_get_db, mock_database, has_table=True) + + with patch("superset.examples.generic_loader.inspect") as mock_inspect: + mock_inspect.return_value = mock_inspector + + # Table already has the UUID we're looking for + test_uuid = "existing-uuid-1234" + mock_existing_table = MagicMock() + mock_existing_table.uuid = test_uuid + + # UUID lookup finds the table + mock_db.session.query.return_value.filter_by.return_value.first.return_value = ( + mock_existing_table + ) + + result = load_parquet_table( + parquet_file="test_data", + table_name="test_table", + database=mock_database, + only_metadata=True, + uuid=test_uuid, + ) + + # UUID should remain unchanged (not re-assigned) + assert result.uuid == test_uuid + + +@patch("superset.examples.generic_loader.db") +@patch("superset.examples.generic_loader.get_example_database") +def test_load_parquet_table_preserves_existing_different_uuid( + mock_get_db: MagicMock, + mock_db: MagicMock, +) -> None: + """Test that if table has different UUID, we find it by UUID lookup first.""" + from superset.examples.generic_loader import load_parquet_table + + mock_database = MagicMock() + mock_inspector = _setup_database_mocks(mock_get_db, mock_database, has_table=True) + + with patch("superset.examples.generic_loader.inspect") as mock_inspect: + mock_inspect.return_value = mock_inspector + + # A table exists with the target UUID + target_uuid = "target-uuid-1234" + mock_uuid_table = MagicMock() + mock_uuid_table.uuid = target_uuid + + # UUID lookup finds the UUID-matching table + mock_db.session.query.return_value.filter_by.return_value.first.return_value = ( + mock_uuid_table + ) + + result = load_parquet_table( + parquet_file="test_data", + table_name="different_table_name", + database=mock_database, + only_metadata=True, + uuid=target_uuid, + ) + + # Should return the table found by UUID, not create new one + assert result is mock_uuid_table + assert result.uuid == target_uuid + + +@patch("superset.examples.generic_loader.db") +@patch("superset.examples.generic_loader.get_example_database") +@patch("superset.examples.generic_loader.read_example_data") +def test_load_parquet_table_works_without_uuid( + mock_read_data: MagicMock, + mock_get_db: MagicMock, + mock_db: MagicMock, +) -> None: + """Test that load_parquet_table still works when no UUID is provided.""" + from superset.examples.generic_loader import load_parquet_table + + mock_database = MagicMock() + mock_inspector = _setup_database_mocks(mock_get_db, mock_database, has_table=False) + + with patch("superset.examples.generic_loader.inspect") as mock_inspect: + mock_inspect.return_value = mock_inspector + + # No existing table + mock_db.session.query.return_value.filter_by.return_value.first.return_value = ( + None + ) + + mock_read_data.return_value = pd.DataFrame({"col1": [1, 2, 3]}) + + result = load_parquet_table( + parquet_file="test_data", + table_name="test_table", + database=mock_database, + only_metadata=True, + ) + + assert result is not None + assert result.table_name == "test_table" + + +def test_create_generic_loader_passes_uuid() -> None: + """Test that create_generic_loader passes UUID to load_parquet_table.""" + from superset.examples.generic_loader import create_generic_loader + + test_uuid = "test-uuid-1234" + loader = create_generic_loader( + parquet_file="test_data", + table_name="test_table", + uuid=test_uuid, + ) + + assert loader is not None + assert callable(loader) + assert loader.__name__ == "load_test_data"
