This is an automated email from the ASF dual-hosted git repository. jli pushed a commit to branch fix-examples-not-loading in repository https://gitbox.apache.org/repos/asf/superset.git
commit cbff869c2d8495c99b4315b9c00e463c74bf2949 Author: Joe Li <[email protected]> AuthorDate: Thu Jan 29 14:03:56 2026 -0800 fix(examples): extract _find_dataset helper for UUID-first lookup Refactor the UUID-first lookup logic into a reusable helper function that is used in both the early-return path (table exists) and the post-load metadata path (table missing/force). This ensures consistent behavior and avoids potential unique constraint violations when duplicate metadata rows exist and the physical table was dropped. Changes: - Add _find_dataset() helper for UUID-first, table_name fallback lookup - Refactor both code paths to use the helper - Add tests for the helper function - Add test for duplicate rows + missing table edge case Co-Authored-By: Claude Opus 4.5 <[email protected]> --- superset/examples/generic_loader.py | 73 +++++++++------- tests/unit_tests/examples/generic_loader_test.py | 107 +++++++++++++++++++++++ 2 files changed, 148 insertions(+), 32 deletions(-) diff --git a/superset/examples/generic_loader.py b/superset/examples/generic_loader.py index d2a1ba81f12..dea2432998a 100644 --- a/superset/examples/generic_loader.py +++ b/superset/examples/generic_loader.py @@ -34,6 +34,41 @@ from superset.utils.database import get_example_database logger = logging.getLogger(__name__) +def _find_dataset( + table_name: str, + database_id: int, + uuid: Optional[str] = None, +) -> tuple[Optional[SqlaTable], bool]: + """Find a dataset by UUID first, then fall back to table_name + database_id. + + This avoids unique constraint violations when a duplicate row exists. + + Args: + table_name: The table name to look up + database_id: The database ID + uuid: Optional UUID to look up first + + Returns: + A tuple of (dataset or None, found_by_uuid bool) + """ + tbl = None + found_by_uuid = False + + if uuid: + tbl = db.session.query(SqlaTable).filter_by(uuid=uuid).first() + if tbl: + found_by_uuid = True + + if not tbl: + tbl = ( + db.session.query(SqlaTable) + .filter_by(table_name=table_name, database_id=database_id) + .first() + ) + + return tbl, found_by_uuid + + def serialize_numpy_arrays(obj: Any) -> Any: # noqa: C901 """Convert numpy arrays to JSON-serializable format.""" if isinstance(obj, np.ndarray): @@ -92,19 +127,10 @@ def load_parquet_table( # noqa: C901 table_exists = database.has_table(Table(table_name, schema=schema)) if table_exists and not force: logger.info("Table %s already exists, skipping data load", table_name) - # Use UUID-first lookup to avoid collisions, then fall back to table_name - tbl = None - if uuid: - tbl = db.session.query(SqlaTable).filter_by(uuid=uuid).first() - if not tbl: - tbl = ( - db.session.query(SqlaTable) - .filter_by(table_name=table_name, database_id=database.id) - .first() - ) + tbl, found_by_uuid = _find_dataset(table_name, database.id, uuid) if tbl: - # Backfill UUID if found by table_name and UUID not set - if uuid and not tbl.uuid: + # Backfill UUID if found by table_name (not UUID) and UUID not set + if uuid and not tbl.uuid and not found_by_uuid: tbl.uuid = uuid db.session.merge(tbl) db.session.commit() # pylint: disable=consider-using-transaction @@ -175,31 +201,14 @@ def load_parquet_table( # noqa: C901 logger.info("Loaded %d rows into %s", len(pdf), table_name) - # Create or update SqlaTable metadata - # If UUID is provided, look up by UUID first to avoid unique constraint violations - # (a prior broken run may have created a duplicate with this UUID) - tbl = None - found_by_uuid = False - - if uuid: - tbl = db.session.query(SqlaTable).filter_by(uuid=uuid).first() - if tbl: - found_by_uuid = True - - # Fall back to table_name + database_id lookup - if not tbl: - tbl = ( - db.session.query(SqlaTable) - .filter_by(table_name=table_name, database_id=database.id) - .first() - ) + # Create or update SqlaTable metadata using UUID-first lookup + tbl, found_by_uuid = _find_dataset(table_name, database.id, uuid) if not tbl: tbl = SqlaTable(table_name=table_name, database_id=database.id) tbl.database = database - # Set UUID if provided and not already set (backfill for existing datasets) - # Only do this if we found by table_name, not by UUID (to avoid collisions) + # Backfill UUID if found by table_name (not UUID) and UUID not set if uuid and not tbl.uuid and not found_by_uuid: tbl.uuid = uuid diff --git a/tests/unit_tests/examples/generic_loader_test.py b/tests/unit_tests/examples/generic_loader_test.py index 464e7ee55b9..74c3ddaa6db 100644 --- a/tests/unit_tests/examples/generic_loader_test.py +++ b/tests/unit_tests/examples/generic_loader_test.py @@ -285,3 +285,110 @@ def test_create_generic_loader_passes_uuid() -> None: assert loader is not None assert callable(loader) assert loader.__name__ == "load_test_data" + + +@patch("superset.examples.generic_loader.db") +def test_find_dataset_returns_uuid_match_first(mock_db: MagicMock) -> None: + """Test that _find_dataset returns UUID match over table_name match.""" + from superset.examples.generic_loader import _find_dataset + + # Row with UUID (should be found first) + uuid_row = MagicMock() + uuid_row.uuid = "target-uuid" + uuid_row.table_name = "table_a" + + # Row without UUID (same table_name as query) + tablename_row = MagicMock() + tablename_row.uuid = None + tablename_row.table_name = "test_table" + + # UUID lookup returns uuid_row + mock_db.session.query.return_value.filter_by.return_value.first.return_value = ( + uuid_row + ) + + result, found_by_uuid = _find_dataset("test_table", 1, "target-uuid") + + assert result is uuid_row + assert found_by_uuid is True + + +@patch("superset.examples.generic_loader.db") +def test_find_dataset_falls_back_to_table_name(mock_db: MagicMock) -> None: + """Test that _find_dataset falls back to table_name when UUID not found.""" + from superset.examples.generic_loader import _find_dataset + + tablename_row = MagicMock() + tablename_row.uuid = None + tablename_row.table_name = "test_table" + + # UUID lookup returns None, table_name lookup returns the row + def filter_by_side_effect(**kwargs): + mock_result = MagicMock() + if "uuid" in kwargs: + mock_result.first.return_value = None + else: + mock_result.first.return_value = tablename_row + return mock_result + + mock_db.session.query.return_value.filter_by.side_effect = filter_by_side_effect + + result, found_by_uuid = _find_dataset("test_table", 1, "nonexistent-uuid") + + assert result is tablename_row + assert found_by_uuid is False + + +@patch("superset.examples.generic_loader.db") +@patch("superset.examples.generic_loader.get_example_database") +@patch("superset.examples.generic_loader.read_example_data") +def test_load_parquet_table_duplicate_rows_table_missing( + mock_read_data: MagicMock, + mock_get_db: MagicMock, + mock_db: MagicMock, +) -> None: + """Test UUID-first lookup when duplicate rows exist and physical table is missing. + + Scenario: + - Row A: table_name="test_table", uuid="target-uuid" (correct row) + - Row B: table_name="test_table", uuid=None (duplicate from broken run) + - Physical table was dropped + + The UUID-first lookup should find Row A and avoid collision. + """ + from superset.examples.generic_loader import load_parquet_table + + mock_database = MagicMock() + mock_inspector = _setup_database_mocks( + mock_get_db, + mock_database, + has_table=False, # Table was dropped + ) + + with patch("superset.examples.generic_loader.inspect") as mock_inspect: + mock_inspect.return_value = mock_inspector + + # Row with UUID (should be found by UUID lookup) + uuid_row = MagicMock() + uuid_row.uuid = "target-uuid" + uuid_row.table_name = "test_table" + uuid_row.database = mock_database + + # UUID lookup finds the correct row + mock_db.session.query.return_value.filter_by.return_value.first.return_value = ( + uuid_row + ) + + mock_read_data.return_value = pd.DataFrame({"col1": [1, 2, 3]}) + + result = load_parquet_table( + parquet_file="test_data", + table_name="test_table", + database=mock_database, + only_metadata=True, + uuid="target-uuid", + ) + + # Should return the UUID row, not try to backfill (which would collide) + assert result is uuid_row + assert result.uuid == "target-uuid"
