This is an automated email from the ASF dual-hosted git repository.

jli pushed a commit to branch fix-examples-not-loading
in repository https://gitbox.apache.org/repos/asf/superset.git

commit cbff869c2d8495c99b4315b9c00e463c74bf2949
Author: Joe Li <[email protected]>
AuthorDate: Thu Jan 29 14:03:56 2026 -0800

    fix(examples): extract _find_dataset helper for UUID-first lookup
    
    Refactor the UUID-first lookup logic into a reusable helper function
    that is used in both the early-return path (table exists) and the
    post-load metadata path (table missing/force).
    
    This ensures consistent behavior and avoids potential unique constraint
    violations when duplicate metadata rows exist and the physical table
    was dropped.
    
    Changes:
    - Add _find_dataset() helper for UUID-first, table_name fallback lookup
    - Refactor both code paths to use the helper
    - Add tests for the helper function
    - Add test for duplicate rows + missing table edge case
    
    Co-Authored-By: Claude Opus 4.5 <[email protected]>
---
 superset/examples/generic_loader.py              |  73 +++++++++-------
 tests/unit_tests/examples/generic_loader_test.py | 107 +++++++++++++++++++++++
 2 files changed, 148 insertions(+), 32 deletions(-)

diff --git a/superset/examples/generic_loader.py 
b/superset/examples/generic_loader.py
index d2a1ba81f12..dea2432998a 100644
--- a/superset/examples/generic_loader.py
+++ b/superset/examples/generic_loader.py
@@ -34,6 +34,41 @@ from superset.utils.database import get_example_database
 logger = logging.getLogger(__name__)
 
 
+def _find_dataset(
+    table_name: str,
+    database_id: int,
+    uuid: Optional[str] = None,
+) -> tuple[Optional[SqlaTable], bool]:
+    """Find a dataset by UUID first, then fall back to table_name + 
database_id.
+
+    This avoids unique constraint violations when a duplicate row exists.
+
+    Args:
+        table_name: The table name to look up
+        database_id: The database ID
+        uuid: Optional UUID to look up first
+
+    Returns:
+        A tuple of (dataset or None, found_by_uuid bool)
+    """
+    tbl = None
+    found_by_uuid = False
+
+    if uuid:
+        tbl = db.session.query(SqlaTable).filter_by(uuid=uuid).first()
+        if tbl:
+            found_by_uuid = True
+
+    if not tbl:
+        tbl = (
+            db.session.query(SqlaTable)
+            .filter_by(table_name=table_name, database_id=database_id)
+            .first()
+        )
+
+    return tbl, found_by_uuid
+
+
 def serialize_numpy_arrays(obj: Any) -> Any:  # noqa: C901
     """Convert numpy arrays to JSON-serializable format."""
     if isinstance(obj, np.ndarray):
@@ -92,19 +127,10 @@ def load_parquet_table(  # noqa: C901
     table_exists = database.has_table(Table(table_name, schema=schema))
     if table_exists and not force:
         logger.info("Table %s already exists, skipping data load", table_name)
-        # Use UUID-first lookup to avoid collisions, then fall back to 
table_name
-        tbl = None
-        if uuid:
-            tbl = db.session.query(SqlaTable).filter_by(uuid=uuid).first()
-        if not tbl:
-            tbl = (
-                db.session.query(SqlaTable)
-                .filter_by(table_name=table_name, database_id=database.id)
-                .first()
-            )
+        tbl, found_by_uuid = _find_dataset(table_name, database.id, uuid)
         if tbl:
-            # Backfill UUID if found by table_name and UUID not set
-            if uuid and not tbl.uuid:
+            # Backfill UUID if found by table_name (not UUID) and UUID not set
+            if uuid and not tbl.uuid and not found_by_uuid:
                 tbl.uuid = uuid
                 db.session.merge(tbl)
                 db.session.commit()  # pylint: 
disable=consider-using-transaction
@@ -175,31 +201,14 @@ def load_parquet_table(  # noqa: C901
 
         logger.info("Loaded %d rows into %s", len(pdf), table_name)
 
-    # Create or update SqlaTable metadata
-    # If UUID is provided, look up by UUID first to avoid unique constraint 
violations
-    # (a prior broken run may have created a duplicate with this UUID)
-    tbl = None
-    found_by_uuid = False
-
-    if uuid:
-        tbl = db.session.query(SqlaTable).filter_by(uuid=uuid).first()
-        if tbl:
-            found_by_uuid = True
-
-    # Fall back to table_name + database_id lookup
-    if not tbl:
-        tbl = (
-            db.session.query(SqlaTable)
-            .filter_by(table_name=table_name, database_id=database.id)
-            .first()
-        )
+    # Create or update SqlaTable metadata using UUID-first lookup
+    tbl, found_by_uuid = _find_dataset(table_name, database.id, uuid)
 
     if not tbl:
         tbl = SqlaTable(table_name=table_name, database_id=database.id)
         tbl.database = database
 
-    # Set UUID if provided and not already set (backfill for existing datasets)
-    # Only do this if we found by table_name, not by UUID (to avoid collisions)
+    # Backfill UUID if found by table_name (not UUID) and UUID not set
     if uuid and not tbl.uuid and not found_by_uuid:
         tbl.uuid = uuid
 
diff --git a/tests/unit_tests/examples/generic_loader_test.py 
b/tests/unit_tests/examples/generic_loader_test.py
index 464e7ee55b9..74c3ddaa6db 100644
--- a/tests/unit_tests/examples/generic_loader_test.py
+++ b/tests/unit_tests/examples/generic_loader_test.py
@@ -285,3 +285,110 @@ def test_create_generic_loader_passes_uuid() -> None:
     assert loader is not None
     assert callable(loader)
     assert loader.__name__ == "load_test_data"
+
+
+@patch("superset.examples.generic_loader.db")
+def test_find_dataset_returns_uuid_match_first(mock_db: MagicMock) -> None:
+    """Test that _find_dataset returns UUID match over table_name match."""
+    from superset.examples.generic_loader import _find_dataset
+
+    # Row with UUID (should be found first)
+    uuid_row = MagicMock()
+    uuid_row.uuid = "target-uuid"
+    uuid_row.table_name = "table_a"
+
+    # Row without UUID (same table_name as query)
+    tablename_row = MagicMock()
+    tablename_row.uuid = None
+    tablename_row.table_name = "test_table"
+
+    # UUID lookup returns uuid_row
+    
mock_db.session.query.return_value.filter_by.return_value.first.return_value = (
+        uuid_row
+    )
+
+    result, found_by_uuid = _find_dataset("test_table", 1, "target-uuid")
+
+    assert result is uuid_row
+    assert found_by_uuid is True
+
+
+@patch("superset.examples.generic_loader.db")
+def test_find_dataset_falls_back_to_table_name(mock_db: MagicMock) -> None:
+    """Test that _find_dataset falls back to table_name when UUID not found."""
+    from superset.examples.generic_loader import _find_dataset
+
+    tablename_row = MagicMock()
+    tablename_row.uuid = None
+    tablename_row.table_name = "test_table"
+
+    # UUID lookup returns None, table_name lookup returns the row
+    def filter_by_side_effect(**kwargs):
+        mock_result = MagicMock()
+        if "uuid" in kwargs:
+            mock_result.first.return_value = None
+        else:
+            mock_result.first.return_value = tablename_row
+        return mock_result
+
+    mock_db.session.query.return_value.filter_by.side_effect = 
filter_by_side_effect
+
+    result, found_by_uuid = _find_dataset("test_table", 1, "nonexistent-uuid")
+
+    assert result is tablename_row
+    assert found_by_uuid is False
+
+
+@patch("superset.examples.generic_loader.db")
+@patch("superset.examples.generic_loader.get_example_database")
+@patch("superset.examples.generic_loader.read_example_data")
+def test_load_parquet_table_duplicate_rows_table_missing(
+    mock_read_data: MagicMock,
+    mock_get_db: MagicMock,
+    mock_db: MagicMock,
+) -> None:
+    """Test UUID-first lookup when duplicate rows exist and physical table is 
missing.
+
+    Scenario:
+    - Row A: table_name="test_table", uuid="target-uuid" (correct row)
+    - Row B: table_name="test_table", uuid=None (duplicate from broken run)
+    - Physical table was dropped
+
+    The UUID-first lookup should find Row A and avoid collision.
+    """
+    from superset.examples.generic_loader import load_parquet_table
+
+    mock_database = MagicMock()
+    mock_inspector = _setup_database_mocks(
+        mock_get_db,
+        mock_database,
+        has_table=False,  # Table was dropped
+    )
+
+    with patch("superset.examples.generic_loader.inspect") as mock_inspect:
+        mock_inspect.return_value = mock_inspector
+
+        # Row with UUID (should be found by UUID lookup)
+        uuid_row = MagicMock()
+        uuid_row.uuid = "target-uuid"
+        uuid_row.table_name = "test_table"
+        uuid_row.database = mock_database
+
+        # UUID lookup finds the correct row
+        
mock_db.session.query.return_value.filter_by.return_value.first.return_value = (
+            uuid_row
+        )
+
+        mock_read_data.return_value = pd.DataFrame({"col1": [1, 2, 3]})
+
+        result = load_parquet_table(
+            parquet_file="test_data",
+            table_name="test_table",
+            database=mock_database,
+            only_metadata=True,
+            uuid="target-uuid",
+        )
+
+        # Should return the UUID row, not try to backfill (which would collide)
+        assert result is uuid_row
+        assert result.uuid == "target-uuid"

Reply via email to