Re: [PR] fix(datasets): preserve UUID identity in YAML export/import to preven… [superset]

via GitHub Wed, 11 Feb 2026 13:58:17 -0800


Copilot commented on code in PR #37907:
URL: https://github.com/apache/superset/pull/37907#discussion_r2795720565



##########
superset/commands/dataset/export.py:
##########
@@ -45,8 +45,11 @@ def _file_name(model: SqlaTable) -> str:
         db_file_name = get_filename(
             model.database.database_name, model.database.id, skip_id=True
         )
+        # Include UUID in filename to ensure uniqueness across environments
+        # while maintaining readability with table name and numeric ID
         ds_file_name = get_filename(model.table_name, model.id)
-        return f"datasets/{db_file_name}/{ds_file_name}.yaml"
+        uuid_suffix = str(model.uuid)[:8]  # Use first 8 chars of UUID for 
brevity
+        return f"datasets/{db_file_name}/{ds_file_name}_{uuid_suffix}.yaml"

Review Comment:
   The filename only includes the first 8 characters of the UUID. This reduces 
(but doesn't guarantee) uniqueness, so the comment “ensure uniqueness across 
environments” is inaccurate and a rare collision is still possible. Consider 
using the full UUID in the filename (or at least a longer prefix), or adjust 
the comment to reflect that this is a best-effort disambiguation.



##########
tests/integration_tests/datasets/commands_tests.py:
##########
@@ -268,10 +275,178 @@ def test_export_dataset_command_no_related(self, mock_g):
         command = ExportDatasetsCommand([example_dataset.id], 
export_related=False)
         contents = dict(command.run())
 
+        uuid_suffix = str(example_dataset.uuid)[:8]
+        expected_filename = 
f"datasets/examples/energy_usage_{example_dataset.id}_{uuid_suffix}.yaml"
+
         assert list(contents.keys()) == [
             "metadata.yaml",
-            f"datasets/examples/energy_usage_{example_dataset.id}.yaml",
+            expected_filename,
+        ]
+
+    @patch("superset.security.manager.g")
+    @pytest.mark.usefixtures("load_energy_table_with_slice")
+    def test_export_dataset_uuid_in_filename(self, mock_g):
+        """
+        Test that dataset export includes UUID in filename for disambiguation.
+        """
+        mock_g.user = security_manager.find_user("admin")
+
+        example_db = get_example_database()
+        example_dataset = _get_table_from_list_by_name(
+            "energy_usage", example_db.tables
+        )
+        command = ExportDatasetsCommand([example_dataset.id])
+        contents = dict(command.run())
+
+        # Verify UUID is in the filename
+        dataset_files = [
+            key for key in contents.keys() if key.startswith("datasets/")
         ]
+        assert len(dataset_files) == 1
+        dataset_file = dataset_files[0]
+
+        # Filename should contain UUID suffix (first 8 chars)
+        uuid_suffix = str(example_dataset.uuid)[:8]
+        assert uuid_suffix in dataset_file
+        assert dataset_file.endswith(f"_{uuid_suffix}.yaml")
+
+    @patch("superset.security.manager.g")
+    def test_export_datasets_with_same_name_different_uuid(self, mock_g):
+        """
+        Test that two datasets with the same table name but different UUIDs
+        export to different files without collision.
+        """
+        mock_g.user = security_manager.find_user("admin")
+        admin = security_manager.find_user("admin")
+
+        example_db = get_example_database()
+
+        # Create two datasets with the same table name
+        from uuid import uuid4
+
+        dataset1 = SqlaTable(
+            table_name="test_disambiguation",
+            schema=example_db.default_schema_name,
+            database=example_db,
+            owners=[admin],
+        )
+        dataset1.uuid = uuid4()
+        db.session.add(dataset1)
+        db.session.flush()
+
+        dataset2 = SqlaTable(
+            table_name="test_disambiguation",  # Same name
+            schema=example_db.default_schema_name,
+            database=example_db,
+            owners=[admin],
+        )
+        dataset2.uuid = uuid4()  # Different UUID
+        db.session.add(dataset2)
+        db.session.flush()
+
+        try:
+            # Export both datasets
+            command = ExportDatasetsCommand([dataset1.id, dataset2.id])
+            contents = dict(command.run())
+
+            dataset_files = [
+                key for key in contents.keys() if key.startswith("datasets/")
+            ]
+
+            # Should have 2 distinct files
+            assert len(dataset_files) == 2
+
+            # Verify both UUIDs are represented in filenames
+            uuid1_suffix = str(dataset1.uuid)[:8]
+            uuid2_suffix = str(dataset2.uuid)[:8]
+
+            filenames_str = " ".join(dataset_files)
+            assert uuid1_suffix in filenames_str
+            assert uuid2_suffix in filenames_str
+
+            # Files should have different names
+            assert dataset_files[0] != dataset_files[1]
+
+            # Verify each YAML contains the correct UUID in content
+            for file_key in dataset_files:
+                content = yaml.safe_load(contents[file_key]())
+                assert "uuid" in content
+                assert content["uuid"] in [str(dataset1.uuid), 
str(dataset2.uuid)]
+
+        finally:
+            db.session.delete(dataset1)
+            db.session.delete(dataset2)
+            db.session.commit()
+
+    @patch("superset.security.manager.g")
+    def test_export_import_dataset_uuid_resolution(self, mock_g):
+        """
+        Test that dataset import correctly resolves by UUID even with
+        filename disambiguation.
+        """
+        mock_g.user = security_manager.find_user("admin")
+        admin = security_manager.find_user("admin")
+
+        example_db = get_example_database()
+
+        # Create a dataset
+        from uuid import uuid4
+
+        original_uuid = uuid4()
+        dataset = SqlaTable(
+            table_name="test_uuid_resolution",
+            schema=example_db.default_schema_name,
+            database=example_db,
+            owners=[admin],
+        )
+        dataset.uuid = original_uuid
+        db.session.add(dataset)
+        db.session.flush()
+        original_id = dataset.id
+
+        try:
+            # Export the dataset
+            command = ExportDatasetsCommand([dataset.id])
+            contents = dict(command.run())
+
+            # Delete the dataset
+            db.session.delete(dataset)
+            db.session.commit()
+
+            # Verify dataset is gone
+            assert (
+                
db.session.query(SqlaTable).filter_by(uuid=original_uuid).first()
+                is None
+            )
+
+            # Import it back
+            from superset.commands.dataset.importers.dispatcher import (
+                ImportDatasetsCommand,
+            )
+
+            ImportDatasetsCommand(contents).run()
+

Review Comment:
   `contents` from `ExportDatasetsCommand.run()` is a mapping of filenames to 
callables (lazy content generators). Passing it directly to 
`ImportDatasetsCommand` will fail YAML loading because the importer expects 
`dict[str, str]`. Convert it first (e.g., `{name: thunk() for name, thunk in 
contents.items()}`) before calling the importer.



##########
tests/integration_tests/datasets/commands_tests.py:
##########
@@ -268,10 +275,178 @@ def test_export_dataset_command_no_related(self, mock_g):
         command = ExportDatasetsCommand([example_dataset.id], 
export_related=False)
         contents = dict(command.run())
 
+        uuid_suffix = str(example_dataset.uuid)[:8]
+        expected_filename = 
f"datasets/examples/energy_usage_{example_dataset.id}_{uuid_suffix}.yaml"
+
         assert list(contents.keys()) == [
             "metadata.yaml",
-            f"datasets/examples/energy_usage_{example_dataset.id}.yaml",
+            expected_filename,
+        ]
+
+    @patch("superset.security.manager.g")
+    @pytest.mark.usefixtures("load_energy_table_with_slice")
+    def test_export_dataset_uuid_in_filename(self, mock_g):
+        """
+        Test that dataset export includes UUID in filename for disambiguation.
+        """
+        mock_g.user = security_manager.find_user("admin")
+
+        example_db = get_example_database()
+        example_dataset = _get_table_from_list_by_name(
+            "energy_usage", example_db.tables
+        )
+        command = ExportDatasetsCommand([example_dataset.id])
+        contents = dict(command.run())
+
+        # Verify UUID is in the filename
+        dataset_files = [
+            key for key in contents.keys() if key.startswith("datasets/")
         ]
+        assert len(dataset_files) == 1
+        dataset_file = dataset_files[0]
+
+        # Filename should contain UUID suffix (first 8 chars)
+        uuid_suffix = str(example_dataset.uuid)[:8]
+        assert uuid_suffix in dataset_file
+        assert dataset_file.endswith(f"_{uuid_suffix}.yaml")
+
+    @patch("superset.security.manager.g")
+    def test_export_datasets_with_same_name_different_uuid(self, mock_g):
+        """
+        Test that two datasets with the same table name but different UUIDs
+        export to different files without collision.
+        """
+        mock_g.user = security_manager.find_user("admin")
+        admin = security_manager.find_user("admin")
+
+        example_db = get_example_database()
+
+        # Create two datasets with the same table name
+        from uuid import uuid4
+
+        dataset1 = SqlaTable(
+            table_name="test_disambiguation",
+            schema=example_db.default_schema_name,
+            database=example_db,
+            owners=[admin],
+        )
+        dataset1.uuid = uuid4()
+        db.session.add(dataset1)
+        db.session.flush()
+
+        dataset2 = SqlaTable(
+            table_name="test_disambiguation",  # Same name
+            schema=example_db.default_schema_name,
+            database=example_db,
+            owners=[admin],
+        )
+        dataset2.uuid = uuid4()  # Different UUID
+        db.session.add(dataset2)
+        db.session.flush()

Review Comment:
   This test creates two `SqlaTable` rows with the same `(database, schema, 
table_name)`. Superset migrations include a uniqueness constraint on these 
fields, so the second `flush()` can raise an `IntegrityError` on some DBs. To 
match the reported issue and avoid constraint failures, vary `schema` (or 
`catalog`) between `dataset1` and `dataset2` while keeping `table_name` the 
same.



##########
tests/integration_tests/datasets/commands_tests.py:
##########
@@ -76,14 +76,18 @@ def test_export_dataset_command(self, mock_g):
         command = ExportDatasetsCommand([example_dataset.id])
         contents = dict(command.run())
 
+        # Build expected filename with UUID suffix
+        uuid_suffix = str(example_dataset.uuid)[:8]
+        expected_filename = 
f"datasets/examples/energy_usage_{example_dataset.id}_{uuid_suffix}.yaml"

Review Comment:
   This `expected_filename` f-string is over the configured 88-character line 
length (ruff/black). Wrap it in parentheses and split across lines to avoid 
formatting/lint failures.
   ```suggestion
           expected_filename = (
               "datasets/examples/"
               f"energy_usage_{example_dataset.id}_{uuid_suffix}.yaml"
           )
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] fix(datasets): preserve UUID identity in YAML export/import to preven… [superset]

Reply via email to