(superset) 01/01: fix: rate limiting issues with example data hosted on github.com

maximebeauchemin Mon, 28 Jul 2025 19:52:42 -0700

This is an automated email from the ASF dual-hosted git repository.

maximebeauchemin pushed a commit to branch codespaces
in repository https://gitbox.apache.org/repos/asf/superset.git


commit 0b14d1e75fe4dfbb6219487f2c1215c8021feef7
Author: Maxime Beauchemin <maximebeauche...@gmail.com>
AuthorDate: Mon Jul 28 19:03:28 2025 -0700

    fix: rate limiting issues with example data hosted on github.com
---
 superset/datasets/schemas.py                           |  9 ++++++++-
 superset/examples/bart_lines.py                        |  2 +-
 superset/examples/birth_names.py                       | 10 +++++-----
 .../configs/datasets/examples/FCC_2018_Survey.yaml     |  2 +-
 .../configs/datasets/examples/channel_members.yaml     |  2 +-
 .../examples/configs/datasets/examples/channels.yaml   |  2 +-
 .../configs/datasets/examples/cleaned_sales_data.yaml  |  2 +-
 .../configs/datasets/examples/covid_vaccines.yaml      |  2 +-
 .../configs/datasets/examples/exported_stats.yaml      |  2 +-
 .../examples/configs/datasets/examples/messages.yaml   |  2 +-
 .../examples/configs/datasets/examples/threads.yaml    |  2 +-
 .../configs/datasets/examples/unicode_test.test.yaml   |  2 +-
 superset/examples/configs/datasets/examples/users.yaml |  2 +-
 .../configs/datasets/examples/users_channels.yaml      |  2 +-
 .../configs/datasets/examples/video_game_sales.yaml    |  2 +-
 superset/examples/country_map.py                       |  2 +-
 superset/examples/energy.py                            |  2 +-
 superset/examples/flights.py                           |  4 ++--
 superset/examples/helpers.py                           | 18 +++++++++++++++---
 superset/examples/long_lat.py                          |  2 +-
 superset/examples/multiformat_time_series.py           |  2 +-
 superset/examples/paris.py                             |  2 +-
 superset/examples/random_time_series.py                |  4 +++-
 superset/examples/sf_population_polygons.py            |  4 +++-
 superset/examples/world_bank.py                        |  2 +-
 25 files changed, 55 insertions(+), 32 deletions(-)

diff --git a/superset/datasets/schemas.py b/superset/datasets/schemas.py
index aba2283ef2..48ff05d1d6 100644
--- a/superset/datasets/schemas.py
+++ b/superset/datasets/schemas.py
@@ -276,6 +276,7 @@ class ImportV1DatasetSchema(Schema):
     def fix_extra(self, data: dict[str, Any], **kwargs: Any) -> dict[str, Any]:
         """
         Fix for extra initially being exported as a string.
+        Also normalize examples:// URLs for validation.
         """
         if isinstance(data.get("extra"), str):
             try:
@@ -284,6 +285,12 @@ class ImportV1DatasetSchema(Schema):
             except ValueError:
                 data["extra"] = None
 
+        # Normalize examples:// URLs before validation
+        if data.get("data") and isinstance(data["data"], str):
+            from superset.examples.helpers import normalize_example_data_url
+
+            data["data"] = normalize_example_data_url(data["data"])
+
         return data
 
     table_name = fields.String(required=True)
@@ -305,7 +312,7 @@ class ImportV1DatasetSchema(Schema):
     metrics = fields.List(fields.Nested(ImportV1MetricSchema))
     version = fields.String(required=True)
     database_uuid = fields.UUID(required=True)
-    data = fields.URL()
+    data = fields.URL(allow_none=True)
     is_managed_externally = fields.Boolean(allow_none=True, dump_default=False)
     external_url = fields.String(allow_none=True)
     normalize_columns = fields.Boolean(load_default=False)
diff --git a/superset/examples/bart_lines.py b/superset/examples/bart_lines.py
index 7d7e71d59f..4b741fdeed 100644
--- a/superset/examples/bart_lines.py
+++ b/superset/examples/bart_lines.py
@@ -38,7 +38,7 @@ def load_bart_lines(only_metadata: bool = False, force: bool 
= False) -> None:
 
         if not only_metadata and (not table_exists or force):
             df = read_example_data(
-                "bart-lines.json.gz", encoding="latin-1", compression="gzip"
+                "examples://bart-lines.json.gz", encoding="latin-1", 
compression="gzip"
             )
             df["path_json"] = df.path.map(json.dumps)
             df["polyline"] = df.path.map(polyline.encode)
diff --git a/superset/examples/birth_names.py b/superset/examples/birth_names.py
index 6f2d9a5ebf..ddd5ea50aa 100644
--- a/superset/examples/birth_names.py
+++ b/superset/examples/birth_names.py
@@ -57,7 +57,7 @@ def gen_filter(
 
 
 def load_data(tbl_name: str, database: Database, sample: bool = False) -> None:
-    pdf = read_example_data("birth_names2.json.gz", compression="gzip")
+    pdf = read_example_data("examples://birth_names2.json.gz", 
compression="gzip")
 
     # TODO(bkyryliuk): move load examples data into the pytest fixture
     if database.backend == "presto":
@@ -584,8 +584,8 @@ def create_dashboard(slices: list[Slice]) -> Dashboard:
         }
     }"""
     )
-    # pylint: disable=echarts_timeseries_line-too-long
-    pos = json.loads(
+    # pylint: disable=line-too-long
+    pos = json.loads(  # noqa: TID251
         textwrap.dedent(
             """\
         {
@@ -859,11 +859,11 @@ def create_dashboard(slices: list[Slice]) -> Dashboard:
         """  # noqa: E501
         )
     )
-    # pylint: enable=echarts_timeseries_line-too-long
+    # pylint: enable=line-too-long
     # dashboard v2 doesn't allow add markup slice
     dash.slices = [slc for slc in slices if slc.viz_type != "markup"]
     update_slice_ids(pos)
     dash.dashboard_title = "USA Births Names"
-    dash.position_json = json.dumps(pos, indent=4)
+    dash.position_json = json.dumps(pos, indent=4)  # noqa: TID251
     dash.slug = "births"
     return dash
diff --git a/superset/examples/configs/datasets/examples/FCC_2018_Survey.yaml 
b/superset/examples/configs/datasets/examples/FCC_2018_Survey.yaml
index 85aeb51eb9..0943c947f3 100644
--- a/superset/examples/configs/datasets/examples/FCC_2018_Survey.yaml
+++ b/superset/examples/configs/datasets/examples/FCC_2018_Survey.yaml
@@ -1490,4 +1490,4 @@ columns:
     python_date_format: null
 version: 1.0.0
 database_uuid: a2dc77af-e654-49bb-b321-40f6b559a1ee
-data: 
https://github.com/apache-superset/examples-data/raw/master/datasets/examples/fcc_survey_2018.csv.gz
+data: examples://datasets/examples/fcc_survey_2018.csv.gz
diff --git a/superset/examples/configs/datasets/examples/channel_members.yaml 
b/superset/examples/configs/datasets/examples/channel_members.yaml
index 7bdf0836c4..a528838e48 100644
--- a/superset/examples/configs/datasets/examples/channel_members.yaml
+++ b/superset/examples/configs/datasets/examples/channel_members.yaml
@@ -60,4 +60,4 @@ columns:
   python_date_format: null
 version: 1.0.0
 database_uuid: a2dc77af-e654-49bb-b321-40f6b559a1ee
-data: 
https://raw.githubusercontent.com/apache-superset/examples-data/master/datasets/examples/slack/channel_members.csv
+data: examples://datasets/examples/slack/channel_members.csv
diff --git a/superset/examples/configs/datasets/examples/channels.yaml 
b/superset/examples/configs/datasets/examples/channels.yaml
index 92ce51d210..bc1dad46ba 100644
--- a/superset/examples/configs/datasets/examples/channels.yaml
+++ b/superset/examples/configs/datasets/examples/channels.yaml
@@ -360,4 +360,4 @@ columns:
   python_date_format: null
 version: 1.0.0
 database_uuid: a2dc77af-e654-49bb-b321-40f6b559a1ee
-data: 
https://raw.githubusercontent.com/apache-superset/examples-data/master/datasets/examples/slack/channels.csv
+data: examples://datasets/examples/slack/channels.csv
diff --git 
a/superset/examples/configs/datasets/examples/cleaned_sales_data.yaml 
b/superset/examples/configs/datasets/examples/cleaned_sales_data.yaml
index d2136a9034..35bb522ef8 100644
--- a/superset/examples/configs/datasets/examples/cleaned_sales_data.yaml
+++ b/superset/examples/configs/datasets/examples/cleaned_sales_data.yaml
@@ -344,4 +344,4 @@ columns:
   extra: null
 version: 1.0.0
 database_uuid: a2dc77af-e654-49bb-b321-40f6b559a1ee
-data: 
https://raw.githubusercontent.com/apache-superset/examples-data/lowercase_columns_examples/datasets/examples/sales.csv
+data: examples://datasets/examples/sales.csv
diff --git a/superset/examples/configs/datasets/examples/covid_vaccines.yaml 
b/superset/examples/configs/datasets/examples/covid_vaccines.yaml
index 9d1431c2db..e5f5ca1b52 100644
--- a/superset/examples/configs/datasets/examples/covid_vaccines.yaml
+++ b/superset/examples/configs/datasets/examples/covid_vaccines.yaml
@@ -204,4 +204,4 @@ columns:
   python_date_format: null
 version: 1.0.0
 database_uuid: a2dc77af-e654-49bb-b321-40f6b559a1ee
-data: 
https://raw.githubusercontent.com/apache-superset/examples-data/lowercase_columns_examples/datasets/examples/covid_vaccines.csv
+data: examples://datasets/examples/covid_vaccines.csv
diff --git a/superset/examples/configs/datasets/examples/exported_stats.yaml 
b/superset/examples/configs/datasets/examples/exported_stats.yaml
index 28cd18ba73..cd71bde56f 100644
--- a/superset/examples/configs/datasets/examples/exported_stats.yaml
+++ b/superset/examples/configs/datasets/examples/exported_stats.yaml
@@ -260,4 +260,4 @@ columns:
   python_date_format: null
 version: 1.0.0
 database_uuid: a2dc77af-e654-49bb-b321-40f6b559a1ee
-data: 
https://raw.githubusercontent.com/apache-superset/examples-data/master/datasets/examples/slack/exported_stats.csv
+data: examples://datasets/examples/slack/exported_stats.csv
diff --git a/superset/examples/configs/datasets/examples/messages.yaml 
b/superset/examples/configs/datasets/examples/messages.yaml
index 27ca367009..f34f98dda4 100644
--- a/superset/examples/configs/datasets/examples/messages.yaml
+++ b/superset/examples/configs/datasets/examples/messages.yaml
@@ -480,4 +480,4 @@ columns:
   python_date_format: null
 version: 1.0.0
 database_uuid: a2dc77af-e654-49bb-b321-40f6b559a1ee
-data: 
https://raw.githubusercontent.com/apache-superset/examples-data/master/datasets/examples/slack/messages.csv
+data: examples://datasets/examples/slack/messages.csv
diff --git a/superset/examples/configs/datasets/examples/threads.yaml 
b/superset/examples/configs/datasets/examples/threads.yaml
index 0d6a2ab8a5..c85d124b7d 100644
--- a/superset/examples/configs/datasets/examples/threads.yaml
+++ b/superset/examples/configs/datasets/examples/threads.yaml
@@ -180,4 +180,4 @@ columns:
   python_date_format: null
 version: 1.0.0
 database_uuid: a2dc77af-e654-49bb-b321-40f6b559a1ee
-data: 
https://raw.githubusercontent.com/apache-superset/examples-data/master/datasets/examples/slack/threads.csv
+data: examples://datasets/examples/slack/threads.csv
diff --git a/superset/examples/configs/datasets/examples/unicode_test.test.yaml 
b/superset/examples/configs/datasets/examples/unicode_test.test.yaml
index f95cf456be..66968532a5 100644
--- a/superset/examples/configs/datasets/examples/unicode_test.test.yaml
+++ b/superset/examples/configs/datasets/examples/unicode_test.test.yaml
@@ -90,4 +90,4 @@ columns:
   python_date_format: null
 version: 1.0.0
 database_uuid: a2dc77af-e654-49bb-b321-40f6b559a1ee
-data: 
https://raw.githubusercontent.com/apache-superset/examples-data/master/datasets/examples/unicode_test.csv
+data: examples://datasets/examples/unicode_test.csv
diff --git a/superset/examples/configs/datasets/examples/users.yaml 
b/superset/examples/configs/datasets/examples/users.yaml
index c26b96b935..dd8b49fb40 100644
--- a/superset/examples/configs/datasets/examples/users.yaml
+++ b/superset/examples/configs/datasets/examples/users.yaml
@@ -220,4 +220,4 @@ columns:
     python_date_format: null
 version: 1.0.0
 database_uuid: a2dc77af-e654-49bb-b321-40f6b559a1ee
-data: 
https://raw.githubusercontent.com/apache-superset/examples-data/master/datasets/examples/slack/users.csv
+data: examples://datasets/examples/slack/users.csv
diff --git a/superset/examples/configs/datasets/examples/users_channels.yaml 
b/superset/examples/configs/datasets/examples/users_channels.yaml
index a69db421d9..0b558094be 100644
--- a/superset/examples/configs/datasets/examples/users_channels.yaml
+++ b/superset/examples/configs/datasets/examples/users_channels.yaml
@@ -60,4 +60,4 @@ columns:
     python_date_format: null
 version: 1.0.0
 database_uuid: a2dc77af-e654-49bb-b321-40f6b559a1ee
-data: 
https://raw.githubusercontent.com/apache-superset/examples-data/master/datasets/examples/slack/users_channels.csv
+data: examples://datasets/examples/slack/users_channels.csv
diff --git a/superset/examples/configs/datasets/examples/video_game_sales.yaml 
b/superset/examples/configs/datasets/examples/video_game_sales.yaml
index 7748f6af8c..ecf88108fe 100644
--- a/superset/examples/configs/datasets/examples/video_game_sales.yaml
+++ b/superset/examples/configs/datasets/examples/video_game_sales.yaml
@@ -153,4 +153,4 @@ columns:
     python_date_format: null
 version: 1.0.0
 database_uuid: a2dc77af-e654-49bb-b321-40f6b559a1ee
-data: 
https://github.com/apache-superset/examples-data/raw/lowercase_columns_examples/datasets/examples/video_game_sales.csv
+data: examples://datasets/examples/video_game_sales.csv
diff --git a/superset/examples/country_map.py b/superset/examples/country_map.py
index 1abbf4b524..06eec473cb 100644
--- a/superset/examples/country_map.py
+++ b/superset/examples/country_map.py
@@ -49,7 +49,7 @@ def load_country_map_data(only_metadata: bool = False, force: 
bool = False) -> N
 
         if not only_metadata and (not table_exists or force):
             data = read_example_data(
-                "birth_france_data_for_country_map.csv", encoding="utf-8"
+                "examples://birth_france_data_for_country_map.csv", 
encoding="utf-8"
             )
             data["dttm"] = datetime.datetime.now().date()
             data.to_sql(
diff --git a/superset/examples/energy.py b/superset/examples/energy.py
index 3b47ca2972..67d6cc5854 100644
--- a/superset/examples/energy.py
+++ b/superset/examples/energy.py
@@ -50,7 +50,7 @@ def load_energy(
         table_exists = database.has_table(Table(tbl_name, schema))
 
         if not only_metadata and (not table_exists or force):
-            pdf = read_example_data("energy.json.gz", compression="gzip")
+            pdf = read_example_data("examples://energy.json.gz", 
compression="gzip")
             pdf = pdf.head(100) if sample else pdf
             pdf.to_sql(
                 tbl_name,
diff --git a/superset/examples/flights.py b/superset/examples/flights.py
index cf3ab34d0a..db00ff5b3a 100644
--- a/superset/examples/flights.py
+++ b/superset/examples/flights.py
@@ -38,12 +38,12 @@ def load_flights(only_metadata: bool = False, force: bool = 
False) -> None:
 
         if not only_metadata and (not table_exists or force):
             pdf = read_example_data(
-                "flight_data.csv.gz", encoding="latin-1", compression="gzip"
+                "examples://flight_data.csv.gz", encoding="latin-1", 
compression="gzip"
             )
 
             # Loading airports info to join and get lat/long
             airports = read_example_data(
-                "airports.csv.gz", encoding="latin-1", compression="gzip"
+                "examples://airports.csv.gz", encoding="latin-1", 
compression="gzip"
             )
             airports = airports.set_index("IATA_CODE")
 
diff --git a/superset/examples/helpers.py b/superset/examples/helpers.py
index 57b6bc6316..036f7f0aef 100644
--- a/superset/examples/helpers.py
+++ b/superset/examples/helpers.py
@@ -125,6 +125,20 @@ def get_example_url(filepath: str) -> str:
     return f"{BASE_URL}{filepath}"
 
 
+def normalize_example_data_url(url: str) -> str:
+    """Convert example data URLs to use the configured CDN.
+
+    Transforms examples:// URLs to the configured CDN URL.
+    Non-example URLs are returned unchanged.
+    """
+    if url.startswith("examples://"):
+        relative_path = url[11:]  # Remove 'examples://'
+        return get_example_url(relative_path)
+
+    # Not an examples URL, return unchanged
+    return url
+
+
 def read_example_data(
     filepath: str,
     max_attempts: int = 5,
@@ -132,9 +146,7 @@ def read_example_data(
     **kwargs: Any,
 ) -> pd.DataFrame:
     """Load CSV or JSON from example data mirror with retry/backoff."""
-    from superset.examples.helpers import get_example_url
-
-    url = get_example_url(filepath)
+    url = normalize_example_data_url(filepath)
     is_json = filepath.endswith(".json") or filepath.endswith(".json.gz")
 
     for attempt in range(1, max_attempts + 1):
diff --git a/superset/examples/long_lat.py b/superset/examples/long_lat.py
index 7c61e845df..c201535ea4 100644
--- a/superset/examples/long_lat.py
+++ b/superset/examples/long_lat.py
@@ -48,7 +48,7 @@ def load_long_lat_data(only_metadata: bool = False, force: 
bool = False) -> None
 
         if not only_metadata and (not table_exists or force):
             pdf = read_example_data(
-                "san_francisco.csv.gz", encoding="utf-8", compression="gzip"
+                "examples://san_francisco.csv.gz", encoding="utf-8", 
compression="gzip"
             )
             start = datetime.datetime.now().replace(
                 hour=0, minute=0, second=0, microsecond=0
diff --git a/superset/examples/multiformat_time_series.py 
b/superset/examples/multiformat_time_series.py
index 1ed46a76db..a86ecdbdc5 100644
--- a/superset/examples/multiformat_time_series.py
+++ b/superset/examples/multiformat_time_series.py
@@ -49,7 +49,7 @@ def load_multiformat_time_series(  # pylint: 
disable=too-many-locals
 
         if not only_metadata and (not table_exists or force):
             pdf = read_example_data(
-                "multiformat_time_series.json.gz", compression="gzip"
+                "examples://multiformat_time_series.json.gz", 
compression="gzip"
             )
 
             # TODO(bkyryliuk): move load examples data into the pytest fixture
diff --git a/superset/examples/paris.py b/superset/examples/paris.py
index 392fe5a710..1b5f40e792 100644
--- a/superset/examples/paris.py
+++ b/superset/examples/paris.py
@@ -37,7 +37,7 @@ def load_paris_iris_geojson(only_metadata: bool = False, 
force: bool = False) ->
         table_exists = database.has_table(Table(tbl_name, schema))
 
         if not only_metadata and (not table_exists or force):
-            df = read_example_data("paris_iris.json.gz", compression="gzip")
+            df = read_example_data("examples://paris_iris.json.gz", 
compression="gzip")
             df["features"] = df.features.map(json.dumps)
 
             df.to_sql(
diff --git a/superset/examples/random_time_series.py 
b/superset/examples/random_time_series.py
index c67125803b..c92cd61761 100644
--- a/superset/examples/random_time_series.py
+++ b/superset/examples/random_time_series.py
@@ -46,7 +46,9 @@ def load_random_time_series_data(
         table_exists = database.has_table(Table(tbl_name, schema))
 
         if not only_metadata and (not table_exists or force):
-            pdf = read_example_data("random_time_series.json.gz", 
compression="gzip")
+            pdf = read_example_data(
+                "examples://random_time_series.json.gz", compression="gzip"
+            )
             if database.backend == "presto":
                 pdf.ds = pd.to_datetime(pdf.ds, unit="s")
                 pdf.ds = pdf.ds.dt.strftime("%Y-%m-%d %H:%M%:%S")
diff --git a/superset/examples/sf_population_polygons.py 
b/superset/examples/sf_population_polygons.py
index 5cbba12866..f1612e2c1d 100644
--- a/superset/examples/sf_population_polygons.py
+++ b/superset/examples/sf_population_polygons.py
@@ -39,7 +39,9 @@ def load_sf_population_polygons(
         table_exists = database.has_table(Table(tbl_name, schema))
 
         if not only_metadata and (not table_exists or force):
-            df = read_example_data("sf_population.json.gz", compression="gzip")
+            df = read_example_data(
+                "examples://sf_population.json.gz", compression="gzip"
+            )
             df["contour"] = df.contour.map(json.dumps)
 
             df.to_sql(
diff --git a/superset/examples/world_bank.py b/superset/examples/world_bank.py
index 784c5a7d31..e1e9363fbd 100644
--- a/superset/examples/world_bank.py
+++ b/superset/examples/world_bank.py
@@ -55,7 +55,7 @@ def load_world_bank_health_n_pop(  # pylint: 
disable=too-many-locals
         table_exists = database.has_table(Table(tbl_name, schema))
 
         if not only_metadata and (not table_exists or force):
-            pdf = read_example_data("countries.json.gz", compression="gzip")
+            pdf = read_example_data("examples://countries.json.gz", 
compression="gzip")
             pdf.columns = [col.replace(".", "_") for col in pdf.columns]
             if database.backend == "presto":
                 pdf.year = pd.to_datetime(pdf.year)

(superset) 01/01: fix: rate limiting issues with example data hosted on github.com

Reply via email to