This is an automated email from the ASF dual-hosted git repository. maximebeauchemin pushed a commit to branch codespaces in repository https://gitbox.apache.org/repos/asf/superset.git
commit 0b14d1e75fe4dfbb6219487f2c1215c8021feef7 Author: Maxime Beauchemin <maximebeauche...@gmail.com> AuthorDate: Mon Jul 28 19:03:28 2025 -0700 fix: rate limiting issues with example data hosted on github.com --- superset/datasets/schemas.py | 9 ++++++++- superset/examples/bart_lines.py | 2 +- superset/examples/birth_names.py | 10 +++++----- .../configs/datasets/examples/FCC_2018_Survey.yaml | 2 +- .../configs/datasets/examples/channel_members.yaml | 2 +- .../examples/configs/datasets/examples/channels.yaml | 2 +- .../configs/datasets/examples/cleaned_sales_data.yaml | 2 +- .../configs/datasets/examples/covid_vaccines.yaml | 2 +- .../configs/datasets/examples/exported_stats.yaml | 2 +- .../examples/configs/datasets/examples/messages.yaml | 2 +- .../examples/configs/datasets/examples/threads.yaml | 2 +- .../configs/datasets/examples/unicode_test.test.yaml | 2 +- superset/examples/configs/datasets/examples/users.yaml | 2 +- .../configs/datasets/examples/users_channels.yaml | 2 +- .../configs/datasets/examples/video_game_sales.yaml | 2 +- superset/examples/country_map.py | 2 +- superset/examples/energy.py | 2 +- superset/examples/flights.py | 4 ++-- superset/examples/helpers.py | 18 +++++++++++++++--- superset/examples/long_lat.py | 2 +- superset/examples/multiformat_time_series.py | 2 +- superset/examples/paris.py | 2 +- superset/examples/random_time_series.py | 4 +++- superset/examples/sf_population_polygons.py | 4 +++- superset/examples/world_bank.py | 2 +- 25 files changed, 55 insertions(+), 32 deletions(-) diff --git a/superset/datasets/schemas.py b/superset/datasets/schemas.py index aba2283ef2..48ff05d1d6 100644 --- a/superset/datasets/schemas.py +++ b/superset/datasets/schemas.py @@ -276,6 +276,7 @@ class ImportV1DatasetSchema(Schema): def fix_extra(self, data: dict[str, Any], **kwargs: Any) -> dict[str, Any]: """ Fix for extra initially being exported as a string. + Also normalize examples:// URLs for validation. """ if isinstance(data.get("extra"), str): try: @@ -284,6 +285,12 @@ class ImportV1DatasetSchema(Schema): except ValueError: data["extra"] = None + # Normalize examples:// URLs before validation + if data.get("data") and isinstance(data["data"], str): + from superset.examples.helpers import normalize_example_data_url + + data["data"] = normalize_example_data_url(data["data"]) + return data table_name = fields.String(required=True) @@ -305,7 +312,7 @@ class ImportV1DatasetSchema(Schema): metrics = fields.List(fields.Nested(ImportV1MetricSchema)) version = fields.String(required=True) database_uuid = fields.UUID(required=True) - data = fields.URL() + data = fields.URL(allow_none=True) is_managed_externally = fields.Boolean(allow_none=True, dump_default=False) external_url = fields.String(allow_none=True) normalize_columns = fields.Boolean(load_default=False) diff --git a/superset/examples/bart_lines.py b/superset/examples/bart_lines.py index 7d7e71d59f..4b741fdeed 100644 --- a/superset/examples/bart_lines.py +++ b/superset/examples/bart_lines.py @@ -38,7 +38,7 @@ def load_bart_lines(only_metadata: bool = False, force: bool = False) -> None: if not only_metadata and (not table_exists or force): df = read_example_data( - "bart-lines.json.gz", encoding="latin-1", compression="gzip" + "examples://bart-lines.json.gz", encoding="latin-1", compression="gzip" ) df["path_json"] = df.path.map(json.dumps) df["polyline"] = df.path.map(polyline.encode) diff --git a/superset/examples/birth_names.py b/superset/examples/birth_names.py index 6f2d9a5ebf..ddd5ea50aa 100644 --- a/superset/examples/birth_names.py +++ b/superset/examples/birth_names.py @@ -57,7 +57,7 @@ def gen_filter( def load_data(tbl_name: str, database: Database, sample: bool = False) -> None: - pdf = read_example_data("birth_names2.json.gz", compression="gzip") + pdf = read_example_data("examples://birth_names2.json.gz", compression="gzip") # TODO(bkyryliuk): move load examples data into the pytest fixture if database.backend == "presto": @@ -584,8 +584,8 @@ def create_dashboard(slices: list[Slice]) -> Dashboard: } }""" ) - # pylint: disable=echarts_timeseries_line-too-long - pos = json.loads( + # pylint: disable=line-too-long + pos = json.loads( # noqa: TID251 textwrap.dedent( """\ { @@ -859,11 +859,11 @@ def create_dashboard(slices: list[Slice]) -> Dashboard: """ # noqa: E501 ) ) - # pylint: enable=echarts_timeseries_line-too-long + # pylint: enable=line-too-long # dashboard v2 doesn't allow add markup slice dash.slices = [slc for slc in slices if slc.viz_type != "markup"] update_slice_ids(pos) dash.dashboard_title = "USA Births Names" - dash.position_json = json.dumps(pos, indent=4) + dash.position_json = json.dumps(pos, indent=4) # noqa: TID251 dash.slug = "births" return dash diff --git a/superset/examples/configs/datasets/examples/FCC_2018_Survey.yaml b/superset/examples/configs/datasets/examples/FCC_2018_Survey.yaml index 85aeb51eb9..0943c947f3 100644 --- a/superset/examples/configs/datasets/examples/FCC_2018_Survey.yaml +++ b/superset/examples/configs/datasets/examples/FCC_2018_Survey.yaml @@ -1490,4 +1490,4 @@ columns: python_date_format: null version: 1.0.0 database_uuid: a2dc77af-e654-49bb-b321-40f6b559a1ee -data: https://github.com/apache-superset/examples-data/raw/master/datasets/examples/fcc_survey_2018.csv.gz +data: examples://datasets/examples/fcc_survey_2018.csv.gz diff --git a/superset/examples/configs/datasets/examples/channel_members.yaml b/superset/examples/configs/datasets/examples/channel_members.yaml index 7bdf0836c4..a528838e48 100644 --- a/superset/examples/configs/datasets/examples/channel_members.yaml +++ b/superset/examples/configs/datasets/examples/channel_members.yaml @@ -60,4 +60,4 @@ columns: python_date_format: null version: 1.0.0 database_uuid: a2dc77af-e654-49bb-b321-40f6b559a1ee -data: https://raw.githubusercontent.com/apache-superset/examples-data/master/datasets/examples/slack/channel_members.csv +data: examples://datasets/examples/slack/channel_members.csv diff --git a/superset/examples/configs/datasets/examples/channels.yaml b/superset/examples/configs/datasets/examples/channels.yaml index 92ce51d210..bc1dad46ba 100644 --- a/superset/examples/configs/datasets/examples/channels.yaml +++ b/superset/examples/configs/datasets/examples/channels.yaml @@ -360,4 +360,4 @@ columns: python_date_format: null version: 1.0.0 database_uuid: a2dc77af-e654-49bb-b321-40f6b559a1ee -data: https://raw.githubusercontent.com/apache-superset/examples-data/master/datasets/examples/slack/channels.csv +data: examples://datasets/examples/slack/channels.csv diff --git a/superset/examples/configs/datasets/examples/cleaned_sales_data.yaml b/superset/examples/configs/datasets/examples/cleaned_sales_data.yaml index d2136a9034..35bb522ef8 100644 --- a/superset/examples/configs/datasets/examples/cleaned_sales_data.yaml +++ b/superset/examples/configs/datasets/examples/cleaned_sales_data.yaml @@ -344,4 +344,4 @@ columns: extra: null version: 1.0.0 database_uuid: a2dc77af-e654-49bb-b321-40f6b559a1ee -data: https://raw.githubusercontent.com/apache-superset/examples-data/lowercase_columns_examples/datasets/examples/sales.csv +data: examples://datasets/examples/sales.csv diff --git a/superset/examples/configs/datasets/examples/covid_vaccines.yaml b/superset/examples/configs/datasets/examples/covid_vaccines.yaml index 9d1431c2db..e5f5ca1b52 100644 --- a/superset/examples/configs/datasets/examples/covid_vaccines.yaml +++ b/superset/examples/configs/datasets/examples/covid_vaccines.yaml @@ -204,4 +204,4 @@ columns: python_date_format: null version: 1.0.0 database_uuid: a2dc77af-e654-49bb-b321-40f6b559a1ee -data: https://raw.githubusercontent.com/apache-superset/examples-data/lowercase_columns_examples/datasets/examples/covid_vaccines.csv +data: examples://datasets/examples/covid_vaccines.csv diff --git a/superset/examples/configs/datasets/examples/exported_stats.yaml b/superset/examples/configs/datasets/examples/exported_stats.yaml index 28cd18ba73..cd71bde56f 100644 --- a/superset/examples/configs/datasets/examples/exported_stats.yaml +++ b/superset/examples/configs/datasets/examples/exported_stats.yaml @@ -260,4 +260,4 @@ columns: python_date_format: null version: 1.0.0 database_uuid: a2dc77af-e654-49bb-b321-40f6b559a1ee -data: https://raw.githubusercontent.com/apache-superset/examples-data/master/datasets/examples/slack/exported_stats.csv +data: examples://datasets/examples/slack/exported_stats.csv diff --git a/superset/examples/configs/datasets/examples/messages.yaml b/superset/examples/configs/datasets/examples/messages.yaml index 27ca367009..f34f98dda4 100644 --- a/superset/examples/configs/datasets/examples/messages.yaml +++ b/superset/examples/configs/datasets/examples/messages.yaml @@ -480,4 +480,4 @@ columns: python_date_format: null version: 1.0.0 database_uuid: a2dc77af-e654-49bb-b321-40f6b559a1ee -data: https://raw.githubusercontent.com/apache-superset/examples-data/master/datasets/examples/slack/messages.csv +data: examples://datasets/examples/slack/messages.csv diff --git a/superset/examples/configs/datasets/examples/threads.yaml b/superset/examples/configs/datasets/examples/threads.yaml index 0d6a2ab8a5..c85d124b7d 100644 --- a/superset/examples/configs/datasets/examples/threads.yaml +++ b/superset/examples/configs/datasets/examples/threads.yaml @@ -180,4 +180,4 @@ columns: python_date_format: null version: 1.0.0 database_uuid: a2dc77af-e654-49bb-b321-40f6b559a1ee -data: https://raw.githubusercontent.com/apache-superset/examples-data/master/datasets/examples/slack/threads.csv +data: examples://datasets/examples/slack/threads.csv diff --git a/superset/examples/configs/datasets/examples/unicode_test.test.yaml b/superset/examples/configs/datasets/examples/unicode_test.test.yaml index f95cf456be..66968532a5 100644 --- a/superset/examples/configs/datasets/examples/unicode_test.test.yaml +++ b/superset/examples/configs/datasets/examples/unicode_test.test.yaml @@ -90,4 +90,4 @@ columns: python_date_format: null version: 1.0.0 database_uuid: a2dc77af-e654-49bb-b321-40f6b559a1ee -data: https://raw.githubusercontent.com/apache-superset/examples-data/master/datasets/examples/unicode_test.csv +data: examples://datasets/examples/unicode_test.csv diff --git a/superset/examples/configs/datasets/examples/users.yaml b/superset/examples/configs/datasets/examples/users.yaml index c26b96b935..dd8b49fb40 100644 --- a/superset/examples/configs/datasets/examples/users.yaml +++ b/superset/examples/configs/datasets/examples/users.yaml @@ -220,4 +220,4 @@ columns: python_date_format: null version: 1.0.0 database_uuid: a2dc77af-e654-49bb-b321-40f6b559a1ee -data: https://raw.githubusercontent.com/apache-superset/examples-data/master/datasets/examples/slack/users.csv +data: examples://datasets/examples/slack/users.csv diff --git a/superset/examples/configs/datasets/examples/users_channels.yaml b/superset/examples/configs/datasets/examples/users_channels.yaml index a69db421d9..0b558094be 100644 --- a/superset/examples/configs/datasets/examples/users_channels.yaml +++ b/superset/examples/configs/datasets/examples/users_channels.yaml @@ -60,4 +60,4 @@ columns: python_date_format: null version: 1.0.0 database_uuid: a2dc77af-e654-49bb-b321-40f6b559a1ee -data: https://raw.githubusercontent.com/apache-superset/examples-data/master/datasets/examples/slack/users_channels.csv +data: examples://datasets/examples/slack/users_channels.csv diff --git a/superset/examples/configs/datasets/examples/video_game_sales.yaml b/superset/examples/configs/datasets/examples/video_game_sales.yaml index 7748f6af8c..ecf88108fe 100644 --- a/superset/examples/configs/datasets/examples/video_game_sales.yaml +++ b/superset/examples/configs/datasets/examples/video_game_sales.yaml @@ -153,4 +153,4 @@ columns: python_date_format: null version: 1.0.0 database_uuid: a2dc77af-e654-49bb-b321-40f6b559a1ee -data: https://github.com/apache-superset/examples-data/raw/lowercase_columns_examples/datasets/examples/video_game_sales.csv +data: examples://datasets/examples/video_game_sales.csv diff --git a/superset/examples/country_map.py b/superset/examples/country_map.py index 1abbf4b524..06eec473cb 100644 --- a/superset/examples/country_map.py +++ b/superset/examples/country_map.py @@ -49,7 +49,7 @@ def load_country_map_data(only_metadata: bool = False, force: bool = False) -> N if not only_metadata and (not table_exists or force): data = read_example_data( - "birth_france_data_for_country_map.csv", encoding="utf-8" + "examples://birth_france_data_for_country_map.csv", encoding="utf-8" ) data["dttm"] = datetime.datetime.now().date() data.to_sql( diff --git a/superset/examples/energy.py b/superset/examples/energy.py index 3b47ca2972..67d6cc5854 100644 --- a/superset/examples/energy.py +++ b/superset/examples/energy.py @@ -50,7 +50,7 @@ def load_energy( table_exists = database.has_table(Table(tbl_name, schema)) if not only_metadata and (not table_exists or force): - pdf = read_example_data("energy.json.gz", compression="gzip") + pdf = read_example_data("examples://energy.json.gz", compression="gzip") pdf = pdf.head(100) if sample else pdf pdf.to_sql( tbl_name, diff --git a/superset/examples/flights.py b/superset/examples/flights.py index cf3ab34d0a..db00ff5b3a 100644 --- a/superset/examples/flights.py +++ b/superset/examples/flights.py @@ -38,12 +38,12 @@ def load_flights(only_metadata: bool = False, force: bool = False) -> None: if not only_metadata and (not table_exists or force): pdf = read_example_data( - "flight_data.csv.gz", encoding="latin-1", compression="gzip" + "examples://flight_data.csv.gz", encoding="latin-1", compression="gzip" ) # Loading airports info to join and get lat/long airports = read_example_data( - "airports.csv.gz", encoding="latin-1", compression="gzip" + "examples://airports.csv.gz", encoding="latin-1", compression="gzip" ) airports = airports.set_index("IATA_CODE") diff --git a/superset/examples/helpers.py b/superset/examples/helpers.py index 57b6bc6316..036f7f0aef 100644 --- a/superset/examples/helpers.py +++ b/superset/examples/helpers.py @@ -125,6 +125,20 @@ def get_example_url(filepath: str) -> str: return f"{BASE_URL}{filepath}" +def normalize_example_data_url(url: str) -> str: + """Convert example data URLs to use the configured CDN. + + Transforms examples:// URLs to the configured CDN URL. + Non-example URLs are returned unchanged. + """ + if url.startswith("examples://"): + relative_path = url[11:] # Remove 'examples://' + return get_example_url(relative_path) + + # Not an examples URL, return unchanged + return url + + def read_example_data( filepath: str, max_attempts: int = 5, @@ -132,9 +146,7 @@ def read_example_data( **kwargs: Any, ) -> pd.DataFrame: """Load CSV or JSON from example data mirror with retry/backoff.""" - from superset.examples.helpers import get_example_url - - url = get_example_url(filepath) + url = normalize_example_data_url(filepath) is_json = filepath.endswith(".json") or filepath.endswith(".json.gz") for attempt in range(1, max_attempts + 1): diff --git a/superset/examples/long_lat.py b/superset/examples/long_lat.py index 7c61e845df..c201535ea4 100644 --- a/superset/examples/long_lat.py +++ b/superset/examples/long_lat.py @@ -48,7 +48,7 @@ def load_long_lat_data(only_metadata: bool = False, force: bool = False) -> None if not only_metadata and (not table_exists or force): pdf = read_example_data( - "san_francisco.csv.gz", encoding="utf-8", compression="gzip" + "examples://san_francisco.csv.gz", encoding="utf-8", compression="gzip" ) start = datetime.datetime.now().replace( hour=0, minute=0, second=0, microsecond=0 diff --git a/superset/examples/multiformat_time_series.py b/superset/examples/multiformat_time_series.py index 1ed46a76db..a86ecdbdc5 100644 --- a/superset/examples/multiformat_time_series.py +++ b/superset/examples/multiformat_time_series.py @@ -49,7 +49,7 @@ def load_multiformat_time_series( # pylint: disable=too-many-locals if not only_metadata and (not table_exists or force): pdf = read_example_data( - "multiformat_time_series.json.gz", compression="gzip" + "examples://multiformat_time_series.json.gz", compression="gzip" ) # TODO(bkyryliuk): move load examples data into the pytest fixture diff --git a/superset/examples/paris.py b/superset/examples/paris.py index 392fe5a710..1b5f40e792 100644 --- a/superset/examples/paris.py +++ b/superset/examples/paris.py @@ -37,7 +37,7 @@ def load_paris_iris_geojson(only_metadata: bool = False, force: bool = False) -> table_exists = database.has_table(Table(tbl_name, schema)) if not only_metadata and (not table_exists or force): - df = read_example_data("paris_iris.json.gz", compression="gzip") + df = read_example_data("examples://paris_iris.json.gz", compression="gzip") df["features"] = df.features.map(json.dumps) df.to_sql( diff --git a/superset/examples/random_time_series.py b/superset/examples/random_time_series.py index c67125803b..c92cd61761 100644 --- a/superset/examples/random_time_series.py +++ b/superset/examples/random_time_series.py @@ -46,7 +46,9 @@ def load_random_time_series_data( table_exists = database.has_table(Table(tbl_name, schema)) if not only_metadata and (not table_exists or force): - pdf = read_example_data("random_time_series.json.gz", compression="gzip") + pdf = read_example_data( + "examples://random_time_series.json.gz", compression="gzip" + ) if database.backend == "presto": pdf.ds = pd.to_datetime(pdf.ds, unit="s") pdf.ds = pdf.ds.dt.strftime("%Y-%m-%d %H:%M%:%S") diff --git a/superset/examples/sf_population_polygons.py b/superset/examples/sf_population_polygons.py index 5cbba12866..f1612e2c1d 100644 --- a/superset/examples/sf_population_polygons.py +++ b/superset/examples/sf_population_polygons.py @@ -39,7 +39,9 @@ def load_sf_population_polygons( table_exists = database.has_table(Table(tbl_name, schema)) if not only_metadata and (not table_exists or force): - df = read_example_data("sf_population.json.gz", compression="gzip") + df = read_example_data( + "examples://sf_population.json.gz", compression="gzip" + ) df["contour"] = df.contour.map(json.dumps) df.to_sql( diff --git a/superset/examples/world_bank.py b/superset/examples/world_bank.py index 784c5a7d31..e1e9363fbd 100644 --- a/superset/examples/world_bank.py +++ b/superset/examples/world_bank.py @@ -55,7 +55,7 @@ def load_world_bank_health_n_pop( # pylint: disable=too-many-locals table_exists = database.has_table(Table(tbl_name, schema)) if not only_metadata and (not table_exists or force): - pdf = read_example_data("countries.json.gz", compression="gzip") + pdf = read_example_data("examples://countries.json.gz", compression="gzip") pdf.columns = [col.replace(".", "_") for col in pdf.columns] if database.backend == "presto": pdf.year = pd.to_datetime(pdf.year)