This is an automated email from the ASF dual-hosted git repository. prantogg pushed a commit to branch use-huggingface-source in repository https://gitbox.apache.org/repos/asf/sedona-spatialbench.git
commit a1b9fa76281778da5246477ad3579d88eb1ba113 Author: Pranav Toggi <[email protected]> AuthorDate: Fri Oct 17 15:15:05 2025 -0700 load from HF over https --- spatialbench-cli/Cargo.toml | 2 +- spatialbench-cli/src/zone_df.rs | 52 +++++++++++++++++++---------------------- 2 files changed, 25 insertions(+), 29 deletions(-) diff --git a/spatialbench-cli/Cargo.toml b/spatialbench-cli/Cargo.toml index 1180f39..3f2b809 100644 --- a/spatialbench-cli/Cargo.toml +++ b/spatialbench-cli/Cargo.toml @@ -24,7 +24,7 @@ serde = { version = "1.0.219", features = ["derive"] } anyhow = "1.0.99" serde_yaml = "0.9.33" datafusion = "47.0.0" -object_store = { version = "0.12.4", features = ["aws"] } +object_store = { version = "0.12.4", features = ["http"] } arrow-array = "55.2.0" arrow-schema = "55.2.0" url = "2.5.7" diff --git a/spatialbench-cli/src/zone_df.rs b/spatialbench-cli/src/zone_df.rs index 464b61d..b486cb0 100644 --- a/spatialbench-cli/src/zone_df.rs +++ b/spatialbench-cli/src/zone_df.rs @@ -11,8 +11,7 @@ use datafusion::{ use crate::plan::DEFAULT_PARQUET_ROW_GROUP_BYTES; use datafusion::execution::runtime_env::RuntimeEnv; use log::{debug, info}; -use object_store::aws::AmazonS3Builder; -use object_store::ObjectStore; +use object_store::http::HttpBuilder; use parquet::{ arrow::ArrowWriter, basic::Compression as ParquetCompression, file::properties::WriterProperties, @@ -20,15 +19,7 @@ use parquet::{ use url::Url; const OVERTURE_RELEASE_DATE: &str = "2025-08-20.1"; -const OVERTURE_S3_BUCKET: &str = "overturemaps-us-west-2"; -const OVERTURE_S3_PREFIX: &str = "release"; - -fn zones_parquet_url() -> String { - format!( - "s3://{}/{}/{}/theme=divisions/type=division_area/", - OVERTURE_S3_BUCKET, OVERTURE_S3_PREFIX, OVERTURE_RELEASE_DATE - ) -} +const HUGGINGFACE_URL: &str = "https://huggingface.co"; fn subtypes_for_scale_factor(sf: f64) -> Vec<&'static str> { let mut v = vec!["microhood", "macrohood", "county"]; @@ -200,29 +191,34 @@ pub async fn generate_zone_parquet(args: ZoneDfArgs) -> Result<()> { let rt: Arc<RuntimeEnv> = Arc::new(RuntimeEnvBuilder::new().build()?); debug!("Built DataFusion runtime environment"); - // Register S3 store for Overture bucket - let bucket = OVERTURE_S3_BUCKET; - info!("Registering S3 store for bucket: {}", bucket); - let s3 = AmazonS3Builder::new() - .with_bucket_name(bucket) - .with_skip_signature(true) - .with_region("us-west-2") - .build()?; - - let s3_url = Url::parse(&format!("s3://{bucket}"))?; - let s3_store: Arc<dyn ObjectStore> = Arc::new(s3); - rt.register_object_store(&s3_url, s3_store); - debug!("Successfully registered S3 object store"); + // Register HTTPS object store for Hugging Face + let hf_store = HttpBuilder::new().with_url(HUGGINGFACE_URL).build()?; + let hf_url = Url::parse(HUGGINGFACE_URL)?; + rt.register_object_store(&hf_url, Arc::new(hf_store)); + debug!("Registered HTTPS object store for huggingface.co"); let ctx = SessionContext::new_with_config_rt(SessionConfig::from(cfg), rt); debug!("Created DataFusion session context"); - let url = zones_parquet_url(); - info!("Reading parquet data from: {}", url); + // 4 Parquet parts from Hugging Face + let parquet_urls = vec![ + format!("https://huggingface.co/datasets/apache-sedona/spatialbench/resolve/main/omf-division-area-{}/part-00000-c998b093-fa14-440c-98f0-bbdb2126ed22-c000.zstd.parquet", OVERTURE_RELEASE_DATE), + format!("https://huggingface.co/datasets/apache-sedona/spatialbench/resolve/main/omf-division-area-{}/part-00001-c998b093-fa14-440c-98f0-bbdb2126ed22-c000.zstd.parquet", OVERTURE_RELEASE_DATE), + format!("https://huggingface.co/datasets/apache-sedona/spatialbench/resolve/main/omf-division-area-{}/part-00002-c998b093-fa14-440c-98f0-bbdb2126ed22-c000.zstd.parquet", OVERTURE_RELEASE_DATE), + format!("https://huggingface.co/datasets/apache-sedona/spatialbench/resolve/main/omf-division-area-{}/part-00003-c998b093-fa14-440c-98f0-bbdb2126ed22-c000.zstd.parquet", OVERTURE_RELEASE_DATE), + ]; + + info!( + "Reading {} Parquet parts from Hugging Faceā¦", + parquet_urls.len() + ); + let t_read_start = Instant::now(); - let mut df = ctx.read_parquet(url, ParquetReadOptions::default()).await?; + let mut df = ctx + .read_parquet(parquet_urls, ParquetReadOptions::default()) + .await?; let read_dur = t_read_start.elapsed(); - info!("Successfully read parquet data in {:?}", read_dur); + info!("Successfully read HF parquet data in {:?}", read_dur); // Build filter predicate debug!("Building filter predicate for subtypes: {:?}", subtypes);
