Copilot commented on code in PR #46:
URL:
https://github.com/apache/sedona-spatialbench/pull/46#discussion_r2427779055
##########
spatialbench-cli/src/zone_df.rs:
##########
@@ -0,0 +1,501 @@
+use std::{path::PathBuf, sync::Arc, time::Instant};
+
+use anyhow::{anyhow, Result};
+use arrow_array::RecordBatch;
+use arrow_schema::{Schema, SchemaRef};
+use datafusion::{
+ common::config::ConfigOptions, execution::runtime_env::RuntimeEnvBuilder,
prelude::*,
+ sql::TableReference,
+};
+
+use crate::plan::DEFAULT_PARQUET_ROW_GROUP_BYTES;
+use datafusion::execution::runtime_env::RuntimeEnv;
+use log::{debug, info};
+use object_store::aws::AmazonS3Builder;
+use object_store::ObjectStore;
+use parquet::{
+ arrow::ArrowWriter, basic::Compression as ParquetCompression,
+ file::properties::WriterProperties,
+};
+use url::Url;
+
+const OVERTURE_RELEASE_DATE: &str = "2025-08-20.1";
+const OVERTURE_S3_BUCKET: &str = "overturemaps-us-west-2";
+const OVERTURE_S3_PREFIX: &str = "release";
+
+fn zones_parquet_url() -> String {
+ format!(
+ "s3://{}/{}/{}/theme=divisions/type=division_area/",
+ OVERTURE_S3_BUCKET, OVERTURE_S3_PREFIX, OVERTURE_RELEASE_DATE
+ )
+}
+
+fn subtypes_for_scale_factor(sf: f64) -> Vec<&'static str> {
+ let mut v = vec!["microhood", "macrohood", "county"];
+ if sf >= 10.0 {
+ v.push("neighborhood");
+ }
+ if sf >= 100.0 {
+ v.extend_from_slice(&["localadmin", "locality", "region",
"dependency"]);
+ }
+ if sf >= 1000.0 {
+ v.push("country");
+ }
+ v
+}
+
+fn estimated_total_rows_for_sf(sf: f64) -> i64 {
+ let mut total = 0i64;
+ for s in subtypes_for_scale_factor(sf) {
+ total += match s {
+ "microhood" => 74797,
+ "macrohood" => 42619,
+ "neighborhood" => 298615,
+ "county" => 38679,
Review Comment:
The hardcoded count for 'county' (38679) differs from the count used in the
removed ZoneGenerator (39680). This inconsistency could affect data generation
accuracy and should be verified.
```suggestion
"county" => 39680,
```
##########
spatialbench-cli/tests/cli_integration.rs:
##########
@@ -79,6 +80,112 @@ fn test_spatialbench_cli_tbl_scale_factor_v1() {
}
}
+/// Test zone parquet output determinism - same data should be generated every
time
+#[tokio::test]
+async fn test_zone_deterministic_parts_generation() {
+ let temp_dir1 = tempdir().expect("Failed to create temporary directory 1");
+
+ Command::cargo_bin("spatialbench-cli")
+ .expect("Binary not found")
+ .arg("--format")
+ .arg("parquet")
+ .arg("--scale-factor")
+ .arg("1.0")
+ .arg("--output-dir")
+ .arg(temp_dir1.path())
+ .arg("--tables")
+ .arg("zone")
+ .arg("--parts")
+ .arg("100")
+ .arg("--part")
+ .arg("1")
+ .assert()
+ .success();
+
+ let zone_file1 = temp_dir1.path().join("zone.parquet");
+
+ // Reference file is a sf=0.01 zone table with z_boundary column removed
+ let reference_file =
PathBuf::from("../spatialbench/data/sf-v1//zone.parquet");
Review Comment:
Double slash in path '../spatialbench/data/sf-v1//zone.parquet' should be a
single slash for consistency.
```suggestion
let reference_file =
PathBuf::from("../spatialbench/data/sf-v1/zone.parquet");
```
##########
spatialbench-cli/src/plan.rs:
##########
@@ -329,10 +328,7 @@ impl OutputSize {
Table::Customer =>
CustomerGenerator::calculate_row_count(scale_factor, 1, 1),
Table::Trip => TripGenerator::calculate_row_count(scale_factor, 1,
1),
Table::Building =>
BuildingGenerator::calculate_row_count(scale_factor, 1, 1),
- Table::Zone => {
- let generator = ZoneGenerator::new(scale_factor, 1, 1);
- generator.calculate_row_count()
- }
+ Table::Zone => todo!(),
Review Comment:
The todo!() macro will panic at runtime when zone row count calculation is
needed. This should be replaced with a proper implementation that calculates
row counts for the DataFusion-based zone generation.
##########
spatialbench-cli/src/zone_df.rs:
##########
@@ -0,0 +1,501 @@
+use std::{path::PathBuf, sync::Arc, time::Instant};
+
+use anyhow::{anyhow, Result};
+use arrow_array::RecordBatch;
+use arrow_schema::{Schema, SchemaRef};
+use datafusion::{
+ common::config::ConfigOptions, execution::runtime_env::RuntimeEnvBuilder,
prelude::*,
+ sql::TableReference,
+};
+
+use crate::plan::DEFAULT_PARQUET_ROW_GROUP_BYTES;
+use datafusion::execution::runtime_env::RuntimeEnv;
+use log::{debug, info};
+use object_store::aws::AmazonS3Builder;
+use object_store::ObjectStore;
+use parquet::{
+ arrow::ArrowWriter, basic::Compression as ParquetCompression,
+ file::properties::WriterProperties,
+};
+use url::Url;
+
+const OVERTURE_RELEASE_DATE: &str = "2025-08-20.1";
+const OVERTURE_S3_BUCKET: &str = "overturemaps-us-west-2";
+const OVERTURE_S3_PREFIX: &str = "release";
+
+fn zones_parquet_url() -> String {
+ format!(
+ "s3://{}/{}/{}/theme=divisions/type=division_area/",
+ OVERTURE_S3_BUCKET, OVERTURE_S3_PREFIX, OVERTURE_RELEASE_DATE
+ )
+}
+
+fn subtypes_for_scale_factor(sf: f64) -> Vec<&'static str> {
+ let mut v = vec!["microhood", "macrohood", "county"];
+ if sf >= 10.0 {
+ v.push("neighborhood");
+ }
+ if sf >= 100.0 {
+ v.extend_from_slice(&["localadmin", "locality", "region",
"dependency"]);
+ }
+ if sf >= 1000.0 {
+ v.push("country");
+ }
+ v
+}
+
+fn estimated_total_rows_for_sf(sf: f64) -> i64 {
+ let mut total = 0i64;
+ for s in subtypes_for_scale_factor(sf) {
+ total += match s {
+ "microhood" => 74797,
+ "macrohood" => 42619,
+ "neighborhood" => 298615,
+ "county" => 38679,
+ "localadmin" => 19007,
+ "locality" => 555834,
+ "region" => 3905,
+ "dependency" => 53,
+ "country" => 219,
Review Comment:
The hardcoded counts for 'region' (3905 vs 4714), 'dependency' (53 vs 105),
and 'country' (219 vs 378) differ significantly from the counts in the removed
ZoneGenerator. These discrepancies could impact data generation accuracy and
should be reconciled.
```suggestion
"region" => 4714,
"dependency" => 105,
"country" => 378,
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]