This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/sedona-spatialbench.git
The following commit(s) were added to refs/heads/main by this push:
new cb53d9d Use Hierarchical Thomas distributions as default (#6)
cb53d9d is described below
commit cb53d9d242008ed3ba00a39008e917f21699b701
Author: Pranav Toggi <[email protected]>
AuthorDate: Tue Sep 9 22:14:24 2025 -0700
Use Hierarchical Thomas distributions as default (#6)
* change default distribution to Hierarchical Thomas
* Update integration test data
* fix generators test
* fix readme
---
README.md | 12 +++++-----
spatialbench-arrow/src/lib.rs | 20 ++++++++--------
spatialbench-cli/src/spatial_config_file.rs | 8 +++----
spatialbench-config.yml | 34 +++++++++++++++++++++------
spatialbench/data/sf-v1/building.tbl.gz | Bin 32752 -> 38908 bytes
spatialbench/data/sf-v1/trip.tbl.gz | Bin 287509 -> 286086 bytes
spatialbench/src/generators.rs | 4 ++--
spatialbench/src/lib.rs | 6 ++---
spatialbench/src/spatial/config.rs | 4 ++--
spatialbench/src/spatial/defaults.rs | 35 ++++++++++++++++++++++------
spatialbench/src/spatial/distributions.rs | 6 ++---
spatialbench/src/spatial/generator.rs | 9 ++++---
12 files changed, 91 insertions(+), 47 deletions(-)
diff --git a/README.md b/README.md
index 709e916..7afbfb8 100644
--- a/README.md
+++ b/README.md
@@ -29,12 +29,12 @@ SpatialBench defines a spatial star schema with the
following tables:
The Zone table uses **scale factor–aware generation** so that zone granularity
scales with dataset size and keeps query cost realistic. At small scales, this
feels like querying ZIP-level units; at large scales, it uses coarser
administrative units.
-| Scale Factor (SF) | Zone Subtypes Included |
Zone Cardinality |
-|-------------------|----------------------------------------------------|------------------|
-| [0, 10) | microhood, macrohood |
117,416 |
-| [10, 100) | + neighborhood, county |
455,711 |
-| [100, 1000) | + localadmin, locality, region, dependency |
1,035,371 |
-| [1000+) | + country |
1,035,749 |
+| Scale Factor (SF) | Zone Subtypes Included | Zone
Cardinality |
+|-------------------|--------------------------------------------|------------------|
+| [0, 10) | microhood, macrohood, county | 156,095
|
+| [10, 100) | + neighborhood | 455,711
|
+| [100, 1000) | + localadmin, locality, region, dependency | 1,035,371
|
+| [1000+) | + country | 1,035,749
|
This tiered scaling reflects **geometry complexity** and **area
distributions** observed in the Overture `division_area` dataset which
represents administrative boundaries, release version 2025-08-20.1.
diff --git a/spatialbench-arrow/src/lib.rs b/spatialbench-arrow/src/lib.rs
index bc94641..eb8f66d 100644
--- a/spatialbench-arrow/src/lib.rs
+++ b/spatialbench-arrow/src/lib.rs
@@ -21,16 +21,16 @@
//!
"+-----------+-----------+-------------+--------------+---------------------+---------------------+---------+---------+---------------+------------+--------------------------------------------+--------------------------------------------+",
//! "| t_tripkey | t_custkey | t_driverkey | t_vehiclekey | t_pickuptime
| t_dropofftime | t_fare | t_tip | t_totalamount | t_distance |
t_pickuploc | t_dropoffloc
|",
//!
"+-----------+-----------+-------------+--------------+---------------------+---------------------+---------+---------+---------------+------------+--------------------------------------------+--------------------------------------------+",
-//! "| 1 | 215 | 1 | 1 |
1997-07-24T06:58:22 | 1997-07-24T13:59:54 | 0.00034 | 0.00002 | 0.00037 |
0.00014 | 010100000018fc05d4fe7933c0e693f96da66b2a40 |
0101000000677b99cd887d33c0cfe06bdf0e242a40 |",
-//! "| 2 | 172 | 1 | 1 |
1997-12-24T08:47:14 | 1997-12-24T09:28:57 | 0.00003 | 0.00000 | 0.00004 |
0.00001 | 01010000005a2f6fd6ecb35b40bfbd2e6d3f384540 |
010100000073b76fdecdb45b409895dc1786384540 |",
-//! "| 3 | 46 | 1 | 1 |
1993-06-27T13:27:07 | 1993-06-27T13:34:51 | 0.00000 | 0.00000 | 0.00000 |
0.00000 | 010100000039c3814e70f2534094e0003f52441540 |
0101000000b2f079ee52f2534016653e396d421540 |",
-//! "| 4 | 40 | 1 | 1 |
1996-08-02T04:14:27 | 1996-08-02T05:29:32 | 0.00005 | 0.00000 | 0.00005 |
0.00002 | 01010000008aa8333e66db54c0fffa582953ec3ac0 |
0101000000a24f9abcf7db54c0541f27d050f23ac0 |",
-//! "| 5 | 232 | 1 | 1 |
1996-08-23T12:48:20 | 1996-08-23T13:36:15 | 0.00002 | 0.00000 | 0.00003 |
0.00001 | 01010000008415c0f7eb1a54c0b75354ac03a8d8bf |
0101000000fe67073bc91b54c0449293684d1cd8bf |",
-//! "| 6 | 46 | 1 | 1 |
1994-11-16T16:39:14 | 1994-11-16T17:26:07 | 0.00003 | 0.00000 | 0.00003 |
0.00001 | 0101000000ce6b7abd2cf54040aa3efca13e7a05c0 |
0101000000924fd9fa33f4404006df0bfe445e05c0 |",
-//! "| 7 | 284 | 1 | 1 |
1996-01-20T06:18:56 | 1996-01-20T06:18:56 | 0.00000 | 0.00000 | 0.00000 |
0.00000 | 010100000063714ed8bd1c524047a536433ace4940 |
0101000000e6bc52d8bd1c5240c45932433ace4940 |",
-//! "| 8 | 233 | 1 | 1 |
1995-01-09T23:26:54 | 1995-01-10T00:16:28 | 0.00003 | 0.00000 | 0.00003 |
0.00001 | 01010000003bb9cfbfd7bc50407b16af721b8f4940 |
01010000007bcfb31fcabb5040a6a655dbea8e4940 |",
-//! "| 9 | 178 | 1 | 1 |
1993-10-13T11:07:04 | 1993-10-13T12:42:27 | 0.00005 | 0.00001 | 0.00007 |
0.00003 | 01010000009b898e89b99952c0a488490e51cc46c0 |
0101000000e768fa91879952c07112f7165ed046c0 |",
-//! "| 10 | 118 | 1 | 1 |
1994-11-08T21:05:58 | 1994-11-08T21:21:29 | 0.00001 | 0.00000 | 0.00001 |
0.00000 | 0101000000e2c7173be1a45340311fd18349d14940 |
01010000009b28f85c1ca553405b833101c3d14940 |",
+//! "| 1 | 215 | 1 | 1 |
1997-07-24T06:58:22 | 1997-07-24T13:59:54 | 0.00034 | 0.00002 | 0.00037 |
0.00014 | 01010000009f3c318dd43735405930592bc6062040 |
0101000000d408a2934a34354083fa96395d7e1f40 |",
+//! "| 2 | 172 | 1 | 1 |
1997-12-24T08:47:14 | 1997-12-24T09:28:57 | 0.00003 | 0.00000 | 0.00004 |
0.00001 | 010100000066ea0ba7209b5740dc070cd122e33d40 |
01010000007f720caf019c57407cf24d26b0e33d40 |",
+//! "| 3 | 46 | 1 | 1 |
1993-06-27T13:27:07 | 1993-06-27T13:34:51 | 0.00000 | 0.00000 | 0.00000 |
0.00000 | 010100000003cc2607066e5b40f26f32d2d4d0ff3f |
01010000009be61da7e86d5b407ac002b940c9ff3f |",
+//! "| 4 | 40 | 1 | 1 |
1996-08-02T04:14:27 | 1996-08-02T05:29:32 | 0.00005 | 0.00000 | 0.00005 |
0.00002 | 0101000000897921e03a0e4cc08816c7ebfbc745c0 |
0101000000c4f5ffdc5d0f4cc0eb6b23bffaca45c0 |",
+//! "| 5 | 232 | 1 | 1 |
1996-08-23T12:48:20 | 1996-08-23T13:36:15 | 0.00002 | 0.00000 | 0.00003 |
0.00001 | 0101000000ff3ea1a62fcb4dc014fc64fc630b2ac0 |
0101000000f3e32f2deacc4dc026e9714a06072ac0 |",
+//! "| 6 | 46 | 1 | 1 |
1994-11-16T16:39:14 | 1994-11-16T17:26:07 | 0.00003 | 0.00000 | 0.00003 |
0.00001 | 0101000000855c114bb6562440b2810ccef493d83f |
0101000000ac47af40d3522440915fa2eec173d93f |",
+//! "| 7 | 284 | 1 | 1 |
1996-01-20T06:18:56 | 1996-01-20T06:18:56 | 0.00000 | 0.00000 | 0.00000 |
0.00000 | 010100000088904b0024855a40ea87d1a6fc8b4340 |
01010000000bdc4f0024855a40f01edaa6fc8b4340 |",
+//! "| 8 | 233 | 1 | 1 |
1995-01-09T23:26:54 | 1995-01-10T00:16:28 | 0.00003 | 0.00000 | 0.00003 |
0.00001 | 010100000002f5d829e5845640f2770bfe6053f8bf |
01010000003b74b489d78356402a9b07ea7359f8bf |",
+//! "| 9 | 178 | 1 | 1 |
1993-10-13T11:07:04 | 1993-10-13T12:42:27 | 0.00005 | 0.00001 | 0.00007 |
0.00003 | 0101000000dd8b0712968e49c061d63d122c131640 |
0101000000ec67d222328e49c0d7fd9dccc3f21540 |",
+//! "| 10 | 118 | 1 | 1 |
1994-11-08T21:05:58 | 1994-11-08T21:21:29 | 0.00001 | 0.00000 | 0.00001 |
0.00000 | 0101000000df66d30c07e75940ff4f81705eca3c40 |
01010000003469ae2e42e75940c49e2c6b51cb3c40 |",
//!
"+-----------+-----------+-------------+--------------+---------------------+---------------------+---------+---------+---------------+------------+--------------------------------------------+--------------------------------------------+"
//! ]);
//! ```
diff --git a/spatialbench-cli/src/spatial_config_file.rs
b/spatialbench-cli/src/spatial_config_file.rs
index b0db4ee..9027da7 100644
--- a/spatialbench-cli/src/spatial_config_file.rs
+++ b/spatialbench-cli/src/spatial_config_file.rs
@@ -32,7 +32,7 @@ where
"bit" => Ok(DistributionType::Bit),
"sierpinski" => Ok(DistributionType::Sierpinski),
"thomas" => Ok(DistributionType::Thomas),
- "hierthomas" => Ok(DistributionType::HierThomas),
+ "hierarchicalthomas" =>
Ok(DistributionType::HierarchicalThomas),
_ => Err(E::custom(format!("unknown distribution type: {}",
value))),
}
}
@@ -121,7 +121,7 @@ pub enum InlineParams {
pareto_xm: f64, // scale (>0), typically 1.0
},
- HierThomas {
+ HierarchicalThomas {
cities: u32, // top-level “city” centers
sub_mean: f64,
sub_sd: f64,
@@ -168,7 +168,7 @@ impl InlineSpatialConfig {
pareto_alpha: *pareto_alpha,
pareto_xm: *pareto_xm,
},
- InlineParams::HierThomas {
+ InlineParams::HierarchicalThomas {
cities,
sub_mean,
sub_sd,
@@ -180,7 +180,7 @@ impl InlineSpatialConfig {
pareto_xm_city,
pareto_alpha_sub,
pareto_xm_sub,
- } => DistributionParams::HierThomas {
+ } => DistributionParams::HierarchicalThomas {
cities: *cities, // top-level “city” centers
sub_mean: *sub_mean,
sub_sd: *sub_sd,
diff --git a/spatialbench-config.yml b/spatialbench-config.yml
index 83df56c..86ce5a8 100644
--- a/spatialbench-config.yml
+++ b/spatialbench-config.yml
@@ -1,5 +1,5 @@
trip:
- dist_type: bit
+ dist_type: HierarchicalThomas
geom_type: point
dim: 2
seed: 56789
@@ -8,18 +8,38 @@ trip:
maxseg: 0
polysize: 0.0
params:
- type: bit
- probability: 0.35
- digits: 30
+ type: hierarchicalthomas
+ cities: 60000
+ sub_mean: 15
+ sub_sd: 12
+ sub_min: 2
+ sub_max: 80
+ sigma_city: 0.006
+ sigma_sub: 0.003
+ pareto_alpha_city: 0.80
+ pareto_xm_city: 1.0
+ pareto_alpha_sub: 1.00
+ pareto_xm_sub: 1.0
building:
- dist_type: sierpinski
+ dist_type: HierarchicalThomas
geom_type: polygon
dim: 2
seed: 12345
width: 0.0
height: 0.0
- maxseg: 5
+ maxseg: 7
polysize: 0.000039
params:
- type: none
\ No newline at end of file
+ type: hierarchicalthomas
+ cities: 10000
+ sub_mean: 5
+ sub_sd: 3
+ sub_min: 1
+ sub_max: 15
+ sigma_city: 0.1
+ sigma_sub: 0.01
+ pareto_alpha_city: 1.20
+ pareto_xm_city: 1.0
+ pareto_alpha_sub: 1.00
+ pareto_xm_sub: 1.0
\ No newline at end of file
diff --git a/spatialbench/data/sf-v1/building.tbl.gz
b/spatialbench/data/sf-v1/building.tbl.gz
index 5954403..297656c 100644
Binary files a/spatialbench/data/sf-v1/building.tbl.gz and
b/spatialbench/data/sf-v1/building.tbl.gz differ
diff --git a/spatialbench/data/sf-v1/trip.tbl.gz
b/spatialbench/data/sf-v1/trip.tbl.gz
index 8b5a25c..7114b91 100644
Binary files a/spatialbench/data/sf-v1/trip.tbl.gz and
b/spatialbench/data/sf-v1/trip.tbl.gz differ
diff --git a/spatialbench/src/generators.rs b/spatialbench/src/generators.rs
index 975f506..94b771b 100644
--- a/spatialbench/src/generators.rs
+++ b/spatialbench/src/generators.rs
@@ -1867,7 +1867,7 @@ mod tests {
// Check first Trip
let first = &trips[1];
assert_eq!(first.t_tripkey, 2);
- assert_eq!(first.to_string(), "2|172|1|1|1997-12-24
08:47:14|1997-12-24 09:28:57|0.03|0.00|0.04|0.01|POINT(110.811330422
42.439435623)|POINT(110.82506524 42.4415922)|");
+ assert_eq!(first.to_string(), "2|172|1|1|1997-12-24
08:47:14|1997-12-24 09:28:57|0.03|0.00|0.04|0.01|POINT(94.423867952
29.887250009)|POINT(94.43760277 29.88940658)|");
}
#[test]
@@ -1893,7 +1893,7 @@ mod tests {
// Check first Building
let first = &buildings[1];
assert_eq!(first.b_buildingkey, 2);
- assert_eq!(first.to_string(), "2|blush|POLYGON((86.384542194
50.455356051,86.380173376 50.454956856,86.379715566 50.458440835,86.382506208
50.459233429,86.385229341 50.457089194,86.384542194 50.455356051))|")
+ assert_eq!(first.to_string(), "2|blush|POLYGON((124.218033476
10.538071565,124.215762091 10.536069114,124.214352934
10.536014944,124.212486371 10.539913704,124.217919324
10.539075339,124.218033476 10.538071565))|")
}
#[test]
diff --git a/spatialbench/src/lib.rs b/spatialbench/src/lib.rs
index 2507dd7..afbdcb9 100644
--- a/spatialbench/src/lib.rs
+++ b/spatialbench/src/lib.rs
@@ -21,9 +21,9 @@
//! .collect::<Vec<_>>();
//! assert_eq!(
//! trips.join("\n"),"\
-//! 1|215|1|1|1997-07-24 06:58:22|1997-07-24
13:59:54|0.34|0.02|0.37|0.14|POINT(-19.47654462 13.210254132)|POINT(-19.4903687
13.07042597)|\n\
-//! 2|172|1|1|1997-12-24 08:47:14|1997-12-24
09:28:57|0.03|0.00|0.04|0.01|POINT(110.811330422
42.439435623)|POINT(110.82506524 42.4415922)|\n\
-//! 3|46|1|1|1993-06-27 13:27:07|1993-06-27
13:34:51|0.00|0.00|0.00|0.00|POINT(79.788104655 5.316719994)|POINT(79.78631174
5.31486978)|"
+//! 1|215|1|1|1997-07-24 06:58:22|1997-07-24
13:59:54|0.34|0.02|0.37|0.14|POINT(21.218087029 8.013230662)|POINT(21.20426295
7.8734025)|\n\
+//! 2|172|1|1|1997-12-24 08:47:14|1997-12-24
09:28:57|0.03|0.00|0.04|0.01|POINT(94.423867952 29.887250009)|POINT(94.43760277
29.88940658)|\n\
+//! 3|46|1|1|1993-06-27 13:27:07|1993-06-27
13:34:51|0.00|0.00|0.00|0.00|POINT(109.719117916 1.988484212)|POINT(109.717325
1.98663399)|"
//! );
//! ```
//!
diff --git a/spatialbench/src/spatial/config.rs
b/spatialbench/src/spatial/config.rs
index 7ee8a31..b31fd1e 100644
--- a/spatialbench/src/spatial/config.rs
+++ b/spatialbench/src/spatial/config.rs
@@ -6,7 +6,7 @@ pub enum DistributionType {
Sierpinski,
Bit,
Thomas,
- HierThomas,
+ HierarchicalThomas,
}
#[derive(Debug, Clone, Copy)]
@@ -42,7 +42,7 @@ pub enum DistributionParams {
pareto_alpha: f64,
pareto_xm: f64,
},
- HierThomas {
+ HierarchicalThomas {
cities: u32,
// variable subclusters per city (normal, clamped)
sub_mean: f64,
diff --git a/spatialbench/src/spatial/defaults.rs
b/spatialbench/src/spatial/defaults.rs
index d158eb3..4fadc96 100644
--- a/spatialbench/src/spatial/defaults.rs
+++ b/spatialbench/src/spatial/defaults.rs
@@ -24,7 +24,7 @@ impl Default for ContinentAffines {
impl SpatialDefaults {
pub fn trip_default() -> SpatialGenerator {
let config = SpatialConfig {
- dist_type: DistributionType::Bit,
+ dist_type: DistributionType::HierarchicalThomas,
geom_type: GeomType::Point,
dim: 2,
seed: 56789,
@@ -37,9 +37,18 @@ impl SpatialDefaults {
maxseg: 0,
polysize: 0.0,
- params: DistributionParams::Bit {
- probability: 0.35,
- digits: 30,
+ params: DistributionParams::HierarchicalThomas {
+ cities: 60000,
+ sub_mean: 15.0,
+ sub_sd: 12.0,
+ sub_min: 2,
+ sub_max: 80,
+ sigma_city: 0.006,
+ sigma_sub: 0.003,
+ pareto_alpha_city: 0.80,
+ pareto_xm_city: 1.0,
+ pareto_alpha_sub: 1.00,
+ pareto_xm_sub: 1.0,
},
};
SpatialGenerator::new(config, OnceLock::new(), OnceLock::new())
@@ -47,7 +56,7 @@ impl SpatialDefaults {
pub fn building_default() -> SpatialGenerator {
let config = SpatialConfig {
- dist_type: DistributionType::Sierpinski,
+ dist_type: DistributionType::HierarchicalThomas,
geom_type: GeomType::Polygon,
dim: 2,
seed: 12345,
@@ -57,10 +66,22 @@ impl SpatialDefaults {
height: 0.0,
// geometry = polygon
- maxseg: 5,
+ maxseg: 7,
polysize: 0.000039,
- params: DistributionParams::None,
+ params: DistributionParams::HierarchicalThomas {
+ cities: 10000,
+ sub_mean: 5.0,
+ sub_sd: 3.0,
+ sub_min: 1,
+ sub_max: 15,
+ sigma_city: 0.1,
+ sigma_sub: 0.01,
+ pareto_alpha_city: 1.20,
+ pareto_xm_city: 1.0,
+ pareto_alpha_sub: 1.00,
+ pareto_xm_sub: 1.0,
+ },
};
SpatialGenerator::new(config, OnceLock::new(), OnceLock::new())
}
diff --git a/spatialbench/src/spatial/distributions.rs
b/spatialbench/src/spatial/distributions.rs
index a787ce7..b80ba8a 100644
--- a/spatialbench/src/spatial/distributions.rs
+++ b/spatialbench/src/spatial/distributions.rs
@@ -77,7 +77,7 @@ pub fn generate_sierpinski(index: u64, config:
&SpatialConfig, m: &[f64; 6]) ->
let (mut x, mut y) = (0.0, 0.0);
let a = (0.0, 0.0);
let b = (1.0, 0.0);
- let c = (0.5, (3.0f64).sqrt() / 2.0);
+ let c = (0.5, 3.0f64.sqrt() / 2.0);
for _ in 0..27 {
match rng.gen_range(0..3) {
0 => {
@@ -163,7 +163,7 @@ fn pick_parent_pareto_once(u: f64, k: usize, alpha: f64,
xm: f64, seed: u64) ->
pick_from_cdf(&cdf, u)
}
-pub fn generate_hier_thomas(
+pub fn generate_hierarchical_thomas(
index: u64,
config: &SpatialConfig,
hier_cache: &OnceLock<HierThomasCache>,
@@ -171,7 +171,7 @@ pub fn generate_hier_thomas(
) -> Geometry {
let (nc, sub_mean, sub_sd, sub_min, sub_max, sigma_city, sigma_sub, a_c,
xm_c, a_s, xm_s) =
match config.params {
- DistributionParams::HierThomas {
+ DistributionParams::HierarchicalThomas {
cities,
sub_mean,
sub_sd,
diff --git a/spatialbench/src/spatial/generator.rs
b/spatialbench/src/spatial/generator.rs
index 54ae6c4..5fcd176 100644
--- a/spatialbench/src/spatial/generator.rs
+++ b/spatialbench/src/spatial/generator.rs
@@ -36,9 +36,12 @@ impl SpatialGenerator {
DistributionType::Thomas => {
generate_thomas(index, &self.config, &self.thomas_cache,
continent_affine)
}
- DistributionType::HierThomas => {
- generate_hier_thomas(index, &self.config, &self.hier_cache,
continent_affine)
- }
+ DistributionType::HierarchicalThomas =>
generate_hierarchical_thomas(
+ index,
+ &self.config,
+ &self.hier_cache,
+ continent_affine,
+ ),
}
}
}