This is an automated email from the ASF dual-hosted git repository.

jiayu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/sedona-spatialbench.git


The following commit(s) were added to refs/heads/main by this push:
     new cb53d9d  Use Hierarchical Thomas distributions as default (#6)
cb53d9d is described below

commit cb53d9d242008ed3ba00a39008e917f21699b701
Author: Pranav Toggi <[email protected]>
AuthorDate: Tue Sep 9 22:14:24 2025 -0700

    Use Hierarchical Thomas distributions as default (#6)
    
    * change default distribution to Hierarchical Thomas
    
    * Update integration test data
    
    * fix generators test
    
    * fix readme
---
 README.md                                   |  12 +++++-----
 spatialbench-arrow/src/lib.rs               |  20 ++++++++--------
 spatialbench-cli/src/spatial_config_file.rs |   8 +++----
 spatialbench-config.yml                     |  34 +++++++++++++++++++++------
 spatialbench/data/sf-v1/building.tbl.gz     | Bin 32752 -> 38908 bytes
 spatialbench/data/sf-v1/trip.tbl.gz         | Bin 287509 -> 286086 bytes
 spatialbench/src/generators.rs              |   4 ++--
 spatialbench/src/lib.rs                     |   6 ++---
 spatialbench/src/spatial/config.rs          |   4 ++--
 spatialbench/src/spatial/defaults.rs        |  35 ++++++++++++++++++++++------
 spatialbench/src/spatial/distributions.rs   |   6 ++---
 spatialbench/src/spatial/generator.rs       |   9 ++++---
 12 files changed, 91 insertions(+), 47 deletions(-)

diff --git a/README.md b/README.md
index 709e916..7afbfb8 100644
--- a/README.md
+++ b/README.md
@@ -29,12 +29,12 @@ SpatialBench defines a spatial star schema with the 
following tables:
 
 The Zone table uses **scale factor–aware generation** so that zone granularity 
scales with dataset size and keeps query cost realistic. At small scales, this 
feels like querying ZIP-level units; at large scales, it uses coarser 
administrative units.
 
-| Scale Factor (SF) | Zone Subtypes Included                             | 
Zone Cardinality |
-|-------------------|----------------------------------------------------|------------------|
-| [0, 10)           | microhood, macrohood                               | 
117,416          |
-| [10, 100)         | + neighborhood, county                             | 
455,711          |
-| [100, 1000)       | + localadmin, locality, region, dependency         | 
1,035,371        |
-| [1000+)           | + country                                          | 
1,035,749        |
+| Scale Factor (SF) | Zone Subtypes Included                     | Zone 
Cardinality |
+|-------------------|--------------------------------------------|------------------|
+| [0, 10)           | microhood, macrohood, county               | 156,095     
     |
+| [10, 100)         | + neighborhood                             | 455,711     
     |
+| [100, 1000)       | + localadmin, locality, region, dependency | 1,035,371   
     |
+| [1000+)           | + country                                  | 1,035,749   
     |
 
 This tiered scaling reflects **geometry complexity** and **area 
distributions** observed in the Overture `division_area` dataset which 
represents administrative boundaries, release version 2025-08-20.1.
 
diff --git a/spatialbench-arrow/src/lib.rs b/spatialbench-arrow/src/lib.rs
index bc94641..eb8f66d 100644
--- a/spatialbench-arrow/src/lib.rs
+++ b/spatialbench-arrow/src/lib.rs
@@ -21,16 +21,16 @@
 //!   
"+-----------+-----------+-------------+--------------+---------------------+---------------------+---------+---------+---------------+------------+--------------------------------------------+--------------------------------------------+",
 //!   "| t_tripkey | t_custkey | t_driverkey | t_vehiclekey | t_pickuptime     
   | t_dropofftime       | t_fare  | t_tip   | t_totalamount | t_distance | 
t_pickuploc                                | t_dropoffloc                       
        |",
 //!   
"+-----------+-----------+-------------+--------------+---------------------+---------------------+---------+---------+---------------+------------+--------------------------------------------+--------------------------------------------+",
-//!   "| 1         | 215       | 1           | 1            | 
1997-07-24T06:58:22 | 1997-07-24T13:59:54 | 0.00034 | 0.00002 | 0.00037       | 
0.00014    | 010100000018fc05d4fe7933c0e693f96da66b2a40 | 
0101000000677b99cd887d33c0cfe06bdf0e242a40 |",
-//!   "| 2         | 172       | 1           | 1            | 
1997-12-24T08:47:14 | 1997-12-24T09:28:57 | 0.00003 | 0.00000 | 0.00004       | 
0.00001    | 01010000005a2f6fd6ecb35b40bfbd2e6d3f384540 | 
010100000073b76fdecdb45b409895dc1786384540 |",
-//!   "| 3         | 46        | 1           | 1            | 
1993-06-27T13:27:07 | 1993-06-27T13:34:51 | 0.00000 | 0.00000 | 0.00000       | 
0.00000    | 010100000039c3814e70f2534094e0003f52441540 | 
0101000000b2f079ee52f2534016653e396d421540 |",
-//!   "| 4         | 40        | 1           | 1            | 
1996-08-02T04:14:27 | 1996-08-02T05:29:32 | 0.00005 | 0.00000 | 0.00005       | 
0.00002    | 01010000008aa8333e66db54c0fffa582953ec3ac0 | 
0101000000a24f9abcf7db54c0541f27d050f23ac0 |",
-//!   "| 5         | 232       | 1           | 1            | 
1996-08-23T12:48:20 | 1996-08-23T13:36:15 | 0.00002 | 0.00000 | 0.00003       | 
0.00001    | 01010000008415c0f7eb1a54c0b75354ac03a8d8bf | 
0101000000fe67073bc91b54c0449293684d1cd8bf |",
-//!   "| 6         | 46        | 1           | 1            | 
1994-11-16T16:39:14 | 1994-11-16T17:26:07 | 0.00003 | 0.00000 | 0.00003       | 
0.00001    | 0101000000ce6b7abd2cf54040aa3efca13e7a05c0 | 
0101000000924fd9fa33f4404006df0bfe445e05c0 |",
-//!   "| 7         | 284       | 1           | 1            | 
1996-01-20T06:18:56 | 1996-01-20T06:18:56 | 0.00000 | 0.00000 | 0.00000       | 
0.00000    | 010100000063714ed8bd1c524047a536433ace4940 | 
0101000000e6bc52d8bd1c5240c45932433ace4940 |",
-//!   "| 8         | 233       | 1           | 1            | 
1995-01-09T23:26:54 | 1995-01-10T00:16:28 | 0.00003 | 0.00000 | 0.00003       | 
0.00001    | 01010000003bb9cfbfd7bc50407b16af721b8f4940 | 
01010000007bcfb31fcabb5040a6a655dbea8e4940 |",
-//!   "| 9         | 178       | 1           | 1            | 
1993-10-13T11:07:04 | 1993-10-13T12:42:27 | 0.00005 | 0.00001 | 0.00007       | 
0.00003    | 01010000009b898e89b99952c0a488490e51cc46c0 | 
0101000000e768fa91879952c07112f7165ed046c0 |",
-//!   "| 10        | 118       | 1           | 1            | 
1994-11-08T21:05:58 | 1994-11-08T21:21:29 | 0.00001 | 0.00000 | 0.00001       | 
0.00000    | 0101000000e2c7173be1a45340311fd18349d14940 | 
01010000009b28f85c1ca553405b833101c3d14940 |",
+//!   "| 1         | 215       | 1           | 1            | 
1997-07-24T06:58:22 | 1997-07-24T13:59:54 | 0.00034 | 0.00002 | 0.00037       | 
0.00014    | 01010000009f3c318dd43735405930592bc6062040 | 
0101000000d408a2934a34354083fa96395d7e1f40 |",
+//!   "| 2         | 172       | 1           | 1            | 
1997-12-24T08:47:14 | 1997-12-24T09:28:57 | 0.00003 | 0.00000 | 0.00004       | 
0.00001    | 010100000066ea0ba7209b5740dc070cd122e33d40 | 
01010000007f720caf019c57407cf24d26b0e33d40 |",
+//!   "| 3         | 46        | 1           | 1            | 
1993-06-27T13:27:07 | 1993-06-27T13:34:51 | 0.00000 | 0.00000 | 0.00000       | 
0.00000    | 010100000003cc2607066e5b40f26f32d2d4d0ff3f | 
01010000009be61da7e86d5b407ac002b940c9ff3f |",
+//!   "| 4         | 40        | 1           | 1            | 
1996-08-02T04:14:27 | 1996-08-02T05:29:32 | 0.00005 | 0.00000 | 0.00005       | 
0.00002    | 0101000000897921e03a0e4cc08816c7ebfbc745c0 | 
0101000000c4f5ffdc5d0f4cc0eb6b23bffaca45c0 |",
+//!   "| 5         | 232       | 1           | 1            | 
1996-08-23T12:48:20 | 1996-08-23T13:36:15 | 0.00002 | 0.00000 | 0.00003       | 
0.00001    | 0101000000ff3ea1a62fcb4dc014fc64fc630b2ac0 | 
0101000000f3e32f2deacc4dc026e9714a06072ac0 |",
+//!   "| 6         | 46        | 1           | 1            | 
1994-11-16T16:39:14 | 1994-11-16T17:26:07 | 0.00003 | 0.00000 | 0.00003       | 
0.00001    | 0101000000855c114bb6562440b2810ccef493d83f | 
0101000000ac47af40d3522440915fa2eec173d93f |",
+//!   "| 7         | 284       | 1           | 1            | 
1996-01-20T06:18:56 | 1996-01-20T06:18:56 | 0.00000 | 0.00000 | 0.00000       | 
0.00000    | 010100000088904b0024855a40ea87d1a6fc8b4340 | 
01010000000bdc4f0024855a40f01edaa6fc8b4340 |",
+//!   "| 8         | 233       | 1           | 1            | 
1995-01-09T23:26:54 | 1995-01-10T00:16:28 | 0.00003 | 0.00000 | 0.00003       | 
0.00001    | 010100000002f5d829e5845640f2770bfe6053f8bf | 
01010000003b74b489d78356402a9b07ea7359f8bf |",
+//!   "| 9         | 178       | 1           | 1            | 
1993-10-13T11:07:04 | 1993-10-13T12:42:27 | 0.00005 | 0.00001 | 0.00007       | 
0.00003    | 0101000000dd8b0712968e49c061d63d122c131640 | 
0101000000ec67d222328e49c0d7fd9dccc3f21540 |",
+//!   "| 10        | 118       | 1           | 1            | 
1994-11-08T21:05:58 | 1994-11-08T21:21:29 | 0.00001 | 0.00000 | 0.00001       | 
0.00000    | 0101000000df66d30c07e75940ff4f81705eca3c40 | 
01010000003469ae2e42e75940c49e2c6b51cb3c40 |",
 //!   
"+-----------+-----------+-------------+--------------+---------------------+---------------------+---------+---------+---------------+------------+--------------------------------------------+--------------------------------------------+"
 //! ]);
 //! ```
diff --git a/spatialbench-cli/src/spatial_config_file.rs 
b/spatialbench-cli/src/spatial_config_file.rs
index b0db4ee..9027da7 100644
--- a/spatialbench-cli/src/spatial_config_file.rs
+++ b/spatialbench-cli/src/spatial_config_file.rs
@@ -32,7 +32,7 @@ where
                 "bit" => Ok(DistributionType::Bit),
                 "sierpinski" => Ok(DistributionType::Sierpinski),
                 "thomas" => Ok(DistributionType::Thomas),
-                "hierthomas" => Ok(DistributionType::HierThomas),
+                "hierarchicalthomas" => 
Ok(DistributionType::HierarchicalThomas),
                 _ => Err(E::custom(format!("unknown distribution type: {}", 
value))),
             }
         }
@@ -121,7 +121,7 @@ pub enum InlineParams {
         pareto_xm: f64,    // scale (>0), typically 1.0
     },
 
-    HierThomas {
+    HierarchicalThomas {
         cities: u32, // top-level “city” centers
         sub_mean: f64,
         sub_sd: f64,
@@ -168,7 +168,7 @@ impl InlineSpatialConfig {
                 pareto_alpha: *pareto_alpha,
                 pareto_xm: *pareto_xm,
             },
-            InlineParams::HierThomas {
+            InlineParams::HierarchicalThomas {
                 cities,
                 sub_mean,
                 sub_sd,
@@ -180,7 +180,7 @@ impl InlineSpatialConfig {
                 pareto_xm_city,
                 pareto_alpha_sub,
                 pareto_xm_sub,
-            } => DistributionParams::HierThomas {
+            } => DistributionParams::HierarchicalThomas {
                 cities: *cities, // top-level “city” centers
                 sub_mean: *sub_mean,
                 sub_sd: *sub_sd,
diff --git a/spatialbench-config.yml b/spatialbench-config.yml
index 83df56c..86ce5a8 100644
--- a/spatialbench-config.yml
+++ b/spatialbench-config.yml
@@ -1,5 +1,5 @@
 trip:
-  dist_type: bit
+  dist_type: HierarchicalThomas
   geom_type: point
   dim: 2
   seed: 56789
@@ -8,18 +8,38 @@ trip:
   maxseg: 0
   polysize: 0.0
   params:
-    type: bit
-    probability: 0.35
-    digits: 30
+    type: hierarchicalthomas
+    cities: 60000
+    sub_mean: 15
+    sub_sd: 12
+    sub_min: 2
+    sub_max: 80
+    sigma_city: 0.006
+    sigma_sub: 0.003
+    pareto_alpha_city: 0.80
+    pareto_xm_city: 1.0
+    pareto_alpha_sub: 1.00
+    pareto_xm_sub: 1.0
 
 building:
-  dist_type: sierpinski
+  dist_type: HierarchicalThomas
   geom_type: polygon
   dim: 2
   seed: 12345
   width: 0.0
   height: 0.0
-  maxseg: 5
+  maxseg: 7
   polysize: 0.000039
   params:
-    type: none
\ No newline at end of file
+    type: hierarchicalthomas
+    cities: 10000
+    sub_mean: 5
+    sub_sd: 3
+    sub_min: 1
+    sub_max: 15
+    sigma_city: 0.1
+    sigma_sub: 0.01
+    pareto_alpha_city: 1.20
+    pareto_xm_city: 1.0
+    pareto_alpha_sub: 1.00
+    pareto_xm_sub: 1.0
\ No newline at end of file
diff --git a/spatialbench/data/sf-v1/building.tbl.gz 
b/spatialbench/data/sf-v1/building.tbl.gz
index 5954403..297656c 100644
Binary files a/spatialbench/data/sf-v1/building.tbl.gz and 
b/spatialbench/data/sf-v1/building.tbl.gz differ
diff --git a/spatialbench/data/sf-v1/trip.tbl.gz 
b/spatialbench/data/sf-v1/trip.tbl.gz
index 8b5a25c..7114b91 100644
Binary files a/spatialbench/data/sf-v1/trip.tbl.gz and 
b/spatialbench/data/sf-v1/trip.tbl.gz differ
diff --git a/spatialbench/src/generators.rs b/spatialbench/src/generators.rs
index 975f506..94b771b 100644
--- a/spatialbench/src/generators.rs
+++ b/spatialbench/src/generators.rs
@@ -1867,7 +1867,7 @@ mod tests {
         // Check first Trip
         let first = &trips[1];
         assert_eq!(first.t_tripkey, 2);
-        assert_eq!(first.to_string(), "2|172|1|1|1997-12-24 
08:47:14|1997-12-24 09:28:57|0.03|0.00|0.04|0.01|POINT(110.811330422 
42.439435623)|POINT(110.82506524 42.4415922)|");
+        assert_eq!(first.to_string(), "2|172|1|1|1997-12-24 
08:47:14|1997-12-24 09:28:57|0.03|0.00|0.04|0.01|POINT(94.423867952 
29.887250009)|POINT(94.43760277 29.88940658)|");
     }
 
     #[test]
@@ -1893,7 +1893,7 @@ mod tests {
         // Check first Building
         let first = &buildings[1];
         assert_eq!(first.b_buildingkey, 2);
-        assert_eq!(first.to_string(), "2|blush|POLYGON((86.384542194 
50.455356051,86.380173376 50.454956856,86.379715566 50.458440835,86.382506208 
50.459233429,86.385229341 50.457089194,86.384542194 50.455356051))|")
+        assert_eq!(first.to_string(), "2|blush|POLYGON((124.218033476 
10.538071565,124.215762091 10.536069114,124.214352934 
10.536014944,124.212486371 10.539913704,124.217919324 
10.539075339,124.218033476 10.538071565))|")
     }
 
     #[test]
diff --git a/spatialbench/src/lib.rs b/spatialbench/src/lib.rs
index 2507dd7..afbdcb9 100644
--- a/spatialbench/src/lib.rs
+++ b/spatialbench/src/lib.rs
@@ -21,9 +21,9 @@
 //!    .collect::<Vec<_>>();
 //!  assert_eq!(
 //!   trips.join("\n"),"\
-//!     1|215|1|1|1997-07-24 06:58:22|1997-07-24 
13:59:54|0.34|0.02|0.37|0.14|POINT(-19.47654462 13.210254132)|POINT(-19.4903687 
13.07042597)|\n\
-//!     2|172|1|1|1997-12-24 08:47:14|1997-12-24 
09:28:57|0.03|0.00|0.04|0.01|POINT(110.811330422 
42.439435623)|POINT(110.82506524 42.4415922)|\n\
-//!     3|46|1|1|1993-06-27 13:27:07|1993-06-27 
13:34:51|0.00|0.00|0.00|0.00|POINT(79.788104655 5.316719994)|POINT(79.78631174 
5.31486978)|"
+//!     1|215|1|1|1997-07-24 06:58:22|1997-07-24 
13:59:54|0.34|0.02|0.37|0.14|POINT(21.218087029 8.013230662)|POINT(21.20426295 
7.8734025)|\n\
+//!     2|172|1|1|1997-12-24 08:47:14|1997-12-24 
09:28:57|0.03|0.00|0.04|0.01|POINT(94.423867952 29.887250009)|POINT(94.43760277 
29.88940658)|\n\
+//!     3|46|1|1|1993-06-27 13:27:07|1993-06-27 
13:34:51|0.00|0.00|0.00|0.00|POINT(109.719117916 1.988484212)|POINT(109.717325 
1.98663399)|"
 //!  );
 //! ```
 //!
diff --git a/spatialbench/src/spatial/config.rs 
b/spatialbench/src/spatial/config.rs
index 7ee8a31..b31fd1e 100644
--- a/spatialbench/src/spatial/config.rs
+++ b/spatialbench/src/spatial/config.rs
@@ -6,7 +6,7 @@ pub enum DistributionType {
     Sierpinski,
     Bit,
     Thomas,
-    HierThomas,
+    HierarchicalThomas,
 }
 
 #[derive(Debug, Clone, Copy)]
@@ -42,7 +42,7 @@ pub enum DistributionParams {
         pareto_alpha: f64,
         pareto_xm: f64,
     },
-    HierThomas {
+    HierarchicalThomas {
         cities: u32,
         // variable subclusters per city (normal, clamped)
         sub_mean: f64,
diff --git a/spatialbench/src/spatial/defaults.rs 
b/spatialbench/src/spatial/defaults.rs
index d158eb3..4fadc96 100644
--- a/spatialbench/src/spatial/defaults.rs
+++ b/spatialbench/src/spatial/defaults.rs
@@ -24,7 +24,7 @@ impl Default for ContinentAffines {
 impl SpatialDefaults {
     pub fn trip_default() -> SpatialGenerator {
         let config = SpatialConfig {
-            dist_type: DistributionType::Bit,
+            dist_type: DistributionType::HierarchicalThomas,
             geom_type: GeomType::Point,
             dim: 2,
             seed: 56789,
@@ -37,9 +37,18 @@ impl SpatialDefaults {
             maxseg: 0,
             polysize: 0.0,
 
-            params: DistributionParams::Bit {
-                probability: 0.35,
-                digits: 30,
+            params: DistributionParams::HierarchicalThomas {
+                cities: 60000,
+                sub_mean: 15.0,
+                sub_sd: 12.0,
+                sub_min: 2,
+                sub_max: 80,
+                sigma_city: 0.006,
+                sigma_sub: 0.003,
+                pareto_alpha_city: 0.80,
+                pareto_xm_city: 1.0,
+                pareto_alpha_sub: 1.00,
+                pareto_xm_sub: 1.0,
             },
         };
         SpatialGenerator::new(config, OnceLock::new(), OnceLock::new())
@@ -47,7 +56,7 @@ impl SpatialDefaults {
 
     pub fn building_default() -> SpatialGenerator {
         let config = SpatialConfig {
-            dist_type: DistributionType::Sierpinski,
+            dist_type: DistributionType::HierarchicalThomas,
             geom_type: GeomType::Polygon,
             dim: 2,
             seed: 12345,
@@ -57,10 +66,22 @@ impl SpatialDefaults {
             height: 0.0,
 
             // geometry = polygon
-            maxseg: 5,
+            maxseg: 7,
             polysize: 0.000039,
 
-            params: DistributionParams::None,
+            params: DistributionParams::HierarchicalThomas {
+                cities: 10000,
+                sub_mean: 5.0,
+                sub_sd: 3.0,
+                sub_min: 1,
+                sub_max: 15,
+                sigma_city: 0.1,
+                sigma_sub: 0.01,
+                pareto_alpha_city: 1.20,
+                pareto_xm_city: 1.0,
+                pareto_alpha_sub: 1.00,
+                pareto_xm_sub: 1.0,
+            },
         };
         SpatialGenerator::new(config, OnceLock::new(), OnceLock::new())
     }
diff --git a/spatialbench/src/spatial/distributions.rs 
b/spatialbench/src/spatial/distributions.rs
index a787ce7..b80ba8a 100644
--- a/spatialbench/src/spatial/distributions.rs
+++ b/spatialbench/src/spatial/distributions.rs
@@ -77,7 +77,7 @@ pub fn generate_sierpinski(index: u64, config: 
&SpatialConfig, m: &[f64; 6]) ->
     let (mut x, mut y) = (0.0, 0.0);
     let a = (0.0, 0.0);
     let b = (1.0, 0.0);
-    let c = (0.5, (3.0f64).sqrt() / 2.0);
+    let c = (0.5, 3.0f64.sqrt() / 2.0);
     for _ in 0..27 {
         match rng.gen_range(0..3) {
             0 => {
@@ -163,7 +163,7 @@ fn pick_parent_pareto_once(u: f64, k: usize, alpha: f64, 
xm: f64, seed: u64) ->
     pick_from_cdf(&cdf, u)
 }
 
-pub fn generate_hier_thomas(
+pub fn generate_hierarchical_thomas(
     index: u64,
     config: &SpatialConfig,
     hier_cache: &OnceLock<HierThomasCache>,
@@ -171,7 +171,7 @@ pub fn generate_hier_thomas(
 ) -> Geometry {
     let (nc, sub_mean, sub_sd, sub_min, sub_max, sigma_city, sigma_sub, a_c, 
xm_c, a_s, xm_s) =
         match config.params {
-            DistributionParams::HierThomas {
+            DistributionParams::HierarchicalThomas {
                 cities,
                 sub_mean,
                 sub_sd,
diff --git a/spatialbench/src/spatial/generator.rs 
b/spatialbench/src/spatial/generator.rs
index 54ae6c4..5fcd176 100644
--- a/spatialbench/src/spatial/generator.rs
+++ b/spatialbench/src/spatial/generator.rs
@@ -36,9 +36,12 @@ impl SpatialGenerator {
             DistributionType::Thomas => {
                 generate_thomas(index, &self.config, &self.thomas_cache, 
continent_affine)
             }
-            DistributionType::HierThomas => {
-                generate_hier_thomas(index, &self.config, &self.hier_cache, 
continent_affine)
-            }
+            DistributionType::HierarchicalThomas => 
generate_hierarchical_thomas(
+                index,
+                &self.config,
+                &self.hier_cache,
+                continent_affine,
+            ),
         }
     }
 }

Reply via email to