Skip to content

Commit 2be2f35

Browse files
authored
Improve the random benchmark data generator to generate non-identical pairs of geometries (#70)
1 parent 1193bb2 commit 2be2f35

File tree

4 files changed

+60
-13
lines changed

4 files changed

+60
-13
lines changed

benchmarks/test_bench_base.py

Lines changed: 45 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,13 @@ def setup_class(self):
2727
num_geoms = 100_000
2828

2929
# Setup tables
30-
for name, options in [
30+
for name, base_options in [
3131
(
3232
"segments_large",
3333
{
3434
"geom_type": "LineString",
3535
"target_rows": num_geoms,
36-
"vertices_per_linestring_range": [2, 2],
36+
"vertices_per_linestring_range": [2, 10],
3737
},
3838
),
3939
(
@@ -69,13 +69,52 @@ def setup_class(self):
6969
},
7070
),
7171
]:
72-
# Generate synthetic data
72+
# Generate synthetic data with two different geometry sets that have overlapping spatial distribution
73+
# The intersection rate between geom1 and geom2 will be around 2%.
74+
# This creates more realistic workloads for spatial predicates.
75+
76+
# Options for first geometry set (geom1) - left-leaning distribution
77+
options1 = base_options.copy()
78+
options1.update(
79+
{
80+
"seed": 42,
81+
"bounds": [0.0, 0.0, 80.0, 100.0], # Slightly left-leaning
82+
"size_range": [
83+
1.0,
84+
15.0,
85+
], # Medium-sized geometries for good intersection chance
86+
}
87+
)
88+
89+
# Options for second geometry set (geom2) - right-leaning distribution
90+
options2 = base_options.copy()
91+
options2.update(
92+
{
93+
"seed": 43,
94+
"bounds": [20.0, 0.0, 100.0, 100.0], # Slightly right-leaning
95+
"size_range": [1.0, 15.0], # Same size range for fair comparison
96+
}
97+
)
98+
7399
query = f"""
100+
WITH geom1_data AS (
101+
SELECT
102+
geometry as geom1,
103+
row_number() OVER () as id
104+
FROM sd_random_geometry('{json.dumps(options1)}')
105+
),
106+
geom2_data AS (
107+
SELECT
108+
geometry as geom2,
109+
row_number() OVER () as id
110+
FROM sd_random_geometry('{json.dumps(options2)}')
111+
)
74112
SELECT
75-
geometry as geom1,
76-
geometry as geom2,
113+
g1.geom1,
114+
g2.geom2,
77115
round(random() * 100) as integer
78-
FROM sd_random_geometry('{json.dumps(options)}')
116+
FROM geom1_data g1
117+
JOIN geom2_data g2 ON g1.id = g2.id
79118
"""
80119
tab = self.sedonadb.execute_and_collect(query)
81120

python/sedonadb/tests/functions/test_distance.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,4 +43,5 @@ def test_st_distance(eng, geom1, geom2, expected):
4343
eng.assert_query_result(
4444
f"SELECT ST_Distance({geom_or_null(geom1)}, {geom_or_null(geom2)})",
4545
expected,
46+
numeric_epsilon=1e-8,
4647
)

python/sedonadb/tests/test_sjoin.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ def test_spatial_join_geography(join_type, on):
104104
"vertices_per_linestring_range": [2, 10],
105105
"bounds": west_most_bound,
106106
"size_range": [0.1, 5],
107-
"seed": 42,
107+
"seed": 43,
108108
}
109109
)
110110
df_point = eng_sedonadb.execute_and_collect(
@@ -118,7 +118,7 @@ def test_spatial_join_geography(join_type, on):
118118
"vertices_per_linestring_range": [2, 10],
119119
"bounds": east_most_bound,
120120
"size_range": [0.1, 5],
121-
"seed": 43,
121+
"seed": 44,
122122
}
123123
)
124124
df_polygon = eng_sedonadb.execute_and_collect(

rust/sedona-testing/src/datagen.rs

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -566,7 +566,8 @@ fn generate_random_linestring<R: rand::Rng>(
566566
);
567567
// Always sample in such a way that we end up with a valid linestring
568568
let num_vertices = rng.sample(vertices_dist).max(2);
569-
let coords = generate_circular_vertices(center_x, center_y, half_size, num_vertices, false);
569+
let coords =
570+
generate_circular_vertices(rng, center_x, center_y, half_size, num_vertices, false);
570571
LineString::from(coords)
571572
}
572573
}
@@ -582,7 +583,8 @@ fn generate_random_polygon<R: rand::Rng>(rng: &mut R, options: &RandomGeometryOp
582583
);
583584
// Always sample in such a way that we end up with a valid Polygon
584585
let num_vertices = rng.sample(vertices_dist).max(3);
585-
let coords = generate_circular_vertices(center_x, center_y, half_size, num_vertices, true);
586+
let coords =
587+
generate_circular_vertices(rng, center_x, center_y, half_size, num_vertices, true);
586588
let shell = LineString::from(coords);
587589
let mut holes = Vec::new();
588590

@@ -593,7 +595,7 @@ fn generate_random_polygon<R: rand::Rng>(rng: &mut R, options: &RandomGeometryOp
593595
if add_hole {
594596
let new_size = half_size * hole_scale_factor;
595597
let mut coords =
596-
generate_circular_vertices(center_x, center_y, new_size, num_vertices, true);
598+
generate_circular_vertices(rng, center_x, center_y, new_size, num_vertices, true);
597599
coords.reverse();
598600
holes.push(LineString::from(coords));
599601
}
@@ -756,15 +758,20 @@ fn generate_non_overlapping_sub_rectangles(num_parts: usize, bounds: &Rect) -> V
756758
tiles
757759
}
758760

759-
fn generate_circular_vertices(
761+
fn generate_circular_vertices<R: rand::Rng>(
762+
rng: &mut R,
760763
center_x: f64,
761764
center_y: f64,
762765
radius: f64,
763766
num_vertices: usize,
764767
closed: bool,
765768
) -> Vec<Coord> {
766769
let mut out = Vec::new();
767-
let mut angle: f64 = 0.0;
770+
771+
// Randomize starting angle (0 to 2 * PI)
772+
let start_angle_dist = Uniform::new(0.0, 2.0 * PI);
773+
let mut angle: f64 = rng.sample(start_angle_dist);
774+
768775
let dangle = 2.0 * PI / (num_vertices as f64).max(3.0);
769776
for _ in 0..num_vertices {
770777
out.push(Coord {

0 commit comments

Comments
 (0)