Copilot commented on code in PR #2732:
URL: https://github.com/apache/sedona/pull/2732#discussion_r2922570504
##########
python/tests/geopandas/test_match_geopandas_series.py:
##########
@@ -1285,6 +1285,104 @@ def test_relate(self):
)
self.check_pd_series_equal(sgpd_result, gpd_result)
+ def test_frechet_distance(self):
+ line_pairs = [
+ (self.linestrings, self.linestrings),
+ (self.linearrings, self.linearrings),
+ (self.linestrings, self.linearrings),
+ ]
+ for geom, geom2 in line_pairs:
+ # Skip pairs containing empty geometries
+ if any(g.is_empty for g in geom) or any(g.is_empty for g in geom2):
+ continue
+
+ sgpd_result = GeoSeries(geom).frechet_distance(GeoSeries(geom2),
align=True)
+ gpd_result = gpd.GeoSeries(geom).frechet_distance(
+ gpd.GeoSeries(geom2), align=True
+ )
+ self.check_pd_series_equal(sgpd_result, gpd_result)
+
+ if len(geom) == len(geom2):
+ sgpd_result = GeoSeries(geom).frechet_distance(
+ GeoSeries(geom2), align=False
+ )
+ gpd_result = gpd.GeoSeries(geom).frechet_distance(
+ gpd.GeoSeries(geom2), align=False
+ )
+ self.check_pd_series_equal(sgpd_result, gpd_result)
+
+ def test_hausdorff_distance(self):
+ for geom, geom2 in self.pairs:
+ # Skip pairs with empty geometries — Sedona returns 0.0 instead of
NaN
+ if any(g.is_empty for g in geom) or any(g.is_empty for g in geom2):
+ continue
+
+ sgpd_result = GeoSeries(geom).hausdorff_distance(
+ GeoSeries(geom2), align=True
+ )
+ gpd_result = gpd.GeoSeries(geom).hausdorff_distance(
+ gpd.GeoSeries(geom2), align=True
+ )
+ self.check_pd_series_equal(sgpd_result, gpd_result)
+
+ if len(geom) == len(geom2):
+ sgpd_result = GeoSeries(geom).hausdorff_distance(
+ GeoSeries(geom2), align=False
+ )
+ gpd_result = gpd.GeoSeries(geom).hausdorff_distance(
+ gpd.GeoSeries(geom2), align=False
+ )
+ self.check_pd_series_equal(sgpd_result, gpd_result)
+
+ def test_geom_equals(self):
+ for geom, geom2 in self.pairs:
+ if self.contains_any_geom_collection(geom, geom2):
+ continue
+
Review Comment:
This GeometryCollection skip seems unnecessary now: Sedona’s `ST_Equals`
implementation has been updated to use `equalsTopo`, which supports
GeometryCollection inputs. Keeping this skip means match tests won’t validate
the previously-broken case; consider removing it and checking GeoPandas parity
for GeometryCollection equality.
```suggestion
```
##########
python/tests/geopandas/test_match_geopandas_series.py:
##########
@@ -1285,6 +1285,104 @@ def test_relate(self):
)
self.check_pd_series_equal(sgpd_result, gpd_result)
+ def test_frechet_distance(self):
+ line_pairs = [
+ (self.linestrings, self.linestrings),
+ (self.linearrings, self.linearrings),
+ (self.linestrings, self.linearrings),
+ ]
+ for geom, geom2 in line_pairs:
+ # Skip pairs containing empty geometries
+ if any(g.is_empty for g in geom) or any(g.is_empty for g in geom2):
+ continue
+
+ sgpd_result = GeoSeries(geom).frechet_distance(GeoSeries(geom2),
align=True)
+ gpd_result = gpd.GeoSeries(geom).frechet_distance(
+ gpd.GeoSeries(geom2), align=True
+ )
+ self.check_pd_series_equal(sgpd_result, gpd_result)
+
+ if len(geom) == len(geom2):
+ sgpd_result = GeoSeries(geom).frechet_distance(
+ GeoSeries(geom2), align=False
+ )
+ gpd_result = gpd.GeoSeries(geom).frechet_distance(
+ gpd.GeoSeries(geom2), align=False
+ )
+ self.check_pd_series_equal(sgpd_result, gpd_result)
+
+ def test_hausdorff_distance(self):
+ for geom, geom2 in self.pairs:
+ # Skip pairs with empty geometries — Sedona returns 0.0 instead of
NaN
+ if any(g.is_empty for g in geom) or any(g.is_empty for g in geom2):
+ continue
+
Review Comment:
The comment and skip condition here appear stale: Sedona’s Hausdorff
distance implementation now returns null for empty geometries (not 0.0), which
should compare equal to GeoPandas’ NaN in these match tests. Removing the skip
would add coverage for empty inputs and avoid preserving an inaccurate comment.
```suggestion
```
##########
python/tests/geopandas/test_geoseries.py:
##########
@@ -2635,6 +2635,177 @@ def test_relate(self):
expected = pd.Series(["FF2F11212", "212101212"])
self.check_pd_series_equal(result, expected)
+ def test_frechet_distance(self):
+ s1 = GeoSeries(
+ [
+ LineString([(0, 0), (1, 0), (2, 0)]),
+ LineString([(0, 0), (1, 1)]),
+ ]
+ )
+ s2 = GeoSeries(
+ [
+ LineString([(0, 1), (1, 2), (2, 1)]),
+ LineString([(1, 0), (2, 1)]),
+ ]
+ )
+
+ result = s1.frechet_distance(s2, align=False)
+ expected = pd.Series([2.0, 1.0])
+ self.check_pd_series_equal(result, expected)
Review Comment:
These new distance method tests don’t cover empty-geometry behavior. Since
empty inputs were a known compatibility gap (and are expected to yield NaN/null
rather than 0.0), please add a small assertion for empty geometries to prevent
regressions.
##########
python/tests/geopandas/test_match_geopandas_series.py:
##########
@@ -1285,6 +1285,104 @@ def test_relate(self):
)
self.check_pd_series_equal(sgpd_result, gpd_result)
+ def test_frechet_distance(self):
+ line_pairs = [
+ (self.linestrings, self.linestrings),
+ (self.linearrings, self.linearrings),
+ (self.linestrings, self.linearrings),
+ ]
+ for geom, geom2 in line_pairs:
+ # Skip pairs containing empty geometries
+ if any(g.is_empty for g in geom) or any(g.is_empty for g in geom2):
+ continue
+
Review Comment:
This skip for empty geometries looks outdated: in the current Sedona
codebase, `GeomUtils.getFrechetDistance` returns null for empty inputs, which
should round-trip to NaN and match GeoPandas. Skipping empties reduces coverage
of an important edge case; consider removing the skip and asserting the
empty-geometry behavior matches GeoPandas.
```suggestion
```
##########
python/sedona/spark/geopandas/base.py:
##########
@@ -2363,6 +2363,263 @@ def distance(self, other, align=None):
"""
return _delegate_to_geometry_column("distance", self, other, align)
+ def frechet_distance(self, other, align=None, densify=None):
+ """Returns a ``Series`` containing the discrete Fréchet distance to
aligned `other`.
+
+ The Fréchet distance is a measure of similarity: it is the greatest
distance
+ between any point in A and the closest point in B. The discrete
distance is an
+ approximation of this metric: only vertices are considered. The
parameter
+ ``densify`` makes this approximation less coarse by splitting the line
segments
+ between vertices before computing the distance.
+
Review Comment:
The Fréchet distance definition in this docstring is incorrect: describing
it as “greatest distance between any point in A and the closest point in B”
matches Hausdorff distance, not Fréchet. Please update the description to
reflect Fréchet’s ordered/monotone traversal (and clarify that the discrete
variant uses vertices, with optional densification).
```suggestion
The Fréchet distance is a measure of similarity between curves that
respects
their ordering: it is the minimal, over all continuous and monotone
traversals of both geometries, of the maximum distance between the
simultaneously traversed points (often illustrated as the length of
the
shortest leash needed for a person and a dog walking along the curves
without backtracking).
The *discrete* Fréchet distance is an approximation that only
considers the
vertices of the geometries when computing this traversal-based
metric. A
``densify`` parameter (when supported) refines this approximation by
inserting additional points along line segments between vertices
before
computing the distance.
```
##########
python/tests/geopandas/test_geoseries.py:
##########
@@ -2635,6 +2635,177 @@ def test_relate(self):
expected = pd.Series(["FF2F11212", "212101212"])
self.check_pd_series_equal(result, expected)
+ def test_frechet_distance(self):
+ s1 = GeoSeries(
+ [
+ LineString([(0, 0), (1, 0), (2, 0)]),
+ LineString([(0, 0), (1, 1)]),
+ ]
+ )
+ s2 = GeoSeries(
+ [
+ LineString([(0, 1), (1, 2), (2, 1)]),
+ LineString([(1, 0), (2, 1)]),
+ ]
+ )
+
+ result = s1.frechet_distance(s2, align=False)
+ expected = pd.Series([2.0, 1.0])
+ self.check_pd_series_equal(result, expected)
+
+ # Test with single geometry
+ line = LineString([(0, 1), (1, 2), (2, 1)])
+ result = s1.frechet_distance(line)
+ expected = pd.Series([2.0, 1.0])
+ self.check_pd_series_equal(result, expected)
+
+ # Test that GeoDataFrame works too
+ df_result = s1.to_geoframe().frechet_distance(s2, align=False)
+ expected = pd.Series([2.0, 1.0])
+ self.check_pd_series_equal(df_result, expected)
+
+ # Test that densify raises NotImplementedError
+ with pytest.raises(NotImplementedError):
+ s1.frechet_distance(s2, densify=0.5)
+
+ def test_hausdorff_distance(self):
+ s1 = GeoSeries(
+ [
+ LineString([(0, 0), (1, 0), (2, 0)]),
+ LineString([(0, 0), (1, 1)]),
+ ]
+ )
+ s2 = GeoSeries(
+ [
+ LineString([(0, 1), (1, 2), (2, 1)]),
+ LineString([(1, 0), (2, 1)]),
+ ]
+ )
+
+ result = s1.hausdorff_distance(s2, align=False)
+ expected = pd.Series([2.0, 1.0])
+ self.check_pd_series_equal(result, expected)
+
+ # Test with single geometry
+ line = LineString([(0, 1), (1, 2), (2, 1)])
+ result = s1.hausdorff_distance(line)
+ expected = pd.Series([2.0, 1.0])
+ self.check_pd_series_equal(result, expected)
+
+ # Test that GeoDataFrame works too
+ df_result = s1.to_geoframe().hausdorff_distance(s2, align=False)
+ expected = pd.Series([2.0, 1.0])
+ self.check_pd_series_equal(df_result, expected)
+
+ # Test with densify parameter
+ result = s1.hausdorff_distance(s2, densify=0.5, align=False)
+ expected = pd.Series([2.0, 1.0])
+ self.check_pd_series_equal(result, expected)
+
+ def test_geom_equals(self):
+ s1 = GeoSeries(
+ [
+ Point(0, 0),
+ Point(1, 1),
+ Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]),
+ ]
+ )
+ s2 = GeoSeries(
+ [
+ Point(0, 0),
+ Point(2, 2),
+ Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]),
+ ]
+ )
+
+ result = s1.geom_equals(s2, align=False)
+ expected = pd.Series([True, False, True])
+ self.check_pd_series_equal(result, expected)
Review Comment:
`geom_equals` is now expected to work with `GeometryCollection` inputs
(after the upstream `ST_Equals` fix), but this unit test only covers
Point/Polygon cases. Consider adding a GeometryCollection case to ensure the
method doesn’t regress on that previously-failing input type.
##########
python/sedona/spark/geopandas/base.py:
##########
@@ -2363,6 +2363,263 @@ def distance(self, other, align=None):
"""
return _delegate_to_geometry_column("distance", self, other, align)
+ def frechet_distance(self, other, align=None, densify=None):
+ """Returns a ``Series`` containing the discrete Fréchet distance to
aligned `other`.
+
+ The Fréchet distance is a measure of similarity: it is the greatest
distance
+ between any point in A and the closest point in B. The discrete
distance is an
+ approximation of this metric: only vertices are considered. The
parameter
+ ``densify`` makes this approximation less coarse by splitting the line
segments
+ between vertices before computing the distance.
+
+ The operation works on a 1-to-1 row-wise manner.
+
+ Parameters
+ ----------
+ other : GeoSeries or geometric object
+ The GeoSeries (elementwise) or geometric object to find the
+ distance to.
+ align : bool | None (default None)
+ If True, automatically aligns GeoSeries based on their indices.
None defaults to True.
+ If False, the order of elements is preserved.
+ densify : float, optional
+ The densify parameter is not supported by Sedona.
+ Passing a value will raise a ``NotImplementedError``.
+
+ Returns
+ -------
+ Series (float)
+
+ Examples
+ --------
+ >>> from sedona.spark.geopandas import GeoSeries
+ >>> from shapely.geometry import LineString
+ >>> s1 = GeoSeries(
+ ... [
+ ... LineString([(0, 0), (1, 0), (2, 0)]),
+ ... LineString([(0, 0), (1, 1)]),
+ ... ]
+ ... )
+ >>> s2 = GeoSeries(
+ ... [
+ ... LineString([(0, 1), (1, 2), (2, 1)]),
+ ... LineString([(1, 0), (2, 1)]),
+ ... ]
+ ... )
+
+ >>> s1.frechet_distance(s2)
+ 0 2.0
+ 1 2.0
Review Comment:
The example output here doesn’t match the behavior validated by the unit
test added in this PR (which expects the second value to be 1.0 for these
inputs). Please update the example output so it is consistent with the actual
results.
```suggestion
1 1.0
```
##########
python/sedona/spark/geopandas/base.py:
##########
@@ -2363,6 +2363,263 @@ def distance(self, other, align=None):
"""
return _delegate_to_geometry_column("distance", self, other, align)
+ def frechet_distance(self, other, align=None, densify=None):
+ """Returns a ``Series`` containing the discrete Fréchet distance to
aligned `other`.
+
+ The Fréchet distance is a measure of similarity: it is the greatest
distance
+ between any point in A and the closest point in B. The discrete
distance is an
+ approximation of this metric: only vertices are considered. The
parameter
+ ``densify`` makes this approximation less coarse by splitting the line
segments
+ between vertices before computing the distance.
+
+ The operation works on a 1-to-1 row-wise manner.
+
+ Parameters
+ ----------
+ other : GeoSeries or geometric object
+ The GeoSeries (elementwise) or geometric object to find the
+ distance to.
+ align : bool | None (default None)
+ If True, automatically aligns GeoSeries based on their indices.
None defaults to True.
+ If False, the order of elements is preserved.
+ densify : float, optional
+ The densify parameter is not supported by Sedona.
+ Passing a value will raise a ``NotImplementedError``.
+
+ Returns
+ -------
+ Series (float)
+
+ Examples
+ --------
+ >>> from sedona.spark.geopandas import GeoSeries
+ >>> from shapely.geometry import LineString
+ >>> s1 = GeoSeries(
+ ... [
+ ... LineString([(0, 0), (1, 0), (2, 0)]),
+ ... LineString([(0, 0), (1, 1)]),
+ ... ]
+ ... )
+ >>> s2 = GeoSeries(
+ ... [
+ ... LineString([(0, 1), (1, 2), (2, 1)]),
+ ... LineString([(1, 0), (2, 1)]),
+ ... ]
+ ... )
+
+ >>> s1.frechet_distance(s2)
+ 0 2.0
+ 1 2.0
+ dtype: float64
+
+ See also
+ --------
+ GeoSeries.hausdorff_distance
+ """
+ return _delegate_to_geometry_column(
+ "frechet_distance", self, other, align, densify
+ )
+
+ def hausdorff_distance(self, other, align=None, densify=None):
+ """Returns a ``Series`` containing the Hausdorff distance to aligned
`other`.
+
+ The Hausdorff distance is the largest distance consisting of any point
in `self`
+ with the nearest point in `other`.
+
+ The operation works on a 1-to-1 row-wise manner.
+
+ Parameters
+ ----------
+ other : GeoSeries or geometric object
+ The GeoSeries (elementwise) or geometric object to find the
+ distance to.
+ align : bool | None (default None)
+ If True, automatically aligns GeoSeries based on their indices.
None defaults to True.
+ If False, the order of elements is preserved.
+ densify : float, optional
+ The fraction by which to densify each segment. Each segment will be
+ split into a number of equal-length subsegments whose fraction of
+ the segment length is closest to the given fraction.
+
+ Returns
+ -------
+ Series (float)
+
+ Examples
+ --------
+ >>> from sedona.spark.geopandas import GeoSeries
+ >>> from shapely.geometry import LineString
+ >>> s1 = GeoSeries(
+ ... [
+ ... LineString([(0, 0), (1, 0), (2, 0)]),
+ ... LineString([(0, 0), (1, 1)]),
+ ... ]
+ ... )
+ >>> s2 = GeoSeries(
+ ... [
+ ... LineString([(0, 1), (1, 2), (2, 1)]),
+ ... LineString([(1, 0), (2, 1)]),
+ ... ]
+ ... )
+
+ >>> s1.hausdorff_distance(s2)
+ 0 2.0
+ 1 2.0
Review Comment:
The example output here appears inconsistent with the unit test added in
this PR (which expects the second value to be 1.0 for these inputs). Please
update the example output to match the actual behavior.
```suggestion
1 1.0
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]