This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git
The following commit(s) were added to refs/heads/master by this push:
new b535bc4b5c [GH-2116] Geopandas.GeoSeries: Implement `difference` and
`dwithin` (#2117)
b535bc4b5c is described below
commit b535bc4b5c3745d209910c0af9622c24f587ef70
Author: Peter Nguyen <[email protected]>
AuthorDate: Sun Jul 20 22:56:53 2025 -0700
[GH-2116] Geopandas.GeoSeries: Implement `difference` and `dwithin` (#2117)
* Implement functions
* Skip covers and covered_by tests for old versions
* Skip covered by for shapely < 2
* Implement dwithin and difference, and fix index bug in row_wise_operations
---
python/sedona/geopandas/geoseries.py | 246 ++++++++++++++++++++-
python/tests/geopandas/test_geoseries.py | 95 ++++++++
.../tests/geopandas/test_match_geopandas_series.py | 49 ++++
3 files changed, 384 insertions(+), 6 deletions(-)
diff --git a/python/sedona/geopandas/geoseries.py
b/python/sedona/geopandas/geoseries.py
index e9fe4d3795..f038b9693c 100644
--- a/python/sedona/geopandas/geoseries.py
+++ b/python/sedona/geopandas/geoseries.py
@@ -1131,6 +1131,232 @@ class GeoSeries(GeoFrame, pspd.Series):
)
)
+ def dwithin(self, other, distance, align=None):
+ """Returns a ``Series`` of ``dtype('bool')`` with value ``True`` for
+ each aligned geometry that is within a set distance from ``other``.
+
+ The operation works on a 1-to-1 row-wise manner:
+
+ Parameters
+ ----------
+ other : GeoSeries or geometric object
+ The GeoSeries (elementwise) or geometric object to test for
+ equality.
+ distance : float, np.array, pd.Series
+ Distance(s) to test if each geometry is within. A scalar distance
will be
+ applied to all geometries. An array or Series will be applied
elementwise.
+ If np.array or pd.Series are used then it must have same length as
the
+ GeoSeries.
+ align : bool | None (default None)
+ If True, automatically aligns GeoSeries based on their indices.
+ If False, the order of elements is preserved. None defaults to
True.
+
+ Returns
+ -------
+ Series (bool)
+
+ Examples
+ --------
+ >>> from sedona.geopandas import GeoSeries
+ >>> from shapely.geometry import Polygon, LineString, Point
+ >>> s = GeoSeries(
+ ... [
+ ... Polygon([(0, 0), (1, 1), (0, 1)]),
+ ... LineString([(0, 0), (0, 2)]),
+ ... LineString([(0, 0), (0, 1)]),
+ ... Point(0, 1),
+ ... ],
+ ... index=range(0, 4),
+ ... )
+ >>> s2 = GeoSeries(
+ ... [
+ ... Polygon([(1, 0), (4, 2), (2, 2)]),
+ ... Polygon([(2, 0), (3, 2), (2, 2)]),
+ ... LineString([(2, 0), (2, 2)]),
+ ... Point(1, 1),
+ ... ],
+ ... index=range(1, 5),
+ ... )
+
+ >>> s
+ 0 POLYGON ((0 0, 1 1, 0 1, 0 0))
+ 1 LINESTRING (0 0, 0 2)
+ 2 LINESTRING (0 0, 0 1)
+ 3 POINT (0 1)
+ dtype: geometry
+
+ >>> s2
+ 1 POLYGON ((1 0, 4 2, 2 2, 1 0))
+ 2 POLYGON ((2 0, 3 2, 2 2, 2 0))
+ 3 LINESTRING (2 0, 2 2)
+ 4 POINT (1 1)
+ dtype: geometry
+
+ We can check if each geometry of GeoSeries contains a single
+ geometry:
+
+ >>> point = Point(0, 1)
+ >>> s2.dwithin(point, 1.8)
+ 1 True
+ 2 False
+ 3 False
+ 4 True
+ dtype: bool
+
+ We can also check two GeoSeries against each other, row by row.
+ The GeoSeries above have different indices. We can either align both
GeoSeries
+ based on index values and compare elements with the same index using
+ ``align=True`` or ignore index and compare elements based on their
matching
+ order using ``align=False``:
+
+ >>> s.dwithin(s2, distance=1, align=True)
+ 0 False
+ 1 True
+ 2 False
+ 3 False
+ 4 False
+ dtype: bool
+
+ >>> s.dwithin(s2, distance=1, align=False)
+ 0 True
+ 1 False
+ 2 False
+ 3 True
+ dtype: bool
+
+ Notes
+ -----
+ This method works in a row-wise manner. It does not check if an element
+ of one GeoSeries is within the set distance of *any* element of the
other one.
+
+ See also
+ --------
+ GeoSeries.within
+ """
+
+ if not isinstance(distance, (float, int)):
+ raise NotImplementedError(
+ "Array-like distance for dwithin not implemented yet."
+ )
+
+ return self._row_wise_operation(
+ f"ST_DWithin(`L`, `R`, {distance})",
+ other,
+ align,
+ rename="dwithin",
+ returns_geom=False,
+ default_val="FALSE",
+ )
+
+ def difference(self, other, align=None) -> "GeoSeries":
+ """Returns a ``GeoSeries`` of the points in each aligned geometry that
+ are not in `other`.
+
+ The operation works on a 1-to-1 row-wise manner:
+
+ Unlike Geopandas, Sedona does not support this operation for
GeometryCollections.
+
+ Parameters
+ ----------
+ other : Geoseries or geometric object
+ The Geoseries (elementwise) or geometric object to find the
+ difference to.
+ align : bool | None (default None)
+ If True, automatically aligns GeoSeries based on their indices.
None defaults to True.
+ If False, the order of elements is preserved.
+
+ Returns
+ -------
+ GeoSeries
+
+ Examples
+ --------
+ >>> from sedona.geopandas import GeoSeries
+ >>> from shapely.geometry import Polygon, LineString, Point
+ >>> s = GeoSeries(
+ ... [
+ ... Polygon([(0, 0), (2, 2), (0, 2)]),
+ ... Polygon([(0, 0), (2, 2), (0, 2)]),
+ ... LineString([(0, 0), (2, 2)]),
+ ... LineString([(2, 0), (0, 2)]),
+ ... Point(0, 1),
+ ... ],
+ ... )
+ >>> s2 = GeoSeries(
+ ... [
+ ... Polygon([(0, 0), (1, 1), (0, 1)]),
+ ... LineString([(1, 0), (1, 3)]),
+ ... LineString([(2, 0), (0, 2)]),
+ ... Point(1, 1),
+ ... Point(0, 1),
+ ... ],
+ ... index=range(1, 6),
+ ... )
+
+ >>> s
+ 0 POLYGON ((0 0, 2 2, 0 2, 0 0))
+ 1 POLYGON ((0 0, 2 2, 0 2, 0 0))
+ 2 LINESTRING (0 0, 2 2)
+ 3 LINESTRING (2 0, 0 2)
+ 4 POINT (0 1)
+ dtype: geometry
+
+ >>> s2
+ 1 POLYGON ((0 0, 1 1, 0 1, 0 0))
+ 2 LINESTRING (1 0, 1 3)
+ 3 LINESTRING (2 0, 0 2)
+ 4 POINT (1 1)
+ 5 POINT (0 1)
+ dtype: geometry
+
+ We can do difference of each geometry and a single
+ shapely geometry:
+
+ >>> s.difference(Polygon([(0, 0), (1, 1), (0, 1)]))
+ 0 POLYGON ((0 2, 2 2, 1 1, 0 1, 0 2))
+ 1 POLYGON ((0 2, 2 2, 1 1, 0 1, 0 2))
+ 2 LINESTRING (1 1, 2 2)
+ 3 MULTILINESTRING ((2 0, 1 1), (1 1, 0 2))
+ 4 POINT EMPTY
+ dtype: geometry
+
+ We can also check two GeoSeries against each other, row by row.
+ The GeoSeries above have different indices. We can either align both
GeoSeries
+ based on index values and compare elements with the same index using
+ ``align=True`` or ignore index and compare elements based on their
matching
+ order using ``align=False``:
+
+ >>> s.difference(s2, align=True)
+ 0 None
+ 1 POLYGON ((0 2, 2 2, 1 1, 0 1, 0 2))
+ 2 MULTILINESTRING ((0 0, 1 1), (1 1, 2 2))
+ 3 LINESTRING EMPTY
+ 4 POINT (0 1)
+ 5 None
+ dtype: geometry
+
+ >>> s.difference(s2, align=False)
+ 0 POLYGON ((0 2, 2 2, 1 1, 0 1, 0 2))
+ 1 POLYGON ((0 0, 0 2, 1 2, 2 2, 1 1, 0 0))
+ 2 MULTILINESTRING ((0 0, 1 1), (1 1, 2 2))
+ 3 LINESTRING (2 0, 0 2)
+ 4 POINT EMPTY
+ dtype: geometry
+
+ See Also
+ --------
+ GeoSeries.symmetric_difference
+ GeoSeries.union
+ GeoSeries.intersection
+ """
+ return self._row_wise_operation(
+ "ST_Difference(`L`, `R`)",
+ other,
+ align,
+ rename="difference",
+ returns_geom=True,
+ )
+
@property
def is_simple(self) -> pspd.Series:
"""Returns a ``Series`` of ``dtype('bool')`` with value ``True`` for
@@ -2698,14 +2924,22 @@ class GeoSeries(GeoFrame, pspd.Series):
NATURAL_ORDER_COLUMN_NAME if align is False else
SPARK_DEFAULT_INDEX_NAME
)
- if isinstance(other, BaseGeometry):
- other = GeoSeries([other] * len(self))
-
- # e.g int input
if not isinstance(other, pspd.Series):
- other = pspd.Series([other] * len(self))
+ # generator instead of a in-memory list
+ data = [other for _ in range(len(self))]
+
+ # e.g int, Geom, etc
+ other = (
+ GeoSeries(data)
+ if isinstance(other, BaseGeometry)
+ else pspd.Series(data)
+ )
- assert isinstance(other, pspd.Series), f"Invalid type for other:
{type(other)}"
+ # To make sure the result is the same length, we set natural
column as the index
+ # in case the index is not the default range index from 0.
+ # Alternatively, we could create 'other' using the same index as
self,
+ # but that would require index=self.index.to_pandas() which is
less scalable.
+ index_col = NATURAL_ORDER_COLUMN_NAME
# This code assumes there is only one index (SPARK_DEFAULT_INDEX_NAME)
# and would need to be updated if Sedona later supports multi-index
diff --git a/python/tests/geopandas/test_geoseries.py
b/python/tests/geopandas/test_geoseries.py
index 2db3ed317d..e3621da080 100644
--- a/python/tests/geopandas/test_geoseries.py
+++ b/python/tests/geopandas/test_geoseries.py
@@ -526,6 +526,101 @@ class TestGeoSeries(TestGeopandasBase):
def test_count_interior_rings(self):
pass
+ def test_dwithin(self):
+ s = GeoSeries(
+ [
+ Polygon([(0, 0), (1, 1), (0, 1)]),
+ LineString([(0, 0), (0, 2)]),
+ LineString([(0, 0), (0, 1)]),
+ Point(0, 1),
+ ],
+ index=range(0, 4),
+ )
+ s2 = GeoSeries(
+ [
+ Polygon([(1, 0), (4, 2), (2, 2)]),
+ Polygon([(2, 0), (3, 2), (2, 2)]),
+ LineString([(2, 0), (2, 2)]),
+ Point(1, 1),
+ ],
+ index=range(1, 5),
+ )
+
+ result = s2.dwithin(Point(0, 1), 1.8)
+ expected = pd.Series([True, False, False, True], index=range(1, 5))
+ assert_series_equal(result.to_pandas(), expected)
+
+ result = s.dwithin(s2, distance=1, align=True)
+ expected = pd.Series([False, True, False, False, False])
+
+ result = s.dwithin(s2, distance=1, align=False)
+ expected = pd.Series([True, False, False, True])
+ assert_series_equal(result.to_pandas(), expected)
+
+ def test_difference(self):
+ s = GeoSeries(
+ [
+ Polygon([(0, 0), (2, 2), (0, 2)]),
+ Polygon([(0, 0), (2, 2), (0, 2)]),
+ LineString([(0, 0), (2, 2)]),
+ LineString([(2, 0), (0, 2)]),
+ Point(0, 1),
+ ],
+ )
+ s2 = GeoSeries(
+ [
+ Polygon([(0, 0), (1, 1), (0, 1)]),
+ LineString([(1, 0), (1, 3)]),
+ LineString([(2, 0), (0, 2)]),
+ Point(1, 1),
+ Point(0, 1),
+ ],
+ index=range(1, 6),
+ )
+
+ result = s.difference(Polygon([(0, 0), (1, 1), (0, 1)]))
+ expected = gpd.GeoSeries(
+ [
+ Polygon([(0, 2), (2, 2), (1, 1), (0, 1), (0, 2)]),
+ Polygon([(0, 2), (2, 2), (1, 1), (0, 1), (0, 2)]),
+ LineString([(1, 1), (2, 2)]),
+ MultiLineString(
+ [LineString([(2, 0), (1, 1)]), LineString([(1, 1), (0,
2)])]
+ ),
+ Point(),
+ ]
+ )
+ self.check_sgpd_equals_gpd(result, expected)
+
+ result = s.difference(s2, align=True)
+ expected = gpd.GeoSeries(
+ [
+ None,
+ Polygon([(0, 2), (2, 2), (1, 1), (0, 1), (0, 2)]),
+ MultiLineString(
+ [LineString([(0, 0), (1, 1)]), LineString([(1, 1), (2,
2)])]
+ ),
+ LineString(),
+ Point(0, 1),
+ None,
+ ]
+ )
+ self.check_sgpd_equals_gpd(result, expected)
+
+ result = s.difference(s2, align=False)
+ expected = gpd.GeoSeries(
+ [
+ None,
+ Polygon([(0, 2), (2, 2), (1, 1), (0, 1), (0, 2)]),
+ Polygon([(0, 0), (0, 2), (1, 2), (2, 2), (1, 1), (0, 0)]),
+ MultiLineString(
+ [LineString([(0, 0), (1, 1)]), LineString([(1, 1), (2,
2)])]
+ ),
+ LineString([(2, 0), (0, 2)]),
+ Point(),
+ ]
+ )
+
def test_is_simple(self):
s = sgpd.GeoSeries(
[
diff --git a/python/tests/geopandas/test_match_geopandas_series.py
b/python/tests/geopandas/test_match_geopandas_series.py
index cfc1b587ba..a23d9db1f4 100644
--- a/python/tests/geopandas/test_match_geopandas_series.py
+++ b/python/tests/geopandas/test_match_geopandas_series.py
@@ -478,6 +478,55 @@ class TestMatchGeopandasSeries(TestGeopandasBase):
def test_count_interior_rings(self):
pass
+ def test_dwithin(self):
+ if parse_version(gpd.__version__) < parse_version("1.0.0"):
+ pytest.skip("geopandas < 1.0.0 does not support dwithin")
+
+ for i, (_, geom) in enumerate(self.geoms):
+ for _, geom2 in self.geoms[i:]:
+ sgpd_result = GeoSeries(geom).dwithin(GeoSeries(geom2),
distance=1)
+ gpd_result = gpd.GeoSeries(geom).dwithin(
+ gpd.GeoSeries(geom2), distance=1
+ )
+ self.check_pd_series_equal(sgpd_result, gpd_result)
+
+ if len(geom) == len(geom2):
+ sgpd_result = GeoSeries(geom).dwithin(
+ GeoSeries(geom2), distance=1, align=False
+ )
+ gpd_result = gpd.GeoSeries(geom).dwithin(
+ gpd.GeoSeries(geom2), distance=1, align=False
+ )
+ self.check_pd_series_equal(sgpd_result, gpd_result)
+
+ def test_difference(self):
+ for i, (_, geom) in enumerate(self.geoms):
+ for _, geom2 in self.geoms[i:]:
+ # Sedona doesn't support difference for GeometryCollections
+ if isinstance(geom[0], GeometryCollection) or isinstance(
+ geom2[0], GeometryCollection
+ ):
+ continue
+ # Operation doesn't work on invalid geometries
+ if (
+ not gpd.GeoSeries(geom).is_valid.all()
+ or not gpd.GeoSeries(geom2).is_valid.all()
+ ):
+ continue
+
+ sgpd_result = GeoSeries(geom).difference(GeoSeries(geom2))
+ gpd_result =
gpd.GeoSeries(geom).difference(gpd.GeoSeries(geom2))
+ self.check_sgpd_equals_gpd(sgpd_result, gpd_result)
+
+ if len(geom) == len(geom2):
+ sgpd_result = GeoSeries(geom).difference(
+ GeoSeries(geom2), align=False
+ )
+ gpd_result = gpd.GeoSeries(geom).difference(
+ gpd.GeoSeries(geom2), align=False
+ )
+ self.check_sgpd_equals_gpd(sgpd_result, gpd_result)
+
def test_is_simple(self):
data = [
LineString([(0, 0), (0, 0)]),