This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git
The following commit(s) were added to refs/heads/master by this push:
new d799f5082c [GH-2004] Geopandas.GeoSeries: Implement Test Framework
(#2005)
d799f5082c is described below
commit d799f5082ca651891d656183d7851378baaf5dae
Author: Peter Nguyen <[email protected]>
AuthorDate: Wed Jun 25 16:54:02 2025 -0700
[GH-2004] Geopandas.GeoSeries: Implement Test Framework (#2005)
* Fix small nit in series __repr__()
* Add test_non_geom_fails()
* test_constructor on all different geometry types
* Change Series.area return type to pd.Series to match gpd behavior and add
area tests
* Fix GeoSeries.to_pandas() and fix refactor tests
* pre-commit
* Test if sgpd_res equals sedona result and gpd result
* Remove run_sedona_sql test
* Rename test_geoseries.py to test_match_geopandas_series.py
* Make area( return ps.Series instead of pd.Series
* Add new test_geoseries to mimic the scala tests
* Use smaller tests for test_geoseries and hard-code expected results
* Remove check_less_precise for version compatibility
---
python/sedona/geopandas/geoseries.py | 25 ++-
python/tests/geopandas/test_geoseries.py | 136 ++++--------
.../tests/geopandas/test_match_geopandas_series.py | 235 +++++++++++++++++++++
3 files changed, 292 insertions(+), 104 deletions(-)
diff --git a/python/sedona/geopandas/geoseries.py
b/python/sedona/geopandas/geoseries.py
index e17e9817a9..b9b914aa45 100644
--- a/python/sedona/geopandas/geoseries.py
+++ b/python/sedona/geopandas/geoseries.py
@@ -50,8 +50,8 @@ class GeoSeries(GeoFrame, pspd.Series):
Return a string representation of the GeoSeries in WKT format.
"""
try:
- pandas_series = self.to_geopandas()
- return gpd.GeoSeries(pandas_series).__repr__()
+ gpd_series = self.to_geopandas()
+ return gpd_series.__repr__()
except Exception as e:
# Fallback to parent's representation if conversion fails
@@ -176,7 +176,7 @@ class GeoSeries(GeoFrame, pspd.Series):
A GeoSeries with the operation applied to the geometry column.
"""
# Find the first column with BinaryType or GeometryType
- first_col = self.get_first_geometry_column()
+ first_col = self.get_first_geometry_column() # TODO: fixme
if first_col:
data_type = self._internal.spark_frame.schema[first_col].dataType
@@ -230,9 +230,16 @@ class GeoSeries(GeoFrame, pspd.Series):
return self._to_geopandas()
def _to_geopandas(self) -> gpd.GeoSeries:
- return gpd.GeoSeries(
- self._to_internal_pandas().map(lambda wkb:
shapely.wkb.loads(bytes(wkb)))
- )
+ pd_series = self._to_internal_pandas()
+ try:
+ return gpd.GeoSeries(
+ pd_series.map(lambda wkb: shapely.wkb.loads(bytes(wkb)))
+ )
+ except Exception as e:
+ return gpd.GeoSeries(pd_series)
+
+ def to_spark_pandas(self) -> pspd.Series:
+ return pspd.Series(self._to_internal_pandas())
@property
def geometry(self) -> "GeoSeries":
@@ -274,7 +281,7 @@ class GeoSeries(GeoFrame, pspd.Series):
return self
@property
- def area(self) -> "GeoSeries":
+ def area(self) -> pspd.Series:
"""
Returns a Series containing the area of each geometry in the GeoSeries
expressed in the units of the CRS.
@@ -295,7 +302,7 @@ class GeoSeries(GeoFrame, pspd.Series):
1 4.0
dtype: float64
"""
- return self._process_geometry_column("ST_Area", rename="area")
+ return self._process_geometry_column("ST_Area",
rename="area").to_spark_pandas()
@property
def crs(self):
@@ -521,7 +528,7 @@ class GeoSeries(GeoFrame, pspd.Series):
mitre_limit=5.0,
single_sided=False,
**kwargs,
- ):
+ ) -> "GeoSeries":
"""
Returns a GeoSeries of geometries representing all points within a
given distance of each geometric object.
diff --git a/python/tests/geopandas/test_geoseries.py
b/python/tests/geopandas/test_geoseries.py
index 3f560262ea..bec526ad46 100644
--- a/python/tests/geopandas/test_geoseries.py
+++ b/python/tests/geopandas/test_geoseries.py
@@ -14,109 +14,55 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
-import os
-import shutil
-import tempfile
-from geopandas.testing import assert_geoseries_equal
-from shapely.geometry import (
- Point,
- Polygon,
-)
-
-from sedona.geopandas import GeoSeries
+import pandas as pd
+import geopandas as gpd
+import sedona.geopandas as sgpd
from tests.test_base import TestBase
-import pyspark.pandas as ps
+from shapely import wkt
+from shapely.geometry import Point, LineString, Polygon, GeometryCollection
+from pandas.testing import assert_series_equal
-class TestSeries(TestBase):
+class TestGeoSeries(TestBase):
def setup_method(self):
- self.tempdir = tempfile.mkdtemp()
- self.t1 = Polygon([(0, 0), (1, 0), (1, 1)])
- self.t2 = Polygon([(0, 0), (1, 1), (0, 1)])
- self.sq = Polygon([(0, 0), (1, 0), (1, 1), (0, 1)])
- self.g1 = GeoSeries([self.t1, self.t2])
- self.g2 = GeoSeries([self.sq, self.t1])
- self.g3 = GeoSeries([self.t1, self.t2], crs="epsg:4326")
- self.g4 = GeoSeries([self.t2, self.t1])
-
- def teardown_method(self):
- shutil.rmtree(self.tempdir)
-
- def test_constructor(self):
- s = GeoSeries([Point(x, x) for x in range(3)])
- check_geoseries_equal(s, s)
-
- def test_psdf(self):
- # this is to make sure the spark session works with pandas on spark api
- psdf = ps.DataFrame(
- {
- "a": [1, 2, 3, 4, 5, 6],
- "b": [100, 200, 300, 400, 500, 600],
- "c": ["one", "two", "three", "four", "five", "six"],
- },
- index=[10, 20, 30, 40, 50, 60],
+ self.geoseries = sgpd.GeoSeries(
+ [
+ Point(2.3, -1),
+ LineString([(0.5, 0), (0, -3)]),
+ Polygon([(-1, -1), (-0.3, 5), (1, 1.2)]),
+ GeometryCollection(
+ [
+ Point(2.3, -1),
+ LineString([(0.5, 0), (0, -3)]),
+ Polygon([(-1, -1), (-0.3, 5), (1, 1.2)]),
+ ]
+ ),
+ ]
)
- assert psdf.count().count() == 3
-
- def test_internal_st_function(self):
- # this is to make sure the spark session works with internal sedona
udfs
- baseDf = self.spark.sql(
- "SELECT ST_GeomFromWKT('POLYGON ((50 50 1, 50 80 2, 80 80 3, 80 50
2, 50 50 1))') as geom"
- )
- actual = baseDf.selectExpr("ST_AsText(ST_Expand(geom, 10))").first()[0]
- expected = "POLYGON Z((40 40 -9, 40 90 -9, 90 90 13, 90 40 13, 40 40
-9))"
- assert expected == actual
-
- def test_type(self):
- assert type(self.g1) is GeoSeries
- assert type(self.g2) is GeoSeries
- assert type(self.g3) is GeoSeries
- assert type(self.g4) is GeoSeries
- def test_copy(self):
- gc = self.g3.copy()
- assert type(gc) is GeoSeries
- assert self.g3.name == gc.name
+ def check_sgpd_equals_gpd(self, actual: sgpd.GeoSeries, expected:
gpd.GeoSeries):
+ assert isinstance(actual, sgpd.GeoSeries)
+ assert isinstance(expected, gpd.GeoSeries)
+ assert len(actual) == len(expected)
+ sgpd_result = actual.to_geopandas()
+ for a, e in zip(sgpd_result, expected):
+ self.assert_geometry_almost_equal(a, e)
def test_area(self):
- area = self.g1.area
- assert area is not None
- assert type(area) is GeoSeries
- assert area.count() == 2
+ result = self.geoseries.area.to_pandas()
+ expected = pd.Series([0.0, 0.0, 5.23, 5.23])
+ assert result.count() > 0
+ assert_series_equal(result, expected)
def test_buffer(self):
- buffer = self.g1.buffer(0.2)
- assert buffer is not None
- assert type(buffer) is GeoSeries
- assert buffer.count() == 2
-
- def test_buffer_then_area(self):
- area = self.g1.buffer(0.2).area
- assert area is not None
- assert type(area) is GeoSeries
- assert area.count() == 2
-
- def test_buffer_then_geoparquet(self):
- temp_file_path = os.path.join(
- self.tempdir, next(tempfile._get_candidate_names()) + ".parquet"
- )
- self.g1.buffer(0.2).to_parquet(temp_file_path)
- assert os.path.exists(temp_file_path)
-
-
-# -----------------------------------------------------------------------------
-# # Utils
-# -----------------------------------------------------------------------------
-
-
-def check_geoseries_equal(s1, s2):
- assert isinstance(s1, GeoSeries)
- assert isinstance(s1.geometry, GeoSeries)
- assert isinstance(s2, GeoSeries)
- assert isinstance(s2.geometry, GeoSeries)
- if isinstance(s1, GeoSeries):
- s1 = s1.to_geopandas()
- if isinstance(s2, GeoSeries):
- s2 = s2.to_geopandas()
- assert_geoseries_equal(s1, s2)
+ result = self.geoseries.buffer(1)
+ expected = [
+ "POLYGON ((3.300000000000000 -1.000000000000000, 3.280785280403230
-1.195090322016128, 3.223879532511287 -1.382683432365090, 3.131469612302545
-1.555570233019602, 3.007106781186547 -1.707106781186547, 2.855570233019602
-1.831469612302545, 2.682683432365089 -1.923879532511287, 2.495090322016128
-1.980785280403230, 2.300000000000000 -2.000000000000000, 2.104909677983872
-1.980785280403230, 1.917316567634910 -1.923879532511287, 1.744429766980398
-1.831469612302545, 1.59289321881 [...]
+ "POLYGON ((0.986393923832144 -3.164398987305357, 0.935367989801224
-3.353676015097457, 0.848396388482656 -3.529361471973156, 0.728821389740875
-3.684703864350261, 0.581238193719096 -3.813733471206735, 0.411318339874827
-3.911491757111723, 0.225591752899151 -3.974221925961374, 0.031195801372873
-3.999513292546280, -0.164398987305357 -3.986393923832144, -0.353676015097457
-3.935367989801224, -0.529361471973156 -3.848396388482656, -0.684703864350260
-3.728821389740875, -0.813733 [...]
+ "POLYGON ((-0.260059926604056 -1.672672793996312,
-0.403493516968407 -1.802608257932399, -0.569270104475049 -1.902480890158382,
-0.751180291696993 -1.968549819451744, -0.942410374326119 -1.998340340272165,
-1.135797558140999 -1.990736606370705, -1.324098251632999 -1.946023426395157,
-1.500259385009482 -1.865875595977814, -1.657682592935656 -1.753295165887471,
-1.790471365675451 -1.612498995956065, -1.893651911234561 -1.448760806607280,
-1.963359455800552 -1.268213644171327, - [...]
+ "POLYGON ((-0.844303230213814 -1.983056850984667,
-0.942410374326119 -1.998340340272165, -1.135797558140999 -1.990736606370705,
-1.324098251632999 -1.946023426395157, -1.500259385009482 -1.865875595977814,
-1.657682592935656 -1.753295165887471, -1.790471365675451 -1.612498995956065,
-1.893651911234561 -1.448760806607280, -1.963359455800552 -1.268213644171327,
-1.996983004332570 -1.077620158927971, -1.993263139087243 -0.884119300439822,
-1.293263139087243 5.115880699560178, -1 [...]
+ ]
+ expected = gpd.GeoSeries([wkt.loads(wkt_str) for wkt_str in expected])
+ assert result.count() > 0
+ self.check_sgpd_equals_gpd(result, expected)
diff --git a/python/tests/geopandas/test_match_geopandas_series.py
b/python/tests/geopandas/test_match_geopandas_series.py
new file mode 100644
index 0000000000..3b3eccc4d4
--- /dev/null
+++ b/python/tests/geopandas/test_match_geopandas_series.py
@@ -0,0 +1,235 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import os
+import shutil
+import tempfile
+import pytest
+import pandas as pd
+import geopandas as gpd
+import pyspark.pandas as ps
+import pyspark
+from pandas.testing import assert_series_equal
+
+from shapely.geometry import (
+ Point,
+ Polygon,
+ MultiPoint,
+ MultiLineString,
+ LineString,
+ MultiPolygon,
+ GeometryCollection,
+)
+
+from sedona.geopandas import GeoSeries
+from tests.test_base import TestBase
+import pyspark.pandas as ps
+
+
+class TestMatchGeopandasSeries(TestBase):
+ def setup_method(self):
+ self.tempdir = tempfile.mkdtemp()
+ self.t1 = Polygon([(0, 0), (1, 0), (1, 1)])
+ self.t2 = Polygon([(0, 0), (1, 1), (0, 1)])
+ self.sq = Polygon([(0, 0), (1, 0), (1, 1), (0, 1)])
+ self.g1 = GeoSeries([self.t1, self.t2])
+ self.g2 = GeoSeries([self.sq, self.t1])
+ self.g3 = GeoSeries([self.t1, self.t2], crs="epsg:4326")
+ self.g4 = GeoSeries([self.t2, self.t1])
+
+ self.points = [Point(x, x + 1) for x in range(3)]
+
+ self.multipoints = [MultiPoint([(x, x + 1), (x + 2, x + 3)]) for x in
range(3)]
+
+ self.linestrings = [LineString([(x, x + 1), (x + 2, x + 3)]) for x in
range(3)]
+
+ self.multilinestrings = [
+ MultiLineString(
+ [[[x, x + 1], [x + 2, x + 3]], [[x + 4, x + 5], [x + 6, x +
7]]]
+ )
+ for x in range(3)
+ ]
+
+ self.polygons = [
+ Polygon([(x, 0), (x + 1, 0), (x + 2, 1), (x + 3, 1)]) for x in
range(3)
+ ]
+
+ self.multipolygons = [
+ MultiPolygon(
+ [
+ (
+ [(0.0, 0.0), (0.0, 1.0), (1.0, 0.0)],
+ [[(0.1, 0.1), (0.1, 0.2), (0.2, 0.1), (0.1, 0.1)]],
+ )
+ ]
+ )
+ ]
+
+ self.geomcollection = [
+ GeometryCollection(
+ [
+ MultiPoint([(0, 0), (1, 1)]),
+ MultiLineString([[(0, 0), (1, 1)], [(2, 2), (3, 3)]]),
+ MultiPolygon(
+ [
+ (
+ [(0.0, 0.0), (0.0, 1.0), (1.0, 0.0)],
+ [[(0.1, 0.1), (0.1, 0.2), (0.2, 0.1), (0.1,
0.1)]],
+ )
+ ]
+ ),
+ ]
+ )
+ ]
+
+ # (sql_table_name, geom)
+ self.geoms = [
+ ("points", self.points),
+ ("multipoints", self.multipoints),
+ ("linestrings", self.linestrings),
+ ("multilinestrings", self.multilinestrings),
+ ("polygons", self.polygons),
+ ("multipolygons", self.multipolygons),
+ ("geomcollection", self.geomcollection),
+ ]
+
+ # create the tables in sedona spark
+ for i, (table_name, geoms) in enumerate(self.geoms):
+ wkt_string = [g.wkt for g in geoms]
+ pd_df = pd.DataFrame({"id": i, "geometry": wkt_string})
+ spark_df = self.spark.createDataFrame(pd_df)
+ spark_df.createOrReplaceTempView(table_name)
+
+ def teardown_method(self):
+ shutil.rmtree(self.tempdir)
+
+ def test_constructor(self):
+ for _, geom in self.geoms:
+ gpd_series = gpd.GeoSeries(geom)
+ assert isinstance(gpd_series, gpd.GeoSeries)
+ assert isinstance(gpd_series.geometry, gpd.GeoSeries)
+
+ def test_non_geom_fails(self):
+ with pytest.raises(TypeError):
+ GeoSeries([0, 1, 2])
+ with pytest.raises(TypeError):
+ GeoSeries([0, 1, 2], crs="epsg:4326")
+ with pytest.raises(TypeError):
+ GeoSeries(["a", "b", "c"])
+
+ def test_to_geopandas(self):
+ for _, geom in self.geoms:
+ sgpd_result = GeoSeries(geom)
+ gpd_result = gpd.GeoSeries(geom)
+ self.check_sgpd_equals_gpd(sgpd_result, gpd_result)
+
+ def test_psdf(self):
+ # this is to make sure the spark session works with pandas on spark api
+ psdf = ps.DataFrame(
+ {
+ "a": [1, 2, 3, 4, 5, 6],
+ "b": [100, 200, 300, 400, 500, 600],
+ "c": ["one", "two", "three", "four", "five", "six"],
+ },
+ index=[10, 20, 30, 40, 50, 60],
+ )
+ assert psdf.count().count() == 3
+
+ def test_internal_st_function(self):
+ # this is to make sure the spark session works with internal sedona
udfs
+ baseDf = self.spark.sql(
+ "SELECT ST_GeomFromWKT('POLYGON ((50 50 1, 50 80 2, 80 80 3, 80 50
2, 50 50 1))') as geom"
+ )
+ actual = baseDf.selectExpr("ST_AsText(ST_Expand(geom, 10))").first()[0]
+ expected = "POLYGON Z((40 40 -9, 40 90 -9, 90 90 13, 90 40 13, 40 40
-9))"
+ assert expected == actual
+
+ def test_type(self):
+ assert type(self.g1) is GeoSeries
+ assert type(self.g2) is GeoSeries
+ assert type(self.g3) is GeoSeries
+ assert type(self.g4) is GeoSeries
+
+ def test_copy(self):
+ gc = self.g3.copy()
+ assert type(gc) is GeoSeries
+ assert self.g3.name == gc.name
+
+ def test_area(self):
+ area = self.g1.area
+ assert area is not None
+ assert type(area) is ps.Series
+ assert area.count() == 2
+
+ for _, geom in self.geoms:
+ sgpd_result = GeoSeries(geom).area
+ gpd_result = gpd.GeoSeries(geom).area
+ self.check_pd_series_equal(sgpd_result, gpd_result)
+
+ def test_buffer(self):
+ buffer = self.g1.buffer(0.2)
+ assert buffer is not None
+ assert type(buffer) is GeoSeries
+ assert buffer.count() == 2
+
+ for _, geom in self.geoms:
+ dist = 0.2
+ sgpd_result = GeoSeries(geom).buffer(dist)
+ gpd_result = gpd.GeoSeries(geom).buffer(dist)
+
+ self.check_sgpd_equals_gpd(sgpd_result, gpd_result)
+
+ def test_buffer_then_area(self):
+ area = self.g1.buffer(0.2).area
+ assert area is not None
+ assert type(area) is ps.Series
+ assert area.count() == 2
+
+ def test_buffer_then_geoparquet(self):
+ temp_file_path = os.path.join(
+ self.tempdir, next(tempfile._get_candidate_names()) + ".parquet"
+ )
+ self.g1.buffer(0.2).to_parquet(temp_file_path)
+ assert os.path.exists(temp_file_path)
+
+ #
-----------------------------------------------------------------------------
+ # # Utils
+ #
-----------------------------------------------------------------------------
+
+ def check_sgpd_equals_spark_df(
+ self, actual: GeoSeries, expected: pyspark.sql.DataFrame
+ ):
+ assert isinstance(actual, GeoSeries)
+ assert isinstance(expected, pyspark.sql.DataFrame)
+ expected = expected.selectExpr("ST_AsText(expected) as expected")
+ sgpd_result = actual.to_geopandas()
+ expected = expected.toPandas()["expected"]
+ for a, e in zip(sgpd_result, expected):
+ self.assert_geometry_almost_equal(a, e)
+
+ def check_sgpd_equals_gpd(self, actual: GeoSeries, expected:
gpd.GeoSeries):
+ assert isinstance(actual, GeoSeries)
+ assert isinstance(expected, gpd.GeoSeries)
+ sgpd_result = actual.to_geopandas()
+ for a, e in zip(sgpd_result, expected):
+ self.assert_geometry_almost_equal(
+ a, e, tolerance=1e-2
+ ) # increased tolerance from 1e-6
+
+ def check_pd_series_equal(self, actual: ps.Series, expected: pd.Series):
+ assert isinstance(actual, ps.Series)
+ assert isinstance(expected, pd.Series)
+ assert_series_equal(actual.to_pandas(), expected)