This is an automated email from the ASF dual-hosted git repository. jiayu pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/sedona.git
The following commit(s) were added to refs/heads/master by this push: new 652db60604 [GH-2049] Retain index information in query results (#2086) 652db60604 is described below commit 652db606045a5436d7464b98ca6d3794c52a778d Author: Peter Nguyen <petern0...@gmail.com> AuthorDate: Fri Jul 11 10:32:16 2025 -0700 [GH-2049] Retain index information in query results (#2086) * Implement retain index information in results of queries * Update comment --- python/sedona/geopandas/geoseries.py | 32 ++++++++++++++-------- python/tests/geopandas/test_geoseries.py | 18 ++++++++++-- .../tests/geopandas/test_match_geopandas_series.py | 14 ++++++++++ 3 files changed, 50 insertions(+), 14 deletions(-) diff --git a/python/sedona/geopandas/geoseries.py b/python/sedona/geopandas/geoseries.py index 31cee8f7a0..06602b27ce 100644 --- a/python/sedona/geopandas/geoseries.py +++ b/python/sedona/geopandas/geoseries.py @@ -243,10 +243,9 @@ class GeoSeries(GeoFrame, pspd.Series): """ from pyproj import CRS - tmp_df = self._process_geometry_column( - "ST_SRID", rename="crs", returns_geom=False - ) - srid = tmp_df.take([0])[0] + tmp = self._process_geometry_column("ST_SRID", rename="crs", returns_geom=False) + ps_series = tmp.take([0]) + srid = ps_series.iloc[0] # Sedona returns 0 if doesn't exist return CRS.from_user_input(srid) if srid != 0 and not pd.isna(srid) else None @@ -494,14 +493,17 @@ class GeoSeries(GeoFrame, pspd.Series): query = f"{query} as `{rename}`" - sdf = df.selectExpr(query) - internal = InternalFrame( + # We always select NATURAL_ORDER_COLUMN_NAME, to avoid having to regenerate it in the result + # We always select SPARK_DEFAULT_INDEX_NAME, to retain series index info + sdf = df.selectExpr(query, SPARK_DEFAULT_INDEX_NAME, NATURAL_ORDER_COLUMN_NAME) + + internal = self._internal.copy( spark_frame=sdf, - index_spark_columns=None, - column_labels=[self._column_label], + index_fields=[self._internal.index_fields[0]], + index_spark_columns=[scol_for(sdf, SPARK_DEFAULT_INDEX_NAME)], data_spark_columns=[scol_for(sdf, rename)], - data_fields=[self._internal.data_fields[0]], - column_label_names=self._internal.column_label_names, + data_fields=[self._internal.data_fields[0].copy(name=rename)], + column_label_names=[(rename,)], ) ps_series = first_series(PandasOnSparkDataFrame(internal)) @@ -1372,13 +1374,19 @@ class GeoSeries(GeoFrame, pspd.Series): assert isinstance(other, GeoSeries), f"Invalid type for other: {type(other)}" - # TODO: this does not yet support multi-index + # This code assumes there is only one index (SPARK_DEFAULT_INDEX_NAME) + # and would need to be updated if Sedona later supports multi-index df = self._internal.spark_frame.select( col(self.get_first_geometry_column()).alias("L"), - col(index_col), + # For the left side: + # - We always select NATURAL_ORDER_COLUMN_NAME, to avoid having to regenerate it in the result + # - We always select SPARK_DEFAULT_INDEX_NAME, to retain series index info + col(NATURAL_ORDER_COLUMN_NAME), + col(SPARK_DEFAULT_INDEX_NAME), ) other_df = other._internal.spark_frame.select( col(other.get_first_geometry_column()).alias("R"), + # for the right side, we only need the column that we are joining on col(index_col), ) joined_df = df.join(other_df, on=index_col, how="outer") diff --git a/python/tests/geopandas/test_geoseries.py b/python/tests/geopandas/test_geoseries.py index 0f89d547ab..6aabf33f25 100644 --- a/python/tests/geopandas/test_geoseries.py +++ b/python/tests/geopandas/test_geoseries.py @@ -626,6 +626,8 @@ class TestGeoSeries(TestBase): expected = pd.Series([True, True, True, True]) def test_intersection(self): + import pyspark.pandas as ps + s = sgpd.GeoSeries( [ Polygon([(0, 0), (2, 2), (0, 2)]), @@ -691,7 +693,15 @@ class TestGeoSeries(TestBase): Point(0, 1), ], index=range(1, 6), + crs=4326, ) + + # Ensure the index is preserved when crs is set (previously an issue) + expected_index = ps.Index(range(1, 6)) + ps.set_option("compute.ops_on_diff_frames", True) + assert s2.index.equals(expected_index) + ps.reset_option("compute.ops_on_diff_frames") + result = s.intersection(s2, align=True) expected = gpd.GeoSeries( [ @@ -705,7 +715,7 @@ class TestGeoSeries(TestBase): ) self.check_sgpd_equals_gpd(result, expected) - result = s.intersection(s2, align=False) + result = s2.intersection(s, align=False) expected = gpd.GeoSeries( [ Polygon([(0, 0), (0, 1), (1, 1), (0, 0)]), @@ -713,10 +723,14 @@ class TestGeoSeries(TestBase): Point(1, 1), Point(1, 1), Point(0, 1), - ] + ], + index=range(1, 6), # left's index ) self.check_sgpd_equals_gpd(result, expected) + # Ensure result of align=False retains the left's index + assert result.index.to_pandas().equals(expected.index) + def test_intersection_all(self): pass diff --git a/python/tests/geopandas/test_match_geopandas_series.py b/python/tests/geopandas/test_match_geopandas_series.py index f19c04e83a..cbb61527c5 100644 --- a/python/tests/geopandas/test_match_geopandas_series.py +++ b/python/tests/geopandas/test_match_geopandas_series.py @@ -594,6 +594,20 @@ class TestMatchGeopandasSeries(TestBase): Polygon([(2, 0), (3, 0), (3, 3), (2, 3)]), Point(0, 0), ] + + # Ensure resulting index behavior is correct for align=False (retain the left's index) + index1 = range(1, len(geometries) + 1) + index2 = range(len(geometries)) + sgpd_result = GeoSeries(geometries, index1).intersection( + GeoSeries(geometries, index2), align=False + ) + + gpd_result = gpd.GeoSeries(geometries, index1).intersection( + gpd.GeoSeries(geometries, index2), align=False + ) + self.check_sgpd_equals_gpd(sgpd_result, gpd_result) + assert sgpd_result.index.to_pandas().equals(gpd_result.index) + for g1 in geometries: for g2 in geometries: sgpd_result = GeoSeries(g1).intersection(GeoSeries(g2))