Re: [PR] [GH-2129] Geopandas: Refactor to use Spark Dataframe API instead of Spark SQL [sedona]

via GitHub Tue, 22 Jul 2025 11:25:45 -0700


Copilot commented on code in PR #2131:
URL: https://github.com/apache/sedona/pull/2131#discussion_r2223464009



##########
python/sedona/geopandas/geoseries.py:
##########
@@ -969,16 +912,38 @@ def length(self) -> pspd.Series:
         3    4.828427
         dtype: float64
         """
-        col = self.get_first_geometry_column()
-        select = f"""
+
+        """
             CASE
                 WHEN GeometryType(`{col}`) IN ('LINESTRING', 
'MULTILINESTRING') THEN ST_Length(`{col}`)
                 WHEN GeometryType(`{col}`) IN ('POLYGON', 'MULTIPOLYGON') THEN 
ST_Perimeter(`{col}`)
                 WHEN GeometryType(`{col}`) IN ('POINT', 'MULTIPOINT') THEN 0.0
                 WHEN GeometryType(`{col}`) IN ('GEOMETRYCOLLECTION') THEN 
ST_Length(`{col}`) + ST_Perimeter(`{col}`)
-            END"""
+        END"""
+

Review Comment:
   Remove commented out SQL code block. This multi-line comment containing old 
SQL logic should be deleted as it's no longer relevant after the refactoring.
   ```suggestion
   
   ```



##########
python/sedona/geopandas/geoseries.py:
##########
@@ -725,46 +672,31 @@ def _query_geometry_column(
         GeoSeries
             A GeoSeries with the operation applied to the geometry column.
         """
-        if not cols:
-            raise ValueError("No valid geometry column found.")
 
         df = self._internal.spark_frame if df is None else df
 
-        if isinstance(cols, str):
-            col = cols
-            data_type = df.schema[col].dataType
-            if isinstance(data_type, BinaryType):
-                query = query.replace(f"`{cols}`", f"ST_GeomFromWKB(`{cols}`)")
+        rename = self.name if self.name else SPARK_DEFAULT_SERIES_NAME
 
-            rename = col if not rename else rename
+        col_expr = spark_col.alias(rename)
 
-        elif isinstance(cols, list):
-            for col in cols:
-                data_type = df.schema[col].dataType
-
-                if isinstance(data_type, BinaryType):
-                    # the backticks here are important so we don't match 
strings that happen to be the same as the column name
-                    query = query.replace(f"`{col}`", 
f"ST_GeomFromWKB(`{col}`)")
-
-            # must have rename for multiple columns since we don't know which 
name to default to
-            assert rename
-
-        query = f"{query} as `{rename}`"
-
-        exprs = [query]
+        exprs = [col_expr]
 
         index_spark_columns = []
         index_fields = []
         if not is_aggr:
             # We always select NATURAL_ORDER_COLUMN_NAME, to avoid having to 
regenerate it in the result
             # We always select SPARK_DEFAULT_INDEX_NAME, to retain series 
index info
-            exprs.append(SPARK_DEFAULT_INDEX_NAME)
-            exprs.append(NATURAL_ORDER_COLUMN_NAME)
+
+            exprs.append(scol_for(df, SPARK_DEFAULT_INDEX_NAME))
+            exprs.append(scol_for(df, NATURAL_ORDER_COLUMN_NAME))
+
+            # exprs.append(SPARK_DEFAULT_INDEX_NAME)
+            # exprs.append(NATURAL_ORDER_COLUMN_NAME)

Review Comment:
   Remove commented out code. These commented lines serve no purpose and should 
be deleted to improve code cleanliness.
   ```suggestion
   
   ```



##########
python/sedona/geopandas/geoseries.py:
##########
@@ -725,46 +672,31 @@ def _query_geometry_column(
         GeoSeries
             A GeoSeries with the operation applied to the geometry column.
         """
-        if not cols:
-            raise ValueError("No valid geometry column found.")
 
         df = self._internal.spark_frame if df is None else df
 
-        if isinstance(cols, str):
-            col = cols
-            data_type = df.schema[col].dataType
-            if isinstance(data_type, BinaryType):
-                query = query.replace(f"`{cols}`", f"ST_GeomFromWKB(`{cols}`)")
+        rename = self.name if self.name else SPARK_DEFAULT_SERIES_NAME
 
-            rename = col if not rename else rename
+        col_expr = spark_col.alias(rename)
 
-        elif isinstance(cols, list):
-            for col in cols:
-                data_type = df.schema[col].dataType
-
-                if isinstance(data_type, BinaryType):
-                    # the backticks here are important so we don't match 
strings that happen to be the same as the column name
-                    query = query.replace(f"`{col}`", 
f"ST_GeomFromWKB(`{col}`)")
-
-            # must have rename for multiple columns since we don't know which 
name to default to
-            assert rename
-
-        query = f"{query} as `{rename}`"
-
-        exprs = [query]
+        exprs = [col_expr]
 
         index_spark_columns = []
         index_fields = []
         if not is_aggr:
             # We always select NATURAL_ORDER_COLUMN_NAME, to avoid having to 
regenerate it in the result
             # We always select SPARK_DEFAULT_INDEX_NAME, to retain series 
index info
-            exprs.append(SPARK_DEFAULT_INDEX_NAME)
-            exprs.append(NATURAL_ORDER_COLUMN_NAME)
+
+            exprs.append(scol_for(df, SPARK_DEFAULT_INDEX_NAME))
+            exprs.append(scol_for(df, NATURAL_ORDER_COLUMN_NAME))
+
+            # exprs.append(SPARK_DEFAULT_INDEX_NAME)
+            # exprs.append(NATURAL_ORDER_COLUMN_NAME)

Review Comment:
   Remove commented out code. These commented lines serve no purpose and should 
be deleted to improve code cleanliness.
   ```suggestion
   
   ```



##########
python/sedona/geopandas/geoseries.py:
##########
@@ -1531,7 +1514,7 @@ def get_geometry(self, index) -> "GeoSeries":
         """
 
         # Sedona errors on negative indexes, so we use a case statement to 
handle it ourselves
-        select = """
+        """

Review Comment:
   Remove commented out SQL code block. This multi-line comment containing old 
SQL logic should be deleted as it's no longer relevant after the refactoring.



##########
python/sedona/geopandas/geoseries.py:
##########
@@ -1585,15 +1581,21 @@ def boundary(self) -> "GeoSeries":
         GeoSeries.exterior : outer boundary (without interior rings)
 
         """
-        col = self.get_first_geometry_column()
         # Geopandas and shapely return NULL for GeometryCollections, so we 
handle it separately
         # 
https://shapely.readthedocs.io/en/stable/reference/shapely.boundary.html
-        select = f"""
+        """
             CASE
                 WHEN GeometryType(`{col}`) IN ('GEOMETRYCOLLECTION') THEN NULL
                 ELSE ST_Boundary(`{col}`)
-            END"""
-        return self._query_geometry_column(select, col, rename="boundary")
+            END
+        """

Review Comment:
   Remove commented out SQL code block. This multi-line comment containing old 
SQL logic should be deleted as it's no longer relevant after the refactoring.



##########
python/sedona/geopandas/geoseries.py:
##########
@@ -2924,55 +2990,43 @@ def _row_wise_operation(
             NATURAL_ORDER_COLUMN_NAME if align is False else 
SPARK_DEFAULT_INDEX_NAME
         )
 
-        if not isinstance(other, pspd.Series):
-            # generator instead of a in-memory list
-            data = [other for _ in range(len(self))]
-
-            # e.g int, Geom, etc
-            other = (
-                GeoSeries(data)
-                if isinstance(other, BaseGeometry)
-                else pspd.Series(data)
-            )
-
-            # To make sure the result is the same length, we set natural 
column as the index
-            # in case the index is not the default range index from 0.
-            # Alternatively, we could create 'other' using the same index as 
self,
-            # but that would require index=self.index.to_pandas() which is 
less scalable.
-            index_col = NATURAL_ORDER_COLUMN_NAME
-
         # This code assumes there is only one index (SPARK_DEFAULT_INDEX_NAME)
         # and would need to be updated if Sedona later supports multi-index
+
         df = self._internal.spark_frame.select(
-            col(self.get_first_geometry_column()).alias("L"),
+            self.spark.column.alias("L"),
             # For the left side:
             # - We always select NATURAL_ORDER_COLUMN_NAME, to avoid having to 
regenerate it in the result
             # - We always select SPARK_DEFAULT_INDEX_NAME, to retain series 
index info
             col(NATURAL_ORDER_COLUMN_NAME),
             col(SPARK_DEFAULT_INDEX_NAME),
         )
         other_df = other._internal.spark_frame.select(
-            col(_get_first_column_name(other)).alias("R"),
+            other.spark.column.alias("R"),
             # for the right side, we only need the column that we are joining 
on
             col(index_col),
         )
+
         joined_df = df.join(other_df, on=index_col, how="outer")
 
         if default_val is not None:
             # ps.Series.fillna() doesn't always work for the output for some 
reason
             # so we manually handle the nulls here.
-            select = f"""
+            spark_col = F.when(
+                F.col("L").isNull() | F.col("R").isNull(),
+                default_val,
+            ).otherwise(spark_col)
+            # The above is equivalent to the following:
+            f"""
                 CASE
                     WHEN `L` IS NULL OR `R` IS NULL THEN {default_val}
-                    ELSE {select}
+                    ELSE {spark_col}
                 END
             """

Review Comment:
   Remove commented out SQL code block. This multi-line comment containing old 
SQL logic should be deleted as it's no longer relevant after the refactoring.



##########
python/sedona/geopandas/geoseries.py:
##########
@@ -2007,16 +2018,30 @@ def crosses(self, other, align=None) -> pspd.Series:
 
         """
         # Sedona does not support GeometryCollection (errors), so we return 
NULL for now to avoid error
-        select = """
+        """
         CASE
             WHEN GeometryType(`L`) == 'GEOMETRYCOLLECTION' OR 
GeometryType(`R`) == 'GEOMETRYCOLLECTION' THEN NULL
             ELSE ST_Crosses(`L`, `R`)
         END
         """
+        other_series, extended = self._make_series_of_val(other)
+        align = False if extended else align
 
+        spark_expr = F.when(
+            (stf.GeometryType(F.col("L")) == "GEOMETRYCOLLECTION")
+            | (stf.GeometryType(F.col("R")) == "GEOMETRYCOLLECTION"),
+            None,
+        ).otherwise(stp.ST_Crosses(F.col("L"), F.col("R")))
         result = self._row_wise_operation(
-            select, other, align, rename="crosses", default_val="FALSE"
+            spark_expr,
+            other_series,
+            align,
+            default_val=False,
         )
+
+        # result = self._row_wise_operation(
+        #     select, other, align, rename="crosses", default_val="FALSE"
+        # )

Review Comment:
   Remove commented out code block. These commented lines showing the old 
implementation should be deleted to improve code cleanliness.
   ```suggestion
   # (Commented-out code removed for cleanliness)
   ```



##########
python/sedona/geopandas/geodataframe.py:
##########
@@ -746,17 +752,19 @@ def rename_geometry(self, col: str, inplace: bool = 
False) -> GeoDataFrame | Non
         if col in self.columns:
             raise ValueError(f"Column named {col} already exists")
         else:
+            mapper = {col: col for col in list(self.columns)}
+            mapper[geometry_col] = col
+
+            # mapper = {geometry_col: col}

Review Comment:
   Remove commented out code. This commented line serves no purpose and should 
be deleted to improve code cleanliness.
   ```suggestion
   
   ```



##########
python/sedona/geopandas/geoseries.py:
##########
@@ -2007,16 +2018,30 @@ def crosses(self, other, align=None) -> pspd.Series:
 
         """
         # Sedona does not support GeometryCollection (errors), so we return 
NULL for now to avoid error
-        select = """
+        """
         CASE
             WHEN GeometryType(`L`) == 'GEOMETRYCOLLECTION' OR 
GeometryType(`R`) == 'GEOMETRYCOLLECTION' THEN NULL
             ELSE ST_Crosses(`L`, `R`)
         END
         """

Review Comment:
   Remove commented out SQL code block. This multi-line comment containing old 
SQL logic should be deleted as it's no longer relevant after the refactoring.
   ```suggestion
   
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [GH-2129] Geopandas: Refactor to use Spark Dataframe API instead of Spark SQL [sedona]

Reply via email to