Copilot commented on code in PR #2131:
URL: https://github.com/apache/sedona/pull/2131#discussion_r2223464009
##########
python/sedona/geopandas/geoseries.py:
##########
@@ -969,16 +912,38 @@ def length(self) -> pspd.Series:
3 4.828427
dtype: float64
"""
- col = self.get_first_geometry_column()
- select = f"""
+
+ """
CASE
WHEN GeometryType(`{col}`) IN ('LINESTRING',
'MULTILINESTRING') THEN ST_Length(`{col}`)
WHEN GeometryType(`{col}`) IN ('POLYGON', 'MULTIPOLYGON') THEN
ST_Perimeter(`{col}`)
WHEN GeometryType(`{col}`) IN ('POINT', 'MULTIPOINT') THEN 0.0
WHEN GeometryType(`{col}`) IN ('GEOMETRYCOLLECTION') THEN
ST_Length(`{col}`) + ST_Perimeter(`{col}`)
- END"""
+ END"""
+
Review Comment:
Remove commented out SQL code block. This multi-line comment containing old
SQL logic should be deleted as it's no longer relevant after the refactoring.
```suggestion
```
##########
python/sedona/geopandas/geoseries.py:
##########
@@ -725,46 +672,31 @@ def _query_geometry_column(
GeoSeries
A GeoSeries with the operation applied to the geometry column.
"""
- if not cols:
- raise ValueError("No valid geometry column found.")
df = self._internal.spark_frame if df is None else df
- if isinstance(cols, str):
- col = cols
- data_type = df.schema[col].dataType
- if isinstance(data_type, BinaryType):
- query = query.replace(f"`{cols}`", f"ST_GeomFromWKB(`{cols}`)")
+ rename = self.name if self.name else SPARK_DEFAULT_SERIES_NAME
- rename = col if not rename else rename
+ col_expr = spark_col.alias(rename)
- elif isinstance(cols, list):
- for col in cols:
- data_type = df.schema[col].dataType
-
- if isinstance(data_type, BinaryType):
- # the backticks here are important so we don't match
strings that happen to be the same as the column name
- query = query.replace(f"`{col}`",
f"ST_GeomFromWKB(`{col}`)")
-
- # must have rename for multiple columns since we don't know which
name to default to
- assert rename
-
- query = f"{query} as `{rename}`"
-
- exprs = [query]
+ exprs = [col_expr]
index_spark_columns = []
index_fields = []
if not is_aggr:
# We always select NATURAL_ORDER_COLUMN_NAME, to avoid having to
regenerate it in the result
# We always select SPARK_DEFAULT_INDEX_NAME, to retain series
index info
- exprs.append(SPARK_DEFAULT_INDEX_NAME)
- exprs.append(NATURAL_ORDER_COLUMN_NAME)
+
+ exprs.append(scol_for(df, SPARK_DEFAULT_INDEX_NAME))
+ exprs.append(scol_for(df, NATURAL_ORDER_COLUMN_NAME))
+
+ # exprs.append(SPARK_DEFAULT_INDEX_NAME)
+ # exprs.append(NATURAL_ORDER_COLUMN_NAME)
Review Comment:
Remove commented out code. These commented lines serve no purpose and should
be deleted to improve code cleanliness.
```suggestion
```
##########
python/sedona/geopandas/geoseries.py:
##########
@@ -725,46 +672,31 @@ def _query_geometry_column(
GeoSeries
A GeoSeries with the operation applied to the geometry column.
"""
- if not cols:
- raise ValueError("No valid geometry column found.")
df = self._internal.spark_frame if df is None else df
- if isinstance(cols, str):
- col = cols
- data_type = df.schema[col].dataType
- if isinstance(data_type, BinaryType):
- query = query.replace(f"`{cols}`", f"ST_GeomFromWKB(`{cols}`)")
+ rename = self.name if self.name else SPARK_DEFAULT_SERIES_NAME
- rename = col if not rename else rename
+ col_expr = spark_col.alias(rename)
- elif isinstance(cols, list):
- for col in cols:
- data_type = df.schema[col].dataType
-
- if isinstance(data_type, BinaryType):
- # the backticks here are important so we don't match
strings that happen to be the same as the column name
- query = query.replace(f"`{col}`",
f"ST_GeomFromWKB(`{col}`)")
-
- # must have rename for multiple columns since we don't know which
name to default to
- assert rename
-
- query = f"{query} as `{rename}`"
-
- exprs = [query]
+ exprs = [col_expr]
index_spark_columns = []
index_fields = []
if not is_aggr:
# We always select NATURAL_ORDER_COLUMN_NAME, to avoid having to
regenerate it in the result
# We always select SPARK_DEFAULT_INDEX_NAME, to retain series
index info
- exprs.append(SPARK_DEFAULT_INDEX_NAME)
- exprs.append(NATURAL_ORDER_COLUMN_NAME)
+
+ exprs.append(scol_for(df, SPARK_DEFAULT_INDEX_NAME))
+ exprs.append(scol_for(df, NATURAL_ORDER_COLUMN_NAME))
+
+ # exprs.append(SPARK_DEFAULT_INDEX_NAME)
+ # exprs.append(NATURAL_ORDER_COLUMN_NAME)
Review Comment:
Remove commented out code. These commented lines serve no purpose and should
be deleted to improve code cleanliness.
```suggestion
```
##########
python/sedona/geopandas/geoseries.py:
##########
@@ -1531,7 +1514,7 @@ def get_geometry(self, index) -> "GeoSeries":
"""
# Sedona errors on negative indexes, so we use a case statement to
handle it ourselves
- select = """
+ """
Review Comment:
Remove commented out SQL code block. This multi-line comment containing old
SQL logic should be deleted as it's no longer relevant after the refactoring.
##########
python/sedona/geopandas/geoseries.py:
##########
@@ -1585,15 +1581,21 @@ def boundary(self) -> "GeoSeries":
GeoSeries.exterior : outer boundary (without interior rings)
"""
- col = self.get_first_geometry_column()
# Geopandas and shapely return NULL for GeometryCollections, so we
handle it separately
#
https://shapely.readthedocs.io/en/stable/reference/shapely.boundary.html
- select = f"""
+ """
CASE
WHEN GeometryType(`{col}`) IN ('GEOMETRYCOLLECTION') THEN NULL
ELSE ST_Boundary(`{col}`)
- END"""
- return self._query_geometry_column(select, col, rename="boundary")
+ END
+ """
Review Comment:
Remove commented out SQL code block. This multi-line comment containing old
SQL logic should be deleted as it's no longer relevant after the refactoring.
##########
python/sedona/geopandas/geoseries.py:
##########
@@ -2924,55 +2990,43 @@ def _row_wise_operation(
NATURAL_ORDER_COLUMN_NAME if align is False else
SPARK_DEFAULT_INDEX_NAME
)
- if not isinstance(other, pspd.Series):
- # generator instead of a in-memory list
- data = [other for _ in range(len(self))]
-
- # e.g int, Geom, etc
- other = (
- GeoSeries(data)
- if isinstance(other, BaseGeometry)
- else pspd.Series(data)
- )
-
- # To make sure the result is the same length, we set natural
column as the index
- # in case the index is not the default range index from 0.
- # Alternatively, we could create 'other' using the same index as
self,
- # but that would require index=self.index.to_pandas() which is
less scalable.
- index_col = NATURAL_ORDER_COLUMN_NAME
-
# This code assumes there is only one index (SPARK_DEFAULT_INDEX_NAME)
# and would need to be updated if Sedona later supports multi-index
+
df = self._internal.spark_frame.select(
- col(self.get_first_geometry_column()).alias("L"),
+ self.spark.column.alias("L"),
# For the left side:
# - We always select NATURAL_ORDER_COLUMN_NAME, to avoid having to
regenerate it in the result
# - We always select SPARK_DEFAULT_INDEX_NAME, to retain series
index info
col(NATURAL_ORDER_COLUMN_NAME),
col(SPARK_DEFAULT_INDEX_NAME),
)
other_df = other._internal.spark_frame.select(
- col(_get_first_column_name(other)).alias("R"),
+ other.spark.column.alias("R"),
# for the right side, we only need the column that we are joining
on
col(index_col),
)
+
joined_df = df.join(other_df, on=index_col, how="outer")
if default_val is not None:
# ps.Series.fillna() doesn't always work for the output for some
reason
# so we manually handle the nulls here.
- select = f"""
+ spark_col = F.when(
+ F.col("L").isNull() | F.col("R").isNull(),
+ default_val,
+ ).otherwise(spark_col)
+ # The above is equivalent to the following:
+ f"""
CASE
WHEN `L` IS NULL OR `R` IS NULL THEN {default_val}
- ELSE {select}
+ ELSE {spark_col}
END
"""
Review Comment:
Remove commented out SQL code block. This multi-line comment containing old
SQL logic should be deleted as it's no longer relevant after the refactoring.
##########
python/sedona/geopandas/geoseries.py:
##########
@@ -2007,16 +2018,30 @@ def crosses(self, other, align=None) -> pspd.Series:
"""
# Sedona does not support GeometryCollection (errors), so we return
NULL for now to avoid error
- select = """
+ """
CASE
WHEN GeometryType(`L`) == 'GEOMETRYCOLLECTION' OR
GeometryType(`R`) == 'GEOMETRYCOLLECTION' THEN NULL
ELSE ST_Crosses(`L`, `R`)
END
"""
+ other_series, extended = self._make_series_of_val(other)
+ align = False if extended else align
+ spark_expr = F.when(
+ (stf.GeometryType(F.col("L")) == "GEOMETRYCOLLECTION")
+ | (stf.GeometryType(F.col("R")) == "GEOMETRYCOLLECTION"),
+ None,
+ ).otherwise(stp.ST_Crosses(F.col("L"), F.col("R")))
result = self._row_wise_operation(
- select, other, align, rename="crosses", default_val="FALSE"
+ spark_expr,
+ other_series,
+ align,
+ default_val=False,
)
+
+ # result = self._row_wise_operation(
+ # select, other, align, rename="crosses", default_val="FALSE"
+ # )
Review Comment:
Remove commented out code block. These commented lines showing the old
implementation should be deleted to improve code cleanliness.
```suggestion
# (Commented-out code removed for cleanliness)
```
##########
python/sedona/geopandas/geodataframe.py:
##########
@@ -746,17 +752,19 @@ def rename_geometry(self, col: str, inplace: bool =
False) -> GeoDataFrame | Non
if col in self.columns:
raise ValueError(f"Column named {col} already exists")
else:
+ mapper = {col: col for col in list(self.columns)}
+ mapper[geometry_col] = col
+
+ # mapper = {geometry_col: col}
Review Comment:
Remove commented out code. This commented line serves no purpose and should
be deleted to improve code cleanliness.
```suggestion
```
##########
python/sedona/geopandas/geoseries.py:
##########
@@ -2007,16 +2018,30 @@ def crosses(self, other, align=None) -> pspd.Series:
"""
# Sedona does not support GeometryCollection (errors), so we return
NULL for now to avoid error
- select = """
+ """
CASE
WHEN GeometryType(`L`) == 'GEOMETRYCOLLECTION' OR
GeometryType(`R`) == 'GEOMETRYCOLLECTION' THEN NULL
ELSE ST_Crosses(`L`, `R`)
END
"""
Review Comment:
Remove commented out SQL code block. This multi-line comment containing old
SQL logic should be deleted as it's no longer relevant after the refactoring.
```suggestion
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]