Copilot commented on code in PR #2203:
URL: https://github.com/apache/sedona/pull/2203#discussion_r2244721092
##########
python/sedona/geopandas/geoseries.py:
##########
@@ -505,7 +520,7 @@ def set_crs(
crs: Union[Any, None] = None,
epsg: Union[int, None] = None,
inplace: bool = False,
- allow_override: bool = False,
+ allow_override: bool = True,
Review Comment:
Changing the default value of `allow_override` from False to True is a
breaking change that could affect existing code. This should be clearly
documented in the migration guide and potentially phased in with deprecation
warnings.
```suggestion
allow_override: bool = False,
```
##########
python/sedona/geopandas/geodataframe.py:
##########
@@ -309,55 +309,22 @@ def __getitem__(self, key: Any) -> Any:
Name: value, dtype: int64
"""
- # Handle column access by name
- if isinstance(key, str):
- # Access column directly from the spark DataFrame
- column_name = key
-
- # Check if column exists
- if column_name not in self.columns:
- raise KeyError(f"Column '{column_name}' does not exist")
-
- # Here we are getting a ps.Series with the same underlying anchor
(ps.Dataframe).
- # This is important so we don't unnecessarily try to perform
operations on different dataframes
- ps_series: pspd.Series = pspd.DataFrame.__getitem__(self,
column_name)
-
+ # Here we are getting a ps.Series with the same underlying anchor
(ps.Dataframe).
+ # This is important so we don't unnecessarily try to perform
operations on different dataframes
+ item = pspd.DataFrame.__getitem__(self, key)
+
+ if isinstance(item, pspd.DataFrame):
+ # don't specify crs=self.crs here because it might not include the
geometry column
+ # if it does include the geometry column, we don't need to set crs
anyways
+ return GeoDataFrame(item)
+ elif isinstance(item, pspd.Series):
+ ps_series: pspd.Series = item
try:
- result = sgpd.GeoSeries(ps_series)
- not_null = ps_series[ps_series.notnull()]
- if len(not_null) > 0:
- first_geom = not_null.iloc[0]
- srid = shapely.get_srid(first_geom)
-
- # Shapely objects stored in the ps.Series retain their srid
- # but the GeoSeries does not, so we manually re-set it here
- if srid > 0:
- result.set_crs(srid, inplace=True)
- return result
+ return sgpd.GeoSeries(ps_series)
except TypeError:
return ps_series
-
- # Handle list of column names
- elif isinstance(key, list) and all(isinstance(k, str) for k in key):
- # Check if all columns exist
- missing_cols = [k for k in key if k not in self.columns]
- if missing_cols:
- raise KeyError(f"Columns {missing_cols} do not exist")
-
- # Select columns from the spark DataFrame
- spark_df = self._internal.spark_frame.select(*key)
- pandas_df = spark_df.toPandas()
-
- # Return as GeoDataFrame
- return GeoDataFrame(pandas_df)
-
- # Handle row selection via slice or boolean indexing
else:
- # For now, convert to pandas first for row-based operations
- # This could be optimized later for better performance
- pandas_df = self._internal.spark_frame.toPandas()
- selected_rows = pandas_df[key]
- return GeoDataFrame(selected_rows)
+ raise Exception(f"Logical Error: Unexpected type: {type(item)}")
Review Comment:
Use a more specific exception type like `TypeError` or `ValueError` instead
of the generic `Exception`. Also, the error message could be more descriptive
about what types are expected.
```suggestion
raise TypeError(f"Unexpected type for 'item': {type(item)}.
Expected types are 'pspd.DataFrame' or 'pspd.Series'.")
```
##########
python/sedona/geopandas/geodataframe.py:
##########
@@ -887,10 +857,93 @@ def crs(self):
@crs.setter
def crs(self, value):
- # Avoid trying to access the geometry column (which might be missing)
if crs is None
- if value is None:
- return
- self.geometry.crs = value
+ # Since pyspark dataframes are immutable, we can't modify in place, so
we create the new geoseries and replace it
+ self.geometry = self.geometry.set_crs(value)
+
+ def set_crs(self, crs, inplace=False, allow_override=True):
+ """
+ Set the Coordinate Reference System (CRS) of the ``GeoDataFrame``.
+
+ If there are multiple geometry columns within the GeoDataFrame, only
+ the CRS of the active geometry column is set.
+
+ Pass ``None`` to remove CRS from the active geometry column.
+
+ Notes
+ -----
+ The underlying geometries are not transformed to this CRS. To
+ transform the geometries to a new CRS, use the ``to_crs`` method.
+
+ Parameters
+ ----------
+ crs : pyproj.CRS | None, optional
+ The value can be anything accepted
+ by :meth:`pyproj.CRS.from_user_input()
<pyproj.crs.CRS.from_user_input>`,
+ such as an authority string (eg "EPSG:4326") or a WKT string.
+ epsg : int, optional
+ EPSG code specifying the projection.
+ inplace : bool, default False
+ If True, the CRS of the GeoDataFrame will be changed in place
+ (while still returning the result) instead of making a copy of
+ the GeoDataFrame.
+ allow_override : bool, default False
+ If the the GeoDataFrame already has a CRS, allow to replace the
Review Comment:
The docstring parameter description says 'default False' but the actual
default value in the method signature is True. This inconsistency should be
fixed.
```suggestion
allow_override : bool, default True
If the GeoDataFrame already has a CRS, allow to replace the
```
##########
python/sedona/geopandas/geoseries.py:
##########
@@ -361,16 +361,18 @@ def __init__(
assert not copy
assert not fastpath
- data_crs = None
- if hasattr(data, "crs"):
- data_crs = data.crs
- if data_crs is not None and crs is not None and data_crs != crs:
- raise ValueError(
- "CRS mismatch between CRS of the passed geometries "
- "and 'crs'. Use 'GeoSeries.set_crs(crs, "
- "allow_override=True)' to overwrite CRS or "
- "'GeoSeries.to_crs(crs)' to reproject geometries. "
- )
+ # We don't check crs validity to keep the operation lazy.
Review Comment:
The comment should explain why lazy evaluation is preferred here and what
the performance implications are of the commented-out CRS validation code.
```suggestion
# CRS (Coordinate Reference System) validation is skipped here
to maintain lazy evaluation.
# Lazy evaluation defers expensive computations, such as CRS
checks, until they are absolutely necessary.
# Enabling CRS validation during initialization could introduce
significant overhead, especially for large datasets.
# However, skipping CRS validation means that potential CRS
mismatches might only be caught later,
# which could lead to subtle bugs if not handled carefully.
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]