Re: [PR] [GH-2100] Enhance geopandas sjoin implementation [sedona]

via GitHub Tue, 15 Jul 2025 16:22:22 -0700


Copilot commented on code in PR #2101:
URL: https://github.com/apache/sedona/pull/2101#discussion_r2208883996



##########
python/sedona/geopandas/tools/sjoin.py:
##########
@@ -172,12 +324,36 @@ def _basic_checks(left_df, right_df, how, lsuffix, 
rsuffix, on_attribute=None):
     on_attribute : list, default None
         list of column names to merge on along with geometry
     """
-    if not isinstance(left_df, GeoSeries):
-        raise ValueError(f"'left_df' should be GeoSeries, got {type(left_df)}")
+    if not isinstance(left_df, (GeoSeries, GeoDataFrame)):
+        raise ValueError(
+            f"'left_df' should be GeoSeries or GeoDataFrame, got 
{type(left_df)}"
+        )
 
-    if not isinstance(right_df, GeoSeries):
-        raise ValueError(f"'right_df' should be GeoSeries, got 
{type(right_df)}")
+    if not isinstance(right_df, (GeoSeries, GeoDataFrame)):
+        raise ValueError(
+            f"'right_df' should be GeoSeries or GeoDataFrame, got 
{type(right_df)}"
+        )
 
-    allowed_hows = ["inner"]
+    allowed_hows = ["inner", "left", "right"]
     if how not in allowed_hows:
         raise ValueError(f'`how` was "{how}" but is expected to be in 
{allowed_hows}')
+
+    # Check if on_attribute columns exist in both datasets
+    if on_attribute:
+        for attr in on_attribute:
+            if hasattr(left_df, "columns") and attr not in left_df.columns:
+                raise ValueError(f"Column '{attr}' not found in left dataset")
+            if hasattr(right_df, "columns") and attr not in right_df.columns:
+                raise ValueError(f"Column '{attr}' not found in right dataset")
+
+    # Check for reserved column names that would conflict
+    if lsuffix == rsuffix:
+        raise ValueError("lsuffix and rsuffix cannot be the same")
+
+    # Validate suffix format (should not contain special characters that would 
break SQL)
+    import re

Review Comment:
   The regular expression module is imported inside the function, which will be 
executed on every function call. Consider moving this import to the top of the 
file for better performance.
   ```suggestion
   
   ```



##########
python/sedona/geopandas/tools/sjoin.py:
##########
@@ -172,12 +324,36 @@ def _basic_checks(left_df, right_df, how, lsuffix, 
rsuffix, on_attribute=None):
     on_attribute : list, default None
         list of column names to merge on along with geometry
     """
-    if not isinstance(left_df, GeoSeries):
-        raise ValueError(f"'left_df' should be GeoSeries, got {type(left_df)}")
+    if not isinstance(left_df, (GeoSeries, GeoDataFrame)):
+        raise ValueError(
+            f"'left_df' should be GeoSeries or GeoDataFrame, got 
{type(left_df)}"
+        )
 
-    if not isinstance(right_df, GeoSeries):
-        raise ValueError(f"'right_df' should be GeoSeries, got 
{type(right_df)}")
+    if not isinstance(right_df, (GeoSeries, GeoDataFrame)):
+        raise ValueError(
+            f"'right_df' should be GeoSeries or GeoDataFrame, got 
{type(right_df)}"
+        )
 
-    allowed_hows = ["inner"]
+    allowed_hows = ["inner", "left", "right"]
     if how not in allowed_hows:
         raise ValueError(f'`how` was "{how}" but is expected to be in 
{allowed_hows}')
+
+    # Check if on_attribute columns exist in both datasets
+    if on_attribute:
+        for attr in on_attribute:
+            if hasattr(left_df, "columns") and attr not in left_df.columns:
+                raise ValueError(f"Column '{attr}' not found in left dataset")
+            if hasattr(right_df, "columns") and attr not in right_df.columns:
+                raise ValueError(f"Column '{attr}' not found in right dataset")
+
+    # Check for reserved column names that would conflict
+    if lsuffix == rsuffix:
+        raise ValueError("lsuffix and rsuffix cannot be the same")
+
+    # Validate suffix format (should not contain special characters that would 
break SQL)
+    import re
+
+    if not re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", lsuffix):
+        raise ValueError(f"lsuffix '{lsuffix}' contains invalid characters")
+    if not re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", rsuffix):

Review Comment:
   The regular expression pattern is compiled on every function call. Consider 
using re.compile() to pre-compile the pattern or move the compiled pattern to 
module level for better performance.
   ```suggestion
       if not VALID_SUFFIX_REGEX.match(lsuffix):
           raise ValueError(f"lsuffix '{lsuffix}' contains invalid characters")
       if not VALID_SUFFIX_REGEX.match(rsuffix):
   ```



##########
python/tests/geopandas/test_sjoin.py:
##########
@@ -45,9 +82,189 @@ def test_sjoin_method1(self):
         assert joined.count() == 4
 
     def test_sjoin_method2(self):
+        """Test GeoSeries.sjoin method"""
         left = self.g1
         right = self.g2
         joined = left.sjoin(right)
         assert joined is not None
         assert type(joined) is GeoSeries
         assert joined.count() == 4
+
+    def test_sjoin_geodataframe_basic(self):
+        """Test basic sjoin with GeoDataFrame"""
+        joined = sjoin(self.gdf1, self.gdf2)
+        assert joined is not None
+        assert type(joined) is GeoDataFrame
+        assert "geometry" in joined.columns
+        assert "id_left" in joined.columns
+        assert "id_right" in joined.columns
+        assert "name" in joined.columns
+        assert "category" in joined.columns
+
+    def test_sjoin_geodataframe_method(self):
+        """Test GeoDataFrame.sjoin method"""
+        joined = self.gdf1.sjoin(self.gdf2)
+        assert joined is not None
+        assert type(joined) is GeoDataFrame
+        assert "geometry" in joined.columns
+
+    def test_sjoin_predicates(self):
+        """Test different spatial predicates"""
+        predicates = [
+            "intersects",
+            "contains",
+            "within",
+            "touches",
+            "crosses",
+            "overlaps",
+        ]
+
+        for predicate in predicates:
+            try:
+                joined = sjoin(self.gdf1, self.gdf2, predicate=predicate)
+                assert joined is not None
+                assert type(joined) is GeoDataFrame
+            except Exception as e:
+                # Some predicates might not return results for our test data
+                # but the function should not raise errors for valid predicates
+                if "not supported" in str(e):
+                    pytest.fail(f"Predicate '{predicate}' should be supported")
+
+    def test_sjoin_join_types(self):
+        """Test different join types"""
+        join_types = ["inner", "left", "right"]
+
+        for how in join_types:
+            joined = sjoin(self.gdf1, self.gdf2, how=how)
+            assert joined is not None
+            assert type(joined) is GeoDataFrame
+            assert "geometry" in joined.columns
+
+    def test_sjoin_column_suffixes(self):
+        """Test column suffix handling"""
+        joined = sjoin(self.gdf1, self.gdf2, lsuffix="_left", rsuffix="_right")
+        assert joined is not None
+        assert type(joined) is GeoDataFrame
+
+        # Check that suffixes are applied to overlapping columns
+        columns = joined.columns
+        if "id_left" in columns and "id_right" in columns:
+            # Both datasets have 'id' column, so suffixes should be applied
+            assert "id_left" in columns
+            assert "id_right" in columns
+            assert "id" not in columns  # Original column should not exist
+
+    def test_sjoin_dwithin_distance(self):
+        """Test dwithin predicate with distance parameter"""
+        # Test with a distance that should capture nearby points
+        joined = sjoin(self.gdf1, self.nearby_points, predicate="dwithin", 
distance=0.5)
+        assert joined is not None
+        assert type(joined) is GeoDataFrame
+
+        # Test with a very small distance that should capture fewer points
+        joined_small = sjoin(
+            self.gdf1, self.nearby_points, predicate="dwithin", distance=0.05
+        )
+        assert joined_small is not None
+        assert type(joined_small) is GeoDataFrame
+
+    def test_sjoin_on_attribute(self):
+        """Test attribute-based joining"""
+        # Create datasets with matching attribute columns
+        gdf1_attr = GeoDataFrame(
+            {"geometry": [self.t1, self.t2], "zone": ["A", "B"], "value": [1, 
2]}
+        )
+        gdf2_attr = GeoDataFrame(
+            {
+                "geometry": [self.sq, self.t1],
+                "zone": ["A", "B"],
+                "category": ["square", "triangle"],
+            }
+        )
+
+        # Test joining on attribute
+        joined = sjoin(gdf1_attr, gdf2_attr, on_attribute=["zone"])
+        assert joined is not None
+        assert type(joined) is GeoDataFrame
+
+    def test_sjoin_points_in_polygons(self):
+        """Test point-in-polygon spatial join"""
+        joined = sjoin(self.gdf_points, self.gdf1, predicate="within")
+        assert joined is not None
+        assert type(joined) is GeoDataFrame
+
+        # The first point should be within the polygon
+        # The second point should be outside
+        # Check that we have some results (at least the point inside the 
polygon)
+        assert len(joined) >= 0  # At least no errors
+
+    def test_sjoin_error_handling(self):
+        """Test error handling for invalid inputs"""
+
+        # Test invalid predicate
+        with pytest.raises(ValueError, match="not supported"):
+            sjoin(self.gdf1, self.gdf2, predicate="invalid_predicate")
+
+        # Test invalid join type
+        with pytest.raises(ValueError, match="expected to be in"):
+            sjoin(self.gdf1, self.gdf2, how="invalid_join")
+
+        # Test dwithin without distance
+        with pytest.raises(ValueError, match="Distance parameter is required"):
+            sjoin(self.gdf1, self.gdf2, predicate="dwithin")
+
+        # Test same suffixes
+        with pytest.raises(ValueError, match="cannot be the same"):
+            sjoin(self.gdf1, self.gdf2, lsuffix="same", rsuffix="same")
+
+        # Test invalid suffix characters
+        with pytest.raises(ValueError, match="invalid characters"):
+            sjoin(self.gdf1, self.gdf2, lsuffix="invalid-suffix")
+
+    def test_sjoin_empty_results(self):
+        """Test sjoin with geometries that don't intersect"""
+        # Create geometries that are far apart
+        far_gdf = GeoDataFrame(
+            {
+                "geometry": [Polygon([(10, 10), (11, 10), (11, 11), (10, 
11)])],
+                "id": [99],
+            }
+        )
+
+        joined = sjoin(self.gdf1, far_gdf)
+        assert joined is not None
+        assert type(joined) is GeoDataFrame
+        # Should have 0 rows for inner join with non-intersecting geometries
+
+    def test_sjoin_mixed_geometry_types(self):
+        """Test sjoin with mixed geometry types"""
+        # Create a dataset with mixed geometry types
+        mixed_gdf = GeoDataFrame(
+            {
+                "geometry": [self.point1, self.line1, self.sq],
+                "id": [100, 101, 102],
+                "geom_type": ["point", "line", "polygon"],
+            }
+        )
+
+        joined = sjoin(self.gdf1, mixed_gdf)
+        assert joined is not None
+        assert type(joined) is GeoDataFrame
+
+    def test_sjoin_performance_basic(self):
+        """Basic performance test with slightly larger dataset"""
+        # Create slightly larger test datasets
+        import numpy as np
+

Review Comment:
   The numpy import is inside the test function but is not used in the visible 
code. This import should be moved to the top of the file or removed if unused.
   ```suggestion
   
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [GH-2100] Enhance geopandas sjoin implementation [sedona]

Reply via email to