This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/sedona-db.git
The following commit(s) were added to refs/heads/main by this push:
new e6a6e326 feat(python/sedonadb): Write GDAL/OGR via pyogrio (#632)
e6a6e326 is described below
commit e6a6e32647f2f05c12b4298e61a73cc8957b5fee
Author: Dewey Dunnington <[email protected]>
AuthorDate: Sat Feb 21 11:27:53 2026 -0600
feat(python/sedonadb): Write GDAL/OGR via pyogrio (#632)
Co-authored-by: Copilot <[email protected]>
---
python/sedonadb/python/sedonadb/dataframe.py | 89 ++++++++++++++++++++++
.../{test_datasource.py => io/test_pyogrio.py} | 83 ++++++++++++++++++++
2 files changed, 172 insertions(+)
diff --git a/python/sedonadb/python/sedonadb/dataframe.py
b/python/sedonadb/python/sedonadb/dataframe.py
index c32c8600..74941814 100644
--- a/python/sedonadb/python/sedonadb/dataframe.py
+++ b/python/sedonadb/python/sedonadb/dataframe.py
@@ -15,6 +15,7 @@
# specific language governing permissions and limitations
# under the License.
+import io
from pathlib import Path
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Literal,
Optional, Union
@@ -445,6 +446,94 @@ class DataFrame:
single_file_output,
)
+ def to_pyogrio(
+ self,
+ path: Union[str, Path, io.BytesIO],
+ *,
+ driver: Optional[str] = None,
+ geometry_type: Optional[str] = None,
+ geometry_name: Optional[str] = None,
+ crs: Optional[str] = None,
+ append: bool = False,
+ **kwargs: Dict[str, Any],
+ ):
+ """Write using GDAL/OGR via pyogrio
+
+ Writes this DataFrame batchwise to a file using GDAL/OGR using the
+ implementation provided by the pyogrio package. This is the same
backend
+ used by GeoPandas and this function is a light wrapper around
+ `pyogrio.raw.write_arrow()` that fills in default values using
+ information available to the DataFrame (e.g., geometry column and CRS).
+
+ Args:
+ path: An output path or `BytesIO` output buffer.
+ driver: An explicit GDAL OGR driver. Usually inferred from `path`
but
+ must be provided if path is a `BytesIO`. Not all drivers
support
+ writing to `BytesIO`.
+ geometry_type: A GeoJSON-style geometry type or `None` to provide
an
+ inferred default value (which may be `"Unknown"`). This is
required
+ to write some types of output (e.g. Shapefiles) and may provide
+ files that are more efficiently read.
+ geometry_name: The column to write as the primary geometry column.
If
+ `None`, the name of the geometry column will be inferred.
+ crs: An optional string overriding the CRS of `geometry_name`.
+ append: Use `True` to append to the file for drivers that support
+ appending.
+ kwargs: Extra arguments passed to `pyogrio.raw.write_arrow()`.
+
+ Examples:
+
+ >>> import tempfile
+ >>> sd = sedona.db.connect()
+ >>> td = tempfile.TemporaryDirectory()
+ >>> sd.sql("SELECT ST_Point(0, 1,
3857)").to_pyogrio(f"{td.name}/tmp.fgb")
+ >>> sd.read_pyogrio(f"{td.name}/tmp.fgb").show()
+ ┌──────────────┐
+ │ wkb_geometry │
+ │ geometry │
+ ╞══════════════╡
+ │ POINT(0 1) │
+ └──────────────┘
+ """
+ if geometry_name is None:
+ geometry_name = self._impl.primary_geometry_column()
+
+ if crs is None and geometry_name is not None:
+ inferred_crs = self.schema.field(geometry_name).type.crs
+ crs = None if inferred_crs is None else inferred_crs.to_json()
+
+ if geometry_type is None:
+ # This is required for pyogrio.raw.write_arrow(). We could try
harder
+ # to infer this because some drivers need this information.
+ geometry_type = "Unknown"
+
+ if isinstance(path, Path):
+ path = str(path)
+
+ if isinstance(path, io.BytesIO) and driver is None:
+ raise ValueError("driver must be provided when path is a BytesIO")
+
+ # There may be more endings worth special-casing here but zipped
FlatGeoBuf
+ # is particularly useful and isn't automatically recognized
+ if driver is None and isinstance(path, str) and
path.endswith(".fgb.zip"):
+ driver = "FlatGeoBuf"
+
+ # Writer: pyogrio.write_arrow() via Cython ogr_write_arrow()
+ #
https://github.com/geopandas/pyogrio/blob/3b2d40273b501c10ecf46cbd37c6e555754c89af/pyogrio/raw.py#L755-L897
+ #
https://github.com/geopandas/pyogrio/blob/3b2d40273b501c10ecf46cbd37c6e555754c89af/pyogrio/_io.pyx#L2858-L2980
+ import pyogrio.raw
+
+ pyogrio.raw.write_arrow(
+ self,
+ path,
+ driver=driver,
+ geometry_type=geometry_type,
+ geometry_name=geometry_name,
+ crs=crs,
+ append=append,
+ **kwargs,
+ )
+
def show(
self,
limit: Optional[int] = 10,
diff --git a/python/sedonadb/tests/test_datasource.py
b/python/sedonadb/tests/io/test_pyogrio.py
similarity index 65%
rename from python/sedonadb/tests/test_datasource.py
rename to python/sedonadb/tests/io/test_pyogrio.py
index 4640adfd..297c4198 100644
--- a/python/sedonadb/tests/test_datasource.py
+++ b/python/sedonadb/tests/io/test_pyogrio.py
@@ -15,6 +15,7 @@
# specific language governing permissions and limitations
# under the License.
+import io
import tempfile
from pathlib import Path
@@ -138,3 +139,85 @@ def test_read_ogr_file_not_found(con):
sedonadb._lib.SedonaError, match="Can't infer schema for zero
objects"
):
con.read_pyogrio(Path(td) / "file_does_not_exist")
+
+
+def test_write_ogr(con):
+ with tempfile.TemporaryDirectory() as td:
+ # Basic write with defaults
+ df = con.sql("SELECT ST_Point(0, 1, 3857)")
+ expected = geopandas.GeoDataFrame(
+ {"geometry": geopandas.GeoSeries.from_wkt(["POINT (0 1)"],
crs=3857)}
+ )
+
+ df.to_pyogrio(f"{td}/foofy.fgb")
+ geopandas.testing.assert_geodataframe_equal(
+ geopandas.read_file(f"{td}/foofy.fgb"), expected
+ )
+
+ # Ensure Path input works
+ df.to_pyogrio(Path(f"{td}/foofy.fgb"))
+ geopandas.testing.assert_geodataframe_equal(
+ geopandas.read_file(f"{td}/foofy.fgb"), expected
+ )
+
+ # Ensure zipped FlatGeoBuf doesn't require specifying the driver
+ df.to_pyogrio(Path(f"{td}/foofy.fgb.zip"))
+ geopandas.testing.assert_geodataframe_equal(
+ geopandas.read_file(f"{td}/foofy.fgb.zip"), expected
+ )
+
+ # Ensure inferred CRS that is None works
+ con.sql("SELECT ST_Point(0, 1)").to_pyogrio(f"{td}/foofy.fgb")
+ expected = geopandas.GeoDataFrame(
+ {"geometry": geopandas.GeoSeries.from_wkt(["POINT (0 1)"])}
+ )
+ geopandas.testing.assert_geodataframe_equal(
+ geopandas.read_file(f"{td}/foofy.fgb"), expected
+ )
+
+
+def test_write_ogr_buffer(con):
+ buf = io.BytesIO()
+ df = con.sql("SELECT ST_Point(0, 1, 3857)")
+ expected = geopandas.GeoDataFrame(
+ {"geometry": geopandas.GeoSeries.from_wkt(["POINT (0 1)"], crs=3857)}
+ )
+
+ df.to_pyogrio(buf, driver="FlatGeoBuf")
+ geopandas.testing.assert_geodataframe_equal(
+ geopandas.read_file(buf.getvalue(), driver="FlatGeoBuf"), expected
+ )
+
+ # Ensure reasonable error if driver is not specified
+ with pytest.raises(ValueError, match="driver must be provided"):
+ df.to_pyogrio(buf)
+
+
+def test_write_ogr_no_geometry(con):
+ with tempfile.TemporaryDirectory() as td:
+ df = con.sql("SELECT 'one' as one")
+ expected = pd.DataFrame({"one": ["one"]})
+
+ df.to_pyogrio(f"{td}/foofy.csv")
+ pd.testing.assert_frame_equal(pd.read_csv(f"{td}/foofy.csv"), expected)
+
+
+def test_write_ogr_many_batches(con):
+ # Check with a non-trivial number of batches
+ con.funcs.table.sd_random_geometry("MultiLineString", 50000,
seed=4837).to_view(
+ "pyogrio_test"
+ )
+ df = con.sql(
+ """
+ SELECT id, ST_SetCrs(geometry, 'EPSG:4326') AS geometry
+ FROM pyogrio_test
+ ORDER BY id
+ """
+ )
+ expected = df.to_pandas()
+
+ with tempfile.TemporaryDirectory() as td:
+ df.to_pyogrio(f"{td}/foofy.gpkg")
+ geopandas.testing.assert_geodataframe_equal(
+ geopandas.read_file(f"{td}/foofy.gpkg"), expected
+ )