paleolimbot commented on code in PR #44:
URL: https://github.com/apache/sedona-db/pull/44#discussion_r2337170853
##########
python/sedonadb/tests/test_context.py:
##########
@@ -63,7 +63,81 @@ def test_dynamic_object_stores():
schema = pa.schema(con.read_parquet(url))
assert schema.field("geometry").type.extension_name == "geoarrow.wkb"
- # Fresh context
+
+def test_read_parquet_options_parameter(con, geoarrow_data):
+ """Test the options parameter functionality for read_parquet()"""
+ test_file = geoarrow_data /
"quadrangles/files/quadrangles_100k_geo.parquet"
+
+ # Test 1: Backward compatibility - no options parameter
+ tab1 = con.read_parquet(test_file).to_arrow_table()
+ assert tab1["geometry"].type.extension_name == "geoarrow.wkb"
+
+ # Test 2: options=None (explicit None)
+ tab2 = con.read_parquet(test_file, options=None).to_arrow_table()
+ assert tab2["geometry"].type.extension_name == "geoarrow.wkb"
+ assert len(tab2) == len(tab1) # Should be identical
+
+ # Test 3: Empty options dictionary
+ tab3 = con.read_parquet(test_file, options={}).to_arrow_table()
+ assert tab3["geometry"].type.extension_name == "geoarrow.wkb"
+ assert len(tab3) == len(tab1) # Should be identical
+
+ # Test 4: Options with string values
+ tab4 = con.read_parquet(
+ test_file, options={"test.option": "value"}
+ ).to_arrow_table()
+ assert tab4["geometry"].type.extension_name == "geoarrow.wkb"
+ assert len(tab4) == len(
+ tab1
+ ) # Should be identical (option ignored but not errored)
+
+
+def test_read_parquet_s3_options_parameter():
+ """Test that S3 options are accepted without errors (using HTTP URL for
reliability)"""
con = sedonadb.connect()
- schema = pa.schema(con.sql(f"SELECT * FROM '{url}'"))
- assert schema.field("geometry").type.extension_name == "geoarrow.wkb"
+ url =
"https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/example/files/example_geometry_geo.parquet"
+
+ # Test S3 option keys are accepted and processed correctly
+ s3_options = {
+ "aws.skip_signature": "true",
+ "aws.nosign": "true",
+ "aws.region": "us-west-2",
+ "aws.endpoint": "https://s3.amazonaws.com",
+ }
+
+ tab = con.read_parquet(url, options=s3_options).to_arrow_table()
+ assert len(tab) > 0
+ assert "geometry" in tab.schema.names
+
+
+def test_read_geoparquet_s3_anonymous_access():
+ """Test reading from a public S3 bucket geoparquet file with anonymous
access"""
+ con = sedonadb.connect()
+ s3_url = "s3://wherobots-examples/data/onboarding_1/nyc_buildings.parquet"
+
+ # Use aws.skip_signature with region for anonymous access
+ tab = con.read_parquet(
+ s3_url, options={"aws.skip_signature": True, "aws.region": "us-west-2"}
+ ).to_arrow_table()
+ assert len(tab) > 0
+ assert "geom" in tab.schema.names # This dataset uses 'geom' instead of
'geometry'
+
+
+def test_read_parquet_invalid_aws_option():
+ """Test that invalid AWS options are caught and provide helpful error
messages"""
+ con = sedonadb.connect()
+ url =
"https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/example/files/example_geometry_geo.parquet"
Review Comment:
Should this be an S3 URL?
##########
python/sedonadb/src/context.rs:
##########
@@ -74,11 +74,33 @@ impl InternalContext {
&self,
py: Python<'py>,
table_paths: Vec<String>,
+ options: HashMap<String, PyObject>,
) -> Result<InternalDataFrame, PySedonaError> {
+ // Convert Python options to strings, filtering out None values
+ let rust_options: HashMap<String, String> = options
+ .into_iter()
+ .filter_map(|(k, v)| {
+ if v.is_none(py) {
+ None
+ } else {
+ v.bind(py)
+ .str()
+ .and_then(|s| s.extract())
+ .map(|s: String| (k, s))
+ .ok()
+ }
+ })
+ .collect();
+
+ let geo_options =
+
sedona_geoparquet::provider::GeoParquetReadOptions::from_table_options(rust_options)
+ .map_err(|e| {
+ PySedonaError::SedonaPython(format!("Invalid table
options: {}", e))
Review Comment:
```suggestion
PySedonaError::SedonaPython(format!("Invalid table
options: {e}"))
```
(Causes a lint error for newer rust versions)
##########
rust/sedona/src/context.rs:
##########
@@ -232,18 +231,21 @@ impl SedonaContext {
options: GeoParquetReadOptions<'_>,
) -> Result<DataFrame> {
let urls = table_paths.to_urls()?;
- let provider =
- match geoparquet_listing_table(&self.ctx, urls.clone(),
options.clone()).await {
- Ok(provider) => provider,
- Err(e) => {
- if urls.is_empty() {
- return Err(e);
- }
-
- ensure_object_store_registered(&mut self.ctx.state(),
urls[0].as_str()).await?;
- geoparquet_listing_table(&self.ctx, urls, options).await?
- }
- };
+
+ // Pre-register object store with our custom options before creating
GeoParquetReadOptions
+ if !urls.is_empty() {
+ use
crate::object_storage::ensure_object_store_registered_with_options;
+ // Extract the table options from GeoParquetReadOptions for object
store registration
+ let table_options_map =
options.table_options().cloned().unwrap_or_default();
+ ensure_object_store_registered_with_options(
+ &mut self.ctx.state(),
+ urls[0].as_str(),
+ Some(&table_options_map),
+ )
Review Comment:
I'm curious if this would prevent future object store accesses from
accessing credentials. For example, a user loads Overature Data with
`aws.nosign` and then wants to load something from a private bucket (e.g.,
managed cloud storage). (I don't think we have to solve that now, just
wondering 🙂 )
##########
rust/sedona/src/object_storage.rs:
##########
@@ -97,6 +122,11 @@ pub async fn ensure_object_store_registered(state: &mut
SessionState, name: &str
Ok(())
}
+// Backward compatibility wrapper
+pub async fn ensure_object_store_registered(state: &mut SessionState, name:
&str) -> Result<()> {
+ ensure_object_store_registered_with_options(state, name, None).await
+}
Review Comment:
Is this still needed?
##########
python/sedonadb/tests/test_context.py:
##########
@@ -63,7 +63,81 @@ def test_dynamic_object_stores():
schema = pa.schema(con.read_parquet(url))
assert schema.field("geometry").type.extension_name == "geoarrow.wkb"
- # Fresh context
+
+def test_read_parquet_options_parameter(con, geoarrow_data):
+ """Test the options parameter functionality for read_parquet()"""
+ test_file = geoarrow_data /
"quadrangles/files/quadrangles_100k_geo.parquet"
+
+ # Test 1: Backward compatibility - no options parameter
+ tab1 = con.read_parquet(test_file).to_arrow_table()
+ assert tab1["geometry"].type.extension_name == "geoarrow.wkb"
+
+ # Test 2: options=None (explicit None)
+ tab2 = con.read_parquet(test_file, options=None).to_arrow_table()
+ assert tab2["geometry"].type.extension_name == "geoarrow.wkb"
+ assert len(tab2) == len(tab1) # Should be identical
+
+ # Test 3: Empty options dictionary
+ tab3 = con.read_parquet(test_file, options={}).to_arrow_table()
+ assert tab3["geometry"].type.extension_name == "geoarrow.wkb"
+ assert len(tab3) == len(tab1) # Should be identical
+
+ # Test 4: Options with string values
+ tab4 = con.read_parquet(
+ test_file, options={"test.option": "value"}
+ ).to_arrow_table()
+ assert tab4["geometry"].type.extension_name == "geoarrow.wkb"
+ assert len(tab4) == len(
+ tab1
+ ) # Should be identical (option ignored but not errored)
+
+
+def test_read_parquet_s3_options_parameter():
+ """Test that S3 options are accepted without errors (using HTTP URL for
reliability)"""
con = sedonadb.connect()
- schema = pa.schema(con.sql(f"SELECT * FROM '{url}'"))
- assert schema.field("geometry").type.extension_name == "geoarrow.wkb"
+ url =
"https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/example/files/example_geometry_geo.parquet"
Review Comment:
Should this be
`"s3://wherobots-examples/data/onboarding_1/nyc_buildings.parquet"`?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]