paleolimbot commented on code in PR #630:
URL: https://github.com/apache/sedona-db/pull/630#discussion_r2818049102


##########
rust/sedona-schema/src/crs.rs:
##########
@@ -157,6 +151,108 @@ impl CachedCrsToSRIDMapping {
     }
 }
 
+/// Batch-local cache for converting integer SRIDs to CRS strings.
+///
+/// Maps SRID integers to their CRS string representation with caching to avoid
+/// repeated validation/deserialization of the same SRID within a batch:

Review Comment:
   ```suggestion
   /// Cache for converting integer SRIDs to CRS strings.
   ///
   /// Maps SRID integers to their CRS string representation with caching to 
avoid
   /// repeated validation/deserialization of the same SRID:
   ```
   
   (Currently it's batch-local but it doesn't have to be)



##########
rust/sedona-schema/src/crs.rs:
##########
@@ -157,6 +151,108 @@ impl CachedCrsToSRIDMapping {
     }
 }
 
+/// Batch-local cache for converting integer SRIDs to CRS strings.
+///
+/// Maps SRID integers to their CRS string representation with caching to avoid
+/// repeated validation/deserialization of the same SRID within a batch:
+/// - `0` → `None` (no CRS)
+/// - `4326` → `Some("OGC:CRS84")`
+/// - other → `Some("EPSG:{srid}")`, validated once via the caller-provided 
closure
+#[derive(Default)]
+pub struct CachedSRIDToCrs {
+    cache: HashMap<i64, Option<String>>,
+}
+
+impl CachedSRIDToCrs {
+    /// Create a new CachedSRIDToCrs with an empty cache.
+    pub fn new() -> Self {
+        Self {
+            cache: HashMap::new(),
+        }
+    }
+
+    /// Create a new CachedSRIDToCrs with a pre-allocated cache.
+    pub fn with_capacity(capacity: usize) -> Self {
+        Self {
+            cache: HashMap::with_capacity(capacity),
+        }
+    }
+
+    /// Get the CRS string for a given SRID, using the cache to avoid repeated 
validation.
+    pub fn get_crs(&mut self, srid: i64) -> Result<Option<String>> {
+        if let Some(cached) = self.cache.get(&srid) {
+            return Ok(cached.clone());
+        }
+
+        let result = if srid == 0 {
+            None
+        } else if srid == 4326 {
+            Some("OGC:CRS84".to_string())
+        } else {
+            let auth_code = format!("EPSG:{srid}");
+            Some(auth_code)
+        };
+
+        self.cache.insert(srid, result.clone());
+        Ok(result)
+    }
+}
+
+/// Batch-local cache for normalizing CRS strings to their abbreviated form.
+///
+/// Maps CRS input strings to their abbreviated `authority:code` representation
+/// with caching to avoid repeated deserialization of the same CRS string 
within a batch:
+/// - `"0"` or `""` → `None` (no CRS)
+/// - other → deserialized and abbreviated to `authority:code` if possible

Review Comment:
   ```suggestion
   /// Cache for normalizing CRS strings to their abbreviated form.
   ///
   /// Maps CRS input strings to their abbreviated `authority:code` 
representation
   /// with caching to avoid repeated deserialization of the same CRS string:
   /// - `"0"` or `""` → `None` (no CRS)
   /// - other → deserialized and abbreviated to `authority:code` if possible
   ```



##########
rust/sedona-raster-functions/src/rs_setsrid.rs:
##########
@@ -0,0 +1,722 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow_array::{Array, ArrayRef, StringViewArray, StructArray};
+use arrow_buffer::NullBuffer;
+use arrow_schema::DataType;
+use datafusion_common::cast::{as_int64_array, as_string_view_array};
+use datafusion_common::error::Result;
+use datafusion_common::{exec_err, DataFusionError, ScalarValue};
+use datafusion_expr::{
+    scalar_doc_sections::DOC_SECTION_OTHER, ColumnarValue, Documentation, 
Volatility,
+};
+use sedona_expr::scalar_udf::{SedonaScalarKernel, SedonaScalarUDF};
+use sedona_geometry::transform::CrsEngine;
+use sedona_schema::crs::{CachedCrsNormalization, CachedSRIDToCrs};
+use sedona_schema::datatypes::SedonaType;
+use sedona_schema::matchers::ArgMatcher;
+use sedona_schema::raster::{raster_indices, RasterSchema};
+
+/// RS_SetSRID() scalar UDF implementation
+///
+/// An implementation of RS_SetSRID providing a scalar function implementation
+/// based on an optional [CrsEngine]. If provided, it will be used to validate
+/// the provided SRID (otherwise, all SRID input is applied without 
validation).
+pub fn rs_set_srid_with_engine_udf(
+    engine: Option<Arc<dyn CrsEngine + Send + Sync>>,
+) -> SedonaScalarUDF {
+    SedonaScalarUDF::new(
+        "rs_setsrid",
+        vec![Arc::new(RsSetSrid { engine })],
+        Volatility::Immutable,
+        Some(rs_set_srid_doc()),
+    )
+}
+
+/// RS_SetCRS() scalar UDF implementation
+///
+/// An implementation of RS_SetCRS providing a scalar function implementation
+/// based on an optional [CrsEngine]. If provided, it will be used to validate
+/// the provided CRS (otherwise, all CRS input is applied without validation).
+pub fn rs_set_crs_with_engine_udf(
+    engine: Option<Arc<dyn CrsEngine + Send + Sync>>,
+) -> SedonaScalarUDF {
+    SedonaScalarUDF::new(
+        "rs_setcrs",
+        vec![Arc::new(RsSetCrs { engine })],
+        Volatility::Immutable,
+        Some(rs_set_crs_doc()),
+    )
+}
+
+/// RS_SetSRID() scalar UDF implementation without CRS validation
+///
+/// See [rs_set_srid_with_engine_udf] for a validating version of this function
+pub fn rs_set_srid_udf() -> SedonaScalarUDF {
+    rs_set_srid_with_engine_udf(None)
+}
+
+/// RS_SetCRS() scalar UDF implementation without CRS validation
+///
+/// See [rs_set_crs_with_engine_udf] for a validating version of this function
+pub fn rs_set_crs_udf() -> SedonaScalarUDF {
+    rs_set_crs_with_engine_udf(None)
+}
+
+fn rs_set_srid_doc() -> Documentation {
+    Documentation::builder(
+        DOC_SECTION_OTHER,
+        "Set the spatial reference system identifier (SRID) of the 
raster".to_string(),
+        "RS_SetSRID(raster: Raster, srid: Integer)".to_string(),
+    )
+    .with_argument("raster", "Raster: Input raster")
+    .with_argument("srid", "Integer: EPSG code to set (e.g., 4326)")
+    .with_sql_example("SELECT RS_SetSRID(RS_Example(), 3857)".to_string())
+    .build()
+}
+
+fn rs_set_crs_doc() -> Documentation {
+    Documentation::builder(
+        DOC_SECTION_OTHER,
+        "Set the coordinate reference system (CRS) of the raster".to_string(),
+        "RS_SetCRS(raster: Raster, crs: String)".to_string(),
+    )
+    .with_argument("raster", "Raster: Input raster")
+    .with_argument(
+        "crs",
+        "String: Coordinate reference system identifier (e.g., 'OGC:CRS84', 
'EPSG:4326')",
+    )
+    .with_sql_example("SELECT RS_SetCRS(RS_Example(), 
'EPSG:3857')".to_string())
+    .build()
+}
+
+// ---------------------------------------------------------------------------
+// RS_SetSRID kernel
+// ---------------------------------------------------------------------------
+
+#[derive(Debug)]
+struct RsSetSrid {
+    engine: Option<Arc<dyn CrsEngine + Send + Sync>>,
+}
+
+impl SedonaScalarKernel for RsSetSrid {
+    fn return_type(&self, args: &[SedonaType]) -> Result<Option<SedonaType>> {
+        let matcher = ArgMatcher::new(
+            vec![ArgMatcher::is_raster(), ArgMatcher::is_integer()],
+            SedonaType::Raster,
+        );
+        matcher.match_args(args)
+    }
+
+    fn invoke_batch(
+        &self,
+        _arg_types: &[SedonaType],
+        args: &[ColumnarValue],
+    ) -> Result<ColumnarValue> {
+        let raster_arg = &args[0];
+        let srid_arg = &args[1];
+
+        let input_nulls = extract_input_nulls(srid_arg);
+
+        // Convert SRID integer(s) to CRS string(s)
+        let crs_columnar = srid_to_crs_columnar(srid_arg, 
self.engine.as_ref())?;
+
+        replace_raster_crs(raster_arg, &crs_columnar, input_nulls)
+    }
+}
+
+// ---------------------------------------------------------------------------
+// RS_SetCRS kernel
+// ---------------------------------------------------------------------------
+
+#[derive(Debug)]
+struct RsSetCrs {
+    engine: Option<Arc<dyn CrsEngine + Send + Sync>>,
+}
+
+impl SedonaScalarKernel for RsSetCrs {
+    fn return_type(&self, args: &[SedonaType]) -> Result<Option<SedonaType>> {
+        let matcher = ArgMatcher::new(
+            vec![ArgMatcher::is_raster(), ArgMatcher::is_string()],
+            SedonaType::Raster,
+        );
+        matcher.match_args(args)
+    }
+
+    fn invoke_batch(
+        &self,
+        _arg_types: &[SedonaType],
+        args: &[ColumnarValue],
+    ) -> Result<ColumnarValue> {
+        let raster_arg = &args[0];
+        let crs_arg = &args[1];
+
+        let input_nulls = extract_input_nulls(crs_arg);
+
+        // Normalize the CRS string(s) — abbreviate PROJJSON to 
authority:code, map "0" to null
+        let crs_columnar = normalize_crs_columnar(crs_arg, 
self.engine.as_ref())?;
+
+        replace_raster_crs(raster_arg, &crs_columnar, input_nulls)
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Core: zero-copy CRS column swap
+// ---------------------------------------------------------------------------
+
+/// Replace the CRS column of a raster StructArray with a new CRS value.
+///
+/// This is a zero-copy operation for the metadata and bands columns:
+/// we clone the Arc pointers for columns 0 (metadata) and 2 (bands),
+/// and only rebuild column 1 (CRS) from the provided value.
+///
+/// When `input_nulls` is provided, rows where the original SRID/CRS input was
+/// null will have the entire raster nulled out (not just the CRS column).
+fn replace_raster_crs(
+    raster_arg: &ColumnarValue,
+    crs_array: &StringViewArray,
+    input_nulls: Option<NullBuffer>,
+) -> Result<ColumnarValue> {
+    match raster_arg {
+        ColumnarValue::Array(raster_array) => {
+            let raster_struct = raster_array
+                .as_any()
+                .downcast_ref::<StructArray>()
+                .ok_or_else(|| {
+                    datafusion_common::DataFusionError::Internal(
+                        "Expected StructArray for raster data".to_string(),
+                    )
+                })?;
+
+            let num_rows = raster_struct.len();
+            let new_crs: ArrayRef = broadcast_string_view(crs_array, num_rows);
+            let new_struct = swap_crs_column(raster_struct, new_crs)?;
+
+            let input_nulls = input_nulls.map(|nulls| {
+                if nulls.len() == 1 && num_rows != 1 {
+                    if nulls.is_valid(0) {
+                        NullBuffer::new_valid(num_rows)
+                    } else {
+                        NullBuffer::new_null(num_rows)
+                    }
+                } else {
+                    nulls
+                }
+            });
+
+            // Merge input nulls: rows where the SRID/CRS input was null 
become null rasters
+            let merged_nulls = NullBuffer::union(new_struct.nulls(), 
input_nulls.as_ref());
+            let new_struct = StructArray::new(
+                RasterSchema::fields(),
+                new_struct.columns().to_vec(),
+                merged_nulls,
+            );
+
+            Ok(ColumnarValue::Array(Arc::new(new_struct)))
+        }
+        ColumnarValue::Scalar(ScalarValue::Struct(arc_struct)) => {
+            let new_crs: ArrayRef = Arc::new(crs_array.clone());
+            let new_struct = swap_crs_column(arc_struct.as_ref(), new_crs)?;
+
+            // Merge input nulls: null SRID/CRS input produces a null raster
+            let merged_nulls = NullBuffer::union(new_struct.nulls(), 
input_nulls.as_ref());
+            let new_struct = StructArray::new(
+                RasterSchema::fields(),
+                new_struct.columns().to_vec(),
+                merged_nulls,
+            );
+
+            Ok(ColumnarValue::Scalar(ScalarValue::Struct(Arc::new(
+                new_struct,
+            ))))
+        }
+        ColumnarValue::Scalar(ScalarValue::Null) => 
Ok(ColumnarValue::Scalar(ScalarValue::Null)),
+        _ => exec_err!("Expected raster (Struct) input for 
RS_SetSRID/RS_SetCRS"),
+    }
+}
+
+/// Broadcast a `StringViewArray` to a target length.
+///
+/// If the array already has the target length, it is returned as-is (clone of 
Arc).
+/// Otherwise the array must have length 1, and its single value is repeated.
+fn broadcast_string_view(array: &StringViewArray, len: usize) -> ArrayRef {
+    if array.len() == len {
+        return Arc::new(array.clone());
+    }
+    debug_assert_eq!(array.len(), 1);
+    if array.is_null(0) {
+        Arc::new(StringViewArray::new_null(len))
+    } else {
+        let value = array.value(0);
+        Arc::new(std::iter::repeat_n(Some(value), 
len).collect::<StringViewArray>())

Review Comment:
   I think there is an optimization for string views with respect to appending 
repeated values:
   
   
https://github.com/apache/datafusion/blob/6798dff0d7e1c52d6bea74b141d4df6ad9cc1516/datafusion/common/src/scalar/mod.rs#L3120-L3127



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to