james-willis commented on code in PR #858: URL: https://github.com/apache/sedona-db/pull/858#discussion_r3269074474
########## rust/sedona-raster-zarr/src/loader.rs: ########## @@ -0,0 +1,513 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Zarr group → N-D raster `StructArray` entry points. +//! +//! Both `group_to_indb_rasters` and `group_to_outdb_rasters` produce the +//! same row shape: one raster row per chunk position, with one band per +//! array in the group. They differ only in how each row's pixel bytes +//! are delivered: +//! +//! - **InDb** — every chunk is fetched eagerly and copied into the +//! Arrow `data` column. Heavy for large datacubes; intended for +//! snapshots. +//! - **OutDb** — `data` is left empty; each band's `outdb_uri` carries a +//! chunk anchor (`zarr://<store-uri>/<array-path>#chunk=i0,i1,...`). +//! Byte resolution awaits the format-keyed dispatch work in a +//! follow-up PR. + +use std::sync::Arc; + +use arrow_array::StructArray; +use arrow_schema::ArrowError; +use sedona_common::sedona_internal_datafusion_err; +use sedona_raster::builder::RasterBuilder; +use sedona_schema::raster::BandDataType; +use zarrs::array::{Array, ArrayBytes}; +use zarrs::group::Group; +use zarrs_filesystem::FilesystemStore; + +use crate::dtype::zarr_to_band_data_type; +use crate::geozarr::GroupGeoMetadata; +use crate::source_uri::{build_chunk_anchor, group_uri_to_filesystem_path}; + +/// Open a Zarr group and eagerly fetch every chunk's bytes into the +/// returned `StructArray`. Each row holds one chunk position's data +/// across every array in the group. +pub fn group_to_indb_rasters(group_uri: &str) -> Result<StructArray, ArrowError> { + build_rasters(group_uri, Mode::InDb) +} + +/// Open a Zarr group and emit one row per chunk position with chunk-anchor +/// URIs in each band's `outdb_uri`. The `data` column is empty; bytes +/// resolve on demand through whichever OutDb loader is registered for +/// the `zarr` format. +pub fn group_to_outdb_rasters(group_uri: &str) -> Result<StructArray, ArrowError> { + build_rasters(group_uri, Mode::OutDb) +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum Mode { + InDb, + OutDb, +} + +/// Per-array metadata extracted once at group open and reused for every +/// chunk position. Caching this avoids re-reading Zarr metadata for each +/// of the (potentially thousands of) chunk rows. +struct ArrayInfo { + /// Array path within the store, used to build chunk anchor URIs and + /// surface in band names. + path: String, + /// Open zarrs handle. + array: Array<FilesystemStore>, + /// SedonaDB BandDataType corresponding to this array's zarrs dtype. + data_type: BandDataType, + /// Dimension names in array order. Required to be `Some(_)` for every + /// dim; missing names error at validation time. + dim_names: Vec<String>, + /// Inner chunk grid shape, one entry per dimension. Used to enumerate + /// chunk positions and validated to match across arrays. + chunk_grid_shape: Vec<u64>, + /// Chunk shape (elements per chunk per dim). Same for every chunk + /// position in Phase 1 (no ragged final chunks emitted as separate + /// short rows). + chunk_shape: Vec<u64>, + /// Encoded fill value in native-endian byte representation, for the + /// `nodata` field. None when the array has no fill value declared. + nodata: Option<Vec<u8>>, +} + +fn build_rasters(group_uri: &str, mode: Mode) -> Result<StructArray, ArrowError> { + let fs_path = group_uri_to_filesystem_path(group_uri)?; + let store = FilesystemStore::new(&fs_path).map_err(|e| { + ArrowError::ExternalError(Box::new(sedona_internal_datafusion_err!( + "failed to open Zarr filesystem store at {}: {e}", + fs_path.display() + ))) + })?; + let storage: Arc<FilesystemStore> = Arc::new(store); + + let group = Group::open(storage.clone(), "/").map_err(|e| { + ArrowError::ExternalError(Box::new(sedona_internal_datafusion_err!( + "failed to open Zarr group at {group_uri}: {e}" + ))) + })?; + + let geo = GroupGeoMetadata::from_attributes(group.attributes())?; + + let arrays = group.child_arrays().map_err(|e| { + ArrowError::ExternalError(Box::new(sedona_internal_datafusion_err!( + "failed to enumerate child arrays in {group_uri}: {e}" + ))) + })?; + if arrays.is_empty() { + return Err(ArrowError::InvalidArgumentError(format!( + "Zarr group at {group_uri} has no child arrays" + ))); + } + + let array_infos = collect_array_infos(arrays)?; + validate_group_constraints(&array_infos)?; + + // Spatial-dim resolution. Phase 1 supports two configurations: + // - dim_names ends with ["y", "x"] (canonical for georeferenced + // 2-D and time-series rasters); the spatial extent is the chunk's + // last two dims. + // - `spatial:dims` attribute on the group explicitly names them. + // Anything else errors with a clear message — silently picking dims + // would produce wrong per-row transforms. + let spatial_dim_indices = + resolve_spatial_dim_indices(&array_infos[0].dim_names, geo.spatial_dims.as_deref())?; + let spatial_dims_names: Vec<&str> = spatial_dim_indices + .iter() + .map(|&i| array_infos[0].dim_names[i].as_str()) + .collect(); + let chunk_spatial_shape: Vec<i64> = spatial_dim_indices + .iter() + .map(|&i| array_infos[0].chunk_shape[i] as i64) + .collect(); + + let group_transform = geo.transform.unwrap_or([0.0, 1.0, 0.0, 0.0, 0.0, -1.0]); Review Comment: Currently the type implementation does not allow null geotransforms so we need to provide this potentially misleading value here when geozarr metadata is not present. Havent considered what to do about this. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
