b4l commented on code in PR #471:
URL: https://github.com/apache/sedona-db/pull/471#discussion_r2693586648


##########
rust/sedona-pointcloud/src/laz/metadata.rs:
##########
@@ -0,0 +1,403 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{any::Any, collections::HashMap, error::Error, io::Cursor, 
ops::Range, sync::Arc};
+
+use arrow_schema::{DataType, Schema, SchemaRef};
+use datafusion_common::{
+    error::DataFusionError, scalar::ScalarValue, stats::Precision, 
ColumnStatistics, Statistics,
+};
+use datafusion_execution::cache::cache_manager::{FileMetadata, 
FileMetadataCache};
+use las::{
+    raw::{Header as RawHeader, Vlr as RawVlr},
+    Builder, Header, Vlr,
+};
+use laz::laszip::ChunkTable;
+use object_store::{ObjectMeta, ObjectStore};
+
+use crate::laz::{options::LazTableOptions, schema::schema_from_header};
+
+/// Laz chunk metadata
+#[derive(Debug, Clone)]
+pub struct ChunkMeta {
+    pub num_points: u64,
+    pub point_offset: u64,
+    pub byte_range: Range<u64>,
+}
+
+/// Laz metadata
+#[derive(Debug, Clone)]
+pub struct LazMetadata {
+    pub header: Arc<Header>,
+    pub chunk_table: Vec<ChunkMeta>,
+    pub extra_attributes: Arc<Vec<ExtraAttribute>>,
+}
+
+impl FileMetadata for LazMetadata {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn memory_size(&self) -> usize {
+        // TODO: proper size
+        std::mem::size_of_val(self)
+    }
+
+    fn extra_info(&self) -> HashMap<String, String> {
+        HashMap::new()
+    }
+}
+
+/// Reader for laz file metadata in object storage.
+pub struct LazMetadataReader<'a> {
+    store: &'a dyn ObjectStore,
+    object_meta: &'a ObjectMeta,
+    file_metadata_cache: Option<Arc<dyn FileMetadataCache>>,
+    options: LazTableOptions,
+}
+
+impl<'a> LazMetadataReader<'a> {
+    pub fn new(store: &'a dyn ObjectStore, object_meta: &'a ObjectMeta) -> 
Self {
+        Self {
+            store,
+            object_meta,
+            file_metadata_cache: None,
+            options: Default::default(),
+        }
+    }
+
+    /// set file metadata cache
+    pub fn with_file_metadata_cache(
+        mut self,
+        file_metadata_cache: Option<Arc<dyn FileMetadataCache>>,
+    ) -> Self {
+        self.file_metadata_cache = file_metadata_cache;
+        self
+    }
+
+    /// set table options
+    pub fn with_options(mut self, options: LazTableOptions) -> Self {
+        self.options = options;
+        self
+    }
+
+    /// Fetch header
+    pub async fn fetch_header(&self) -> Result<Header, DataFusionError> {
+        fetch_header(self.store, self.object_meta, false)
+            .await
+            .map_err(DataFusionError::External)
+    }
+
+    /// Fetch laz metadata from the remote object store
+    pub async fn fetch_metadata(&self) -> Result<Arc<LazMetadata>, 
DataFusionError> {
+        let Self {
+            store,
+            object_meta,
+            file_metadata_cache,
+            options: _,
+        } = self;
+
+        if let Some(las_file_metadata) = file_metadata_cache
+            .as_ref()
+            .and_then(|file_metadata_cache| 
file_metadata_cache.get(object_meta))
+            .and_then(|file_metadata| {
+                file_metadata
+                    .as_any()
+                    .downcast_ref::<LazMetadata>()
+                    .map(|laz_file_metadata| 
Arc::new(laz_file_metadata.to_owned()))
+            })
+        {
+            return Ok(las_file_metadata);
+        }
+
+        let header = self.fetch_header().await?;
+        let extra_attributes = extra_bytes_attributes(&header);
+        let chunk_table = chunk_table(*store, object_meta, 
&header).await.unwrap();
+
+        let metadata = Arc::new(LazMetadata {
+            header: Arc::new(header),
+            chunk_table,
+            extra_attributes: Arc::new(extra_attributes),
+        });
+
+        if let Some(file_metadata_cache) = file_metadata_cache {
+            file_metadata_cache.put(object_meta, metadata.clone());
+        }
+
+        Ok(metadata)
+    }
+
+    /// Read and parse the schema of the laz file
+    pub async fn fetch_schema(&mut self) -> Result<Schema, DataFusionError> {
+        let metadata = self.fetch_metadata().await?;
+
+        let schema = schema_from_header(
+            &metadata.header,
+            self.options.point_encoding,
+            self.options.extra_bytes,
+        );
+
+        Ok(schema)
+    }
+
+    /// Fetch the metadata from the laz file via [`Self::fetch_metadata`] and 
extracts
+    /// the statistics in the metadata
+    pub async fn fetch_statistics(
+        &self,
+        table_schema: &SchemaRef,
+    ) -> Result<Statistics, DataFusionError> {
+        let metadata = self.fetch_metadata().await?;
+
+        let mut statistics = Statistics::new_unknown(table_schema)
+            .with_num_rows(Precision::Exact(metadata.header.number_of_points() 
as usize))
+            .with_total_byte_size(Precision::Exact(
+                metadata
+                    .chunk_table
+                    .iter()
+                    .map(|meta| meta.byte_range.end - meta.byte_range.start)
+                    .sum::<u64>() as usize,
+            ));
+
+        let bounds = metadata.header.bounds();
+        for field in table_schema.fields() {
+            let cs = match field.name().as_str() {

Review Comment:
   Creating the bbox from the header is sufficient to prune the files for now. 
Maybe we can address more elaborate statistics extraction and handling in a 
follow-up?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to