b4l commented on code in PR #471:
URL: https://github.com/apache/sedona-db/pull/471#discussion_r2751289241


##########
rust/sedona-pointcloud/src/laz/metadata.rs:
##########
@@ -0,0 +1,451 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{
+    any::Any,
+    collections::HashMap,
+    error::Error,
+    io::{Cursor, Read},
+    ops::Range,
+    sync::Arc,
+};
+
+use arrow_schema::{DataType, Schema, SchemaRef};
+use datafusion_common::{
+    error::DataFusionError, scalar::ScalarValue, stats::Precision, 
ColumnStatistics, Statistics,
+};
+use datafusion_execution::cache::cache_manager::{FileMetadata, 
FileMetadataCache};
+use las::{
+    raw::{Header as RawHeader, Vlr as RawVlr},
+    Builder, Header, Vlr,
+};
+use laz::laszip::ChunkTable;
+use object_store::{ObjectMeta, ObjectStore};
+
+use crate::laz::{options::LazTableOptions, schema::try_schema_from_header};
+
+/// Laz chunk metadata
+#[derive(Debug, Clone)]
+pub struct ChunkMeta {
+    pub num_points: u64,
+    pub point_offset: u64,
+    pub byte_range: Range<u64>,
+}
+
+/// Laz metadata
+#[derive(Debug, Clone)]
+pub struct LazMetadata {
+    pub header: Arc<Header>,
+    pub chunk_table: Vec<ChunkMeta>,
+    pub extra_attributes: Arc<Vec<ExtraAttribute>>,
+}
+
+impl FileMetadata for LazMetadata {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn memory_size(&self) -> usize {
+        self.chunk_table.capacity() * std::mem::size_of::<ChunkMeta>()
+            + self.extra_attributes.capacity() * 
std::mem::size_of::<ExtraAttribute>()
+    }
+
+    fn extra_info(&self) -> HashMap<String, String> {
+        HashMap::new()
+    }
+}
+
+/// Reader for laz file metadata in object storage.
+pub struct LazMetadataReader<'a> {
+    store: &'a dyn ObjectStore,
+    object_meta: &'a ObjectMeta,
+    file_metadata_cache: Option<Arc<dyn FileMetadataCache>>,
+    options: LazTableOptions,
+}
+
+impl<'a> LazMetadataReader<'a> {
+    pub fn new(store: &'a dyn ObjectStore, object_meta: &'a ObjectMeta) -> 
Self {
+        Self {
+            store,
+            object_meta,
+            file_metadata_cache: None,
+            options: Default::default(),
+        }
+    }
+
+    /// set file metadata cache
+    pub fn with_file_metadata_cache(
+        mut self,
+        file_metadata_cache: Option<Arc<dyn FileMetadataCache>>,
+    ) -> Self {
+        self.file_metadata_cache = file_metadata_cache;
+        self
+    }
+
+    /// set table options
+    pub fn with_options(mut self, options: LazTableOptions) -> Self {
+        self.options = options;
+        self
+    }
+
+    /// Fetch header
+    pub async fn fetch_header(&self) -> Result<Header, DataFusionError> {
+        fetch_header(self.store, self.object_meta)
+            .await
+            .map_err(DataFusionError::External)
+    }
+
+    /// Fetch laz metadata from the remote object store
+    pub async fn fetch_metadata(&self) -> Result<Arc<LazMetadata>, 
DataFusionError> {
+        let Self {
+            store,
+            object_meta,
+            file_metadata_cache,
+            options: _,
+        } = self;
+
+        if let Some(las_file_metadata) = file_metadata_cache
+            .as_ref()
+            .and_then(|file_metadata_cache| 
file_metadata_cache.get(object_meta))
+            .and_then(|file_metadata| {
+                file_metadata
+                    .as_any()
+                    .downcast_ref::<LazMetadata>()
+                    .map(|laz_file_metadata| 
Arc::new(laz_file_metadata.to_owned()))
+            })
+        {
+            return Ok(las_file_metadata);
+        }
+
+        let header = self.fetch_header().await?;
+        let extra_attributes = extra_bytes_attributes(&header)?;
+        let chunk_table = chunk_table(*store, object_meta, &header).await?;
+
+        let metadata = Arc::new(LazMetadata {
+            header: Arc::new(header),
+            chunk_table,
+            extra_attributes: Arc::new(extra_attributes),
+        });
+
+        if let Some(file_metadata_cache) = file_metadata_cache {
+            file_metadata_cache.put(object_meta, metadata.clone());
+        }
+
+        Ok(metadata)
+    }
+
+    /// Read and parse the schema of the laz file
+    pub async fn fetch_schema(&mut self) -> Result<Schema, DataFusionError> {
+        let metadata = self.fetch_metadata().await?;
+
+        let schema = try_schema_from_header(
+            &metadata.header,
+            self.options.point_encoding,
+            self.options.extra_bytes,
+        )?;
+
+        Ok(schema)
+    }
+
+    /// Fetch the metadata from the laz file via [`Self::fetch_metadata`] and 
extracts
+    /// the statistics in the metadata
+    pub async fn fetch_statistics(
+        &self,
+        table_schema: &SchemaRef,
+    ) -> Result<Statistics, DataFusionError> {
+        let metadata = self.fetch_metadata().await?;
+
+        let mut statistics = Statistics::new_unknown(table_schema)
+            .with_num_rows(Precision::Exact(metadata.header.number_of_points() 
as usize))
+            .with_total_byte_size(Precision::Exact(
+                metadata
+                    .chunk_table
+                    .iter()
+                    .map(|meta| meta.byte_range.end - meta.byte_range.start)
+                    .sum::<u64>() as usize,
+            ));
+
+        let bounds = metadata.header.bounds();
+        for field in table_schema.fields() {
+            let cs = match field.name().as_str() {
+                "x" => ColumnStatistics::new_unknown()
+                    
.with_min_value(Precision::Exact(ScalarValue::Float64(Some(bounds.min.x))))
+                    
.with_max_value(Precision::Exact(ScalarValue::Float64(Some(bounds.max.x))))
+                    .with_null_count(Precision::Exact(0)),
+                "y" => ColumnStatistics::new_unknown()
+                    
.with_min_value(Precision::Exact(ScalarValue::Float64(Some(bounds.min.y))))
+                    
.with_max_value(Precision::Exact(ScalarValue::Float64(Some(bounds.max.y))))
+                    .with_null_count(Precision::Exact(0)),
+                "z" => ColumnStatistics::new_unknown()
+                    
.with_min_value(Precision::Exact(ScalarValue::Float64(Some(bounds.min.z))))
+                    
.with_max_value(Precision::Exact(ScalarValue::Float64(Some(bounds.max.z))))
+                    .with_null_count(Precision::Exact(0)),
+                _ => ColumnStatistics::new_unknown(),
+            };
+
+            statistics = statistics.add_column_statistics(cs);
+        }
+
+        Ok(statistics)
+    }
+}
+
+pub(crate) async fn fetch_header(
+    store: &(impl ObjectStore + ?Sized),
+    object_meta: &ObjectMeta,
+) -> Result<Header, Box<dyn Error + Send + Sync>> {
+    let location = &object_meta.location;
+
+    // Header
+    let bytes = store.get_range(location, 0..375).await?;
+    let reader = Cursor::new(bytes);
+    let raw_header = RawHeader::read_from(reader)?;
+
+    let header_size = raw_header.header_size as u64;
+    let offset_to_point_data = raw_header.offset_to_point_data as u64;
+    let num_vlr = raw_header.number_of_variable_length_records;
+    let evlr = raw_header.evlr;
+
+    let mut builder = Builder::new(raw_header)?;
+
+    // VLRs
+    let bytes = store
+        .get_range(location, header_size..offset_to_point_data)
+        .await?;
+    let mut reader = Cursor::new(bytes);
+
+    for _ in 0..num_vlr {
+        let vlr = RawVlr::read_from(&mut reader, false).map(Vlr::new)?;
+        builder.vlrs.push(vlr);
+    }
+
+    reader.read_to_end(&mut builder.vlr_padding)?;
+
+    // EVLRs
+    if let Some(evlr) = evlr {
+        let mut start = evlr.start_of_first_evlr;
+
+        for _ in 0..evlr.number_of_evlrs {
+            let mut end = start + 60;
+
+            let bytes = store.get_range(location, start..end).await?;
+
+            end += u64::from_le_bytes(bytes[20..28].try_into()?);
+
+            let bytes = store.get_range(location, start..end).await?;
+            let mut reader = Cursor::new(bytes);
+            let evlr = RawVlr::read_from(&mut reader, true).map(Vlr::new)?;
+
+            builder.evlrs.push(evlr);
+
+            start = end;
+        }
+    }
+
+    Ok(builder.into_header()?)
+}
+
+#[derive(Debug, Clone, PartialEq)]
+pub struct ExtraAttribute {
+    pub data_type: DataType,
+    pub no_data: Option<[u8; 8]>,
+    pub scale: Option<f64>,
+    pub offset: Option<f64>,
+}
+
+pub(crate) fn extra_bytes_attributes(
+    header: &Header,
+) -> Result<Vec<ExtraAttribute>, Box<dyn Error + Send + Sync>> {

Review Comment:
   I would keep it as is if you don't mind.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to