Copilot commented on code in PR #599:
URL: https://github.com/apache/hudi-rs/pull/599#discussion_r3197889256


##########
crates/core/src/table/fs_view.rs:
##########
@@ -27,6 +27,8 @@ use crate::config::HudiConfigs;
 use crate::config::table::BaseFileFormatValue;
 use crate::config::table::HudiTableConfig::BaseFileFormat;
 use crate::file_group::FileGroup;
+use crate::file_group::base_file::parquet::ParquetBaseFileReader;
+use crate::file_group::base_file::reader::BaseFileReader;

Review Comment:
   `BaseFileReader` is imported but not used in this module. With `make 
check-rust` running clippy with `-D warnings`, this will fail the build; please 
remove the unused import (or use it).
   



##########
crates/core/src/config/table.rs:
##########
@@ -360,6 +360,63 @@ pub enum BaseFileFormatValue {
     HFile,
 }
 
+impl BaseFileFormatValue {
+    fn ends_with_ignore_ascii_case(s: &str, suffix: &str) -> bool {
+        let s_bytes = s.as_bytes();
+        let suffix_bytes = suffix.as_bytes();
+        s_bytes.len() >= suffix_bytes.len()
+            && s_bytes[s_bytes.len() - 
suffix_bytes.len()..].eq_ignore_ascii_case(suffix_bytes)
+    }
+
+    /// Detect format from a file extension, returning `None` if unrecognized.
+    pub fn from_extension(path: &str) -> Option<Self> {
+        if Self::ends_with_ignore_ascii_case(path, ".parquet") {
+            Some(Self::Parquet)
+        } else if Self::ends_with_ignore_ascii_case(path, ".hfile") {
+            Some(Self::HFile)
+        } else {
+            None
+        }
+    }
+
+    /// Returns true when `path` has this format's base-file suffix.
+    pub fn matches_extension(&self, path: &str) -> bool {
+        let suffix = format!(".{}", self.as_ref());
+        Self::ends_with_ignore_ascii_case(path, &suffix)

Review Comment:
   `BaseFileFormatValue::matches_extension` allocates a new `String` via 
`format!(".{}", ...)` on every call. Since this is used on hot paths (e.g., 
per-file pruning), consider using a match on `self` with static suffixes (e.g., 
".parquet"/".hfile") to avoid allocation.
   



##########
crates/core/src/file_group/reader.rs:
##########
@@ -74,10 +87,13 @@ impl FileGroupReader {
         final_opts.extend(extra_hudi_opts);
         let hudi_configs = Arc::new(HudiConfigs::new(final_opts));
         let storage = Storage::new(Arc::new(storage_opts), 
hudi_configs.clone())?;
+        let format = base_file_reader::resolve_base_file_format(&hudi_configs, 
None)?;
+        let base_file_reader = 
base_file_reader::create_base_file_reader(&storage, &format).ok();
 
         Ok(Self {
             hudi_configs,
             storage,
+            base_file_reader,
         })

Review Comment:
   Both constructors swallow `create_base_file_reader` errors via `.ok()`, 
which can hide unsupported formats (e.g., `hfile`) and later produce a 
less-informative runtime error. Since these constructors already return 
`Result`, consider propagating the error (or only suppressing it when you are 
intentionally doing extension-based detection).



##########
crates/core/src/file_group/reader.rs:
##########
@@ -99,24 +115,46 @@ impl FileGroupReader {
         resolver.resolve_options().await?;
         let hudi_configs = Arc::new(HudiConfigs::new(resolver.hudi_options));
         let storage = Storage::new(Arc::new(resolver.storage_options), 
hudi_configs.clone())?;
+        let format = base_file_reader::resolve_base_file_format(&hudi_configs, 
None)?;
+        let base_file_reader = 
base_file_reader::create_base_file_reader(&storage, &format).ok();
 
         Ok(Self {
             hudi_configs,
             storage,
+            base_file_reader,
         })
     }
 
     fn resolve_read_options(&self, options: &ReadOptions) -> 
Result<ReadOptions> {
         options.with_defaults_from(&self.hudi_configs)
     }
 
+    /// Returns the base file reader for a given path. When the table config
+    /// explicitly sets the format, returns the cached reader. Otherwise falls
+    /// back to extension-based detection.
+    fn reader_for_path(&self, relative_path: &str) -> Result<Arc<dyn 
BaseFileReader>> {
+        if self
+            .hudi_configs
+            .contains(HudiTableConfig::BaseFileFormat.as_ref())
+        {
+            return self.base_file_reader.clone().ok_or_else(|| {
+                ReadFileSliceError(format!("No base file reader available for 
{relative_path}"))

Review Comment:
   When `hoodie.table.base.file.format` is set but no cached reader is 
available, the error `No base file reader available...` loses the real cause 
(e.g., unsupported format). Consider preserving the original `StorageError` 
from reader creation (store it, or include the resolved format in the message) 
so users can act on the failure.
   



##########
crates/core/src/table/fs_view.rs:
##########
@@ -195,16 +188,16 @@ impl FileSystemView {
                     }
                 };
 
-                // Case-insensitive check for configured base file extension.
-                if !Self::ends_with_ignore_ascii_case(&relative_path, 
&base_file_suffix) {
+                if !base_file_format.matches_extension(&relative_path) {
                     retained.push(fg);
                     continue;
                 }
 
-                let (file_metadata, col_stats) = match self
-                    .storage
-                    .get_file_metadata_and_stats(&relative_path, table_schema)
-                    .await
+                let (file_metadata, col_stats) = match 
ParquetBaseFileReader::new(
+                    self.storage.clone(),
+                )
+                .get_metadata_and_stats(&relative_path, table_schema)
+                .await

Review Comment:
   `ParquetBaseFileReader::new(self.storage.clone())` is created inside the 
per-file-group loop, which adds repeated allocations/clones on large tables. 
Consider constructing a single reader once before the loop (or caching it on 
`FileSystemView`) and reusing it for all files in this method.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to