Copilot commented on code in PR #537:
URL: https://github.com/apache/hudi-rs/pull/537#discussion_r2970459715


##########
README.md:
##########
@@ -210,7 +210,7 @@ record_batch = 
reader.read_file_slice_by_base_file_path("relative/path.parquet")
 use hudi::file_group::reader::FileGroupReader;
 
 let reader = FileGroupReader::new_with_options(
-    "/table/base/path", [("hoodie.read.file_group.start_timestamp", "0")])?;
+    "/table/base/path", [("hoodie.read.file_group.start_timestamp", 
"0")]).await?;
 
 // Returns an Arrow RecordBatch
 let record_batch = 
reader.read_file_slice_by_base_file_path("relative/path.parquet").await?;

Review Comment:
   This Rust README snippet now uses `.await?` but the example isn’t shown 
inside an `async fn`/runtime context, so it won’t compile as-is. Consider 
wrapping it in a minimal `#[tokio::main] async fn main() -> Result<...>` (or 
mark the code block as pseudocode) to keep the docs copy/pasteable.



##########
cpp/src/lib.rs:
##########
@@ -37,81 +37,95 @@ mod ffi {
         fn new_file_group_reader_with_options(
             base_uri: &CxxString,
             options: &CxxVector<CxxString>,
-        ) -> Box<HudiFileGroupReader>;
+        ) -> Result<Box<HudiFileGroupReader>>;
 
         type HudiFileSlice;
         fn new_file_slice_from_file_names(
             partition_path: &CxxString,
             base_file_name: &CxxString,
             log_file_names: &CxxVector<CxxString>,
-        ) -> Box<HudiFileSlice>;
+        ) -> Result<Box<HudiFileSlice>>;
 
         fn read_file_slice_by_base_file_path(
             self: &HudiFileGroupReader,
             relative_path: &CxxString,
-        ) -> *mut ArrowArrayStream;
+        ) -> Result<*mut ArrowArrayStream>;
 
         fn read_file_slice(
             self: &HudiFileGroupReader,
             file_slice: &HudiFileSlice,
-        ) -> *mut ArrowArrayStream;
+        ) -> Result<*mut ArrowArrayStream>;
     }
 }
 
 pub struct HudiFileGroupReader {
     inner: FileGroupReader,
+    rt: tokio::runtime::Runtime,
 }
 
 pub fn new_file_group_reader_with_options(
     base_uri: &CxxString,
     options: &CxxVector<CxxString>,
-) -> Box<HudiFileGroupReader> {
+) -> std::result::Result<Box<HudiFileGroupReader>, String> {
     let base_uri = base_uri
         .to_str()
-        .expect("Failed to convert CxxString to str: Invalid UTF-8 sequence");
+        .map_err(|e| format!("Failed to convert CxxString to str: {e}"))?;
 
     let mut opt_vec = Vec::new();
     for opt in options.iter() {
         let opt_str = opt
             .to_str()
-            .expect("Failed to convert CxxString to str: Invalid UTF-8 
sequence");
+            .map_err(|e| format!("Failed to convert CxxString to str: {e}"))?;
         if let Some((key, value)) = opt_str.split_once('=') {
             opt_vec.push((key, value))
         }
     }
 
-    let reader = FileGroupReader::new_with_options(base_uri, opt_vec)
-        .expect("Failed to create FileGroupReader with options");
-    let reader_wrapper = HudiFileGroupReader { inner: reader };
-    Box::new(reader_wrapper)
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .map_err(|e| format!("Failed to create tokio runtime: {e}"))?;
+    let reader = rt

Review Comment:
   `HudiFileGroupReader` stores a `tokio::runtime::Runtime` built with 
`new_current_thread()`. A current-thread runtime generally must be driven from 
the thread that created it; if the C++ side uses the same reader from a 
different thread, `block_on` can panic or behave unexpectedly. Consider using a 
multi-thread runtime (`Runtime::new()` / `new_multi_thread`) or storing a 
`Handle` to a shared runtime that is safe to call from any thread.



##########
crates/core/src/schema/resolver.rs:
##########
@@ -97,8 +107,7 @@ pub(crate) async fn resolve_schema_from_commit_metadata(
         Err(e) => return Err(e),
     };
 
-    let arrow_schema = arrow_schema_from_avro_schema_str(&avro_schema_str)?;
-    prepend_meta_fields(SchemaRef::new(arrow_schema))
+    arrow_schema_from_avro_schema_str(&avro_schema_str)

Review Comment:
   `resolve_data_schema_from_commit_metadata` falls back to 
`resolve_schema_from_base_file`, which reads the Parquet file schema via 
`Storage::get_parquet_file_schema`. Those schemas typically include `_hoodie_*` 
meta fields, but this function is supposed to return the *data* schema without 
meta fields. This can cause `Table::get_schema()` / 
`Timeline::get_latest_schema()` to unexpectedly include meta fields, and 
`resolve_schema()` to prepend meta fields again (duplicating them). Consider 
stripping meta fields from the base-file-derived schema before returning it 
(and/or de-duping in `prepend_meta_fields`).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to