Copilot commented on code in PR #510:
URL: https://github.com/apache/hudi-rs/pull/510#discussion_r2660028398


##########
crates/datafusion/src/lib.rs:
##########
@@ -95,10 +116,27 @@ impl HudiDataSource {
         K: AsRef<str>,
         V: Into<String>,
     {
-        match HudiTable::new_with_options(base_uri, options).await {
-            Ok(t) => Ok(Self { table: Arc::new(t) }),
-            Err(e) => Err(Execution(format!("Failed to create Hudi table: 
{e}"))),
-        }
+        let table = HudiTable::new_with_options(base_uri, options)
+            .await
+            .map_err(|e| Execution(format!("Failed to create Hudi table: 
{e}")))?;
+
+        // Cache schema at construction to avoid async workarounds later
+        let cached_schema = table
+            .get_schema()
+            .await
+            .map(|s| Arc::new(s) as SchemaRef)
+            .unwrap_or_else(|_| Arc::new(Schema::empty()));
+
+        let cached_partition_schema = table
+            .get_partition_schema()
+            .await
+            .unwrap_or_else(|_| Schema::empty());

Review Comment:
   The schema errors are silently ignored with `unwrap_or_else`, returning an 
empty schema. This could lead to confusing behavior if schema retrieval fails - 
DataFusion would operate with an empty schema rather than surfacing the error. 
Consider logging the error or propagating it as a DataFusion error.
   ```suggestion
           let cached_schema: SchemaRef = Arc::new(
               table
                   .get_schema()
                   .await
                   .map_err(|e| Execution(format!("Failed to get Hudi table 
schema: {e}")))?,
           );
   
           let cached_partition_schema = table
               .get_partition_schema()
               .await
               .map_err(|e| Execution(format!("Failed to get Hudi partition 
schema: {e}")))?;
   ```



##########
crates/core/src/storage/mod.rs:
##########
@@ -297,7 +328,29 @@ impl Storage {
             builder = builder.with_batch_size(batch_size);
         }
 
-        if let Some(projection) = options.projection {
+        // Handle projection: prefer column names over indices for reliability
+        if let Some(ref column_names) = options.projection_columns {
+            // Convert column names to indices using builder's schema
+            let arrow_schema = builder.schema();
+            let projection: Vec<usize> = column_names
+                .iter()
+                .map(|name| {
+                    arrow_schema.index_of(name).map_err(|_| {
+                        StorageError::InvalidPath(format!(
+                            "Column '{}' not found in parquet file schema",
+                            name
+                        ))
+                    })

Review Comment:
   The error message uses `InvalidPath` error type, but this isn't really a 
path issue - it's a schema mismatch where a column name doesn't exist in the 
schema. Consider using a more appropriate error variant like `Schema` or 
creating a new error type for column-not-found errors to make debugging easier.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to