Re: [PR] feat: support IN and NOT IN operators in partition filter [hudi-rs]

via GitHub Sun, 29 Mar 2026 18:35:16 -0700


Copilot commented on code in PR #566:
URL: https://github.com/apache/hudi-rs/pull/566#discussion_r3007130344



##########
crates/core/src/keygen/timestamp_based.rs:
##########
@@ -418,29 +418,72 @@ impl TimestampBasedKeyGenerator {
             .replace("'T'", "T")
     }
 
-    /// Extracts partition values from a datetime based on output format,
-    /// applying the configured output timezone.
-    fn extract_partition_values(&self, dt: &DateTime<Utc>) -> HashMap<String, 
String> {
+    /// Formats a datetime into the full partition path string.
+    ///
+    /// For non-hive-style with format `yyyy/MM/dd`: `"2023/04/15"`
+    /// For hive-style with format `yyyy/MM/dd`: `"year=2023/month=04/day=15"`
+    fn format_partition_path(&self, dt: &DateTime<Utc>) -> String {
         let local_dt = dt.with_timezone(&self.output_timezone);
-        let mut values = HashMap::new();
 
-        let segments: Vec<&str> = self.output_dateformat.split('/').collect();
+        let segments: Vec<String> = self
+            .output_dateformat
+            .split('/')
+            .enumerate()
+            .map(|(i, segment)| {
+                let value = Self::format_segment_value(segment, &local_dt);
+                if self.is_hive_style {
+                    let field_name = &self.partition_fields[i];
+                    format!("{field_name}={value}")
+                } else {
+                    value
+                }
+            })
+            .collect();
 
-        for (i, segment) in segments.iter().enumerate() {
-            let field_name = &self.partition_fields[i];
-            let value = match *segment {
-                "yyyy" => format!("{:04}", local_dt.year()),
-                "MM" => format!("{:02}", local_dt.month()),
-                "dd" => format!("{:02}", local_dt.day()),
-                "HH" => format!("{:02}", local_dt.hour()),
-                "mm" => format!("{:02}", local_dt.minute()),
-                "ss" => format!("{:02}", local_dt.second()),
-                _ => segment.to_string(),
-            };
-            values.insert(field_name.clone(), value);
+        segments.join("/")
+    }
+
+    /// Formats a single date segment into its value string.
+    fn format_segment_value<T: Datelike + Timelike>(segment: &str, dt: &T) -> 
String {
+        match segment {
+            "yyyy" => format!("{:04}", dt.year()),
+            "MM" => format!("{:02}", dt.month()),
+            "dd" => format!("{:02}", dt.day()),
+            "HH" => format!("{:02}", dt.hour()),
+            "mm" => format!("{:02}", dt.minute()),
+            "ss" => format!("{:02}", dt.second()),
+            _ => segment.to_string(),
         }

Review Comment:
   `format_segment_value` falls back to returning the raw `segment` string for 
unsupported patterns. For output formats like `yyyyMMdd` (a valid Java 
SimpleDateFormat), this will produce a constant partition path ("yyyyMMdd") and 
cause partition pruning to incorrectly exclude all partitions for equality/IN 
filters. Consider either rejecting unsupported `output.dateformat` patterns at 
construction time, or formatting the full `output_dateformat` via 
`java_to_chrono_format`/`DateTime::format` so compound patterns like `yyyyMMdd` 
are handled correctly.
   ```suggestion
       ///
       /// This supports both simple segments like `"yyyy"` and compound 
patterns
       /// like `"yyyyMMdd"` or `"yyyy-MM-dd"`, by replacing all known date 
tokens
       /// inside the segment. If no known tokens are found, the segment is
       /// returned unchanged.
       fn format_segment_value<T: Datelike + Timelike>(segment: &str, dt: &T) 
-> String {
           // Start with the raw segment and progressively substitute known 
tokens.
           let mut result = segment.to_string();
           let mut changed = false;
   
           let year_str = format!("{:04}", dt.year());
           if result.contains("yyyy") {
               result = result.replace("yyyy", &year_str);
               changed = true;
           }
   
           let month_str = format!("{:02}", dt.month());
           if result.contains("MM") {
               result = result.replace("MM", &month_str);
               changed = true;
           }
   
           let day_str = format!("{:02}", dt.day());
           if result.contains("dd") {
               result = result.replace("dd", &day_str);
               changed = true;
           }
   
           let hour_str = format!("{:02}", dt.hour());
           if result.contains("HH") {
               result = result.replace("HH", &hour_str);
               changed = true;
           }
   
           let minute_str = format!("{:02}", dt.minute());
           if result.contains("mm") {
               result = result.replace("mm", &minute_str);
               changed = true;
           }
   
           let second_str = format!("{:02}", dt.second());
           if result.contains("ss") {
               result = result.replace("ss", &second_str);
               changed = true;
           }
   
           if changed {
               result
           } else {
               segment.to_string()
           }
   ```



##########
crates/core/src/table/file_pruner.rs:
##########
@@ -155,6 +155,10 @@ impl FilePruner {
                 // Prune if: max < value
                 self.can_prune_gte(&filter_value, max)
             }
+            ExprOperator::In | ExprOperator::NotIn => {
+                // TODO: Multi-value operators not yet supported for 
file-level pruning
+                false
+            }

Review Comment:
   `can_prune_by_filter` always extracts `filter_value` before matching on the 
operator. For `IN`/`NOT IN` you currently return `false` (no pruning), so this 
extraction is unnecessary work and also assumes `filter.values[0]` exists. 
Consider short-circuiting on `IN`/`NOT IN` before extracting, or moving the 
extraction into the match arms that actually need a scalar value.



##########
crates/core/src/expr/filter.rs:
##########
@@ -195,12 +244,30 @@ impl SchemableFilter {
 
     pub fn apply_comparison(&self, value: &dyn Datum) -> Result<BooleanArray> {
         match self.operator {
-            ExprOperator::Eq => eq(value, &self.value),
-            ExprOperator::Ne => neq(value, &self.value),
-            ExprOperator::Lt => lt(value, &self.value),
-            ExprOperator::Lte => lt_eq(value, &self.value),
-            ExprOperator::Gt => gt(value, &self.value),
-            ExprOperator::Gte => gt_eq(value, &self.value),
+            ExprOperator::Eq => eq(value, &self.values[0]),
+            ExprOperator::Ne => neq(value, &self.values[0]),
+            ExprOperator::Lt => lt(value, &self.values[0]),
+            ExprOperator::Lte => lt_eq(value, &self.values[0]),
+            ExprOperator::Gt => gt(value, &self.values[0]),
+            ExprOperator::Gte => gt_eq(value, &self.values[0]),

Review Comment:
   `SchemableFilter::apply_comparison` indexes `self.values[0]` (and slices 
`[1..]`) without validating that `values` is non-empty (or that single-value 
operators have exactly one value). Because `Filter` still exposes `pub values: 
Vec<String>`, callers can construct an invalid filter and trigger a panic here. 
Consider enforcing invariants in `TryFrom<(Filter, &Schema)>` (and/or making 
`Filter` fields private) and returning a `CoreError` when `values` is empty or 
has an unexpected length for the operator.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] feat: support IN and NOT IN operators in partition filter [hudi-rs]

Reply via email to