Copilot commented on code in PR #566:
URL: https://github.com/apache/hudi-rs/pull/566#discussion_r3007130344
##########
crates/core/src/keygen/timestamp_based.rs:
##########
@@ -418,29 +418,72 @@ impl TimestampBasedKeyGenerator {
.replace("'T'", "T")
}
- /// Extracts partition values from a datetime based on output format,
- /// applying the configured output timezone.
- fn extract_partition_values(&self, dt: &DateTime<Utc>) -> HashMap<String,
String> {
+ /// Formats a datetime into the full partition path string.
+ ///
+ /// For non-hive-style with format `yyyy/MM/dd`: `"2023/04/15"`
+ /// For hive-style with format `yyyy/MM/dd`: `"year=2023/month=04/day=15"`
+ fn format_partition_path(&self, dt: &DateTime<Utc>) -> String {
let local_dt = dt.with_timezone(&self.output_timezone);
- let mut values = HashMap::new();
- let segments: Vec<&str> = self.output_dateformat.split('/').collect();
+ let segments: Vec<String> = self
+ .output_dateformat
+ .split('/')
+ .enumerate()
+ .map(|(i, segment)| {
+ let value = Self::format_segment_value(segment, &local_dt);
+ if self.is_hive_style {
+ let field_name = &self.partition_fields[i];
+ format!("{field_name}={value}")
+ } else {
+ value
+ }
+ })
+ .collect();
- for (i, segment) in segments.iter().enumerate() {
- let field_name = &self.partition_fields[i];
- let value = match *segment {
- "yyyy" => format!("{:04}", local_dt.year()),
- "MM" => format!("{:02}", local_dt.month()),
- "dd" => format!("{:02}", local_dt.day()),
- "HH" => format!("{:02}", local_dt.hour()),
- "mm" => format!("{:02}", local_dt.minute()),
- "ss" => format!("{:02}", local_dt.second()),
- _ => segment.to_string(),
- };
- values.insert(field_name.clone(), value);
+ segments.join("/")
+ }
+
+ /// Formats a single date segment into its value string.
+ fn format_segment_value<T: Datelike + Timelike>(segment: &str, dt: &T) ->
String {
+ match segment {
+ "yyyy" => format!("{:04}", dt.year()),
+ "MM" => format!("{:02}", dt.month()),
+ "dd" => format!("{:02}", dt.day()),
+ "HH" => format!("{:02}", dt.hour()),
+ "mm" => format!("{:02}", dt.minute()),
+ "ss" => format!("{:02}", dt.second()),
+ _ => segment.to_string(),
}
Review Comment:
`format_segment_value` falls back to returning the raw `segment` string for
unsupported patterns. For output formats like `yyyyMMdd` (a valid Java
SimpleDateFormat), this will produce a constant partition path ("yyyyMMdd") and
cause partition pruning to incorrectly exclude all partitions for equality/IN
filters. Consider either rejecting unsupported `output.dateformat` patterns at
construction time, or formatting the full `output_dateformat` via
`java_to_chrono_format`/`DateTime::format` so compound patterns like `yyyyMMdd`
are handled correctly.
```suggestion
///
/// This supports both simple segments like `"yyyy"` and compound
patterns
/// like `"yyyyMMdd"` or `"yyyy-MM-dd"`, by replacing all known date
tokens
/// inside the segment. If no known tokens are found, the segment is
/// returned unchanged.
fn format_segment_value<T: Datelike + Timelike>(segment: &str, dt: &T)
-> String {
// Start with the raw segment and progressively substitute known
tokens.
let mut result = segment.to_string();
let mut changed = false;
let year_str = format!("{:04}", dt.year());
if result.contains("yyyy") {
result = result.replace("yyyy", &year_str);
changed = true;
}
let month_str = format!("{:02}", dt.month());
if result.contains("MM") {
result = result.replace("MM", &month_str);
changed = true;
}
let day_str = format!("{:02}", dt.day());
if result.contains("dd") {
result = result.replace("dd", &day_str);
changed = true;
}
let hour_str = format!("{:02}", dt.hour());
if result.contains("HH") {
result = result.replace("HH", &hour_str);
changed = true;
}
let minute_str = format!("{:02}", dt.minute());
if result.contains("mm") {
result = result.replace("mm", &minute_str);
changed = true;
}
let second_str = format!("{:02}", dt.second());
if result.contains("ss") {
result = result.replace("ss", &second_str);
changed = true;
}
if changed {
result
} else {
segment.to_string()
}
```
##########
crates/core/src/table/file_pruner.rs:
##########
@@ -155,6 +155,10 @@ impl FilePruner {
// Prune if: max < value
self.can_prune_gte(&filter_value, max)
}
+ ExprOperator::In | ExprOperator::NotIn => {
+ // TODO: Multi-value operators not yet supported for
file-level pruning
+ false
+ }
Review Comment:
`can_prune_by_filter` always extracts `filter_value` before matching on the
operator. For `IN`/`NOT IN` you currently return `false` (no pruning), so this
extraction is unnecessary work and also assumes `filter.values[0]` exists.
Consider short-circuiting on `IN`/`NOT IN` before extracting, or moving the
extraction into the match arms that actually need a scalar value.
##########
crates/core/src/expr/filter.rs:
##########
@@ -195,12 +244,30 @@ impl SchemableFilter {
pub fn apply_comparison(&self, value: &dyn Datum) -> Result<BooleanArray> {
match self.operator {
- ExprOperator::Eq => eq(value, &self.value),
- ExprOperator::Ne => neq(value, &self.value),
- ExprOperator::Lt => lt(value, &self.value),
- ExprOperator::Lte => lt_eq(value, &self.value),
- ExprOperator::Gt => gt(value, &self.value),
- ExprOperator::Gte => gt_eq(value, &self.value),
+ ExprOperator::Eq => eq(value, &self.values[0]),
+ ExprOperator::Ne => neq(value, &self.values[0]),
+ ExprOperator::Lt => lt(value, &self.values[0]),
+ ExprOperator::Lte => lt_eq(value, &self.values[0]),
+ ExprOperator::Gt => gt(value, &self.values[0]),
+ ExprOperator::Gte => gt_eq(value, &self.values[0]),
Review Comment:
`SchemableFilter::apply_comparison` indexes `self.values[0]` (and slices
`[1..]`) without validating that `values` is non-empty (or that single-value
operators have exactly one value). Because `Filter` still exposes `pub values:
Vec<String>`, callers can construct an invalid filter and trigger a panic here.
Consider enforcing invariants in `TryFrom<(Filter, &Schema)>` (and/or making
`Filter` fields private) and returning a `CoreError` when `values` is empty or
has an unexpected length for the operator.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]