andygrove commented on code in PR #3017:
URL: https://github.com/apache/datafusion-comet/pull/3017#discussion_r2662535704
##########
native/spark-expr/src/conversion_funcs/cast.rs:
##########
@@ -1957,41 +1967,46 @@ fn cast_string_to_int_with_range_check(
/// Equivalent to
/// - org.apache.spark.unsafe.types.UTF8String.toInt(IntWrapper intWrapper,
boolean allowDecimal)
/// - org.apache.spark.unsafe.types.UTF8String.toLong(LongWrapper longWrapper,
boolean allowDecimal)
-fn do_cast_string_to_int<
- T: Num + PartialOrd + Integer + CheckedSub + CheckedNeg + From<i32> + Copy,
->(
+fn do_cast_string_to_int<T: Integer + CheckedSub + CheckedNeg + From<u8> +
Copy>(
str: &str,
eval_mode: EvalMode,
type_name: &str,
min_value: T,
) -> SparkResult<Option<T>> {
- let trimmed_str = str.trim();
- if trimmed_str.is_empty() {
+ let bytes = str.as_bytes();
+ let mut start = 0;
+ let mut end = bytes.len();
+
+ while start < end && bytes[start].is_ascii_whitespace() {
+ start += 1;
+ }
+ while end > start && bytes[end - 1].is_ascii_whitespace() {
+ end -= 1;
+ }
+
+ if start == end {
return none_or_err(eval_mode, type_name, str);
}
- let len = trimmed_str.len();
+ let trimmed_bytes = &bytes[start..end];
+ let len = trimmed_bytes.len();
let mut result: T = T::zero();
- let mut negative = false;
- let radix = T::from(10);
+ let mut idx = 0;
+ let first_char = trimmed_bytes[0];
+ let negative = first_char == b'-';
+ if negative || first_char == b'+' {
+ idx = 1;
+ if len == 1 {
+ return none_or_err(eval_mode, type_name, str);
+ }
+ }
+
+ let radix = T::from(10_u8);
let stop_value = min_value / radix;
let mut parse_sign_and_digits = true;
- for (i, ch) in trimmed_str.char_indices() {
+ for &ch in &trimmed_bytes[idx..] {
if parse_sign_and_digits {
- if i == 0 {
- negative = ch == '-';
- let positive = ch == '+';
- if negative || positive {
- if i + 1 == len {
- // input string is just "+" or "-"
- return none_or_err(eval_mode, type_name, str);
- }
- // consume this char
- continue;
- }
- }
-
- if ch == '.' {
+ if ch == b'.' {
if eval_mode == EvalMode::Legacy {
// truncate decimal in legacy mode
parse_sign_and_digits = false;
Review Comment:
The `eval_mode` does not change for different rows. It would likely be more
performant to have separate implementations for legacy vs other modes to avoid
the conditional in the hot loop.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]