Omega359 commented on code in PR #9181:
URL: https://github.com/apache/arrow-datafusion/pull/9181#discussion_r1486283497
##########
datafusion/physical-expr/src/datetime_expressions.rs:
##########
@@ -502,6 +503,176 @@ pub fn make_current_time(
move |_arg| Ok(ColumnarValue::Scalar(ScalarValue::Time64Nanosecond(nano)))
}
+/// Returns a string representation of a date, time, timestamp or duration
based
+/// on a Chrono pattern.
+///
+/// The syntax for the patterns can be found at
+/// <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>
+///
+/// # Examples
+///
+/// ```
+/// # use chrono::prelude::*;
+/// # use datafusion::prelude::*;
+/// # use datafusion::error::Result;
+/// # use datafusion_common::ScalarValue::TimestampNanosecond;
+/// # use std::sync::Arc;
+/// # use arrow_array::{Date32Array, RecordBatch, StringArray};
+/// # use arrow_schema::{DataType, Field, Schema};
+/// # #[tokio::main]
+/// # async fn main() -> Result<()> {
+/// let schema = Arc::new(Schema::new(vec![
+/// Field::new("values", DataType::Date32, false),
+/// Field::new("patterns", DataType::Utf8, false),
+/// ]));
+///
+/// let batch = RecordBatch::try_new(
+/// schema,
+/// vec![
+/// Arc::new(Date32Array::from(vec![
+/// 18506,
+/// 18507,
+/// 18508,
+/// 18509,
+/// ])),
+/// Arc::new(StringArray::from(vec![
+/// "%Y-%m-%d",
+/// "%Y:%m:%d",
+/// "%Y%m%d",
+/// "%d-%m-%Y",
+/// ])),
+/// ],
+/// )?;
+///
+/// let ctx = SessionContext::new();
+/// ctx.register_batch("t", batch)?;
+/// let df = ctx.table("t").await?;
+///
+/// // use the to_char function to convert col 'values',
+/// // to strings using patterns in col 'patterns'
+/// let df = df.with_column(
+/// "date_str",
+/// to_char(col("values"), col("patterns"))
+/// )?;
+/// // Note that providing a scalar value for the pattern
+/// // is more performant
+/// let df = df.with_column(
+/// "date_str2",
+/// to_char(col("values"), lit("%d-%m-%Y"))
+/// )?;
+/// // literals can be used as well with dataframe calls
+/// let timestamp = "2026-07-08T09:10:11"
+/// .parse::<NaiveDateTime>()
+/// .unwrap()
+/// .with_nanosecond(56789)
+/// .unwrap()
+/// .timestamp_nanos_opt()
+/// .unwrap();
+/// let df = df.with_column(
+/// "timestamp_str",
+/// to_char(lit(TimestampNanosecond(Some(timestamp), None)), lit("%d-%m-%Y
%H:%M:%S"))
+/// )?;
+///
+/// df.show().await?;
+///
+/// # Ok(())
+/// # }
+/// ```
+pub fn to_char(args: &[ColumnarValue]) -> Result<ColumnarValue> {
+ if args.len() != 2 {
+ return exec_err!("to_char function requires 2 arguments, got {}",
args.len());
+ }
+
+ let is_scalar = args
+ .iter()
+ .fold(Option::<usize>::None, |acc, arg| match arg {
+ ColumnarValue::Scalar(_) => acc,
+ ColumnarValue::Array(a) => Some(a.len()),
+ })
+ .is_none();
+
+ let args = ColumnarValue::values_to_arrays(args)?;
+ if is_scalar {
+ _to_char_scalar(&args)
+ } else {
+ _to_char_array(&args)
+ }
+}
+
+fn _build_format_options<'a>(
+ data_type: &DataType,
+ format: &'a str,
+) -> Result<FormatOptions<'a>, Result<ColumnarValue>> {
+ let format_options = match data_type {
+ DataType::Date32 =>
FormatOptions::new().with_date_format(Some(format)),
+ DataType::Date64 =>
FormatOptions::new().with_datetime_format(Some(format)),
+ DataType::Time32(_) =>
FormatOptions::new().with_time_format(Some(format)),
+ DataType::Time64(_) =>
FormatOptions::new().with_time_format(Some(format)),
+ DataType::Timestamp(_, _) => FormatOptions::new()
+ .with_timestamp_format(Some(format))
+ .with_timestamp_tz_format(Some(format)),
+ DataType::Duration(_) => FormatOptions::new().with_duration_format(
+ if "ISO8601".eq_ignore_ascii_case(format) {
+ DurationFormat::ISO8601
+ } else {
+ DurationFormat::Pretty
+ },
+ ),
+ other => {
+ return Err(exec_err!(
+ "to_char only supports date, time, timestamp and duration data
types, received {other:?}"
+ ));
+ }
+ };
+ Ok(format_options)
+}
+
+fn _to_char_scalar(args: &[ArrayRef]) -> Result<ColumnarValue> {
+ if &DataType::Utf8 != args[1].data_type() {
+ return exec_err!(
+ "Format for `to_char` must be non-null Utf8, received {:?}",
+ args[1].data_type()
+ );
+ }
+
+ let format = args[1].as_string::<i32>().value(0);
+ let format_options = match _build_format_options(args[0].data_type(),
format) {
+ Ok(value) => value,
+ Err(value) => return value,
+ };
+
+ let formatter = ArrayFormatter::try_new(args[0].as_ref(),
&format_options)?;
+ let formatted = (0..args[0].len())
+ .map(|i| formatter.value(i).to_string())
+ .collect::<Vec<_>>();
+
+ Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+ formatted.first().unwrap().to_string(),
+ ))))
+}
+
+fn _to_char_array(args: &[ArrayRef]) -> Result<ColumnarValue> {
+ let mut results: Vec<String> = vec![];
+ let format_array = args[1].as_string::<i32>();
+ let data_type = args[0].data_type();
+
+ for idx in 0..args[0].len() {
+ let format = format_array.value(idx);
+ let format_options = match _build_format_options(data_type, format) {
+ Ok(value) => value,
+ Err(value) => return value,
+ };
+ // this isn't ideal but this can't use ValueFormatter as it isn't
independent
Review Comment:
I was thinking I may have to do that but the benchmark doesn't seem to
indicate it would be worth it:
```
❯ cargo criterion --bench to_char
Finished bench [optimized] target(s) in 0.24s
Gnuplot not found, using plotters backend
to_char_array_array_1000
time: [255.18 µs 257.84 µs 260.62 µs]
change: [-0.3634% +1.5030% +3.4367%] (p = 0.11 >
0.05)
No change in performance detected.
to_char_array_scalar_1000
time: [257.70 µs 260.75 µs 263.83 µs]
change: [-2.4221% -0.6517% +1.0318%] (p = 0.45 >
0.05)
No change in performance detected.
to_char_scalar_scalar_1000
time: [839.74 ns 850.89 ns 863.68 ns]
change: [-1.2744% +0.7158% +2.5192%] (p = 0.47 >
0.05)
No change in performance detected.
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]