alamb commented on code in PR #3647:
URL: https://github.com/apache/arrow-rs/pull/3647#discussion_r1099423872
##########
arrow-cast/src/display.rs:
##########
@@ -19,57 +19,499 @@
//! purposes. See the `pretty` crate for additional functions for
//! record batch pretty printing.
-use std::fmt::Write;
-use std::sync::Arc;
+use std::fmt::{Debug, Display, Formatter, Write};
+use std::ops::Range;
+use arrow_array::cast::*;
+use arrow_array::temporal_conversions::*;
use arrow_array::timezone::Tz;
use arrow_array::types::*;
use arrow_array::*;
use arrow_buffer::ArrowNativeType;
use arrow_schema::*;
-use chrono::prelude::SecondsFormat;
-use chrono::{DateTime, Utc};
+use chrono::{NaiveDate, NaiveDateTime, SecondsFormat, TimeZone, Utc};
+use lexical_core::FormattedSize;
-fn invalid_cast_error(dt: &str, col_idx: usize, row_idx: usize) -> ArrowError {
- ArrowError::CastError(format!(
- "Cannot cast to {dt} at col index: {col_idx} row index: {row_idx}"
- ))
+type TimeFormat<'a> = Option<&'a str>;
+
+/// Options for formatting arrays
+///
+/// By default nulls are formatted as `""` and temporal types formatted
+/// according to RFC3339
+///
+#[derive(Debug, Clone)]
+pub struct FormatOptions<'a> {
+ safe: bool,
+ /// Format string for nulls
+ null: &'a str,
+ /// Date format for date arrays
+ date_format: TimeFormat<'a>,
+ /// Format for DateTime arrays
+ datetime_format: TimeFormat<'a>,
+ /// Timestamp format for timestamp arrays
+ timestamp_format: TimeFormat<'a>,
+ /// Timestamp format for timestamp with timezone arrays
+ timestamp_tz_format: TimeFormat<'a>,
+ /// Time format for time arrays
+ time_format: TimeFormat<'a>,
+}
+
+impl<'a> Default for FormatOptions<'a> {
+ fn default() -> Self {
+ Self {
+ safe: true,
+ null: "",
+ date_format: None,
+ datetime_format: None,
+ timestamp_format: None,
+ timestamp_tz_format: None,
+ time_format: None,
+ }
+ }
+}
+
+impl<'a> FormatOptions<'a> {
+ /// If set to `true` any formatting errors will be written to the output
+ /// instead of being converted into a [`std::fmt::Error`]
+ pub fn with_display_error(mut self, safe: bool) -> Self {
+ self.safe = safe;
+ self
+ }
+
+ /// Overrides the string used to represent a null
+ ///
+ /// Defaults to `""`
+ pub fn with_null(self, null: &'a str) -> Self {
+ Self { null, ..self }
+ }
+
+ /// Overrides the format used for [`DataType::Date32`] columns
+ pub fn with_date_format(self, date_format: Option<&'a str>) -> Self {
+ Self {
+ date_format,
+ ..self
+ }
+ }
+
+ /// Overrides the format used for [`DataType::Date64`] columns
+ pub fn with_datetime_format(self, datetime_format: Option<&'a str>) ->
Self {
+ Self {
+ datetime_format,
+ ..self
+ }
+ }
+
+ /// Overrides the format used for [`DataType::Timestamp`] columns without
a timezone
+ pub fn with_timestamp_format(self, timestamp_format: Option<&'a str>) ->
Self {
+ Self {
+ timestamp_format,
+ ..self
+ }
+ }
+
+ /// Overrides the format used for [`DataType::Timestamp`] columns with a
timezone
+ pub fn with_timestamp_tz_format(self, timestamp_tz_format: Option<&'a
str>) -> Self {
+ Self {
+ timestamp_tz_format,
+ ..self
+ }
+ }
+
+ /// Overrides the format used for [`DataType::Time32`] and
[`DataType::Time64`] columns
+ pub fn with_time_format(self, time_format: Option<&'a str>) -> Self {
+ Self {
+ time_format,
+ ..self
+ }
+ }
+}
+
+/// Implements [`Display`] for a specific array value
+pub struct ValueFormatter<'a> {
+ idx: usize,
+ formatter: &'a ArrayFormatter<'a>,
+}
+
+impl<'a> ValueFormatter<'a> {
+ /// Writes this value to the provided [`Write`]
+ ///
+ /// Note: this ignores [`FormatOptions::with_display_error`]
+ pub fn write(&self, s: &mut dyn Write) -> Result<(), ArrowError> {
+ match self.formatter.format.write(self.idx, s) {
+ Ok(_) => Ok(()),
+ Err(FormatError::Arrow(e)) => Err(e),
+ Err(FormatError::Format(_)) => {
+ Err(ArrowError::CastError("Format error".to_string()))
+ }
+ }
+ }
+
+ /// Fallibly converts this to a string
+ pub fn try_to_string(&self) -> Result<String, ArrowError> {
+ let mut s = String::new();
+ self.write(&mut s)?;
+ Ok(s)
+ }
+}
+
+impl<'a> Display for ValueFormatter<'a> {
+ fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+ match self.formatter.format.write(self.idx, f) {
+ Ok(()) => Ok(()),
+ Err(FormatError::Arrow(e)) if self.formatter.safe => {
+ write!(f, "ERROR: {e}")
+ }
+ Err(_) => Err(std::fmt::Error),
+ }
+ }
+}
+
+/// A string formatter for an [`Array`]
+pub struct ArrayFormatter<'a> {
Review Comment:
As far as I understand this is the main new interface. I think if we added
an example to the docs here it would be great and help this usability a lot. A
follow on PR would be fine (and I would be happy to help write it as well)
##########
arrow-cast/src/display.rs:
##########
@@ -19,57 +19,499 @@
//! purposes. See the `pretty` crate for additional functions for
//! record batch pretty printing.
-use std::fmt::Write;
-use std::sync::Arc;
+use std::fmt::{Debug, Display, Formatter, Write};
+use std::ops::Range;
+use arrow_array::cast::*;
+use arrow_array::temporal_conversions::*;
use arrow_array::timezone::Tz;
use arrow_array::types::*;
use arrow_array::*;
use arrow_buffer::ArrowNativeType;
use arrow_schema::*;
-use chrono::prelude::SecondsFormat;
-use chrono::{DateTime, Utc};
+use chrono::{NaiveDate, NaiveDateTime, SecondsFormat, TimeZone, Utc};
+use lexical_core::FormattedSize;
-fn invalid_cast_error(dt: &str, col_idx: usize, row_idx: usize) -> ArrowError {
- ArrowError::CastError(format!(
- "Cannot cast to {dt} at col index: {col_idx} row index: {row_idx}"
- ))
+type TimeFormat<'a> = Option<&'a str>;
+
+/// Options for formatting arrays
+///
+/// By default nulls are formatted as `""` and temporal types formatted
+/// according to RFC3339
+///
+#[derive(Debug, Clone)]
+pub struct FormatOptions<'a> {
+ safe: bool,
+ /// Format string for nulls
+ null: &'a str,
+ /// Date format for date arrays
+ date_format: TimeFormat<'a>,
+ /// Format for DateTime arrays
+ datetime_format: TimeFormat<'a>,
+ /// Timestamp format for timestamp arrays
+ timestamp_format: TimeFormat<'a>,
+ /// Timestamp format for timestamp with timezone arrays
+ timestamp_tz_format: TimeFormat<'a>,
+ /// Time format for time arrays
+ time_format: TimeFormat<'a>,
+}
+
+impl<'a> Default for FormatOptions<'a> {
+ fn default() -> Self {
+ Self {
+ safe: true,
+ null: "",
+ date_format: None,
+ datetime_format: None,
+ timestamp_format: None,
+ timestamp_tz_format: None,
+ time_format: None,
+ }
+ }
+}
+
+impl<'a> FormatOptions<'a> {
+ /// If set to `true` any formatting errors will be written to the output
+ /// instead of being converted into a [`std::fmt::Error`]
+ pub fn with_display_error(mut self, safe: bool) -> Self {
+ self.safe = safe;
+ self
+ }
+
+ /// Overrides the string used to represent a null
+ ///
+ /// Defaults to `""`
+ pub fn with_null(self, null: &'a str) -> Self {
+ Self { null, ..self }
+ }
+
+ /// Overrides the format used for [`DataType::Date32`] columns
+ pub fn with_date_format(self, date_format: Option<&'a str>) -> Self {
+ Self {
+ date_format,
+ ..self
+ }
+ }
+
+ /// Overrides the format used for [`DataType::Date64`] columns
+ pub fn with_datetime_format(self, datetime_format: Option<&'a str>) ->
Self {
+ Self {
+ datetime_format,
+ ..self
+ }
+ }
+
+ /// Overrides the format used for [`DataType::Timestamp`] columns without
a timezone
+ pub fn with_timestamp_format(self, timestamp_format: Option<&'a str>) ->
Self {
+ Self {
+ timestamp_format,
+ ..self
+ }
+ }
+
+ /// Overrides the format used for [`DataType::Timestamp`] columns with a
timezone
+ pub fn with_timestamp_tz_format(self, timestamp_tz_format: Option<&'a
str>) -> Self {
+ Self {
+ timestamp_tz_format,
+ ..self
+ }
+ }
+
+ /// Overrides the format used for [`DataType::Time32`] and
[`DataType::Time64`] columns
+ pub fn with_time_format(self, time_format: Option<&'a str>) -> Self {
+ Self {
+ time_format,
+ ..self
+ }
+ }
+}
+
+/// Implements [`Display`] for a specific array value
+pub struct ValueFormatter<'a> {
+ idx: usize,
+ formatter: &'a ArrayFormatter<'a>,
+}
+
+impl<'a> ValueFormatter<'a> {
+ /// Writes this value to the provided [`Write`]
+ ///
+ /// Note: this ignores [`FormatOptions::with_display_error`]
+ pub fn write(&self, s: &mut dyn Write) -> Result<(), ArrowError> {
+ match self.formatter.format.write(self.idx, s) {
+ Ok(_) => Ok(()),
+ Err(FormatError::Arrow(e)) => Err(e),
+ Err(FormatError::Format(_)) => {
+ Err(ArrowError::CastError("Format error".to_string()))
+ }
+ }
+ }
+
+ /// Fallibly converts this to a string
+ pub fn try_to_string(&self) -> Result<String, ArrowError> {
+ let mut s = String::new();
+ self.write(&mut s)?;
+ Ok(s)
+ }
+}
+
+impl<'a> Display for ValueFormatter<'a> {
+ fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+ match self.formatter.format.write(self.idx, f) {
+ Ok(()) => Ok(()),
+ Err(FormatError::Arrow(e)) if self.formatter.safe => {
+ write!(f, "ERROR: {e}")
+ }
+ Err(_) => Err(std::fmt::Error),
+ }
+ }
+}
+
+/// A string formatter for an [`Array`]
+pub struct ArrayFormatter<'a> {
+ format: Box<dyn DisplayIndex + 'a>,
+ safe: bool,
+}
+
+impl<'a> ArrayFormatter<'a> {
+ /// Returns an [`ArrayFormatter`] that can be used to format `array`
+ ///
+ /// This returns an error if an array of the given data type cannot be
formatted
+ pub fn try_new(
+ array: &'a dyn Array,
+ options: &FormatOptions<'a>,
+ ) -> Result<Self, ArrowError> {
+ Ok(Self {
+ format: make_formatter(array, options)?,
+ safe: options.safe,
+ })
+ }
+
+ /// Returns a [`ValueFormatter`] that implements [`Display`] for
+ /// the value of the array at `idx`
+ pub fn value(&self, idx: usize) -> ValueFormatter<'_> {
+ ValueFormatter {
+ formatter: self,
+ idx,
+ }
+ }
+}
+
+fn make_formatter<'a>(
+ array: &'a dyn Array,
+ options: &FormatOptions<'a>,
+) -> Result<Box<dyn DisplayIndex + 'a>, ArrowError> {
+ downcast_primitive_array! {
+ array => array_format(array, options),
+ DataType::Null => array_format(as_null_array(array), options),
+ DataType::Boolean => array_format(as_boolean_array(array), options),
+ DataType::Utf8 => array_format(as_string_array(array), options),
+ DataType::LargeUtf8 => array_format(as_largestring_array(array),
options),
+ DataType::Binary =>
array_format(as_generic_binary_array::<i32>(array), options),
+ DataType::LargeBinary =>
array_format(as_generic_binary_array::<i64>(array), options),
+ DataType::FixedSizeBinary(_) => {
+ let a =
array.as_any().downcast_ref::<FixedSizeBinaryArray>().unwrap();
+ array_format(a, options)
+ }
+ DataType::Dictionary(_, _) => downcast_dictionary_array! {
+ array => array_format(array, options),
+ _ => unreachable!()
+ }
+ DataType::List(_) => array_format(as_generic_list_array::<i32>(array),
options),
+ DataType::LargeList(_) =>
array_format(as_generic_list_array::<i64>(array), options),
+ DataType::FixedSizeList(_, _) => {
+ let a =
array.as_any().downcast_ref::<FixedSizeListArray>().unwrap();
+ array_format(a, options)
+ }
+ DataType::Struct(_) => array_format(as_struct_array(array), options),
+ DataType::Map(_, _) => array_format(as_map_array(array), options),
+ DataType::Union(_, _, _) => array_format(as_union_array(array),
options),
+ d => Err(ArrowError::NotYetImplemented(format!("formatting {d} is not
yet supported"))),
+ }
+}
+
+/// Either an [`ArrowError`] or [`std::fmt::Error`]
+enum FormatError {
+ Format(std::fmt::Error),
+ Arrow(ArrowError),
+}
+
+type FormatResult = Result<(), FormatError>;
+
+impl From<std::fmt::Error> for FormatError {
+ fn from(value: std::fmt::Error) -> Self {
+ Self::Format(value)
+ }
+}
+
+impl From<ArrowError> for FormatError {
+ fn from(value: ArrowError) -> Self {
+ Self::Arrow(value)
+ }
+}
+
+/// [`Display`] but accepting an index
+trait DisplayIndex {
+ fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult;
+}
+
+/// [`DisplayIndex`] with additional state
+trait DisplayIndexState<'a> {
+ type State;
+
+ fn prepare(&self, options: &FormatOptions<'a>) -> Result<Self::State,
ArrowError>;
+
+ fn write(&self, state: &Self::State, idx: usize, f: &mut dyn Write) ->
FormatResult;
+}
+
+impl<'a, T: DisplayIndex> DisplayIndexState<'a> for T {
+ type State = ();
+
+ fn prepare(&self, _options: &FormatOptions<'a>) -> Result<Self::State,
ArrowError> {
+ Ok(())
+ }
+
+ fn write(&self, _: &Self::State, idx: usize, f: &mut dyn Write) ->
FormatResult {
+ DisplayIndex::write(self, idx, f)
+ }
+}
+
+struct ArrayFormat<'a, F: DisplayIndexState<'a>> {
+ state: F::State,
+ array: F,
+ null: &'a str,
+}
+
+fn array_format<'a, F>(
+ array: F,
+ options: &FormatOptions<'a>,
+) -> Result<Box<dyn DisplayIndex + 'a>, ArrowError>
+where
+ F: DisplayIndexState<'a> + Array + 'a,
+{
+ let state = array.prepare(options)?;
+ Ok(Box::new(ArrayFormat {
+ state,
+ array,
+ null: options.null,
+ }))
+}
+
+impl<'a, F: DisplayIndexState<'a> + Array> DisplayIndex for ArrayFormat<'a, F>
{
+ fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult {
+ if self.array.is_null(idx) {
+ if !self.null.is_empty() {
+ f.write_str(self.null)?
+ }
+ return Ok(());
+ }
+ DisplayIndexState::write(&self.array, &self.state, idx, f)
+ }
+}
+
+impl<'a> DisplayIndex for &'a BooleanArray {
+ fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult {
+ write!(f, "{}", self.value(idx))?;
+ Ok(())
+ }
+}
+
+impl<'a> DisplayIndex for &'a NullArray {
+ fn write(&self, _idx: usize, _f: &mut dyn Write) -> FormatResult {
+ Ok(())
+ }
+}
+
+macro_rules! primitive_display {
+ ($($t:ty),+) => {
+ $(impl<'a> DisplayIndex for &'a PrimitiveArray<$t>
+ {
+ fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult {
+ let value = self.value(idx);
+ let mut buffer = [0u8; <$t as
ArrowPrimitiveType>::Native::FORMATTED_SIZE];
+ // SAFETY:
+ // buffer is T::FORMATTED_SIZE
+ let b = unsafe { lexical_core::write_unchecked(value, &mut
buffer) };
+ // Lexical core produces valid UTF-8
+ let s = unsafe { std::str::from_utf8_unchecked(b) };
+ f.write_str(s)?;
+ Ok(())
+ }
+ })+
+ };
+}
+
+primitive_display!(Int8Type, Int16Type, Int32Type, Int64Type);
+primitive_display!(UInt8Type, UInt16Type, UInt32Type, UInt64Type);
+primitive_display!(Float32Type, Float64Type);
+
+impl<'a> DisplayIndex for &'a PrimitiveArray<Float16Type> {
+ fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult {
+ write!(f, "{}", self.value(idx))?;
+ Ok(())
+ }
+}
+
+macro_rules! decimal_display {
+ ($($t:ty),+) => {
+ $(impl<'a> DisplayIndexState<'a> for &'a PrimitiveArray<$t> {
Review Comment:
I am impressed that you got the types to work out with references within
macros 🤯
##########
arrow-cast/src/display.rs:
##########
@@ -135,657 +575,218 @@ macro_rules! make_string_interval_month_day_nano {
secs_sign,
secs.abs(),
nanoseconds.abs(),
- ))
- }};
-}
-
-macro_rules! make_string_date {
- ($array_type:ty, $dt:expr, $column: ident, $col_idx:ident, $row_idx:
ident) => {{
- Ok($column
- .as_any()
- .downcast_ref::<$array_type>()
- .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?
- .value_as_date($row_idx)
- .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?
- .to_string())
- }};
-}
-
-macro_rules! make_string_date_with_format {
- ($array_type:ty, $dt:expr, $format: ident, $column: ident, $col_idx:ident,
$row_idx: ident) => {{
- Ok($column
- .as_any()
- .downcast_ref::<$array_type>()
- .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?
- .value_as_datetime($row_idx)
- .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?
- .format($format)
- .to_string())
- }};
-}
-
-macro_rules! handle_string_date {
- ($array_type:ty, $dt:expr, $format: ident, $column: ident, $col_idx:ident,
$row_idx: ident) => {{
- match $format {
- Some(format) => {
- make_string_date_with_format!(
- $array_type,
- $dt,
- format,
- $column,
- $col_idx,
- $row_idx
- )
- }
- None => make_string_date!($array_type, $dt, $column, $col_idx,
$row_idx),
- }
- }};
-}
-
-macro_rules! make_string_time {
- ($array_type:ty, $dt:expr, $column: ident, $col_idx:ident, $row_idx:
ident) => {{
- Ok($column
- .as_any()
- .downcast_ref::<$array_type>()
- .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?
- .value_as_time($row_idx)
- .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?
- .to_string())
- }};
-}
-
-macro_rules! make_string_time_with_format {
- ($array_type:ty, $dt:expr, $format: ident, $column: ident, $col_idx:ident,
$row_idx: ident) => {{
- Ok($column
- .as_any()
- .downcast_ref::<$array_type>()
- .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?
- .value_as_time($row_idx)
- .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?
- .format($format)
- .to_string())
- }};
-}
-
-macro_rules! handle_string_time {
- ($array_type:ty, $dt:expr, $format: ident, $column: ident, $col_idx:ident,
$row_idx: ident) => {
- match $format {
- Some(format) => {
- make_string_time_with_format!(
- $array_type,
- $dt,
- format,
- $column,
- $col_idx,
- $row_idx
- )
- }
- None => make_string_time!($array_type, $dt, $column, $col_idx,
$row_idx),
- }
- };
+ )?;
+ Ok(())
+ }
}
-macro_rules! make_string_datetime {
- ($array_type:ty, $dt:expr, $tz_string: ident, $column: ident,
$col_idx:ident, $row_idx: ident) => {{
- let array = $column
- .as_any()
- .downcast_ref::<$array_type>()
- .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?;
-
- let s = match $tz_string {
- Some(tz_string) => match tz_string.parse::<Tz>() {
- Ok(tz) => array
- .value_as_datetime_with_tz($row_idx, tz)
- .ok_or_else(|| invalid_cast_error($dt, $col_idx,
$row_idx))?
- .to_rfc3339_opts(SecondsFormat::AutoSi, true)
- .to_string(),
- Err(_) => {
- let datetime = array
- .value_as_datetime($row_idx)
- .ok_or_else(|| invalid_cast_error($dt, $col_idx,
$row_idx))?;
- format!("{:?} (Unknown Time Zone '{}')", datetime,
tz_string)
- }
- },
- None => {
- let datetime = array
- .value_as_datetime($row_idx)
- .ok_or_else(|| invalid_cast_error($dt, $col_idx,
$row_idx))?;
- format!("{:?}", datetime)
- }
- };
-
- Ok(s)
- }};
-}
-
-macro_rules! make_string_datetime_with_format {
- ($array_type:ty, $dt:expr, $format: ident, $tz_string: ident, $column:
ident, $col_idx:ident, $row_idx: ident) => {{
- let array = $column
- .as_any()
- .downcast_ref::<$array_type>()
- .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?;
- let datetime = array
- .value_as_datetime($row_idx)
- .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?;
-
- let s = match $tz_string {
- Some(tz_string) => match tz_string.parse::<Tz>() {
- Ok(tz) => {
- let utc_time = DateTime::<Utc>::from_utc(datetime, Utc);
- let local_time = utc_time.with_timezone(&tz);
- local_time.format($format).to_string()
- }
- Err(_) => {
- format!("{:?} (Unknown Time Zone '{}')", datetime,
tz_string)
- }
- },
- None => datetime.format($format).to_string(),
- };
+impl<'a, O: OffsetSizeTrait> DisplayIndex for &'a GenericStringArray<O> {
+ fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult {
+ write!(f, "{}", self.value(idx))?;
+ Ok(())
+ }
+}
- Ok(s)
- }};
-}
-
-macro_rules! handle_string_datetime {
- ($array_type:ty, $dt:expr, $format: ident, $tz_string: ident, $column:
ident, $col_idx:ident, $row_idx: ident) => {
- match $format {
- Some(format) => make_string_datetime_with_format!(
- $array_type,
- $dt,
- format,
- $tz_string,
- $column,
- $col_idx,
- $row_idx
- ),
- None => make_string_datetime!(
- $array_type,
- $dt,
- $tz_string,
- $column,
- $col_idx,
- $row_idx
- ),
+impl<'a, O: OffsetSizeTrait> DisplayIndex for &'a GenericBinaryArray<O> {
+ fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult {
+ let v = self.value(idx);
+ for byte in v {
+ write!(f, "{byte:02x}")?;
}
- };
+ Ok(())
+ }
}
-// It's not possible to do array.value($row).to_string() for &[u8], let's
format it as hex
-macro_rules! make_string_hex {
- ($array_type:ty, $column: ident, $row: ident) => {{
- let array = $column.as_any().downcast_ref::<$array_type>().unwrap();
+impl<'a> DisplayIndex for &'a FixedSizeBinaryArray {
+ fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult {
+ let v = self.value(idx);
+ for byte in v {
+ write!(f, "{byte:02x}")?;
+ }
+ Ok(())
+ }
+}
- let mut tmp = "".to_string();
+impl<'a, K: ArrowDictionaryKeyType> DisplayIndexState<'a> for &'a
DictionaryArray<K> {
+ type State = Box<dyn DisplayIndex + 'a>;
- for character in array.value($row) {
- let _ = write!(tmp, "{:02x}", character);
- }
+ fn prepare(&self, options: &FormatOptions<'a>) -> Result<Self::State,
ArrowError> {
+ make_formatter(self.values().as_ref(), options)
+ }
- Ok(tmp)
- }};
-}
-
-macro_rules! make_string_from_list {
- ($column: ident, $row: ident) => {{
- let list = $column
- .as_any()
- .downcast_ref::<array::ListArray>()
- .ok_or(ArrowError::InvalidArgumentError(format!(
- "Repl error: could not convert list column to list array."
- )))?
- .value($row);
- let string_values = (0..list.len())
- .map(|i| array_value_to_string(&list.clone(), i))
- .collect::<Result<Vec<_>, _>>()?;
- Ok(format!("[{}]", string_values.join(", ")))
- }};
-}
-
-macro_rules! make_string_from_large_list {
- ($column: ident, $row: ident) => {{
- let list = $column
- .as_any()
- .downcast_ref::<array::LargeListArray>()
- .ok_or(ArrowError::InvalidArgumentError(format!(
- "Repl error: could not convert large list column to list
array."
- )))?
- .value($row);
- let string_values = (0..list.len())
- .map(|i| array_value_to_string(&list, i))
- .collect::<Result<Vec<_>, _>>()?;
- Ok(format!("[{}]", string_values.join(", ")))
- }};
-}
-
-macro_rules! make_string_from_fixed_size_list {
- ($column: ident, $row: ident) => {{
- let list = $column
- .as_any()
- .downcast_ref::<array::FixedSizeListArray>()
- .ok_or(ArrowError::InvalidArgumentError(format!(
- "Repl error: could not convert list column to list array."
- )))?
- .value($row);
- let string_values = (0..list.len())
- .map(|i| array_value_to_string(&list.clone(), i))
- .collect::<Result<Vec<_>, _>>()?;
- Ok(format!("[{}]", string_values.join(", ")))
- }};
-}
-
-macro_rules! make_string_from_duration {
- ($array_type:ty, $dt:expr, $column:ident, $col_idx:ident, $row_idx: ident)
=> {{
- Ok($column
- .as_any()
- .downcast_ref::<$array_type>()
- .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?
- .value_as_duration($row_idx)
- .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?
- .to_string())
- }};
-}
-
-#[inline(always)]
-pub fn make_string_from_decimal(
- column: &Arc<dyn Array>,
- row: usize,
-) -> Result<String, ArrowError> {
- let array = column.as_any().downcast_ref::<Decimal128Array>().unwrap();
+ fn write(&self, s: &Self::State, idx: usize, f: &mut dyn Write) ->
FormatResult {
+ let value_idx = self.keys().values()[idx].as_usize();
+ s.as_ref().write(value_idx, f)
+ }
+}
- Ok(array.value_as_string(row))
+fn write_list(
+ f: &mut dyn Write,
+ mut range: Range<usize>,
+ values: &dyn DisplayIndex,
+) -> FormatResult {
+ f.write_char('[')?;
+ if let Some(idx) = range.next() {
+ values.write(idx, f)?;
+ }
+ for idx in range {
+ write!(f, ", ")?;
+ values.write(idx, f)?;
+ }
+ f.write_char(']')?;
+ Ok(())
}
-fn append_struct_field_string(
- target: &mut String,
- name: &str,
- field_col: &Arc<dyn Array>,
- row: usize,
-) -> Result<(), ArrowError> {
- target.push('"');
- target.push_str(name);
- target.push_str("\": ");
-
- if field_col.is_null(row) {
- target.push_str("null");
- } else {
- match field_col.data_type() {
- DataType::Utf8 | DataType::LargeUtf8 => {
- target.push('"');
- target.push_str(array_value_to_string(field_col,
row)?.as_str());
- target.push('"');
- }
- _ => {
- target.push_str(array_value_to_string(field_col,
row)?.as_str());
- }
- }
+impl<'a, O: OffsetSizeTrait> DisplayIndexState<'a> for &'a GenericListArray<O>
{
+ type State = Box<dyn DisplayIndex + 'a>;
+
+ fn prepare(&self, options: &FormatOptions<'a>) -> Result<Self::State,
ArrowError> {
+ make_formatter(self.values().as_ref(), options)
}
- Ok(())
+ fn write(&self, s: &Self::State, idx: usize, f: &mut dyn Write) ->
FormatResult {
+ let offsets = self.value_offsets();
+ let end = offsets[idx + 1].as_usize();
+ let start = offsets[idx].as_usize();
+ write_list(f, start..end, s.as_ref())
+ }
}
-fn append_map_field_string(
- target: &mut String,
- field_col: &Arc<dyn Array>,
- row: usize,
-) -> Result<(), ArrowError> {
- if field_col.is_null(row) {
- target.push_str("null");
- } else {
- match field_col.data_type() {
- DataType::Utf8 | DataType::LargeUtf8 => {
- target.push('"');
- target.push_str(array_value_to_string(field_col,
row)?.as_str());
- target.push('"');
- }
- _ => {
- target.push_str(array_value_to_string(field_col,
row)?.as_str());
- }
- }
+impl<'a> DisplayIndexState<'a> for &'a FixedSizeListArray {
+ type State = (usize, Box<dyn DisplayIndex + 'a>);
+
+ fn prepare(&self, options: &FormatOptions<'a>) -> Result<Self::State,
ArrowError> {
+ let values = make_formatter(self.values().as_ref(), options)?;
+ let length = self.value_length();
+ Ok((length as usize, values))
}
- Ok(())
+ fn write(&self, s: &Self::State, idx: usize, f: &mut dyn Write) ->
FormatResult {
+ let start = idx * s.0;
+ let end = start + s.0;
+ write_list(f, start..end, s.1.as_ref())
+ }
}
-/// Get the value at the given row in an array as a String.
-///
-/// Note this function is quite inefficient and is unlikely to be
-/// suitable for converting large arrays or record batches.
-fn array_value_to_string_internal(
- column: &ArrayRef,
- col_idx: usize,
- row_idx: usize,
- format: Option<&str>,
-) -> Result<String, ArrowError> {
- if column.is_null(row_idx) {
- return Ok("".to_string());
- }
- match column.data_type() {
- DataType::Utf8 => make_string!(array::StringArray, column, row_idx),
- DataType::LargeUtf8 => make_string!(array::LargeStringArray, column,
row_idx),
- DataType::Binary => make_string_hex!(array::BinaryArray, column,
row_idx),
- DataType::LargeBinary => {
- make_string_hex!(array::LargeBinaryArray, column, row_idx)
- }
- DataType::FixedSizeBinary(_) => {
- make_string_hex!(array::FixedSizeBinaryArray, column, row_idx)
- }
- DataType::Boolean => make_string!(array::BooleanArray, column,
row_idx),
- DataType::Int8 => make_string!(array::Int8Array, column, row_idx),
- DataType::Int16 => make_string!(array::Int16Array, column, row_idx),
- DataType::Int32 => make_string!(array::Int32Array, column, row_idx),
- DataType::Int64 => make_string!(array::Int64Array, column, row_idx),
- DataType::UInt8 => make_string!(array::UInt8Array, column, row_idx),
- DataType::UInt16 => make_string!(array::UInt16Array, column, row_idx),
- DataType::UInt32 => make_string!(array::UInt32Array, column, row_idx),
- DataType::UInt64 => make_string!(array::UInt64Array, column, row_idx),
- DataType::Float16 => make_string!(array::Float16Array, column,
row_idx),
- DataType::Float32 => make_string!(array::Float32Array, column,
row_idx),
- DataType::Float64 => make_string!(array::Float64Array, column,
row_idx),
- DataType::Decimal128(..) => make_string_from_decimal(column, row_idx),
- DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Second
=> {
- handle_string_datetime!(
- array::TimestampSecondArray,
- "Timestamp",
- format,
- tz_string_opt,
- column,
- col_idx,
- row_idx
- )
- }
- DataType::Timestamp(unit, tz_string_opt) if *unit ==
TimeUnit::Millisecond => {
- handle_string_datetime!(
- array::TimestampMillisecondArray,
- "Timestamp",
- format,
- tz_string_opt,
- column,
- col_idx,
- row_idx
- )
- }
- DataType::Timestamp(unit, tz_string_opt) if *unit ==
TimeUnit::Microsecond => {
- handle_string_datetime!(
- array::TimestampMicrosecondArray,
- "Timestamp",
- format,
- tz_string_opt,
- column,
- col_idx,
- row_idx
- )
- }
- DataType::Timestamp(unit, tz_string_opt) if *unit ==
TimeUnit::Nanosecond => {
- handle_string_datetime!(
- array::TimestampNanosecondArray,
- "Timestamp",
- format,
- tz_string_opt,
- column,
- col_idx,
- row_idx
- )
- }
- DataType::Date32 => {
- handle_string_date!(
- array::Date32Array,
- "Date32",
- format,
- column,
- col_idx,
- row_idx
- )
- }
- DataType::Date64 => {
- handle_string_date!(
- array::Date64Array,
- "Date64",
- format,
- column,
- col_idx,
- row_idx
- )
- }
- DataType::Time32(unit) if *unit == TimeUnit::Second => {
- handle_string_time!(
- array::Time32SecondArray,
- "Time32",
- format,
- column,
- col_idx,
- row_idx
- )
- }
- DataType::Time32(unit) if *unit == TimeUnit::Millisecond => {
- handle_string_time!(
- array::Time32MillisecondArray,
- "Time32",
- format,
- column,
- col_idx,
- row_idx
- )
- }
- DataType::Time64(unit) if *unit == TimeUnit::Microsecond => {
- handle_string_time!(
- array::Time64MicrosecondArray,
- "Time64",
- format,
- column,
- col_idx,
- row_idx
- )
- }
- DataType::Time64(unit) if *unit == TimeUnit::Nanosecond => {
- handle_string_time!(
- array::Time64NanosecondArray,
- "Time64",
- format,
- column,
- col_idx,
- row_idx
- )
- }
- DataType::Interval(unit) => match unit {
- IntervalUnit::DayTime => {
- make_string_interval_day_time!(column, row_idx)
- }
- IntervalUnit::YearMonth => {
- make_string_interval_year_month!(column, row_idx)
- }
- IntervalUnit::MonthDayNano => {
- make_string_interval_month_day_nano!(column, row_idx)
- }
- },
- DataType::List(_) => make_string_from_list!(column, row_idx),
- DataType::LargeList(_) => make_string_from_large_list!(column,
row_idx),
- DataType::Dictionary(index_type, _value_type) => match **index_type {
- DataType::Int8 => dict_array_value_to_string::<Int8Type>(column,
row_idx),
- DataType::Int16 => dict_array_value_to_string::<Int16Type>(column,
row_idx),
- DataType::Int32 => dict_array_value_to_string::<Int32Type>(column,
row_idx),
- DataType::Int64 => dict_array_value_to_string::<Int64Type>(column,
row_idx),
- DataType::UInt8 => dict_array_value_to_string::<UInt8Type>(column,
row_idx),
- DataType::UInt16 =>
dict_array_value_to_string::<UInt16Type>(column, row_idx),
- DataType::UInt32 =>
dict_array_value_to_string::<UInt32Type>(column, row_idx),
- DataType::UInt64 =>
dict_array_value_to_string::<UInt64Type>(column, row_idx),
- _ => Err(ArrowError::InvalidArgumentError(format!(
- "Pretty printing not supported for {:?} due to index type",
- column.data_type()
- ))),
- },
- DataType::FixedSizeList(_, _) => {
- make_string_from_fixed_size_list!(column, row_idx)
- }
- DataType::Struct(_) => {
- let st = column
- .as_any()
- .downcast_ref::<array::StructArray>()
- .ok_or_else(|| {
- ArrowError::InvalidArgumentError(
- "Repl error: could not convert struct column to struct
array."
- .to_string(),
- )
- })?;
+/// Pairs a boxed [`DisplayIndex`] with its field name
+type FieldDisplay<'a> = (&'a str, Box<dyn DisplayIndex + 'a>);
- let mut s = String::new();
- s.push('{');
- let mut kv_iter = st.columns().iter().zip(st.column_names());
- if let Some((col, name)) = kv_iter.next() {
- append_struct_field_string(&mut s, name, col, row_idx)?;
- }
- for (col, name) in kv_iter {
- s.push_str(", ");
- append_struct_field_string(&mut s, name, col, row_idx)?;
- }
- s.push('}');
+impl<'a> DisplayIndexState<'a> for &'a StructArray {
+ type State = Vec<FieldDisplay<'a>>;
- Ok(s)
- }
- DataType::Map(_, _) => {
- let map_array =
- column.as_any().downcast_ref::<MapArray>().ok_or_else(|| {
- ArrowError::InvalidArgumentError(
- "Repl error: could not convert column to map
array.".to_string(),
- )
- })?;
- let map_entry = map_array.value(row_idx);
- let st = map_entry
- .as_any()
- .downcast_ref::<StructArray>()
- .ok_or_else(|| {
- ArrowError::InvalidArgumentError(
- "Repl error: could not convert map entry to struct
array."
- .to_string(),
- )
- })?;
- let mut s = String::new();
- s.push('{');
- let entries_count = st.column(0).len();
- for i in 0..entries_count {
- if i > 0 {
- s.push_str(", ");
- }
- append_map_field_string(&mut s, st.column(0), i)?;
- s.push_str(": ");
- append_map_field_string(&mut s, st.column(1), i)?;
- }
- s.push('}');
+ fn prepare(&self, options: &FormatOptions<'a>) -> Result<Self::State,
ArrowError> {
+ let fields = match (*self).data_type() {
+ DataType::Struct(f) => f,
+ _ => unreachable!(),
+ };
- Ok(s)
+ self.columns()
+ .iter()
+ .zip(fields)
+ .map(|(a, f)| {
+ let format = make_formatter(a.as_ref(), options)?;
+ Ok((f.name().as_str(), format))
+ })
+ .collect()
+ }
+
+ fn write(&self, s: &Self::State, idx: usize, f: &mut dyn Write) ->
FormatResult {
+ let mut iter = s.iter();
+ f.write_char('{')?;
+ if let Some((name, display)) = iter.next() {
+ write!(f, "{name}: ")?;
+ display.as_ref().write(idx, f)?;
}
- DataType::Union(field_vec, type_ids, mode) => {
- union_to_string(column, row_idx, field_vec, type_ids, mode)
+ for (name, display) in iter {
+ write!(f, ", {name}: ")?;
+ display.as_ref().write(idx, f)?;
}
- DataType::Duration(unit) => match *unit {
- TimeUnit::Second => {
- make_string_from_duration!(
- array::DurationSecondArray,
- "Duration",
- column,
- col_idx,
- row_idx
- )
- }
- TimeUnit::Millisecond => {
- make_string_from_duration!(
- array::DurationMillisecondArray,
- "Duration",
- column,
- col_idx,
- row_idx
- )
- }
- TimeUnit::Microsecond => {
- make_string_from_duration!(
- array::DurationMicrosecondArray,
- "Duration",
- column,
- col_idx,
- row_idx
- )
- }
- TimeUnit::Nanosecond => {
- make_string_from_duration!(
- array::DurationNanosecondArray,
- "Duration",
- column,
- col_idx,
- row_idx
- )
- }
- },
- _ => Err(ArrowError::InvalidArgumentError(format!(
- "Pretty printing not implemented for {:?} type",
- column.data_type()
- ))),
+ f.write_char('}')?;
+ Ok(())
}
}
-pub fn temporal_array_value_to_string(
- column: &ArrayRef,
- col_idx: usize,
- row_idx: usize,
- format: Option<&str>,
-) -> Result<String, ArrowError> {
- array_value_to_string_internal(column, col_idx, row_idx, format)
-}
+impl<'a> DisplayIndexState<'a> for &'a MapArray {
+ type State = (Box<dyn DisplayIndex + 'a>, Box<dyn DisplayIndex + 'a>);
-pub fn array_value_to_string(
- column: &ArrayRef,
- row_idx: usize,
-) -> Result<String, ArrowError> {
- array_value_to_string_internal(column, 0, row_idx, None)
-}
+ fn prepare(&self, options: &FormatOptions<'a>) -> Result<Self::State,
ArrowError> {
+ let keys = make_formatter(self.keys().as_ref(), options)?;
+ let values = make_formatter(self.values().as_ref(), options)?;
+ Ok((keys, values))
+ }
-/// Converts the value of the union array at `row` to a String
-fn union_to_string(
- column: &ArrayRef,
- row: usize,
- fields: &[Field],
- type_ids: &[i8],
- mode: &UnionMode,
-) -> Result<String, ArrowError> {
- let list = column
- .as_any()
- .downcast_ref::<array::UnionArray>()
- .ok_or_else(|| {
- ArrowError::InvalidArgumentError(
- "Repl error: could not convert union column to union
array.".to_string(),
- )
- })?;
- let type_id = list.type_id(row);
- let field_idx = type_ids.iter().position(|t| t == &type_id).ok_or_else(|| {
- ArrowError::InvalidArgumentError(format!(
- "Repl error: could not get field name for type id: {type_id} in
union array.",
- ))
- })?;
- let name = fields.get(field_idx).unwrap().name();
-
- let value = array_value_to_string(
- list.child(type_id),
- match mode {
- UnionMode::Dense => list.value_offset(row) as usize,
- UnionMode::Sparse => row,
- },
- )?;
+ fn write(&self, s: &Self::State, idx: usize, f: &mut dyn Write) ->
FormatResult {
+ let offsets = self.value_offsets();
+ let end = offsets[idx + 1].as_usize();
+ let start = offsets[idx].as_usize();
+ let mut iter = start..end;
+
+ f.write_char('{')?;
+ if let Some(idx) = iter.next() {
+ s.0.write(idx, f)?;
+ write!(f, ": ")?;
+ s.1.write(idx, f)?;
+ }
+
+ for idx in iter {
+ write!(f, ", ")?;
+ s.0.write(idx, f)?;
+ write!(f, ": ")?;
+ s.1.write(idx, f)?;
+ }
- Ok(format!("{{{name}={value}}}"))
+ f.write_char('}')?;
+ Ok(())
+ }
}
-/// Converts the value of the dictionary array at `row` to a String
-fn dict_array_value_to_string<K: ArrowPrimitiveType>(
- colum: &ArrayRef,
- row: usize,
-) -> Result<String, ArrowError> {
- let dict_array =
colum.as_any().downcast_ref::<DictionaryArray<K>>().unwrap();
- let keys_array = dict_array.keys();
+impl<'a> DisplayIndexState<'a> for &'a UnionArray {
+ type State = (
+ Vec<Option<(&'a str, Box<dyn DisplayIndex + 'a>)>>,
+ UnionMode,
+ );
- if keys_array.is_null(row) {
- return Ok(String::from(""));
+ fn prepare(&self, options: &FormatOptions<'a>) -> Result<Self::State,
ArrowError> {
+ let (fields, type_ids, mode) = match (*self).data_type() {
+ DataType::Union(fields, type_ids, mode) => (fields, type_ids,
mode),
+ _ => unreachable!(),
+ };
+
+ let max_id = type_ids.iter().copied().max().unwrap_or_default() as
usize;
+ let mut out: Vec<Option<FieldDisplay>> = (0..max_id + 1).map(|_|
None).collect();
+ for (i, field) in type_ids.iter().zip(fields) {
+ let formatter = make_formatter(self.child(*i).as_ref(), options)?;
+ out[*i as usize] = Some((field.name().as_str(), formatter))
+ }
+ Ok((out, *mode))
+ }
+
+ fn write(&self, s: &Self::State, idx: usize, f: &mut dyn Write) ->
FormatResult {
+ let id = self.type_id(idx);
+ let idx = match s.1 {
+ UnionMode::Dense => self.value_offset(idx) as usize,
+ UnionMode::Sparse => idx,
+ };
+ let (name, field) = s.0[id as usize].as_ref().unwrap();
+
+ write!(f, "{{{name}=")?;
+ field.write(idx, f)?;
+ f.write_char('}')?;
+ Ok(())
}
+}
- let dict_index = keys_array.value(row).as_usize();
- array_value_to_string(dict_array.values(), dict_index)
+/// Get the value at the given row in an array as a String.
+///
+/// Note this function is quite inefficient and is unlikely to be
+/// suitable for converting large arrays or record batches.
Review Comment:
```suggestion
/// suitable for converting large arrays or record batches.
///
/// Please see [`ArrayFormatter`] for a better interface
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]