This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new a3b344de3 Lazy array display (#3638) (#3647)
a3b344de3 is described below
commit a3b344de39dd5652f1216b0497e15ca263b7d648
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Wed Feb 8 16:58:01 2023 +0000
Lazy array display (#3638) (#3647)
* Lazy array display (#3638)
* Update CSV writer
* Borrow
* Time formatting
* Update pretty
* Add FixedSizeBinaryArray
* Further tweaks
* Clippy
* More clippy
* More tweaks
* More clippy
* Clippy
* Use lexical_core
* Update doctest
* Review feedback
* Bump CI
* Review feedback
---
arrow-cast/src/display.rs | 1378 ++++++++++++++++--------------
arrow-csv/src/writer.rs | 213 ++---
arrow-json/src/writer.rs | 88 +-
arrow-schema/src/datatype.rs | 2 +-
arrow/src/util/pretty.rs | 152 ++--
arrow/tests/csv.rs | 2 +-
parquet/src/arrow/arrow_writer/levels.rs | 4 +-
parquet/src/arrow/arrow_writer/mod.rs | 22 +-
parquet/src/arrow/async_reader/mod.rs | 8 +-
9 files changed, 873 insertions(+), 996 deletions(-)
diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs
index bd4829898..6e06a0e39 100644
--- a/arrow-cast/src/display.rs
+++ b/arrow-cast/src/display.rs
@@ -19,56 +19,553 @@
//! purposes. See the `pretty` crate for additional functions for
//! record batch pretty printing.
-use std::fmt::Write;
+use std::fmt::{Display, Formatter, Write};
+use std::ops::Range;
+use arrow_array::cast::*;
+use arrow_array::temporal_conversions::*;
use arrow_array::timezone::Tz;
use arrow_array::types::*;
use arrow_array::*;
use arrow_buffer::ArrowNativeType;
use arrow_schema::*;
-use chrono::prelude::SecondsFormat;
-use chrono::{DateTime, Utc};
+use chrono::{NaiveDate, NaiveDateTime, SecondsFormat, TimeZone, Utc};
+use lexical_core::FormattedSize;
-fn invalid_cast_error(dt: &str, col_idx: usize, row_idx: usize) -> ArrowError {
- ArrowError::CastError(format!(
- "Cannot cast to {dt} at col index: {col_idx} row index: {row_idx}"
- ))
+type TimeFormat<'a> = Option<&'a str>;
+
+/// Options for formatting arrays
+///
+/// By default nulls are formatted as `""` and temporal types formatted
+/// according to RFC3339
+///
+#[derive(Debug, Clone)]
+pub struct FormatOptions<'a> {
+ /// If set to `true` any formatting errors will be written to the output
+ /// instead of being converted into a [`std::fmt::Error`]
+ safe: bool,
+ /// Format string for nulls
+ null: &'a str,
+ /// Date format for date arrays
+ date_format: TimeFormat<'a>,
+ /// Format for DateTime arrays
+ datetime_format: TimeFormat<'a>,
+ /// Timestamp format for timestamp arrays
+ timestamp_format: TimeFormat<'a>,
+ /// Timestamp format for timestamp with timezone arrays
+ timestamp_tz_format: TimeFormat<'a>,
+ /// Time format for time arrays
+ time_format: TimeFormat<'a>,
+}
+
+impl<'a> Default for FormatOptions<'a> {
+ fn default() -> Self {
+ Self {
+ safe: true,
+ null: "",
+ date_format: None,
+ datetime_format: None,
+ timestamp_format: None,
+ timestamp_tz_format: None,
+ time_format: None,
+ }
+ }
+}
+
+impl<'a> FormatOptions<'a> {
+ /// If set to `true` any formatting errors will be written to the output
+ /// instead of being converted into a [`std::fmt::Error`]
+ pub fn with_display_error(mut self, safe: bool) -> Self {
+ self.safe = safe;
+ self
+ }
+
+ /// Overrides the string used to represent a null
+ ///
+ /// Defaults to `""`
+ pub fn with_null(self, null: &'a str) -> Self {
+ Self { null, ..self }
+ }
+
+ /// Overrides the format used for [`DataType::Date32`] columns
+ pub fn with_date_format(self, date_format: Option<&'a str>) -> Self {
+ Self {
+ date_format,
+ ..self
+ }
+ }
+
+ /// Overrides the format used for [`DataType::Date64`] columns
+ pub fn with_datetime_format(self, datetime_format: Option<&'a str>) ->
Self {
+ Self {
+ datetime_format,
+ ..self
+ }
+ }
+
+ /// Overrides the format used for [`DataType::Timestamp`] columns without
a timezone
+ pub fn with_timestamp_format(self, timestamp_format: Option<&'a str>) ->
Self {
+ Self {
+ timestamp_format,
+ ..self
+ }
+ }
+
+ /// Overrides the format used for [`DataType::Timestamp`] columns with a
timezone
+ pub fn with_timestamp_tz_format(self, timestamp_tz_format: Option<&'a
str>) -> Self {
+ Self {
+ timestamp_tz_format,
+ ..self
+ }
+ }
+
+ /// Overrides the format used for [`DataType::Time32`] and
[`DataType::Time64`] columns
+ pub fn with_time_format(self, time_format: Option<&'a str>) -> Self {
+ Self {
+ time_format,
+ ..self
+ }
+ }
+}
+
+/// Implements [`Display`] for a specific array value
+pub struct ValueFormatter<'a> {
+ idx: usize,
+ formatter: &'a ArrayFormatter<'a>,
+}
+
+impl<'a> ValueFormatter<'a> {
+ /// Writes this value to the provided [`Write`]
+ ///
+ /// Note: this ignores [`FormatOptions::with_display_error`] and
+ /// will return an error on formatting issue
+ pub fn write(&self, s: &mut dyn Write) -> Result<(), ArrowError> {
+ match self.formatter.format.write(self.idx, s) {
+ Ok(_) => Ok(()),
+ Err(FormatError::Arrow(e)) => Err(e),
+ Err(FormatError::Format(_)) => {
+ Err(ArrowError::CastError("Format error".to_string()))
+ }
+ }
+ }
+
+ /// Fallibly converts this to a string
+ pub fn try_to_string(&self) -> Result<String, ArrowError> {
+ let mut s = String::new();
+ self.write(&mut s)?;
+ Ok(s)
+ }
+}
+
+impl<'a> Display for ValueFormatter<'a> {
+ fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+ match self.formatter.format.write(self.idx, f) {
+ Ok(()) => Ok(()),
+ Err(FormatError::Arrow(e)) if self.formatter.safe => {
+ write!(f, "ERROR: {e}")
+ }
+ Err(_) => Err(std::fmt::Error),
+ }
+ }
+}
+
+/// A string formatter for an [`Array`]
+///
+/// This can be used with [`std::write`] to write type-erased `dyn Array`
+///
+/// ```
+/// # use std::fmt::{Display, Formatter, Write};
+/// # use arrow_array::{Array, ArrayRef, Int32Array};
+/// # use arrow_cast::display::{ArrayFormatter, FormatOptions};
+/// # use arrow_schema::ArrowError;
+/// struct MyContainer {
+/// values: ArrayRef,
+/// }
+///
+/// impl Display for MyContainer {
+/// fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+/// let options = FormatOptions::default();
+/// let formatter = ArrayFormatter::try_new(self.values.as_ref(),
&options)
+/// .map_err(|_| std::fmt::Error)?;
+///
+/// let mut iter = 0..self.values.len();
+/// if let Some(idx) = iter.next() {
+/// write!(f, "{}", formatter.value(idx))?;
+/// }
+/// for idx in iter {
+/// write!(f, ", {}", formatter.value(idx))?;
+/// }
+/// Ok(())
+/// }
+/// }
+/// ```
+///
+/// [`ValueFormatter::write`] can also be used to get a semantic error,
instead of the
+/// opaque [`std::fmt::Error`]
+///
+/// ```
+/// # use std::fmt::Write;
+/// # use arrow_array::Array;
+/// # use arrow_cast::display::{ArrayFormatter, FormatOptions};
+/// # use arrow_schema::ArrowError;
+/// fn format_array(
+/// f: &mut dyn Write,
+/// array: &dyn Array,
+/// options: &FormatOptions,
+/// ) -> Result<(), ArrowError> {
+/// let formatter = ArrayFormatter::try_new(array, options)?;
+/// for i in 0..array.len() {
+/// formatter.value(i).write(f)?
+/// }
+/// Ok(())
+/// }
+/// ```
+///
+pub struct ArrayFormatter<'a> {
+ format: Box<dyn DisplayIndex + 'a>,
+ safe: bool,
+}
+
+impl<'a> ArrayFormatter<'a> {
+ /// Returns an [`ArrayFormatter`] that can be used to format `array`
+ ///
+ /// This returns an error if an array of the given data type cannot be
formatted
+ pub fn try_new(
+ array: &'a dyn Array,
+ options: &FormatOptions<'a>,
+ ) -> Result<Self, ArrowError> {
+ Ok(Self {
+ format: make_formatter(array, options)?,
+ safe: options.safe,
+ })
+ }
+
+ /// Returns a [`ValueFormatter`] that implements [`Display`] for
+ /// the value of the array at `idx`
+ pub fn value(&self, idx: usize) -> ValueFormatter<'_> {
+ ValueFormatter {
+ formatter: self,
+ idx,
+ }
+ }
+}
+
+fn make_formatter<'a>(
+ array: &'a dyn Array,
+ options: &FormatOptions<'a>,
+) -> Result<Box<dyn DisplayIndex + 'a>, ArrowError> {
+ downcast_primitive_array! {
+ array => array_format(array, options),
+ DataType::Null => array_format(as_null_array(array), options),
+ DataType::Boolean => array_format(as_boolean_array(array), options),
+ DataType::Utf8 => array_format(as_string_array(array), options),
+ DataType::LargeUtf8 => array_format(as_largestring_array(array),
options),
+ DataType::Binary =>
array_format(as_generic_binary_array::<i32>(array), options),
+ DataType::LargeBinary =>
array_format(as_generic_binary_array::<i64>(array), options),
+ DataType::FixedSizeBinary(_) => {
+ let a =
array.as_any().downcast_ref::<FixedSizeBinaryArray>().unwrap();
+ array_format(a, options)
+ }
+ DataType::Dictionary(_, _) => downcast_dictionary_array! {
+ array => array_format(array, options),
+ _ => unreachable!()
+ }
+ DataType::List(_) => array_format(as_generic_list_array::<i32>(array),
options),
+ DataType::LargeList(_) =>
array_format(as_generic_list_array::<i64>(array), options),
+ DataType::FixedSizeList(_, _) => {
+ let a =
array.as_any().downcast_ref::<FixedSizeListArray>().unwrap();
+ array_format(a, options)
+ }
+ DataType::Struct(_) => array_format(as_struct_array(array), options),
+ DataType::Map(_, _) => array_format(as_map_array(array), options),
+ DataType::Union(_, _, _) => array_format(as_union_array(array),
options),
+ d => Err(ArrowError::NotYetImplemented(format!("formatting {d} is not
yet supported"))),
+ }
+}
+
+/// Either an [`ArrowError`] or [`std::fmt::Error`]
+enum FormatError {
+ Format(std::fmt::Error),
+ Arrow(ArrowError),
+}
+
+type FormatResult = Result<(), FormatError>;
+
+impl From<std::fmt::Error> for FormatError {
+ fn from(value: std::fmt::Error) -> Self {
+ Self::Format(value)
+ }
+}
+
+impl From<ArrowError> for FormatError {
+ fn from(value: ArrowError) -> Self {
+ Self::Arrow(value)
+ }
+}
+
+/// [`Display`] but accepting an index
+trait DisplayIndex {
+ fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult;
+}
+
+/// [`DisplayIndex`] with additional state
+trait DisplayIndexState<'a> {
+ type State;
+
+ fn prepare(&self, options: &FormatOptions<'a>) -> Result<Self::State,
ArrowError>;
+
+ fn write(&self, state: &Self::State, idx: usize, f: &mut dyn Write) ->
FormatResult;
+}
+
+impl<'a, T: DisplayIndex> DisplayIndexState<'a> for T {
+ type State = ();
+
+ fn prepare(&self, _options: &FormatOptions<'a>) -> Result<Self::State,
ArrowError> {
+ Ok(())
+ }
+
+ fn write(&self, _: &Self::State, idx: usize, f: &mut dyn Write) ->
FormatResult {
+ DisplayIndex::write(self, idx, f)
+ }
+}
+
+struct ArrayFormat<'a, F: DisplayIndexState<'a>> {
+ state: F::State,
+ array: F,
+ null: &'a str,
+}
+
+fn array_format<'a, F>(
+ array: F,
+ options: &FormatOptions<'a>,
+) -> Result<Box<dyn DisplayIndex + 'a>, ArrowError>
+where
+ F: DisplayIndexState<'a> + Array + 'a,
+{
+ let state = array.prepare(options)?;
+ Ok(Box::new(ArrayFormat {
+ state,
+ array,
+ null: options.null,
+ }))
+}
+
+impl<'a, F: DisplayIndexState<'a> + Array> DisplayIndex for ArrayFormat<'a, F>
{
+ fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult {
+ if self.array.is_null(idx) {
+ if !self.null.is_empty() {
+ f.write_str(self.null)?
+ }
+ return Ok(());
+ }
+ DisplayIndexState::write(&self.array, &self.state, idx, f)
+ }
+}
+
+impl<'a> DisplayIndex for &'a BooleanArray {
+ fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult {
+ write!(f, "{}", self.value(idx))?;
+ Ok(())
+ }
+}
+
+impl<'a> DisplayIndex for &'a NullArray {
+ fn write(&self, _idx: usize, _f: &mut dyn Write) -> FormatResult {
+ Ok(())
+ }
+}
+
+macro_rules! primitive_display {
+ ($($t:ty),+) => {
+ $(impl<'a> DisplayIndex for &'a PrimitiveArray<$t>
+ {
+ fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult {
+ let value = self.value(idx);
+ let mut buffer = [0u8; <$t as
ArrowPrimitiveType>::Native::FORMATTED_SIZE];
+ // SAFETY:
+ // buffer is T::FORMATTED_SIZE
+ let b = unsafe { lexical_core::write_unchecked(value, &mut
buffer) };
+ // Lexical core produces valid UTF-8
+ let s = unsafe { std::str::from_utf8_unchecked(b) };
+ f.write_str(s)?;
+ Ok(())
+ }
+ })+
+ };
}
-macro_rules! make_string {
- ($array_type:ty, $column: ident, $row: ident) => {{
- let array = $column.as_any().downcast_ref::<$array_type>().unwrap();
+primitive_display!(Int8Type, Int16Type, Int32Type, Int64Type);
+primitive_display!(UInt8Type, UInt16Type, UInt32Type, UInt64Type);
+primitive_display!(Float32Type, Float64Type);
- Ok(array.value($row).to_string())
- }};
+impl<'a> DisplayIndex for &'a PrimitiveArray<Float16Type> {
+ fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult {
+ write!(f, "{}", self.value(idx))?;
+ Ok(())
+ }
+}
+
+macro_rules! decimal_display {
+ ($($t:ty),+) => {
+ $(impl<'a> DisplayIndexState<'a> for &'a PrimitiveArray<$t> {
+ type State = (u8, i8);
+
+ fn prepare(&self, _options: &FormatOptions<'a>) ->
Result<Self::State, ArrowError> {
+ Ok((self.precision(), self.scale()))
+ }
+
+ fn write(&self, s: &Self::State, idx: usize, f: &mut dyn Write) ->
FormatResult {
+ write!(f, "{}", <$t>::format_decimal(self.values()[idx], s.0,
s.1))?;
+ Ok(())
+ }
+ })+
+ };
+}
+
+decimal_display!(Decimal128Type, Decimal256Type);
+
+fn write_timestamp(
+ f: &mut dyn Write,
+ naive: NaiveDateTime,
+ timezone: Option<Tz>,
+ format: Option<&str>,
+) -> FormatResult {
+ match timezone {
+ Some(tz) => {
+ let date = Utc.from_utc_datetime(&naive).with_timezone(&tz);
+ match format {
+ Some(s) => write!(f, "{}", date.format(s))?,
+ None => {
+ write!(f, "{}",
date.to_rfc3339_opts(SecondsFormat::AutoSi, true))?
+ }
+ }
+ }
+ None => match format {
+ Some(s) => write!(f, "{}", naive.format(s))?,
+ None => write!(f, "{naive:?}")?,
+ },
+ }
+ Ok(())
}
-macro_rules! make_string_interval_year_month {
- ($column: ident, $row: ident) => {{
- let array = $column
- .as_any()
- .downcast_ref::<array::IntervalYearMonthArray>()
- .unwrap();
+macro_rules! timestamp_display {
+ ($($t:ty),+) => {
+ $(impl<'a> DisplayIndexState<'a> for &'a PrimitiveArray<$t> {
+ type State = (Option<Tz>, TimeFormat<'a>);
- let interval = array.value($row) as f64;
+ fn prepare(&self, options: &FormatOptions<'a>) ->
Result<Self::State, ArrowError> {
+ match self.data_type() {
+ DataType::Timestamp(_, Some(tz)) => Ok((Some(tz.parse()?),
options.timestamp_tz_format)),
+ DataType::Timestamp(_, None) => Ok((None,
options.timestamp_format)),
+ _ => unreachable!(),
+ }
+ }
+
+ fn write(&self, s: &Self::State, idx: usize, f: &mut dyn Write) ->
FormatResult {
+ let value = self.value(idx);
+ let naive = as_datetime::<$t>(value).ok_or_else(|| {
+ ArrowError::CastError(format!(
+ "Failed to convert {} to datetime for {}",
+ value,
+ self.data_type()
+ ))
+ })?;
+
+ write_timestamp(f, naive, s.0, s.1.clone())
+ }
+ })+
+ };
+}
+
+timestamp_display!(
+ TimestampSecondType,
+ TimestampMillisecondType,
+ TimestampMicrosecondType,
+ TimestampNanosecondType
+);
+
+macro_rules! temporal_display {
+ ($convert:ident, $format:ident, $t:ty) => {
+ impl<'a> DisplayIndexState<'a> for &'a PrimitiveArray<$t> {
+ type State = TimeFormat<'a>;
+
+ fn prepare(
+ &self,
+ options: &FormatOptions<'a>,
+ ) -> Result<Self::State, ArrowError> {
+ Ok(options.$format)
+ }
+
+ fn write(
+ &self,
+ fmt: &Self::State,
+ idx: usize,
+ f: &mut dyn Write,
+ ) -> FormatResult {
+ let value = self.value(idx);
+ let naive = $convert(value as _).ok_or_else(|| {
+ ArrowError::CastError(format!(
+ "Failed to convert {} to temporal for {}",
+ value,
+ self.data_type()
+ ))
+ })?;
+
+ match fmt {
+ Some(s) => write!(f, "{}", naive.format(s))?,
+ None => write!(f, "{naive:?}")?,
+ }
+ Ok(())
+ }
+ }
+ };
+}
+
+#[inline]
+fn date32_to_date(value: i32) -> Option<NaiveDate> {
+ Some(date32_to_datetime(value)?.date())
+}
+
+temporal_display!(date32_to_date, date_format, Date32Type);
+temporal_display!(date64_to_datetime, datetime_format, Date64Type);
+temporal_display!(time32s_to_time, time_format, Time32SecondType);
+temporal_display!(time32ms_to_time, time_format, Time32MillisecondType);
+temporal_display!(time64us_to_time, time_format, Time64MicrosecondType);
+temporal_display!(time64ns_to_time, time_format, Time64NanosecondType);
+
+macro_rules! duration_display {
+ ($convert:ident, $t:ty) => {
+ impl<'a> DisplayIndex for &'a PrimitiveArray<$t> {
+ fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult {
+ write!(f, "{}", $convert(self.value(idx)))?;
+ Ok(())
+ }
+ }
+ };
+}
+
+duration_display!(duration_s_to_duration, DurationSecondType);
+duration_display!(duration_ms_to_duration, DurationMillisecondType);
+duration_display!(duration_us_to_duration, DurationMicrosecondType);
+duration_display!(duration_ns_to_duration, DurationNanosecondType);
+
+impl<'a> DisplayIndex for &'a PrimitiveArray<IntervalYearMonthType> {
+ fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult {
+ let interval = self.value(idx) as f64;
let years = (interval / 12_f64).floor();
let month = interval - (years * 12_f64);
- Ok(format!(
- "{} years {} mons 0 days 0 hours 0 mins 0.00 secs",
- years, month,
- ))
- }};
+ write!(
+ f,
+ "{years} years {month} mons 0 days 0 hours 0 mins 0.00 secs",
+ )?;
+ Ok(())
+ }
}
-macro_rules! make_string_interval_day_time {
- ($column: ident, $row: ident) => {{
- let array = $column
- .as_any()
- .downcast_ref::<array::IntervalDayTimeArray>()
- .unwrap();
-
- let value: u64 = array.value($row) as u64;
+impl<'a> DisplayIndex for &'a PrimitiveArray<IntervalDayTimeType> {
+ fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult {
+ let value: u64 = self.value(idx) as u64;
let days_parts: i32 = ((value & 0xFFFFFFFF00000000) >> 32) as i32;
let milliseconds_part: i32 = (value & 0xFFFFFFFF) as i32;
@@ -88,7 +585,8 @@ macro_rules! make_string_interval_day_time {
""
};
- Ok(format!(
+ write!(
+ f,
"0 years 0 mons {} days {} hours {} mins {}{}.{:03} secs",
days_parts,
hours,
@@ -96,18 +594,14 @@ macro_rules! make_string_interval_day_time {
secs_sign,
secs.abs(),
milliseconds.abs(),
- ))
- }};
+ )?;
+ Ok(())
+ }
}
-macro_rules! make_string_interval_month_day_nano {
- ($column: ident, $row: ident) => {{
- let array = $column
- .as_any()
- .downcast_ref::<array::IntervalMonthDayNanoArray>()
- .unwrap();
-
- let value: u128 = array.value($row) as u128;
+impl<'a> DisplayIndex for &'a PrimitiveArray<IntervalMonthDayNanoType> {
+ fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult {
+ let value: u128 = self.value(idx) as u128;
let months_part: i32 =
((value & 0xFFFFFFFF000000000000000000000000) >> 96) as i32;
@@ -125,7 +619,8 @@ macro_rules! make_string_interval_month_day_nano {
let secs_sign = if secs < 0 || nanoseconds < 0 { "-" } else { "" };
- Ok(format!(
+ write!(
+ f,
"0 years {} mons {} days {} hours {} mins {}{}.{:09} secs",
months_part,
days_part,
@@ -134,657 +629,220 @@ macro_rules! make_string_interval_month_day_nano {
secs_sign,
secs.abs(),
nanoseconds.abs(),
- ))
- }};
-}
-
-macro_rules! make_string_date {
- ($array_type:ty, $dt:expr, $column: ident, $col_idx:ident, $row_idx:
ident) => {{
- Ok($column
- .as_any()
- .downcast_ref::<$array_type>()
- .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?
- .value_as_date($row_idx)
- .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?
- .to_string())
- }};
-}
-
-macro_rules! make_string_date_with_format {
- ($array_type:ty, $dt:expr, $format: ident, $column: ident, $col_idx:ident,
$row_idx: ident) => {{
- Ok($column
- .as_any()
- .downcast_ref::<$array_type>()
- .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?
- .value_as_datetime($row_idx)
- .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?
- .format($format)
- .to_string())
- }};
-}
-
-macro_rules! handle_string_date {
- ($array_type:ty, $dt:expr, $format: ident, $column: ident, $col_idx:ident,
$row_idx: ident) => {{
- match $format {
- Some(format) => {
- make_string_date_with_format!(
- $array_type,
- $dt,
- format,
- $column,
- $col_idx,
- $row_idx
- )
- }
- None => make_string_date!($array_type, $dt, $column, $col_idx,
$row_idx),
- }
- }};
-}
-
-macro_rules! make_string_time {
- ($array_type:ty, $dt:expr, $column: ident, $col_idx:ident, $row_idx:
ident) => {{
- Ok($column
- .as_any()
- .downcast_ref::<$array_type>()
- .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?
- .value_as_time($row_idx)
- .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?
- .to_string())
- }};
-}
-
-macro_rules! make_string_time_with_format {
- ($array_type:ty, $dt:expr, $format: ident, $column: ident, $col_idx:ident,
$row_idx: ident) => {{
- Ok($column
- .as_any()
- .downcast_ref::<$array_type>()
- .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?
- .value_as_time($row_idx)
- .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?
- .format($format)
- .to_string())
- }};
-}
-
-macro_rules! handle_string_time {
- ($array_type:ty, $dt:expr, $format: ident, $column: ident, $col_idx:ident,
$row_idx: ident) => {
- match $format {
- Some(format) => {
- make_string_time_with_format!(
- $array_type,
- $dt,
- format,
- $column,
- $col_idx,
- $row_idx
- )
- }
- None => make_string_time!($array_type, $dt, $column, $col_idx,
$row_idx),
- }
- };
+ )?;
+ Ok(())
+ }
}
-macro_rules! make_string_datetime {
- ($array_type:ty, $dt:expr, $tz_string: ident, $column: ident,
$col_idx:ident, $row_idx: ident) => {{
- let array = $column
- .as_any()
- .downcast_ref::<$array_type>()
- .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?;
-
- let s = match $tz_string {
- Some(tz_string) => match tz_string.parse::<Tz>() {
- Ok(tz) => array
- .value_as_datetime_with_tz($row_idx, tz)
- .ok_or_else(|| invalid_cast_error($dt, $col_idx,
$row_idx))?
- .to_rfc3339_opts(SecondsFormat::AutoSi, true)
- .to_string(),
- Err(_) => {
- let datetime = array
- .value_as_datetime($row_idx)
- .ok_or_else(|| invalid_cast_error($dt, $col_idx,
$row_idx))?;
- format!("{:?} (Unknown Time Zone '{}')", datetime,
tz_string)
- }
- },
- None => {
- let datetime = array
- .value_as_datetime($row_idx)
- .ok_or_else(|| invalid_cast_error($dt, $col_idx,
$row_idx))?;
- format!("{:?}", datetime)
- }
- };
-
- Ok(s)
- }};
-}
-
-macro_rules! make_string_datetime_with_format {
- ($array_type:ty, $dt:expr, $format: ident, $tz_string: ident, $column:
ident, $col_idx:ident, $row_idx: ident) => {{
- let array = $column
- .as_any()
- .downcast_ref::<$array_type>()
- .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?;
- let datetime = array
- .value_as_datetime($row_idx)
- .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?;
-
- let s = match $tz_string {
- Some(tz_string) => match tz_string.parse::<Tz>() {
- Ok(tz) => {
- let utc_time = DateTime::<Utc>::from_utc(datetime, Utc);
- let local_time = utc_time.with_timezone(&tz);
- local_time.format($format).to_string()
- }
- Err(_) => {
- format!("{:?} (Unknown Time Zone '{}')", datetime,
tz_string)
- }
- },
- None => datetime.format($format).to_string(),
- };
+impl<'a, O: OffsetSizeTrait> DisplayIndex for &'a GenericStringArray<O> {
+ fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult {
+ write!(f, "{}", self.value(idx))?;
+ Ok(())
+ }
+}
- Ok(s)
- }};
-}
-
-macro_rules! handle_string_datetime {
- ($array_type:ty, $dt:expr, $format: ident, $tz_string: ident, $column:
ident, $col_idx:ident, $row_idx: ident) => {
- match $format {
- Some(format) => make_string_datetime_with_format!(
- $array_type,
- $dt,
- format,
- $tz_string,
- $column,
- $col_idx,
- $row_idx
- ),
- None => make_string_datetime!(
- $array_type,
- $dt,
- $tz_string,
- $column,
- $col_idx,
- $row_idx
- ),
+impl<'a, O: OffsetSizeTrait> DisplayIndex for &'a GenericBinaryArray<O> {
+ fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult {
+ let v = self.value(idx);
+ for byte in v {
+ write!(f, "{byte:02x}")?;
}
- };
+ Ok(())
+ }
}
-// It's not possible to do array.value($row).to_string() for &[u8], let's
format it as hex
-macro_rules! make_string_hex {
- ($array_type:ty, $column: ident, $row: ident) => {{
- let array = $column.as_any().downcast_ref::<$array_type>().unwrap();
+impl<'a> DisplayIndex for &'a FixedSizeBinaryArray {
+ fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult {
+ let v = self.value(idx);
+ for byte in v {
+ write!(f, "{byte:02x}")?;
+ }
+ Ok(())
+ }
+}
- let mut tmp = "".to_string();
+impl<'a, K: ArrowDictionaryKeyType> DisplayIndexState<'a> for &'a
DictionaryArray<K> {
+ type State = Box<dyn DisplayIndex + 'a>;
- for character in array.value($row) {
- let _ = write!(tmp, "{:02x}", character);
- }
+ fn prepare(&self, options: &FormatOptions<'a>) -> Result<Self::State,
ArrowError> {
+ make_formatter(self.values().as_ref(), options)
+ }
- Ok(tmp)
- }};
-}
-
-macro_rules! make_string_from_list {
- ($column: ident, $row: ident) => {{
- let list = $column
- .as_any()
- .downcast_ref::<array::ListArray>()
- .ok_or(ArrowError::InvalidArgumentError(format!(
- "Repl error: could not convert list column to list array."
- )))?
- .value($row);
- let string_values = (0..list.len())
- .map(|i| array_value_to_string(&list.clone(), i))
- .collect::<Result<Vec<_>, _>>()?;
- Ok(format!("[{}]", string_values.join(", ")))
- }};
-}
-
-macro_rules! make_string_from_large_list {
- ($column: ident, $row: ident) => {{
- let list = $column
- .as_any()
- .downcast_ref::<array::LargeListArray>()
- .ok_or(ArrowError::InvalidArgumentError(format!(
- "Repl error: could not convert large list column to list
array."
- )))?
- .value($row);
- let string_values = (0..list.len())
- .map(|i| array_value_to_string(&list, i))
- .collect::<Result<Vec<_>, _>>()?;
- Ok(format!("[{}]", string_values.join(", ")))
- }};
-}
-
-macro_rules! make_string_from_fixed_size_list {
- ($column: ident, $row: ident) => {{
- let list = $column
- .as_any()
- .downcast_ref::<array::FixedSizeListArray>()
- .ok_or(ArrowError::InvalidArgumentError(format!(
- "Repl error: could not convert list column to list array."
- )))?
- .value($row);
- let string_values = (0..list.len())
- .map(|i| array_value_to_string(&list.clone(), i))
- .collect::<Result<Vec<_>, _>>()?;
- Ok(format!("[{}]", string_values.join(", ")))
- }};
-}
-
-macro_rules! make_string_from_duration {
- ($array_type:ty, $dt:expr, $column:ident, $col_idx:ident, $row_idx: ident)
=> {{
- Ok($column
- .as_any()
- .downcast_ref::<$array_type>()
- .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?
- .value_as_duration($row_idx)
- .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?
- .to_string())
- }};
-}
-
-#[inline(always)]
-pub fn make_string_from_decimal(
- column: &dyn Array,
- row: usize,
-) -> Result<String, ArrowError> {
- let array = column.as_any().downcast_ref::<Decimal128Array>().unwrap();
+ fn write(&self, s: &Self::State, idx: usize, f: &mut dyn Write) ->
FormatResult {
+ let value_idx = self.keys().values()[idx].as_usize();
+ s.as_ref().write(value_idx, f)
+ }
+}
- Ok(array.value_as_string(row))
+fn write_list(
+ f: &mut dyn Write,
+ mut range: Range<usize>,
+ values: &dyn DisplayIndex,
+) -> FormatResult {
+ f.write_char('[')?;
+ if let Some(idx) = range.next() {
+ values.write(idx, f)?;
+ }
+ for idx in range {
+ write!(f, ", ")?;
+ values.write(idx, f)?;
+ }
+ f.write_char(']')?;
+ Ok(())
}
-fn append_struct_field_string(
- target: &mut String,
- name: &str,
- field_col: &dyn Array,
- row: usize,
-) -> Result<(), ArrowError> {
- target.push('"');
- target.push_str(name);
- target.push_str("\": ");
-
- if field_col.is_null(row) {
- target.push_str("null");
- } else {
- match field_col.data_type() {
- DataType::Utf8 | DataType::LargeUtf8 => {
- target.push('"');
- target.push_str(array_value_to_string(field_col,
row)?.as_str());
- target.push('"');
- }
- _ => {
- target.push_str(array_value_to_string(field_col,
row)?.as_str());
- }
- }
+impl<'a, O: OffsetSizeTrait> DisplayIndexState<'a> for &'a GenericListArray<O>
{
+ type State = Box<dyn DisplayIndex + 'a>;
+
+ fn prepare(&self, options: &FormatOptions<'a>) -> Result<Self::State,
ArrowError> {
+ make_formatter(self.values().as_ref(), options)
}
- Ok(())
+ fn write(&self, s: &Self::State, idx: usize, f: &mut dyn Write) ->
FormatResult {
+ let offsets = self.value_offsets();
+ let end = offsets[idx + 1].as_usize();
+ let start = offsets[idx].as_usize();
+ write_list(f, start..end, s.as_ref())
+ }
}
-fn append_map_field_string(
- target: &mut String,
- field_col: &dyn Array,
- row: usize,
-) -> Result<(), ArrowError> {
- if field_col.is_null(row) {
- target.push_str("null");
- } else {
- match field_col.data_type() {
- DataType::Utf8 | DataType::LargeUtf8 => {
- target.push('"');
- target.push_str(array_value_to_string(field_col,
row)?.as_str());
- target.push('"');
- }
- _ => {
- target.push_str(array_value_to_string(field_col,
row)?.as_str());
- }
- }
+impl<'a> DisplayIndexState<'a> for &'a FixedSizeListArray {
+ type State = (usize, Box<dyn DisplayIndex + 'a>);
+
+ fn prepare(&self, options: &FormatOptions<'a>) -> Result<Self::State,
ArrowError> {
+ let values = make_formatter(self.values().as_ref(), options)?;
+ let length = self.value_length();
+ Ok((length as usize, values))
}
- Ok(())
+ fn write(&self, s: &Self::State, idx: usize, f: &mut dyn Write) ->
FormatResult {
+ let start = idx * s.0;
+ let end = start + s.0;
+ write_list(f, start..end, s.1.as_ref())
+ }
}
-/// Get the value at the given row in an array as a String.
-///
-/// Note this function is quite inefficient and is unlikely to be
-/// suitable for converting large arrays or record batches.
-fn array_value_to_string_internal(
- column: &dyn Array,
- col_idx: usize,
- row_idx: usize,
- format: Option<&str>,
-) -> Result<String, ArrowError> {
- if column.is_null(row_idx) {
- return Ok("".to_string());
- }
- match column.data_type() {
- DataType::Utf8 => make_string!(array::StringArray, column, row_idx),
- DataType::LargeUtf8 => make_string!(array::LargeStringArray, column,
row_idx),
- DataType::Binary => make_string_hex!(array::BinaryArray, column,
row_idx),
- DataType::LargeBinary => {
- make_string_hex!(array::LargeBinaryArray, column, row_idx)
- }
- DataType::FixedSizeBinary(_) => {
- make_string_hex!(array::FixedSizeBinaryArray, column, row_idx)
- }
- DataType::Boolean => make_string!(array::BooleanArray, column,
row_idx),
- DataType::Int8 => make_string!(array::Int8Array, column, row_idx),
- DataType::Int16 => make_string!(array::Int16Array, column, row_idx),
- DataType::Int32 => make_string!(array::Int32Array, column, row_idx),
- DataType::Int64 => make_string!(array::Int64Array, column, row_idx),
- DataType::UInt8 => make_string!(array::UInt8Array, column, row_idx),
- DataType::UInt16 => make_string!(array::UInt16Array, column, row_idx),
- DataType::UInt32 => make_string!(array::UInt32Array, column, row_idx),
- DataType::UInt64 => make_string!(array::UInt64Array, column, row_idx),
- DataType::Float16 => make_string!(array::Float16Array, column,
row_idx),
- DataType::Float32 => make_string!(array::Float32Array, column,
row_idx),
- DataType::Float64 => make_string!(array::Float64Array, column,
row_idx),
- DataType::Decimal128(..) => make_string_from_decimal(column, row_idx),
- DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Second
=> {
- handle_string_datetime!(
- array::TimestampSecondArray,
- "Timestamp",
- format,
- tz_string_opt,
- column,
- col_idx,
- row_idx
- )
- }
- DataType::Timestamp(unit, tz_string_opt) if *unit ==
TimeUnit::Millisecond => {
- handle_string_datetime!(
- array::TimestampMillisecondArray,
- "Timestamp",
- format,
- tz_string_opt,
- column,
- col_idx,
- row_idx
- )
- }
- DataType::Timestamp(unit, tz_string_opt) if *unit ==
TimeUnit::Microsecond => {
- handle_string_datetime!(
- array::TimestampMicrosecondArray,
- "Timestamp",
- format,
- tz_string_opt,
- column,
- col_idx,
- row_idx
- )
- }
- DataType::Timestamp(unit, tz_string_opt) if *unit ==
TimeUnit::Nanosecond => {
- handle_string_datetime!(
- array::TimestampNanosecondArray,
- "Timestamp",
- format,
- tz_string_opt,
- column,
- col_idx,
- row_idx
- )
- }
- DataType::Date32 => {
- handle_string_date!(
- array::Date32Array,
- "Date32",
- format,
- column,
- col_idx,
- row_idx
- )
- }
- DataType::Date64 => {
- handle_string_date!(
- array::Date64Array,
- "Date64",
- format,
- column,
- col_idx,
- row_idx
- )
- }
- DataType::Time32(unit) if *unit == TimeUnit::Second => {
- handle_string_time!(
- array::Time32SecondArray,
- "Time32",
- format,
- column,
- col_idx,
- row_idx
- )
- }
- DataType::Time32(unit) if *unit == TimeUnit::Millisecond => {
- handle_string_time!(
- array::Time32MillisecondArray,
- "Time32",
- format,
- column,
- col_idx,
- row_idx
- )
- }
- DataType::Time64(unit) if *unit == TimeUnit::Microsecond => {
- handle_string_time!(
- array::Time64MicrosecondArray,
- "Time64",
- format,
- column,
- col_idx,
- row_idx
- )
- }
- DataType::Time64(unit) if *unit == TimeUnit::Nanosecond => {
- handle_string_time!(
- array::Time64NanosecondArray,
- "Time64",
- format,
- column,
- col_idx,
- row_idx
- )
- }
- DataType::Interval(unit) => match unit {
- IntervalUnit::DayTime => {
- make_string_interval_day_time!(column, row_idx)
- }
- IntervalUnit::YearMonth => {
- make_string_interval_year_month!(column, row_idx)
- }
- IntervalUnit::MonthDayNano => {
- make_string_interval_month_day_nano!(column, row_idx)
- }
- },
- DataType::List(_) => make_string_from_list!(column, row_idx),
- DataType::LargeList(_) => make_string_from_large_list!(column,
row_idx),
- DataType::Dictionary(index_type, _value_type) => match **index_type {
- DataType::Int8 => dict_array_value_to_string::<Int8Type>(column,
row_idx),
- DataType::Int16 => dict_array_value_to_string::<Int16Type>(column,
row_idx),
- DataType::Int32 => dict_array_value_to_string::<Int32Type>(column,
row_idx),
- DataType::Int64 => dict_array_value_to_string::<Int64Type>(column,
row_idx),
- DataType::UInt8 => dict_array_value_to_string::<UInt8Type>(column,
row_idx),
- DataType::UInt16 =>
dict_array_value_to_string::<UInt16Type>(column, row_idx),
- DataType::UInt32 =>
dict_array_value_to_string::<UInt32Type>(column, row_idx),
- DataType::UInt64 =>
dict_array_value_to_string::<UInt64Type>(column, row_idx),
- _ => Err(ArrowError::InvalidArgumentError(format!(
- "Pretty printing not supported for {:?} due to index type",
- column.data_type()
- ))),
- },
- DataType::FixedSizeList(_, _) => {
- make_string_from_fixed_size_list!(column, row_idx)
- }
- DataType::Struct(_) => {
- let st = column
- .as_any()
- .downcast_ref::<array::StructArray>()
- .ok_or_else(|| {
- ArrowError::InvalidArgumentError(
- "Repl error: could not convert struct column to struct
array."
- .to_string(),
- )
- })?;
+/// Pairs a boxed [`DisplayIndex`] with its field name
+type FieldDisplay<'a> = (&'a str, Box<dyn DisplayIndex + 'a>);
- let mut s = String::new();
- s.push('{');
- let mut kv_iter = st.columns().iter().zip(st.column_names());
- if let Some((col, name)) = kv_iter.next() {
- append_struct_field_string(&mut s, name, col, row_idx)?;
- }
- for (col, name) in kv_iter {
- s.push_str(", ");
- append_struct_field_string(&mut s, name, col, row_idx)?;
- }
- s.push('}');
+impl<'a> DisplayIndexState<'a> for &'a StructArray {
+ type State = Vec<FieldDisplay<'a>>;
- Ok(s)
- }
- DataType::Map(_, _) => {
- let map_array =
- column.as_any().downcast_ref::<MapArray>().ok_or_else(|| {
- ArrowError::InvalidArgumentError(
- "Repl error: could not convert column to map
array.".to_string(),
- )
- })?;
- let map_entry = map_array.value(row_idx);
- let st = map_entry
- .as_any()
- .downcast_ref::<StructArray>()
- .ok_or_else(|| {
- ArrowError::InvalidArgumentError(
- "Repl error: could not convert map entry to struct
array."
- .to_string(),
- )
- })?;
- let mut s = String::new();
- s.push('{');
- let entries_count = st.column(0).len();
- for i in 0..entries_count {
- if i > 0 {
- s.push_str(", ");
- }
- append_map_field_string(&mut s, st.column(0), i)?;
- s.push_str(": ");
- append_map_field_string(&mut s, st.column(1), i)?;
- }
- s.push('}');
+ fn prepare(&self, options: &FormatOptions<'a>) -> Result<Self::State,
ArrowError> {
+ let fields = match (*self).data_type() {
+ DataType::Struct(f) => f,
+ _ => unreachable!(),
+ };
- Ok(s)
+ self.columns()
+ .iter()
+ .zip(fields)
+ .map(|(a, f)| {
+ let format = make_formatter(a.as_ref(), options)?;
+ Ok((f.name().as_str(), format))
+ })
+ .collect()
+ }
+
+ fn write(&self, s: &Self::State, idx: usize, f: &mut dyn Write) ->
FormatResult {
+ let mut iter = s.iter();
+ f.write_char('{')?;
+ if let Some((name, display)) = iter.next() {
+ write!(f, "{name}: ")?;
+ display.as_ref().write(idx, f)?;
}
- DataType::Union(field_vec, type_ids, mode) => {
- union_to_string(column, row_idx, field_vec, type_ids, mode)
+ for (name, display) in iter {
+ write!(f, ", {name}: ")?;
+ display.as_ref().write(idx, f)?;
}
- DataType::Duration(unit) => match *unit {
- TimeUnit::Second => {
- make_string_from_duration!(
- array::DurationSecondArray,
- "Duration",
- column,
- col_idx,
- row_idx
- )
- }
- TimeUnit::Millisecond => {
- make_string_from_duration!(
- array::DurationMillisecondArray,
- "Duration",
- column,
- col_idx,
- row_idx
- )
- }
- TimeUnit::Microsecond => {
- make_string_from_duration!(
- array::DurationMicrosecondArray,
- "Duration",
- column,
- col_idx,
- row_idx
- )
- }
- TimeUnit::Nanosecond => {
- make_string_from_duration!(
- array::DurationNanosecondArray,
- "Duration",
- column,
- col_idx,
- row_idx
- )
- }
- },
- _ => Err(ArrowError::InvalidArgumentError(format!(
- "Pretty printing not implemented for {:?} type",
- column.data_type()
- ))),
+ f.write_char('}')?;
+ Ok(())
}
}
-pub fn temporal_array_value_to_string(
- column: &dyn Array,
- col_idx: usize,
- row_idx: usize,
- format: Option<&str>,
-) -> Result<String, ArrowError> {
- array_value_to_string_internal(column, col_idx, row_idx, format)
-}
+impl<'a> DisplayIndexState<'a> for &'a MapArray {
+ type State = (Box<dyn DisplayIndex + 'a>, Box<dyn DisplayIndex + 'a>);
-pub fn array_value_to_string(
- column: &dyn Array,
- row_idx: usize,
-) -> Result<String, ArrowError> {
- array_value_to_string_internal(column, 0, row_idx, None)
-}
+ fn prepare(&self, options: &FormatOptions<'a>) -> Result<Self::State,
ArrowError> {
+ let keys = make_formatter(self.keys().as_ref(), options)?;
+ let values = make_formatter(self.values().as_ref(), options)?;
+ Ok((keys, values))
+ }
-/// Converts the value of the union array at `row` to a String
-fn union_to_string(
- column: &dyn Array,
- row: usize,
- fields: &[Field],
- type_ids: &[i8],
- mode: &UnionMode,
-) -> Result<String, ArrowError> {
- let list = column
- .as_any()
- .downcast_ref::<array::UnionArray>()
- .ok_or_else(|| {
- ArrowError::InvalidArgumentError(
- "Repl error: could not convert union column to union
array.".to_string(),
- )
- })?;
- let type_id = list.type_id(row);
- let field_idx = type_ids.iter().position(|t| t == &type_id).ok_or_else(|| {
- ArrowError::InvalidArgumentError(format!(
- "Repl error: could not get field name for type id: {type_id} in
union array.",
- ))
- })?;
- let name = fields.get(field_idx).unwrap().name();
-
- let value = array_value_to_string(
- list.child(type_id),
- match mode {
- UnionMode::Dense => list.value_offset(row) as usize,
- UnionMode::Sparse => row,
- },
- )?;
+ fn write(&self, s: &Self::State, idx: usize, f: &mut dyn Write) ->
FormatResult {
+ let offsets = self.value_offsets();
+ let end = offsets[idx + 1].as_usize();
+ let start = offsets[idx].as_usize();
+ let mut iter = start..end;
+
+ f.write_char('{')?;
+ if let Some(idx) = iter.next() {
+ s.0.write(idx, f)?;
+ write!(f, ": ")?;
+ s.1.write(idx, f)?;
+ }
+
+ for idx in iter {
+ write!(f, ", ")?;
+ s.0.write(idx, f)?;
+ write!(f, ": ")?;
+ s.1.write(idx, f)?;
+ }
- Ok(format!("{{{name}={value}}}"))
+ f.write_char('}')?;
+ Ok(())
+ }
}
-/// Converts the value of the dictionary array at `row` to a String
-fn dict_array_value_to_string<K: ArrowPrimitiveType>(
- colum: &dyn Array,
- row: usize,
-) -> Result<String, ArrowError> {
- let dict_array =
colum.as_any().downcast_ref::<DictionaryArray<K>>().unwrap();
- let keys_array = dict_array.keys();
+impl<'a> DisplayIndexState<'a> for &'a UnionArray {
+ type State = (
+ Vec<Option<(&'a str, Box<dyn DisplayIndex + 'a>)>>,
+ UnionMode,
+ );
- if keys_array.is_null(row) {
- return Ok(String::from(""));
+ fn prepare(&self, options: &FormatOptions<'a>) -> Result<Self::State,
ArrowError> {
+ let (fields, type_ids, mode) = match (*self).data_type() {
+ DataType::Union(fields, type_ids, mode) => (fields, type_ids,
mode),
+ _ => unreachable!(),
+ };
+
+ let max_id = type_ids.iter().copied().max().unwrap_or_default() as
usize;
+ let mut out: Vec<Option<FieldDisplay>> = (0..max_id + 1).map(|_|
None).collect();
+ for (i, field) in type_ids.iter().zip(fields) {
+ let formatter = make_formatter(self.child(*i).as_ref(), options)?;
+ out[*i as usize] = Some((field.name().as_str(), formatter))
+ }
+ Ok((out, *mode))
}
- let dict_index = keys_array.value(row).as_usize();
- array_value_to_string(dict_array.values(), dict_index)
+ fn write(&self, s: &Self::State, idx: usize, f: &mut dyn Write) ->
FormatResult {
+ let id = self.type_id(idx);
+ let idx = match s.1 {
+ UnionMode::Dense => self.value_offset(idx) as usize,
+ UnionMode::Sparse => idx,
+ };
+ let (name, field) = s.0[id as usize].as_ref().unwrap();
+
+ write!(f, "{{{name}=")?;
+ field.write(idx, f)?;
+ f.write_char('}')?;
+ Ok(())
+ }
+}
+
+/// Get the value at the given row in an array as a String.
+///
+/// Note this function is quite inefficient and is unlikely to be
+/// suitable for converting large arrays or record batches.
+///
+/// Please see [`ArrayFormatter`] for a more performant interface
+pub fn array_value_to_string(
+ column: &dyn Array,
+ row: usize,
+) -> Result<String, ArrowError> {
+ let options = FormatOptions::default().with_display_error(true);
+ let formatter = ArrayFormatter::try_new(column, &options)?;
+ Ok(formatter.value(row).to_string())
}
/// Converts numeric type to a `String`
@@ -824,7 +882,7 @@ mod tests {
)
.unwrap();
assert_eq!(
- "{\"d\": 30, \"e\": 40, \"f\": 50}",
+ "{d: 30, e: 40, f: 50}",
array_value_to_string(&map_array, 1).unwrap()
);
}
diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs
index 94620be66..e0734a15f 100644
--- a/arrow-csv/src/writer.rs
+++ b/arrow-csv/src/writer.rs
@@ -63,12 +63,10 @@
//! }
//! ```
-use arrow_array::types::*;
use arrow_array::*;
-use arrow_cast::display::{
- array_value_to_string, lexical_to_string, temporal_array_value_to_string,
-};
+use arrow_cast::display::*;
use arrow_schema::*;
+use csv::ByteRecord;
use std::io::Write;
use crate::map_csv_error;
@@ -79,15 +77,6 @@ const DEFAULT_TIMESTAMP_FORMAT: &str = "%FT%H:%M:%S.%9f";
const DEFAULT_TIMESTAMP_TZ_FORMAT: &str = "%FT%H:%M:%S.%9f%:z";
const DEFAULT_NULL_VALUE: &str = "";
-fn write_primitive_value<T>(array: &ArrayRef, i: usize) -> String
-where
- T: ArrowPrimitiveType,
- T::Native: lexical_core::ToLexical,
-{
- let c = array.as_any().downcast_ref::<PrimitiveArray<T>>().unwrap();
- lexical_to_string(c.value(i))
-}
-
/// A CSV writer
#[derive(Debug)]
pub struct Writer<W: Write> {
@@ -100,10 +89,8 @@ pub struct Writer<W: Write> {
/// The datetime format for datetime arrays
datetime_format: Option<String>,
/// The timestamp format for timestamp arrays
- #[allow(dead_code)]
timestamp_format: Option<String>,
/// The timestamp format for timestamp (with timezone) arrays
- #[allow(dead_code)]
timestamp_tz_format: Option<String>,
/// The time format for time arrays
time_format: Option<String>,
@@ -132,113 +119,6 @@ impl<W: Write> Writer<W> {
}
}
- /// Convert a record to a string vector
- fn convert(
- &self,
- batch: &[ArrayRef],
- row_index: usize,
- buffer: &mut [String],
- ) -> Result<(), ArrowError> {
- // TODO: it'd be more efficient if we could create `record: Vec<&[u8]>
- for (col_index, item) in buffer.iter_mut().enumerate() {
- let col = &batch[col_index];
- if col.is_null(row_index) {
- // write the configured null value
- *item = self.null_value.clone();
- continue;
- }
- let string = match col.data_type() {
- DataType::Float64 => write_primitive_value::<Float64Type>(col,
row_index),
- DataType::Float32 => write_primitive_value::<Float32Type>(col,
row_index),
- DataType::Int8 => write_primitive_value::<Int8Type>(col,
row_index),
- DataType::Int16 => write_primitive_value::<Int16Type>(col,
row_index),
- DataType::Int32 => write_primitive_value::<Int32Type>(col,
row_index),
- DataType::Int64 => write_primitive_value::<Int64Type>(col,
row_index),
- DataType::UInt8 => write_primitive_value::<UInt8Type>(col,
row_index),
- DataType::UInt16 => write_primitive_value::<UInt16Type>(col,
row_index),
- DataType::UInt32 => write_primitive_value::<UInt32Type>(col,
row_index),
- DataType::UInt64 => write_primitive_value::<UInt64Type>(col,
row_index),
- DataType::Boolean => array_value_to_string(col,
row_index)?.to_string(),
- DataType::Utf8 => array_value_to_string(col,
row_index)?.to_string(),
- DataType::LargeUtf8 => array_value_to_string(col,
row_index)?.to_string(),
- DataType::Date32 => temporal_array_value_to_string(
- col,
- col_index,
- row_index,
- self.date_format.as_deref(),
- )?
- .to_string(),
- DataType::Date64 => temporal_array_value_to_string(
- col,
- col_index,
- row_index,
- self.datetime_format.as_deref(),
- )?
- .to_string(),
- DataType::Time32(TimeUnit::Second) =>
temporal_array_value_to_string(
- col,
- col_index,
- row_index,
- self.time_format.as_deref(),
- )?
- .to_string(),
- DataType::Time32(TimeUnit::Millisecond) => {
- temporal_array_value_to_string(
- col,
- col_index,
- row_index,
- self.time_format.as_deref(),
- )?
- .to_string()
- }
- DataType::Time64(TimeUnit::Microsecond) => {
- temporal_array_value_to_string(
- col,
- col_index,
- row_index,
- self.time_format.as_deref(),
- )?
- .to_string()
- }
- DataType::Time64(TimeUnit::Nanosecond) =>
temporal_array_value_to_string(
- col,
- col_index,
- row_index,
- self.time_format.as_deref(),
- )?
- .to_string(),
- DataType::Timestamp(_, time_zone) => match time_zone {
- Some(_tz) => temporal_array_value_to_string(
- col,
- col_index,
- row_index,
- self.timestamp_tz_format.as_deref(),
- )?
- .to_string(),
- None => temporal_array_value_to_string(
- col,
- col_index,
- row_index,
- self.timestamp_format.as_deref(),
- )?
- .to_string(),
- },
- DataType::Decimal128(..) => {
- array_value_to_string(col, row_index)?.to_string()
- }
- t => {
- // List and Struct arrays not supported by the writer, any
- // other type needs to be implemented
- return Err(ArrowError::CsvError(format!(
- "CSV Writer does not support {t:?} data type"
- )));
- }
- };
- *item = string;
- }
- Ok(())
- }
-
/// Write a vector of record batches to a writable object
pub fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> {
let num_columns = batch.num_columns();
@@ -257,23 +137,49 @@ impl<W: Write> Writer<W> {
self.beginning = false;
}
- let columns: Vec<_> = batch
+ let options = FormatOptions::default()
+ .with_null(&self.null_value)
+ .with_date_format(self.date_format.as_deref())
+ .with_datetime_format(self.datetime_format.as_deref())
+ .with_timestamp_format(self.timestamp_format.as_deref())
+ .with_timestamp_tz_format(self.timestamp_tz_format.as_deref())
+ .with_time_format(self.time_format.as_deref());
+
+ let converters = batch
.columns()
.iter()
- .map(|array| match array.data_type() {
- DataType::Dictionary(_, value_type) => {
- arrow_cast::cast(array, value_type)
- .expect("cannot cast dictionary to underlying values")
- }
- _ => array.clone(),
+ .map(|a| match a.data_type() {
+ d if d.is_nested() => Err(ArrowError::CsvError(format!(
+ "Nested type {} is not supported in CSV",
+ a.data_type()
+ ))),
+ DataType::Binary | DataType::LargeBinary =>
Err(ArrowError::CsvError(
+ "Binary data cannot be written to CSV".to_string(),
+ )),
+ _ => ArrayFormatter::try_new(a.as_ref(), &options),
})
- .collect();
-
- let mut buffer = vec!["".to_string(); batch.num_columns()];
+ .collect::<Result<Vec<_>, ArrowError>>()?;
+
+ let mut buffer = String::with_capacity(1024);
+ let mut byte_record = ByteRecord::with_capacity(1024,
converters.len());
+
+ for row_idx in 0..batch.num_rows() {
+ byte_record.clear();
+ for (col_idx, converter) in converters.iter().enumerate() {
+ buffer.clear();
+ converter.value(row_idx).write(&mut buffer).map_err(|e| {
+ ArrowError::CsvError(format!(
+ "Error formatting row {} and column {}: {e}",
+ row_idx + 1,
+ col_idx + 1
+ ))
+ })?;
+ byte_record.push_field(buffer.as_bytes());
+ }
- for row_index in 0..batch.num_rows() {
- self.convert(columns.as_slice(), row_index, &mut buffer)?;
- self.writer.write_record(&buffer).map_err(map_csv_error)?;
+ self.writer
+ .write_byte_record(&byte_record)
+ .map_err(map_csv_error)?;
}
self.writer.flush()?;
@@ -384,16 +290,13 @@ impl WriterBuilder {
self
}
- /// Use RFC3339 format for date/time/timestamps by clearing all
- /// date/time specific formats.
- pub fn with_rfc3339(mut self, use_rfc3339: bool) -> Self {
- if use_rfc3339 {
- self.date_format = None;
- self.datetime_format = None;
- self.time_format = None;
- self.timestamp_format = None;
- self.timestamp_tz_format = None;
- }
+ /// Use RFC3339 format for date/time/timestamps
+ pub fn with_rfc3339(mut self) -> Self {
+ self.date_format = None;
+ self.datetime_format = None;
+ self.time_format = None;
+ self.timestamp_format = None;
+ self.timestamp_tz_format = None;
self
}
@@ -423,15 +326,10 @@ mod tests {
use super::*;
use crate::Reader;
+ use arrow_array::types::*;
use std::io::{Cursor, Read, Seek};
use std::sync::Arc;
- fn invalid_cast_error(dt: &str, col_idx: usize, row_idx: usize) ->
ArrowError {
- ArrowError::CastError(format!(
- "Cannot cast to {dt} at col index: {col_idx} row index: {row_idx}"
- ))
- }
-
#[test]
fn test_write_csv() {
let schema = Schema::new(vec![
@@ -654,15 +552,8 @@ sed do eiusmod
tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo
let batches = vec![&batch, &batch];
for batch in batches {
- writer
- .write(batch)
- .map_err(|e| {
- dbg!(e.to_string());
- assert!(e.to_string().ends_with(
- invalid_cast_error("Date64", 1, 1).to_string().as_str()
- ))
- })
- .unwrap_err();
+ let err = writer.write(batch).unwrap_err().to_string();
+ assert_eq!(err, "Csv error: Error formatting row 2 and column 2:
Cast error: Failed to convert 1926632005177685347 to temporal for Date64")
}
drop(writer);
}
@@ -700,7 +591,7 @@ sed do eiusmod
tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo
let mut file = tempfile::tempfile().unwrap();
- let builder = WriterBuilder::new().with_rfc3339(true);
+ let builder = WriterBuilder::new().with_rfc3339();
let mut writer = builder.build(&mut file);
let batches = vec![&batch];
for batch in batches {
diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs
index d2425a3d5..028b7d889 100644
--- a/arrow-json/src/writer.rs
+++ b/arrow-json/src/writer.rs
@@ -105,7 +105,7 @@ use arrow_array::types::*;
use arrow_array::*;
use arrow_schema::*;
-use arrow_cast::display::temporal_array_value_to_string;
+use arrow_cast::display::{ArrayFormatter, FormatOptions};
fn primitive_array_to_json<T>(array: &ArrayRef) -> Result<Vec<Value>,
ArrowError>
where
@@ -137,7 +137,6 @@ fn struct_array_to_jsonmap_array(
row_count,
struct_col,
inner_col_names[j],
- j,
)?
}
Ok(inner_objs)
@@ -217,26 +216,6 @@ macro_rules! set_column_by_array_type {
};
}
-macro_rules! set_temporal_column_by_array_type {
- ($col_name:ident, $col_idx:ident, $rows:ident, $array:ident,
$row_count:ident) => {
- $rows
- .iter_mut()
- .enumerate()
- .take($row_count)
- .for_each(|(i, row)| {
- if !$array.is_null(i) {
- row.insert(
- $col_name.to_string(),
- temporal_array_value_to_string($array, $col_idx, i,
None)
- .unwrap()
- .to_string()
- .into(),
- );
- }
- });
- };
-}
-
fn set_column_by_primitive_type<T>(
rows: &mut [JsonMap<String, Value>],
row_count: usize,
@@ -264,7 +243,6 @@ fn set_column_for_json_rows(
row_count: usize,
array: &ArrayRef,
col_name: &str,
- col_idx: usize,
) -> Result<(), ArrowError> {
match array.data_type() {
DataType::Int8 => {
@@ -315,47 +293,23 @@ fn set_column_for_json_rows(
row_count
);
}
- DataType::Date32 => {
- set_temporal_column_by_array_type!(col_name, col_idx, rows, array,
row_count);
- }
- DataType::Date64 => {
- set_temporal_column_by_array_type!(col_name, col_idx, rows, array,
row_count);
- }
- DataType::Timestamp(TimeUnit::Second, _) => {
- set_temporal_column_by_array_type!(col_name, col_idx, rows, array,
row_count);
- }
- DataType::Timestamp(TimeUnit::Millisecond, _) => {
- set_temporal_column_by_array_type!(col_name, col_idx, rows, array,
row_count);
- }
- DataType::Timestamp(TimeUnit::Microsecond, _) => {
- set_temporal_column_by_array_type!(col_name, col_idx, rows, array,
row_count);
- }
- DataType::Timestamp(TimeUnit::Nanosecond, _) => {
- set_temporal_column_by_array_type!(col_name, col_idx, rows, array,
row_count);
- }
- DataType::Time32(TimeUnit::Second) => {
- set_temporal_column_by_array_type!(col_name, col_idx, rows, array,
row_count);
- }
- DataType::Time32(TimeUnit::Millisecond) => {
- set_temporal_column_by_array_type!(col_name, col_idx, rows, array,
row_count);
- }
- DataType::Time64(TimeUnit::Microsecond) => {
- set_temporal_column_by_array_type!(col_name, col_idx, rows, array,
row_count);
- }
- DataType::Time64(TimeUnit::Nanosecond) => {
- set_temporal_column_by_array_type!(col_name, col_idx, rows, array,
row_count);
- }
- DataType::Duration(TimeUnit::Second) => {
- set_temporal_column_by_array_type!(col_name, col_idx, rows, array,
row_count);
- }
- DataType::Duration(TimeUnit::Millisecond) => {
- set_temporal_column_by_array_type!(col_name, col_idx, rows, array,
row_count);
- }
- DataType::Duration(TimeUnit::Microsecond) => {
- set_temporal_column_by_array_type!(col_name, col_idx, rows, array,
row_count);
- }
- DataType::Duration(TimeUnit::Nanosecond) => {
- set_temporal_column_by_array_type!(col_name, col_idx, rows, array,
row_count);
+ DataType::Date32
+ | DataType::Date64
+ | DataType::Timestamp(_, _)
+ | DataType::Time32(_)
+ | DataType::Time64(_)
+ | DataType::Duration(_) => {
+ let options = FormatOptions::default();
+ let formatter = ArrayFormatter::try_new(array.as_ref(), &options)?;
+ let data = array.data();
+ rows.iter_mut().enumerate().for_each(|(idx, row)| {
+ if data.is_valid(idx) {
+ row.insert(
+ col_name.to_string(),
+ formatter.value(idx).to_string().into(),
+ );
+ }
+ });
}
DataType::Struct(_) => {
let inner_objs =
@@ -399,7 +353,7 @@ fn set_column_for_json_rows(
let slice = array.slice(0, row_count);
let hydrated = arrow_cast::cast::cast(&slice, value_type)
.expect("cannot cast dictionary to underlying values");
- set_column_for_json_rows(rows, row_count, &hydrated, col_name,
col_idx)?;
+ set_column_for_json_rows(rows, row_count, &hydrated, col_name)?;
}
DataType::Map(_, _) => {
let maparr = as_map_array(array);
@@ -465,7 +419,7 @@ pub fn record_batches_to_json_rows(
let row_count = batch.num_rows();
for (j, col) in batch.columns().iter().enumerate() {
let col_name = schema.field(j).name();
- set_column_for_json_rows(&mut rows[base..], row_count, col,
col_name, j)?
+ set_column_for_json_rows(&mut rows[base..], row_count, col,
col_name)?
}
base += row_count;
}
@@ -937,7 +891,7 @@ mod tests {
assert_json_eq(
&buf,
- r#"{"date32":"2018-11-13","date64":"2018-11-13","name":"a"}
+
r#"{"date32":"2018-11-13","date64":"2018-11-13T17:11:10.011","name":"a"}
{"name":"b"}
"#,
);
diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs
index 78ad0258d..9476535fa 100644
--- a/arrow-schema/src/datatype.rs
+++ b/arrow-schema/src/datatype.rs
@@ -290,7 +290,7 @@ pub enum IntervalUnit {
}
// Sparse or Dense union layouts
-#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
+#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Copy)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub enum UnionMode {
Sparse,
diff --git a/arrow/src/util/pretty.rs b/arrow/src/util/pretty.rs
index 9027a1cdc..4defa71a7 100644
--- a/arrow/src/util/pretty.rs
+++ b/arrow/src/util/pretty.rs
@@ -19,6 +19,7 @@
//! available unless `feature = "prettyprint"` is enabled.
use crate::{array::ArrayRef, record_batch::RecordBatch};
+use arrow_cast::display::{ArrayFormatter, FormatOptions};
use comfy_table::{Cell, Table};
use std::fmt::Display;
@@ -68,12 +69,19 @@ fn create_table(results: &[RecordBatch]) -> Result<Table> {
}
table.set_header(header);
+ let options = FormatOptions::default().with_display_error(true);
+
for batch in results {
+ let formatters = batch
+ .columns()
+ .iter()
+ .map(|c| ArrayFormatter::try_new(c.as_ref(), &options))
+ .collect::<Result<Vec<_>>>()?;
+
for row in 0..batch.num_rows() {
let mut cells = Vec::new();
- for col in 0..batch.num_columns() {
- let column = batch.column(col);
- cells.push(Cell::new(array_value_to_string(column, row)?));
+ for formatter in &formatters {
+ cells.push(Cell::new(formatter.value(row)));
}
table.add_row(cells);
}
@@ -123,6 +131,8 @@ mod tests {
use std::fmt::Write;
use std::sync::Arc;
+ use arrow_array::builder::PrimitiveBuilder;
+ use arrow_array::types::{ArrowTimestampType, TimestampSecondType};
use half::f16;
#[test]
@@ -366,42 +376,33 @@ mod tests {
let expected = $EXPECTED_RESULT;
let actual: Vec<&str> = table.lines().collect();
- assert_eq!(expected, actual, "Actual result:\n\n{:#?}\n\n",
actual);
+ assert_eq!(expected, actual, "Actual result:\n\n{actual:#?}\n\n");
};
}
- /// Generate an array with type $ARRAYTYPE with a numeric value of
- /// $VALUE, and compare $EXPECTED_RESULT to the output of
- /// formatting that array with `pretty_format_batches`
- macro_rules! check_datetime_with_timezone {
- ($ARRAYTYPE:ident, $VALUE:expr, $TZ_STRING:expr,
$EXPECTED_RESULT:expr) => {
- let mut builder = $ARRAYTYPE::builder(10);
- builder.append_value($VALUE);
- builder.append_null();
- let array = builder.finish();
- let array = array.with_timezone($TZ_STRING);
-
- let schema = Arc::new(Schema::new(vec![Field::new(
- "f",
- array.data_type().clone(),
- true,
- )]));
- let batch = RecordBatch::try_new(schema,
vec![Arc::new(array)]).unwrap();
-
- let table = pretty_format_batches(&[batch])
- .expect("formatting batches")
- .to_string();
-
- let expected = $EXPECTED_RESULT;
- let actual: Vec<&str> = table.lines().collect();
+ fn timestamp_batch<T: ArrowTimestampType>(
+ timezone: &str,
+ value: T::Native,
+ ) -> RecordBatch {
+ let mut builder = PrimitiveBuilder::<T>::with_capacity(10);
+ builder.append_value(value);
+ builder.append_null();
+ let array = builder.finish();
+ let array = array.with_timezone(timezone);
- assert_eq!(expected, actual, "Actual result:\n\n{:#?}\n\n",
actual);
- };
+ let schema = Arc::new(Schema::new(vec![Field::new(
+ "f",
+ array.data_type().clone(),
+ true,
+ )]));
+ RecordBatch::try_new(schema, vec![Arc::new(array)]).unwrap()
}
#[test]
#[cfg(features = "chrono-tz")]
fn test_pretty_format_timestamp_second_with_utc_timezone() {
+ let batch = timestamp_batch::<TimestampSecondType>("UTC", 11111111);
+ let table = pretty_format_batches(&[batch]).unwrap().to_string();
let expected = vec![
"+---------------------------+",
"| f |",
@@ -410,17 +411,15 @@ mod tests {
"| |",
"+---------------------------+",
];
- check_datetime_with_timezone!(
- TimestampSecondArray,
- 11111111,
- "UTC".to_string(),
- expected
- );
+ let actual: Vec<&str> = table.lines().collect();
+ assert_eq!(expected, actual, "Actual result:\n\n{actual:#?}\n\n");
}
#[test]
#[cfg(features = "chrono-tz")]
fn test_pretty_format_timestamp_second_with_non_utc_timezone() {
+ let batch = timestamp_batch::<TimestampSecondType>("Asia/Taipei",
11111111);
+ let table = pretty_format_batches(&[batch]).unwrap().to_string();
let expected = vec![
"+---------------------------+",
"| f |",
@@ -429,16 +428,15 @@ mod tests {
"| |",
"+---------------------------+",
];
- check_datetime_with_timezone!(
- TimestampSecondArray,
- 11111111,
- "Asia/Taipei".to_string(),
- expected
- );
+ let actual: Vec<&str> = table.lines().collect();
+ assert_eq!(expected, actual, "Actual result:\n\n{actual:#?}\n\n");
}
#[test]
fn test_pretty_format_timestamp_second_with_fixed_offset_timezone() {
+ let batch = timestamp_batch::<TimestampSecondType>("+08:00", 11111111);
+ let table = pretty_format_batches(&[batch]).unwrap().to_string();
+
let expected = vec![
"+---------------------------+",
"| f |",
@@ -447,48 +445,24 @@ mod tests {
"| |",
"+---------------------------+",
];
- check_datetime_with_timezone!(
- TimestampSecondArray,
- 11111111,
- "+08:00".to_string(),
- expected
- );
+ let actual: Vec<&str> = table.lines().collect();
+ assert_eq!(expected, actual, "Actual result:\n\n{actual:#?}\n\n");
}
#[test]
+ #[cfg(not(feature = "chrono-tz"))]
fn
test_pretty_format_timestamp_second_with_incorrect_fixed_offset_timezone() {
- let expected = vec![
- "+-------------------------------------------------+",
- "| f |",
- "+-------------------------------------------------+",
- "| 1970-05-09T14:25:11 (Unknown Time Zone '08:00') |",
- "| |",
- "+-------------------------------------------------+",
- ];
- check_datetime_with_timezone!(
- TimestampSecondArray,
- 11111111,
- "08:00".to_string(),
- expected
- );
+ let batch = timestamp_batch::<TimestampSecondType>("08:00", 11111111);
+ let err = pretty_format_batches(&[batch]).err().unwrap().to_string();
+ assert_eq!(err, "Parser error: Invalid timezone \"08:00\": only offset
based timezones supported without chrono-tz feature");
}
#[test]
+ #[cfg(not(feature = "chrono-tz"))]
fn test_pretty_format_timestamp_second_with_unknown_timezone() {
- let expected = vec![
- "+---------------------------------------------------+",
- "| f |",
- "+---------------------------------------------------+",
- "| 1970-05-09T14:25:11 (Unknown Time Zone 'Unknown') |",
- "| |",
- "+---------------------------------------------------+",
- ];
- check_datetime_with_timezone!(
- TimestampSecondArray,
- 11111111,
- "Unknown".to_string(),
- expected
- );
+ let batch = timestamp_batch::<TimestampSecondType>("unknown",
11111111);
+ let err = pretty_format_batches(&[batch]).err().unwrap().to_string();
+ assert_eq!(err, "Parser error: Invalid timezone \"unknown\": only
offset based timezones supported without chrono-tz feature");
}
#[test]
@@ -559,12 +533,12 @@ mod tests {
#[test]
fn test_pretty_format_date_64() {
let expected = vec![
- "+------------+",
- "| f |",
- "+------------+",
- "| 2005-03-18 |",
- "| |",
- "+------------+",
+ "+---------------------+",
+ "| f |",
+ "+---------------------+",
+ "| 2005-03-18T01:58:20 |",
+ "| |",
+ "+---------------------+",
];
check_datetime!(Date64Array, 1111111100000, expected);
}
@@ -751,13 +725,13 @@ mod tests {
let table = pretty_format_batches(&[batch])?.to_string();
let expected = vec![
- r#"+-------------------------------------+----+"#,
- r#"| c1 | c2 |"#,
- r#"+-------------------------------------+----+"#,
- r#"| {"c11": 1, "c12": {"c121": "e"}} | a |"#,
- r#"| {"c11": null, "c12": {"c121": "f"}} | b |"#,
- r#"| {"c11": 5, "c12": {"c121": "g"}} | c |"#,
- r#"+-------------------------------------+----+"#,
+ "+--------------------------+----+",
+ "| c1 | c2 |",
+ "+--------------------------+----+",
+ "| {c11: 1, c12: {c121: e}} | a |",
+ "| {c11: , c12: {c121: f}} | b |",
+ "| {c11: 5, c12: {c121: g}} | c |",
+ "+--------------------------+----+",
];
let actual: Vec<&str> = table.lines().collect();
diff --git a/arrow/tests/csv.rs b/arrow/tests/csv.rs
index 5a7c7e962..dbb399948 100644
--- a/arrow/tests/csv.rs
+++ b/arrow/tests/csv.rs
@@ -93,7 +93,7 @@ fn test_export_csv_timestamps_using_rfc3339() {
let mut sw = Vec::new();
let mut writer = arrow_csv::WriterBuilder::new()
- .with_rfc3339(true)
+ .with_rfc3339()
.build(&mut sw);
let batches = vec![&batch];
for batch in batches {
diff --git a/parquet/src/arrow/arrow_writer/levels.rs
b/parquet/src/arrow/arrow_writer/levels.rs
index 15197c02e..f427ce3e1 100644
--- a/parquet/src/arrow/arrow_writer/levels.rs
+++ b/parquet/src/arrow/arrow_writer/levels.rs
@@ -1360,8 +1360,8 @@ mod tests {
r#""#.to_string(),
r#""#.to_string(),
r#"[]"#.to_string(),
- r#"[{"list": [3, ], "integers": null}]"#.to_string(),
- r#"[, {"list": null, "integers": 5}]"#.to_string(),
+ r#"[{list: [3, ], integers: }]"#.to_string(),
+ r#"[, {list: , integers: 5}]"#.to_string(),
r#"[]"#.to_string(),
];
diff --git a/parquet/src/arrow/arrow_writer/mod.rs
b/parquet/src/arrow/arrow_writer/mod.rs
index 9235706d5..6260c2ed4 100644
--- a/parquet/src/arrow/arrow_writer/mod.rs
+++ b/parquet/src/arrow/arrow_writer/mod.rs
@@ -2314,17 +2314,17 @@ mod tests {
// Verify data is as expected
let expected = r#"
-
+-------------------------------------------------------------------------------------------------------------------------------------+
- | struct_b
|
-
+-------------------------------------------------------------------------------------------------------------------------------------+
- | {"list": [{"leaf_a": 1, "leaf_b": 1}]}
|
- | {"list": null}
|
- | {"list": [{"leaf_a": 2, "leaf_b": null}, {"leaf_a": 3, "leaf_b":
2}]} |
- | {"list": null}
|
- | {"list": [{"leaf_a": 4, "leaf_b": null}, {"leaf_a": 5, "leaf_b":
null}]} |
- | {"list": [{"leaf_a": 6, "leaf_b": null}, {"leaf_a": 7, "leaf_b":
null}, {"leaf_a": 8, "leaf_b": null}, {"leaf_a": 9, "leaf_b": 1}]} |
- | {"list": [{"leaf_a": 10, "leaf_b": null}]}
|
-
+-------------------------------------------------------------------------------------------------------------------------------------+
+
+-------------------------------------------------------------------------------------------------------+
+ | struct_b
|
+
+-------------------------------------------------------------------------------------------------------+
+ | {list: [{leaf_a: 1, leaf_b: 1}]}
|
+ | {list: }
|
+ | {list: [{leaf_a: 2, leaf_b: }, {leaf_a: 3, leaf_b: 2}]}
|
+ | {list: }
|
+ | {list: [{leaf_a: 4, leaf_b: }, {leaf_a: 5, leaf_b: }]}
|
+ | {list: [{leaf_a: 6, leaf_b: }, {leaf_a: 7, leaf_b: }, {leaf_a:
8, leaf_b: }, {leaf_a: 9, leaf_b: 1}]} |
+ | {list: [{leaf_a: 10, leaf_b: }]}
|
+
+-------------------------------------------------------------------------------------------------------+
"#.trim().split('\n').map(|x| x.trim()).collect::<Vec<_>>().join("\n");
let actual = pretty_format_batches(batches).unwrap().to_string();
diff --git a/parquet/src/arrow/async_reader/mod.rs
b/parquet/src/arrow/async_reader/mod.rs
index 71f95e07a..3e0d865c0 100644
--- a/parquet/src/arrow/async_reader/mod.rs
+++ b/parquet/src/arrow/async_reader/mod.rs
@@ -61,13 +61,13 @@
//! "+----------+-------------+-----------+",
//! "| bool_col | tinyint_col | float_col |",
//! "+----------+-------------+-----------+",
-//! "| true | 0 | 0 |",
+//! "| true | 0 | 0.0 |",
//! "| false | 1 | 1.1 |",
-//! "| true | 0 | 0 |",
+//! "| true | 0 | 0.0 |",
//! "| false | 1 | 1.1 |",
-//! "| true | 0 | 0 |",
+//! "| true | 0 | 0.0 |",
//! "| false | 1 | 1.1 |",
-//! "| true | 0 | 0 |",
+//! "| true | 0 | 0.0 |",
//! "| false | 1 | 1.1 |",
//! "+----------+-------------+-----------+",
//! ],