alamb commented on a change in pull request #849:
URL: https://github.com/apache/arrow-rs/pull/849#discussion_r746624516
##########
File path: arrow/src/compute/kernels/temporal.rs
##########
@@ -435,4 +452,78 @@ mod tests {
));
assert!(matches!(hour(&a), Err(ArrowError::ComputeError(_))))
}
+
+ #[cfg(feature = "chrono-tz")]
+ #[test]
+ fn test_using_chrono_tz() {
+ let sydney_offset = FixedOffset::east(10 * 60 * 60);
+ assert_eq!(
+ using_chrono_tz(&"Australia/Sydney".to_string()),
+ Some(sydney_offset)
+ );
+
+ let nyc_offset = FixedOffset::west(5 * 60 * 60);
+ assert_eq!(
+ using_chrono_tz(&"America/New_York".to_string()),
+ Some(nyc_offset)
+ );
+ }
+
+ #[cfg(feature = "chrono-tz")]
+ #[test]
+ fn test_using_chrono_tz_and_utc_naive_date_time() {
+ let sydney_tz = "Australia/Sydney".to_string();
+ let sydney_offset_without_dst = FixedOffset::east(10 * 60 * 60);
+ let sydney_offset_with_dst = FixedOffset::east(11 * 60 * 60);
+ // Daylight savings ends
+ // When local daylight time was about to reach
+ // Sunday, 4 April 2021, 3:00:00 am clocks were turned backward 1 hour
to
+ // Sunday, 4 April 2021, 2:00:00 am local standard time instead.
+
+ // Daylight savings starts
+ // When local standard time was about to reach
+ // Sunday, 3 October 2021, 2:00:00 am clocks were turned forward 1
hour to
+ // Sunday, 3 October 2021, 3:00:00 am local daylight time instead.
+
+ // Sydney 2021-04-04T02:30:00+11:00 is 2021-04-03T15:30:00Z
Review comment:
I really like the names of the variables and comments in this test 👍 .
It makes them very easy to understand
##########
File path: arrow/src/csv/writer.rs
##########
@@ -626,6 +787,134 @@ sed do eiusmod
tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,\n";
assert_eq!(Some(left.to_string()), right.ok());
}
+ #[cfg(feature = "chrono-tz")]
+ #[test]
+ fn test_export_csv_timestamps() {
+ let schema = Schema::new(vec![
+ Field::new(
+ "c1",
+ DataType::Timestamp(
+ TimeUnit::Millisecond,
+ Some("Australia/Sydney".to_string()),
+ ),
+ true,
+ ),
+ Field::new("c2", DataType::Timestamp(TimeUnit::Millisecond, None),
true),
+ ]);
+
+ let c1 = TimestampMillisecondArray::from_opt_vec(
+ // 1555584887 converts to 2019-04-18, 20:54:47 in time zone
Australia/Sydney (AEST).
+ // The offset (difference to UTC) is +10:00.
+ // 1635577147 converts to 2021-10-30 17:59:07 in time zone
Australia/Sydney (AEDT)
Review comment:
👍
##########
File path: arrow/src/csv/writer.rs
##########
@@ -675,4 +964,76 @@ sed do eiusmod
tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,\n";
let expected = vec![Some(3), Some(2), Some(1)];
assert_eq!(actual, expected);
}
+
+ #[cfg(feature = "chrono-tz")]
+ #[test]
+ fn test_conversion_consistency() {
+ // test if we can serialize and deserialize whilst retaining the same
type information/ precision
Review comment:
👍
##########
File path: arrow/src/csv/writer.rs
##########
@@ -626,6 +787,134 @@ sed do eiusmod
tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,\n";
assert_eq!(Some(left.to_string()), right.ok());
}
+ #[cfg(feature = "chrono-tz")]
+ #[test]
+ fn test_export_csv_timestamps() {
+ let schema = Schema::new(vec![
+ Field::new(
+ "c1",
+ DataType::Timestamp(
+ TimeUnit::Millisecond,
+ Some("Australia/Sydney".to_string()),
+ ),
+ true,
+ ),
+ Field::new("c2", DataType::Timestamp(TimeUnit::Millisecond, None),
true),
+ ]);
+
+ let c1 = TimestampMillisecondArray::from_opt_vec(
+ // 1555584887 converts to 2019-04-18, 20:54:47 in time zone
Australia/Sydney (AEST).
+ // The offset (difference to UTC) is +10:00.
+ // 1635577147 converts to 2021-10-30 17:59:07 in time zone
Australia/Sydney (AEDT)
+ // The offset (difference to UTC) is +11:00. Note that daylight
savings is in effect on 2021-10-30.
+ //
+ vec![Some(1555584887378), Some(1635577147000)],
+ Some("Australia/Sydney".to_string()),
+ );
+ let c2 = TimestampMillisecondArray::from_opt_vec(
+ vec![Some(1555584887378), Some(1635577147000)],
+ None,
+ );
+ let batch =
+ RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1),
Arc::new(c2)])
+ .unwrap();
+
+ let sw = StringWriter::new();
+ let mut writer = Writer::new(sw);
+ let batches = vec![&batch];
+ for batch in batches {
+ writer.write(batch).unwrap();
+ }
+
+ let left = "c1,c2
+2019-04-18T20:54:47.378000000+10:00,2019-04-18T10:54:47.378000000Z
+2021-10-30T17:59:07.000000000+11:00,2021-10-30T06:59:07.000000000Z\n";
+ let right = writer.writer.into_inner().map(|s| s.to_string());
+ assert_eq!(Some(left.to_string()), right.ok());
+ }
+
+ #[cfg(feature = "chrono-tz")]
+ #[test]
+ fn test_export_csv_string() {
Review comment:
is this testing anything that is not covered in `test_csv_writer()`?
##########
File path: arrow/src/csv/writer.rs
##########
@@ -499,6 +582,83 @@ sed do eiusmod
tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo
);
}
+ #[cfg(feature = "chrono-tz")]
+ #[test]
+ fn test_write_csv() {
Review comment:
What do you think about reducing the replication in the test by keeping
a single `test_write_csv` function and instead conditionalize the expected
output. This would not only reduce the replication, it would also make it
clearer what was the expected difference
Something like (untested)
```rust
#[test]
fn test_write_csv() {
...
#[cfg(not(feature = "chrono-tz"))]
let expected = r#"c1,c2,c3,c4,c5,c6,c7
Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34,cupcakes
consectetur adipiscing
elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20,cupcakes
sed do eiusmod
tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo
Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34,cupcakes
consectetur adipiscing
elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20,cupcakes
sed do eiu
"#;
#[cfg(feature = "chrono-tz")]
let expected = r#"c1,c2,c3,c4,c5,c6,c7
Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34,cupcakes
consectetur adipiscing
elit,,2,false,2019-04-18T10:54:47.378000000Z,06:51:20,cupcakes
sed do eiusmod
tempor,-556132.25,1,,2019-04-18T02:45:55.555000000Z,23:46:03,foo
Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34,cupcakes
consectetur adipiscing
elit,,2,false,2019-04-18T10:54:47.378000000Z,06:51:20,cupcakes
sed do eiusmod
tempor,-556132.25,1,,2019-04-18T02:45:55.555000000Z,23:46:03,foo
"#;
assert_eq!(expected.to_string(), String::from_utf8(buffer).unwrap());
```
##########
File path: arrow/src/csv/writer.rs
##########
@@ -258,6 +240,100 @@ impl<W: Write> Writer<W> {
Ok(())
}
+ #[cfg(not(feature = "chrono-tz"))]
+ fn handle_timestamp(
+ &self,
+ time_unit: &TimeUnit,
+ _time_zone: &Option<String>,
+ row_index: usize,
+ col: &ArrayRef,
+ ) -> Result<String> {
+ use TimeUnit::*;
+ let datetime = match time_unit {
+ Second => col
+ .as_any()
+ .downcast_ref::<TimestampSecondArray>()
+ .unwrap()
+ .value_as_datetime(row_index)
+ .unwrap(),
+ Millisecond => col
+ .as_any()
+ .downcast_ref::<TimestampMillisecondArray>()
+ .unwrap()
+ .value_as_datetime(row_index)
+ .unwrap(),
+ Microsecond => col
+ .as_any()
+ .downcast_ref::<TimestampMicrosecondArray>()
+ .unwrap()
+ .value_as_datetime(row_index)
+ .unwrap(),
+ Nanosecond => col
+ .as_any()
+ .downcast_ref::<TimestampNanosecondArray>()
+ .unwrap()
+ .value_as_datetime(row_index)
+ .unwrap(),
+ };
+ Ok(format!("{}", datetime.format(&self.timestamp_format)))
+ }
+
+ #[cfg(feature = "chrono-tz")]
+ fn handle_timestamp(
+ &self,
+ time_unit: &TimeUnit,
+ time_zone: &Option<String>,
+ row_index: usize,
+ col: &ArrayRef,
+ ) -> Result<String> {
+ use TimeUnit::*;
+
+ let datetime = match time_unit {
+ Second => col
+ .as_any()
+ .downcast_ref::<TimestampSecondArray>()
+ .unwrap()
+ .value_as_datetime(row_index)
+ .unwrap(),
+ Millisecond => col
+ .as_any()
+ .downcast_ref::<TimestampMillisecondArray>()
+ .unwrap()
+ .value_as_datetime(row_index)
+ .unwrap(),
+ Microsecond => col
+ .as_any()
+ .downcast_ref::<TimestampMicrosecondArray>()
+ .unwrap()
+ .value_as_datetime(row_index)
+ .unwrap(),
+ Nanosecond => col
+ .as_any()
+ .downcast_ref::<TimestampNanosecondArray>()
+ .unwrap()
+ .value_as_datetime(row_index)
+ .unwrap(),
+ };
+ match time_zone {
+ None => Ok(format!("{}Z",
datetime.format(&self.timestamp_format))),
+ Some(tzs) => match using_chrono_tz_and_utc_naive_date_time(tzs,
datetime) {
Review comment:
I think we can avoid some of this code duplication by following the
model of
What would you think about following the model of `using_chrono_tz` -- and
providing an implementation of `using_chrono_tz_and_utc_naive_date_time` that
doesn't do any conversion of `chrono-tz` isn't available.
Something like the following (not tested)
```rust
/// Ignore timezones if chrono-tz is not enabled and as utc
#[not(cfg(feature = "chrono-tz")]
pub fn using_chrono_tz_and_utc_naive_date_time(
tz: &str,
utc: chrono::NaiveDateTime,
) -> Option<FixedOffset> {
Some(FixedOffset(0))
}
```
The code in `arrow/src/csv/writer.rs` could thenbe simpler and always call
`using_chrono_tz_and_utc_naive_date_time` but if `chrono-tz` wasn't enabled
then it would retain the current behavior
##########
File path: arrow/src/csv/writer.rs
##########
@@ -675,4 +964,76 @@ sed do eiusmod
tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,\n";
let expected = vec![Some(3), Some(2), Some(1)];
assert_eq!(actual, expected);
}
+
+ #[cfg(feature = "chrono-tz")]
+ #[test]
+ fn test_conversion_consistency() {
+ // test if we can serialize and deserialize whilst retaining the same
type information/ precision
+
+ let schema = Schema::new(vec![
+ Field::new("c1", DataType::Date32, false),
+ Field::new("c2", DataType::Date64, false),
+ Field::new("c3", DataType::Timestamp(TimeUnit::Nanosecond, None),
false),
+ ]);
+
+ let c1 = Date32Array::from(vec![3, 2, 1]);
+ let c2 = Date64Array::from(vec![3, 2, 1]);
+ let c3 = TimestampNanosecondArray::from_vec(
+ vec![
+ 1599566300000000000,
+ 1599566200000000000,
+ 1599566100000000000,
+ ],
+ None,
+ );
+
+ let batch = RecordBatch::try_new(
+ Arc::new(schema.clone()),
+ vec![Arc::new(c1), Arc::new(c2), Arc::new(c3)],
+ )
+ .unwrap();
+
+ let builder = WriterBuilder::new().has_headers(false);
+
+ let mut buf: Cursor<Vec<u8>> = Default::default();
+ // drop the writer early to release the borrow.
+ {
+ let mut writer = builder.build(&mut buf);
+ writer.write(&batch).unwrap();
+ }
+ buf.set_position(0);
+
+ let mut reader = Reader::new(
+ buf,
+ Arc::new(schema),
+ false,
+ None,
+ 3,
+ // starting at row 2 and up to row 6.
+ None,
+ None,
+ );
+ let rb = reader.next().unwrap().unwrap();
+ let c1 = rb.column(0).as_any().downcast_ref::<Date32Array>().unwrap();
+ let c2 = rb.column(1).as_any().downcast_ref::<Date64Array>().unwrap();
+ let c3 = rb
+ .column(2)
+ .as_any()
+ .downcast_ref::<TimestampNanosecondArray>()
+ .unwrap();
+
+ let actual = c1.into_iter().collect::<Vec<_>>();
+ let expected = vec![Some(3), Some(2), Some(1)];
+ assert_eq!(actual, expected);
+ let actual = c2.into_iter().collect::<Vec<_>>();
+ let expected = vec![Some(3), Some(2), Some(1)];
+ assert_eq!(actual, expected);
+ let actual = c3.into_iter().collect::<Vec<_>>();
+ let expected = vec![
Review comment:
I wonder if it would be possible to use the same vec in the creation of
the input and the expected output which would make it explicit they were
supposed to be the same
Something like
```rust
let c3 = TimestampNanosecondArray::from_vec(expected.clone(), None);
```
above, perhaps
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]