This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 02a3f5cd2 Move CSV test data (#3044) (#3051)
02a3f5cd2 is described below
commit 02a3f5cd24ef586cdf57af1f06cad662a094a9af
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Sat Nov 12 07:18:52 2022 +1300
Move CSV test data (#3044) (#3051)
* Move CSV test data (#3044)
* Format
---
arrow-csv/src/reader.rs | 425 ++++++++++++++++++++-
{arrow => arrow-csv}/test/data/decimal_test.csv | 0
{arrow => arrow-csv}/test/data/null_test.csv | 0
{arrow => arrow-csv}/test/data/uk_cities.csv | 0
.../test/data/uk_cities_with_headers.csv | 0
{arrow => arrow-csv}/test/data/various_types.csv | 0
.../test/data/various_types_invalid.csv | 0
arrow/Cargo.toml | 2 +-
arrow/examples/read_csv.rs | 5 +-
arrow/examples/read_csv_infer_schema.rs | 2 +-
arrow/tests/csv.rs | 422 --------------------
dev/release/rat_exclude_files.txt | 1 +
12 files changed, 430 insertions(+), 427 deletions(-)
diff --git a/arrow-csv/src/reader.rs b/arrow-csv/src/reader.rs
index 459c23ad2..2fb6493e1 100644
--- a/arrow-csv/src/reader.rs
+++ b/arrow-csv/src/reader.rs
@@ -22,7 +22,7 @@
//!
//! Example:
//!
-//! ```no_run
+//! ```
//! # use arrow_schema::*;
//! # use arrow_csv::Reader;
//! # use std::fs::File;
@@ -1131,11 +1131,432 @@ impl ReaderBuilder {
mod tests {
use super::*;
- use std::io::Write;
+ use std::io::{Cursor, Write};
use tempfile::NamedTempFile;
use chrono::prelude::*;
+ #[test]
+ fn test_csv() {
+ let _: Vec<()> = vec![None,
Some("%Y-%m-%dT%H:%M:%S%.f%:z".to_string())]
+ .into_iter()
+ .map(|format| {
+ let schema = Schema::new(vec![
+ Field::new("city", DataType::Utf8, false),
+ Field::new("lat", DataType::Float64, false),
+ Field::new("lng", DataType::Float64, false),
+ ]);
+
+ let file = File::open("test/data/uk_cities.csv").unwrap();
+ let mut csv = Reader::new(
+ file,
+ Arc::new(schema.clone()),
+ false,
+ None,
+ 1024,
+ None,
+ None,
+ format,
+ );
+ assert_eq!(Arc::new(schema), csv.schema());
+ let batch = csv.next().unwrap().unwrap();
+ assert_eq!(37, batch.num_rows());
+ assert_eq!(3, batch.num_columns());
+
+ // access data from a primitive array
+ let lat = batch
+ .column(1)
+ .as_any()
+ .downcast_ref::<Float64Array>()
+ .unwrap();
+ assert_eq!(57.653484, lat.value(0));
+
+ // access data from a string array (ListArray<u8>)
+ let city = batch
+ .column(0)
+ .as_any()
+ .downcast_ref::<StringArray>()
+ .unwrap();
+
+ assert_eq!("Aberdeen, Aberdeen City, UK", city.value(13));
+ })
+ .collect();
+ }
+
+ #[test]
+ fn test_csv_schema_metadata() {
+ let mut metadata = std::collections::HashMap::new();
+ metadata.insert("foo".to_owned(), "bar".to_owned());
+ let schema = Schema::new_with_metadata(
+ vec![
+ Field::new("city", DataType::Utf8, false),
+ Field::new("lat", DataType::Float64, false),
+ Field::new("lng", DataType::Float64, false),
+ ],
+ metadata.clone(),
+ );
+
+ let file = File::open("test/data/uk_cities.csv").unwrap();
+
+ let mut csv = Reader::new(
+ file,
+ Arc::new(schema.clone()),
+ false,
+ None,
+ 1024,
+ None,
+ None,
+ None,
+ );
+ assert_eq!(Arc::new(schema), csv.schema());
+ let batch = csv.next().unwrap().unwrap();
+ assert_eq!(37, batch.num_rows());
+ assert_eq!(3, batch.num_columns());
+
+ assert_eq!(&metadata, batch.schema().metadata());
+ }
+
+ #[test]
+ fn test_csv_reader_with_decimal() {
+ let schema = Schema::new(vec![
+ Field::new("city", DataType::Utf8, false),
+ Field::new("lat", DataType::Decimal128(38, 6), false),
+ Field::new("lng", DataType::Decimal128(38, 6), false),
+ ]);
+
+ let file = File::open("test/data/decimal_test.csv").unwrap();
+
+ let mut csv =
+ Reader::new(file, Arc::new(schema), false, None, 1024, None, None,
None);
+ let batch = csv.next().unwrap().unwrap();
+ // access data from a primitive array
+ let lat = batch
+ .column(1)
+ .as_any()
+ .downcast_ref::<Decimal128Array>()
+ .unwrap();
+
+ assert_eq!("57.653484", lat.value_as_string(0));
+ assert_eq!("53.002666", lat.value_as_string(1));
+ assert_eq!("52.412811", lat.value_as_string(2));
+ assert_eq!("51.481583", lat.value_as_string(3));
+ assert_eq!("12.123456", lat.value_as_string(4));
+ assert_eq!("50.760000", lat.value_as_string(5));
+ assert_eq!("0.123000", lat.value_as_string(6));
+ assert_eq!("123.000000", lat.value_as_string(7));
+ assert_eq!("123.000000", lat.value_as_string(8));
+ assert_eq!("-50.760000", lat.value_as_string(9));
+ }
+
+ #[test]
+ fn test_csv_from_buf_reader() {
+ let schema = Schema::new(vec![
+ Field::new("city", DataType::Utf8, false),
+ Field::new("lat", DataType::Float64, false),
+ Field::new("lng", DataType::Float64, false),
+ ]);
+
+ let file_with_headers =
+ File::open("test/data/uk_cities_with_headers.csv").unwrap();
+ let file_without_headers =
File::open("test/data/uk_cities.csv").unwrap();
+ let both_files = file_with_headers
+ .chain(Cursor::new("\n".to_string()))
+ .chain(file_without_headers);
+ let mut csv = Reader::from_reader(
+ both_files,
+ Arc::new(schema),
+ true,
+ None,
+ 1024,
+ None,
+ None,
+ None,
+ );
+ let batch = csv.next().unwrap().unwrap();
+ assert_eq!(74, batch.num_rows());
+ assert_eq!(3, batch.num_columns());
+ }
+
+ #[test]
+ fn test_csv_with_schema_inference() {
+ let file = File::open("test/data/uk_cities_with_headers.csv").unwrap();
+
+ let builder = ReaderBuilder::new().has_header(true).infer_schema(None);
+
+ let mut csv = builder.build(file).unwrap();
+ let expected_schema = Schema::new(vec![
+ Field::new("city", DataType::Utf8, true),
+ Field::new("lat", DataType::Float64, true),
+ Field::new("lng", DataType::Float64, true),
+ ]);
+ assert_eq!(Arc::new(expected_schema), csv.schema());
+ let batch = csv.next().unwrap().unwrap();
+ assert_eq!(37, batch.num_rows());
+ assert_eq!(3, batch.num_columns());
+
+ // access data from a primitive array
+ let lat = batch
+ .column(1)
+ .as_any()
+ .downcast_ref::<Float64Array>()
+ .unwrap();
+ assert_eq!(57.653484, lat.value(0));
+
+ // access data from a string array (ListArray<u8>)
+ let city = batch
+ .column(0)
+ .as_any()
+ .downcast_ref::<StringArray>()
+ .unwrap();
+
+ assert_eq!("Aberdeen, Aberdeen City, UK", city.value(13));
+ }
+
+ #[test]
+ fn test_csv_with_schema_inference_no_headers() {
+ let file = File::open("test/data/uk_cities.csv").unwrap();
+
+ let builder = ReaderBuilder::new().infer_schema(None);
+
+ let mut csv = builder.build(file).unwrap();
+
+ // csv field names should be 'column_{number}'
+ let schema = csv.schema();
+ assert_eq!("column_1", schema.field(0).name());
+ assert_eq!("column_2", schema.field(1).name());
+ assert_eq!("column_3", schema.field(2).name());
+ let batch = csv.next().unwrap().unwrap();
+ let batch_schema = batch.schema();
+
+ assert_eq!(schema, batch_schema);
+ assert_eq!(37, batch.num_rows());
+ assert_eq!(3, batch.num_columns());
+
+ // access data from a primitive array
+ let lat = batch
+ .column(1)
+ .as_any()
+ .downcast_ref::<Float64Array>()
+ .unwrap();
+ assert_eq!(57.653484, lat.value(0));
+
+ // access data from a string array (ListArray<u8>)
+ let city = batch
+ .column(0)
+ .as_any()
+ .downcast_ref::<StringArray>()
+ .unwrap();
+
+ assert_eq!("Aberdeen, Aberdeen City, UK", city.value(13));
+ }
+
+ #[test]
+ fn test_csv_builder_with_bounds() {
+ let file = File::open("test/data/uk_cities.csv").unwrap();
+
+ // Set the bounds to the lines 0, 1 and 2.
+ let mut csv = ReaderBuilder::new().with_bounds(0,
2).build(file).unwrap();
+ let batch = csv.next().unwrap().unwrap();
+
+ // access data from a string array (ListArray<u8>)
+ let city = batch
+ .column(0)
+ .as_any()
+ .downcast_ref::<StringArray>()
+ .unwrap();
+
+ // The value on line 0 is within the bounds
+ assert_eq!("Elgin, Scotland, the UK", city.value(0));
+
+ // The value on line 13 is outside of the bounds. Therefore
+ // the call to .value() will panic.
+ let result = std::panic::catch_unwind(|| city.value(13));
+ assert!(result.is_err());
+ }
+
+ #[test]
+ fn test_csv_with_projection() {
+ let schema = Schema::new(vec![
+ Field::new("city", DataType::Utf8, false),
+ Field::new("lat", DataType::Float64, false),
+ Field::new("lng", DataType::Float64, false),
+ ]);
+
+ let file = File::open("test/data/uk_cities.csv").unwrap();
+
+ let mut csv = Reader::new(
+ file,
+ Arc::new(schema),
+ false,
+ None,
+ 1024,
+ None,
+ Some(vec![0, 1]),
+ None,
+ );
+ let projected_schema = Arc::new(Schema::new(vec![
+ Field::new("city", DataType::Utf8, false),
+ Field::new("lat", DataType::Float64, false),
+ ]));
+ assert_eq!(projected_schema, csv.schema());
+ let batch = csv.next().unwrap().unwrap();
+ assert_eq!(projected_schema, batch.schema());
+ assert_eq!(37, batch.num_rows());
+ assert_eq!(2, batch.num_columns());
+ }
+
+ #[test]
+ fn test_csv_with_dictionary() {
+ let schema = Schema::new(vec![
+ Field::new(
+ "city",
+ DataType::Dictionary(Box::new(DataType::Int32),
Box::new(DataType::Utf8)),
+ false,
+ ),
+ Field::new("lat", DataType::Float64, false),
+ Field::new("lng", DataType::Float64, false),
+ ]);
+
+ let file = File::open("test/data/uk_cities.csv").unwrap();
+
+ let mut csv = Reader::new(
+ file,
+ Arc::new(schema),
+ false,
+ None,
+ 1024,
+ None,
+ Some(vec![0, 1]),
+ None,
+ );
+ let projected_schema = Arc::new(Schema::new(vec![
+ Field::new(
+ "city",
+ DataType::Dictionary(Box::new(DataType::Int32),
Box::new(DataType::Utf8)),
+ false,
+ ),
+ Field::new("lat", DataType::Float64, false),
+ ]));
+ assert_eq!(projected_schema, csv.schema());
+ let batch = csv.next().unwrap().unwrap();
+ assert_eq!(projected_schema, batch.schema());
+ assert_eq!(37, batch.num_rows());
+ assert_eq!(2, batch.num_columns());
+
+ let strings = arrow_cast::cast(batch.column(0),
&DataType::Utf8).unwrap();
+ let strings = strings.as_any().downcast_ref::<StringArray>().unwrap();
+
+ assert_eq!(strings.value(0), "Elgin, Scotland, the UK");
+ assert_eq!(strings.value(4), "Eastbourne, East Sussex, UK");
+ assert_eq!(strings.value(29), "Uckfield, East Sussex, UK");
+ }
+
+ #[test]
+ fn test_nulls() {
+ let schema = Schema::new(vec![
+ Field::new("c_int", DataType::UInt64, false),
+ Field::new("c_float", DataType::Float32, true),
+ Field::new("c_string", DataType::Utf8, false),
+ ]);
+
+ let file = File::open("test/data/null_test.csv").unwrap();
+
+ let mut csv =
+ Reader::new(file, Arc::new(schema), true, None, 1024, None, None,
None);
+ let batch = csv.next().unwrap().unwrap();
+
+ assert!(!batch.column(1).is_null(0));
+ assert!(!batch.column(1).is_null(1));
+ assert!(batch.column(1).is_null(2));
+ assert!(!batch.column(1).is_null(3));
+ assert!(!batch.column(1).is_null(4));
+ }
+
+ #[test]
+ fn test_nulls_with_inference() {
+ let file = File::open("test/data/various_types.csv").unwrap();
+
+ let builder = ReaderBuilder::new()
+ .infer_schema(None)
+ .has_header(true)
+ .with_delimiter(b'|')
+ .with_batch_size(512)
+ .with_projection(vec![0, 1, 2, 3, 4, 5]);
+
+ let mut csv = builder.build(file).unwrap();
+ let batch = csv.next().unwrap().unwrap();
+
+ assert_eq!(7, batch.num_rows());
+ assert_eq!(6, batch.num_columns());
+
+ let schema = batch.schema();
+
+ assert_eq!(&DataType::Int64, schema.field(0).data_type());
+ assert_eq!(&DataType::Float64, schema.field(1).data_type());
+ assert_eq!(&DataType::Float64, schema.field(2).data_type());
+ assert_eq!(&DataType::Boolean, schema.field(3).data_type());
+ assert_eq!(&DataType::Date32, schema.field(4).data_type());
+ assert_eq!(&DataType::Date64, schema.field(5).data_type());
+
+ let names: Vec<&str> =
+ schema.fields().iter().map(|x| x.name().as_str()).collect();
+ assert_eq!(
+ names,
+ vec![
+ "c_int",
+ "c_float",
+ "c_string",
+ "c_bool",
+ "c_date",
+ "c_datetime"
+ ]
+ );
+
+ assert!(schema.field(0).is_nullable());
+ assert!(schema.field(1).is_nullable());
+ assert!(schema.field(2).is_nullable());
+ assert!(schema.field(3).is_nullable());
+ assert!(schema.field(4).is_nullable());
+ assert!(schema.field(5).is_nullable());
+
+ assert!(!batch.column(1).is_null(0));
+ assert!(!batch.column(1).is_null(1));
+ assert!(batch.column(1).is_null(2));
+ assert!(!batch.column(1).is_null(3));
+ assert!(!batch.column(1).is_null(4));
+ }
+
+ #[test]
+ fn test_parse_invalid_csv() {
+ let file = File::open("test/data/various_types_invalid.csv").unwrap();
+
+ let schema = Schema::new(vec![
+ Field::new("c_int", DataType::UInt64, false),
+ Field::new("c_float", DataType::Float32, false),
+ Field::new("c_string", DataType::Utf8, false),
+ Field::new("c_bool", DataType::Boolean, false),
+ ]);
+
+ let builder = ReaderBuilder::new()
+ .with_schema(Arc::new(schema))
+ .has_header(true)
+ .with_delimiter(b'|')
+ .with_batch_size(512)
+ .with_projection(vec![0, 1, 2, 3]);
+
+ let mut csv = builder.build(file).unwrap();
+ match csv.next() {
+ Some(e) => match e {
+ Err(e) => assert_eq!(
+ "ParseError(\"Error while parsing value 4.x4 for column 1
at line 4\")",
+ format!("{:?}", e)
+ ),
+ Ok(_) => panic!("should have failed"),
+ },
+ None => panic!("should have failed"),
+ }
+ }
+
#[test]
fn test_infer_field_schema() {
assert_eq!(infer_field_schema("A", None), DataType::Utf8);
diff --git a/arrow/test/data/decimal_test.csv
b/arrow-csv/test/data/decimal_test.csv
similarity index 100%
rename from arrow/test/data/decimal_test.csv
rename to arrow-csv/test/data/decimal_test.csv
diff --git a/arrow/test/data/null_test.csv b/arrow-csv/test/data/null_test.csv
similarity index 100%
rename from arrow/test/data/null_test.csv
rename to arrow-csv/test/data/null_test.csv
diff --git a/arrow/test/data/uk_cities.csv b/arrow-csv/test/data/uk_cities.csv
similarity index 100%
rename from arrow/test/data/uk_cities.csv
rename to arrow-csv/test/data/uk_cities.csv
diff --git a/arrow/test/data/uk_cities_with_headers.csv
b/arrow-csv/test/data/uk_cities_with_headers.csv
similarity index 100%
rename from arrow/test/data/uk_cities_with_headers.csv
rename to arrow-csv/test/data/uk_cities_with_headers.csv
diff --git a/arrow/test/data/various_types.csv
b/arrow-csv/test/data/various_types.csv
similarity index 100%
rename from arrow/test/data/various_types.csv
rename to arrow-csv/test/data/various_types.csv
diff --git a/arrow/test/data/various_types_invalid.csv
b/arrow-csv/test/data/various_types_invalid.csv
similarity index 100%
rename from arrow/test/data/various_types_invalid.csv
rename to arrow-csv/test/data/various_types_invalid.csv
diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml
index 2e33014db..452cc4bbd 100644
--- a/arrow/Cargo.toml
+++ b/arrow/Cargo.toml
@@ -267,4 +267,4 @@ required-features = ["test_utils", "ipc"]
[[test]]
name = "csv"
-required-features = ["csv"]
+required-features = ["csv", "chrono-tz"]
diff --git a/arrow/examples/read_csv.rs b/arrow/examples/read_csv.rs
index a1a592134..efb55c6d2 100644
--- a/arrow/examples/read_csv.rs
+++ b/arrow/examples/read_csv.rs
@@ -31,7 +31,10 @@ fn main() {
Field::new("lng", DataType::Float64, false),
]);
- let path = format!("{}/test/data/uk_cities.csv",
env!("CARGO_MANIFEST_DIR"));
+ let path = format!(
+ "{}/../arrow-csv/test/data/uk_cities.csv",
+ env!("CARGO_MANIFEST_DIR")
+ );
let file = File::open(path).unwrap();
let mut csv =
diff --git a/arrow/examples/read_csv_infer_schema.rs
b/arrow/examples/read_csv_infer_schema.rs
index 120a7b819..2a713ba61 100644
--- a/arrow/examples/read_csv_infer_schema.rs
+++ b/arrow/examples/read_csv_infer_schema.rs
@@ -23,7 +23,7 @@ use std::fs::File;
fn main() {
let path = format!(
- "{}/test/data/uk_cities_with_headers.csv",
+ "{}/../arrow-csv/test/data/uk_cities_with_headers.csv",
env!("CARGO_MANIFEST_DIR")
);
let file = File::open(path).unwrap();
diff --git a/arrow/tests/csv.rs b/arrow/tests/csv.rs
index 11e1b30e1..83a279ce4 100644
--- a/arrow/tests/csv.rs
+++ b/arrow/tests/csv.rs
@@ -15,16 +15,12 @@
// specific language governing permissions and limitations
// under the License.
-use std::fs::File;
-use std::io::{Cursor, Read};
use std::sync::Arc;
use arrow_array::*;
-use arrow_csv::{Reader, ReaderBuilder};
use arrow_schema::*;
#[test]
-#[cfg(feature = "chrono-tz")]
fn test_export_csv_timestamps() {
let schema = Schema::new(vec![
Field::new(
@@ -66,421 +62,3 @@ fn test_export_csv_timestamps() {
let right = String::from_utf8(sw).unwrap();
assert_eq!(left, right);
}
-
-#[test]
-fn test_csv() {
- let _: Vec<()> = vec![None, Some("%Y-%m-%dT%H:%M:%S%.f%:z".to_string())]
- .into_iter()
- .map(|format| {
- let schema = Schema::new(vec![
- Field::new("city", DataType::Utf8, false),
- Field::new("lat", DataType::Float64, false),
- Field::new("lng", DataType::Float64, false),
- ]);
-
- let file = File::open("test/data/uk_cities.csv").unwrap();
- let mut csv = Reader::new(
- file,
- Arc::new(schema.clone()),
- false,
- None,
- 1024,
- None,
- None,
- format,
- );
- assert_eq!(Arc::new(schema), csv.schema());
- let batch = csv.next().unwrap().unwrap();
- assert_eq!(37, batch.num_rows());
- assert_eq!(3, batch.num_columns());
-
- // access data from a primitive array
- let lat = batch
- .column(1)
- .as_any()
- .downcast_ref::<Float64Array>()
- .unwrap();
- assert_eq!(57.653484, lat.value(0));
-
- // access data from a string array (ListArray<u8>)
- let city = batch
- .column(0)
- .as_any()
- .downcast_ref::<StringArray>()
- .unwrap();
-
- assert_eq!("Aberdeen, Aberdeen City, UK", city.value(13));
- })
- .collect();
-}
-
-#[test]
-fn test_csv_schema_metadata() {
- let mut metadata = std::collections::HashMap::new();
- metadata.insert("foo".to_owned(), "bar".to_owned());
- let schema = Schema::new_with_metadata(
- vec![
- Field::new("city", DataType::Utf8, false),
- Field::new("lat", DataType::Float64, false),
- Field::new("lng", DataType::Float64, false),
- ],
- metadata.clone(),
- );
-
- let file = File::open("test/data/uk_cities.csv").unwrap();
-
- let mut csv = Reader::new(
- file,
- Arc::new(schema.clone()),
- false,
- None,
- 1024,
- None,
- None,
- None,
- );
- assert_eq!(Arc::new(schema), csv.schema());
- let batch = csv.next().unwrap().unwrap();
- assert_eq!(37, batch.num_rows());
- assert_eq!(3, batch.num_columns());
-
- assert_eq!(&metadata, batch.schema().metadata());
-}
-
-#[test]
-fn test_csv_reader_with_decimal() {
- let schema = Schema::new(vec![
- Field::new("city", DataType::Utf8, false),
- Field::new("lat", DataType::Decimal128(38, 6), false),
- Field::new("lng", DataType::Decimal128(38, 6), false),
- ]);
-
- let file = File::open("test/data/decimal_test.csv").unwrap();
-
- let mut csv =
- Reader::new(file, Arc::new(schema), false, None, 1024, None, None,
None);
- let batch = csv.next().unwrap().unwrap();
- // access data from a primitive array
- let lat = batch
- .column(1)
- .as_any()
- .downcast_ref::<Decimal128Array>()
- .unwrap();
-
- assert_eq!("57.653484", lat.value_as_string(0));
- assert_eq!("53.002666", lat.value_as_string(1));
- assert_eq!("52.412811", lat.value_as_string(2));
- assert_eq!("51.481583", lat.value_as_string(3));
- assert_eq!("12.123456", lat.value_as_string(4));
- assert_eq!("50.760000", lat.value_as_string(5));
- assert_eq!("0.123000", lat.value_as_string(6));
- assert_eq!("123.000000", lat.value_as_string(7));
- assert_eq!("123.000000", lat.value_as_string(8));
- assert_eq!("-50.760000", lat.value_as_string(9));
-}
-
-#[test]
-fn test_csv_from_buf_reader() {
- let schema = Schema::new(vec![
- Field::new("city", DataType::Utf8, false),
- Field::new("lat", DataType::Float64, false),
- Field::new("lng", DataType::Float64, false),
- ]);
-
- let file_with_headers =
File::open("test/data/uk_cities_with_headers.csv").unwrap();
- let file_without_headers = File::open("test/data/uk_cities.csv").unwrap();
- let both_files = file_with_headers
- .chain(Cursor::new("\n".to_string()))
- .chain(file_without_headers);
- let mut csv = Reader::from_reader(
- both_files,
- Arc::new(schema),
- true,
- None,
- 1024,
- None,
- None,
- None,
- );
- let batch = csv.next().unwrap().unwrap();
- assert_eq!(74, batch.num_rows());
- assert_eq!(3, batch.num_columns());
-}
-
-#[test]
-fn test_csv_with_schema_inference() {
- let file = File::open("test/data/uk_cities_with_headers.csv").unwrap();
-
- let builder = ReaderBuilder::new().has_header(true).infer_schema(None);
-
- let mut csv = builder.build(file).unwrap();
- let expected_schema = Schema::new(vec![
- Field::new("city", DataType::Utf8, true),
- Field::new("lat", DataType::Float64, true),
- Field::new("lng", DataType::Float64, true),
- ]);
- assert_eq!(Arc::new(expected_schema), csv.schema());
- let batch = csv.next().unwrap().unwrap();
- assert_eq!(37, batch.num_rows());
- assert_eq!(3, batch.num_columns());
-
- // access data from a primitive array
- let lat = batch
- .column(1)
- .as_any()
- .downcast_ref::<Float64Array>()
- .unwrap();
- assert_eq!(57.653484, lat.value(0));
-
- // access data from a string array (ListArray<u8>)
- let city = batch
- .column(0)
- .as_any()
- .downcast_ref::<StringArray>()
- .unwrap();
-
- assert_eq!("Aberdeen, Aberdeen City, UK", city.value(13));
-}
-
-#[test]
-fn test_csv_with_schema_inference_no_headers() {
- let file = File::open("test/data/uk_cities.csv").unwrap();
-
- let builder = ReaderBuilder::new().infer_schema(None);
-
- let mut csv = builder.build(file).unwrap();
-
- // csv field names should be 'column_{number}'
- let schema = csv.schema();
- assert_eq!("column_1", schema.field(0).name());
- assert_eq!("column_2", schema.field(1).name());
- assert_eq!("column_3", schema.field(2).name());
- let batch = csv.next().unwrap().unwrap();
- let batch_schema = batch.schema();
-
- assert_eq!(schema, batch_schema);
- assert_eq!(37, batch.num_rows());
- assert_eq!(3, batch.num_columns());
-
- // access data from a primitive array
- let lat = batch
- .column(1)
- .as_any()
- .downcast_ref::<Float64Array>()
- .unwrap();
- assert_eq!(57.653484, lat.value(0));
-
- // access data from a string array (ListArray<u8>)
- let city = batch
- .column(0)
- .as_any()
- .downcast_ref::<StringArray>()
- .unwrap();
-
- assert_eq!("Aberdeen, Aberdeen City, UK", city.value(13));
-}
-
-#[test]
-fn test_csv_builder_with_bounds() {
- let file = File::open("test/data/uk_cities.csv").unwrap();
-
- // Set the bounds to the lines 0, 1 and 2.
- let mut csv = ReaderBuilder::new().with_bounds(0, 2).build(file).unwrap();
- let batch = csv.next().unwrap().unwrap();
-
- // access data from a string array (ListArray<u8>)
- let city = batch
- .column(0)
- .as_any()
- .downcast_ref::<StringArray>()
- .unwrap();
-
- // The value on line 0 is within the bounds
- assert_eq!("Elgin, Scotland, the UK", city.value(0));
-
- // The value on line 13 is outside of the bounds. Therefore
- // the call to .value() will panic.
- let result = std::panic::catch_unwind(|| city.value(13));
- assert!(result.is_err());
-}
-
-#[test]
-fn test_csv_with_projection() {
- let schema = Schema::new(vec![
- Field::new("city", DataType::Utf8, false),
- Field::new("lat", DataType::Float64, false),
- Field::new("lng", DataType::Float64, false),
- ]);
-
- let file = File::open("test/data/uk_cities.csv").unwrap();
-
- let mut csv = Reader::new(
- file,
- Arc::new(schema),
- false,
- None,
- 1024,
- None,
- Some(vec![0, 1]),
- None,
- );
- let projected_schema = Arc::new(Schema::new(vec![
- Field::new("city", DataType::Utf8, false),
- Field::new("lat", DataType::Float64, false),
- ]));
- assert_eq!(projected_schema, csv.schema());
- let batch = csv.next().unwrap().unwrap();
- assert_eq!(projected_schema, batch.schema());
- assert_eq!(37, batch.num_rows());
- assert_eq!(2, batch.num_columns());
-}
-
-#[test]
-fn test_csv_with_dictionary() {
- let schema = Schema::new(vec![
- Field::new(
- "city",
- DataType::Dictionary(Box::new(DataType::Int32),
Box::new(DataType::Utf8)),
- false,
- ),
- Field::new("lat", DataType::Float64, false),
- Field::new("lng", DataType::Float64, false),
- ]);
-
- let file = File::open("test/data/uk_cities.csv").unwrap();
-
- let mut csv = Reader::new(
- file,
- Arc::new(schema),
- false,
- None,
- 1024,
- None,
- Some(vec![0, 1]),
- None,
- );
- let projected_schema = Arc::new(Schema::new(vec![
- Field::new(
- "city",
- DataType::Dictionary(Box::new(DataType::Int32),
Box::new(DataType::Utf8)),
- false,
- ),
- Field::new("lat", DataType::Float64, false),
- ]));
- assert_eq!(projected_schema, csv.schema());
- let batch = csv.next().unwrap().unwrap();
- assert_eq!(projected_schema, batch.schema());
- assert_eq!(37, batch.num_rows());
- assert_eq!(2, batch.num_columns());
-
- let strings = arrow_cast::cast(batch.column(0), &DataType::Utf8).unwrap();
- let strings = strings.as_any().downcast_ref::<StringArray>().unwrap();
-
- assert_eq!(strings.value(0), "Elgin, Scotland, the UK");
- assert_eq!(strings.value(4), "Eastbourne, East Sussex, UK");
- assert_eq!(strings.value(29), "Uckfield, East Sussex, UK");
-}
-
-#[test]
-fn test_nulls() {
- let schema = Schema::new(vec![
- Field::new("c_int", DataType::UInt64, false),
- Field::new("c_float", DataType::Float32, true),
- Field::new("c_string", DataType::Utf8, false),
- ]);
-
- let file = File::open("test/data/null_test.csv").unwrap();
-
- let mut csv = Reader::new(file, Arc::new(schema), true, None, 1024, None,
None, None);
- let batch = csv.next().unwrap().unwrap();
-
- assert!(!batch.column(1).is_null(0));
- assert!(!batch.column(1).is_null(1));
- assert!(batch.column(1).is_null(2));
- assert!(!batch.column(1).is_null(3));
- assert!(!batch.column(1).is_null(4));
-}
-
-#[test]
-fn test_nulls_with_inference() {
- let file = File::open("test/data/various_types.csv").unwrap();
-
- let builder = ReaderBuilder::new()
- .infer_schema(None)
- .has_header(true)
- .with_delimiter(b'|')
- .with_batch_size(512)
- .with_projection(vec![0, 1, 2, 3, 4, 5]);
-
- let mut csv = builder.build(file).unwrap();
- let batch = csv.next().unwrap().unwrap();
-
- assert_eq!(7, batch.num_rows());
- assert_eq!(6, batch.num_columns());
-
- let schema = batch.schema();
-
- assert_eq!(&DataType::Int64, schema.field(0).data_type());
- assert_eq!(&DataType::Float64, schema.field(1).data_type());
- assert_eq!(&DataType::Float64, schema.field(2).data_type());
- assert_eq!(&DataType::Boolean, schema.field(3).data_type());
- assert_eq!(&DataType::Date32, schema.field(4).data_type());
- assert_eq!(&DataType::Date64, schema.field(5).data_type());
-
- let names: Vec<&str> = schema.fields().iter().map(|x|
x.name().as_str()).collect();
- assert_eq!(
- names,
- vec![
- "c_int",
- "c_float",
- "c_string",
- "c_bool",
- "c_date",
- "c_datetime"
- ]
- );
-
- assert!(schema.field(0).is_nullable());
- assert!(schema.field(1).is_nullable());
- assert!(schema.field(2).is_nullable());
- assert!(schema.field(3).is_nullable());
- assert!(schema.field(4).is_nullable());
- assert!(schema.field(5).is_nullable());
-
- assert!(!batch.column(1).is_null(0));
- assert!(!batch.column(1).is_null(1));
- assert!(batch.column(1).is_null(2));
- assert!(!batch.column(1).is_null(3));
- assert!(!batch.column(1).is_null(4));
-}
-
-#[test]
-fn test_parse_invalid_csv() {
- let file = File::open("test/data/various_types_invalid.csv").unwrap();
-
- let schema = Schema::new(vec![
- Field::new("c_int", DataType::UInt64, false),
- Field::new("c_float", DataType::Float32, false),
- Field::new("c_string", DataType::Utf8, false),
- Field::new("c_bool", DataType::Boolean, false),
- ]);
-
- let builder = ReaderBuilder::new()
- .with_schema(Arc::new(schema))
- .has_header(true)
- .with_delimiter(b'|')
- .with_batch_size(512)
- .with_projection(vec![0, 1, 2, 3]);
-
- let mut csv = builder.build(file).unwrap();
- match csv.next() {
- Some(e) => match e {
- Err(e) => assert_eq!(
- "ParseError(\"Error while parsing value 4.x4 for column 1 at
line 4\")",
- format!("{:?}", e)
- ),
- Ok(_) => panic!("should have failed"),
- },
- None => panic!("should have failed"),
- }
-}
diff --git a/dev/release/rat_exclude_files.txt
b/dev/release/rat_exclude_files.txt
index 0ca2ab91a..fad1a5a7d 100644
--- a/dev/release/rat_exclude_files.txt
+++ b/dev/release/rat_exclude_files.txt
@@ -3,6 +3,7 @@ testing/*
target/*
dev/release/rat_exclude_files.txt
arrow/test/data/*
+arrow-csv/test/data/*
arrow-json/test/data/*
arrow/test/dependency/*
arrow-integration-test/data/*