This is an automated email from the ASF dual-hosted git repository.
agrove pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new ac45f32 ARROW-3839: [Rust] Add ability to infer schema in CSV reader
ac45f32 is described below
commit ac45f3210a194049ef35f49847dbc4ff5e70d48f
Author: Neville Dipale <[email protected]>
AuthorDate: Tue Jan 8 16:49:12 2019 -0700
ARROW-3839: [Rust] Add ability to infer schema in CSV reader
Resubmission of #3128
Author: Neville Dipale <[email protected]>
Closes #3349 from nevi-me/rust/infer-csv-schema and squashes the following
commits:
0838199 <Neville Dipale> ARROW-3839: Add ability to infer schema in CSV
reader
---
ci/rust-build-main.bat | 1 +
ci/travis_script_rust.sh | 1 +
rust/arrow/Cargo.toml | 2 +
rust/arrow/examples/read_csv_infer_schema.rs | 66 +++++
rust/arrow/src/csv/mod.rs | 1 +
rust/arrow/src/csv/reader.rs | 373 +++++++++++++++++++++++-
rust/arrow/src/datatypes.rs | 4 +-
rust/arrow/src/error.rs | 37 +++
rust/arrow/test/data/uk_cities_with_headers.csv | 38 +++
rust/arrow/test/data/various_types.csv | 6 +
10 files changed, 524 insertions(+), 5 deletions(-)
diff --git a/ci/rust-build-main.bat b/ci/rust-build-main.bat
index ac5c9e7..b36a97a 100644
--- a/ci/rust-build-main.bat
+++ b/ci/rust-build-main.bat
@@ -40,5 +40,6 @@ cd arrow
cargo run --example builders --target %TARGET% --release || exit /B
cargo run --example dynamic_types --target %TARGET% --release || exit /B
cargo run --example read_csv --target %TARGET% --release || exit /B
+cargo run --example read_csv_infer_schema --target %TARGET% --release || exit
/B
popd
diff --git a/ci/travis_script_rust.sh b/ci/travis_script_rust.sh
index 8e3c8c3..c25d64e 100755
--- a/ci/travis_script_rust.sh
+++ b/ci/travis_script_rust.sh
@@ -39,5 +39,6 @@ cd arrow
cargo run --example builders
cargo run --example dynamic_types
cargo run --example read_csv
+cargo run --example read_csv_infer_schema
popd
diff --git a/rust/arrow/Cargo.toml b/rust/arrow/Cargo.toml
index 77e8d53..38e7e5e 100644
--- a/rust/arrow/Cargo.toml
+++ b/rust/arrow/Cargo.toml
@@ -43,6 +43,8 @@ serde_json = "1.0.13"
rand = "0.5"
csv = "1.0.0"
num = "0.2"
+regex = "1.1"
+lazy_static = "1.2"
[dev-dependencies]
criterion = "0.2"
diff --git a/rust/arrow/examples/read_csv_infer_schema.rs
b/rust/arrow/examples/read_csv_infer_schema.rs
new file mode 100644
index 0000000..9dd2d2a
--- /dev/null
+++ b/rust/arrow/examples/read_csv_infer_schema.rs
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+extern crate arrow;
+
+use arrow::array::{BinaryArray, Float64Array};
+use arrow::csv;
+use std::fs::File;
+
+fn main() {
+ let file = File::open("test/data/uk_cities_with_headers.csv").unwrap();
+ let builder = csv::ReaderBuilder::new()
+ .has_headers(true)
+ .infer_schema(Some(100));
+ let mut csv = builder.build(file).unwrap();
+ let batch = csv.next().unwrap().unwrap();
+
+ println!(
+ "Loaded {} rows containing {} columns",
+ batch.num_rows(),
+ batch.num_columns()
+ );
+
+ println!("Inferred schema: {:?}", batch.schema());
+
+ let city = batch
+ .column(0)
+ .as_any()
+ .downcast_ref::<BinaryArray>()
+ .unwrap();
+ let lat = batch
+ .column(1)
+ .as_any()
+ .downcast_ref::<Float64Array>()
+ .unwrap();
+ let lng = batch
+ .column(2)
+ .as_any()
+ .downcast_ref::<Float64Array>()
+ .unwrap();
+
+ for i in 0..batch.num_rows() {
+ let city_name: String =
String::from_utf8(city.value(i).to_vec()).unwrap();
+
+ println!(
+ "City: {}, Latitude: {}, Longitude: {}",
+ city_name,
+ lat.value(i),
+ lng.value(i)
+ );
+ }
+}
diff --git a/rust/arrow/src/csv/mod.rs b/rust/arrow/src/csv/mod.rs
index 9f2bd1d..6521b19 100644
--- a/rust/arrow/src/csv/mod.rs
+++ b/rust/arrow/src/csv/mod.rs
@@ -18,3 +18,4 @@
pub mod reader;
pub use self::reader::Reader;
+pub use self::reader::ReaderBuilder;
diff --git a/rust/arrow/src/csv/reader.rs b/rust/arrow/src/csv/reader.rs
index 57c7dde..49e0302 100644
--- a/rust/arrow/src/csv/reader.rs
+++ b/rust/arrow/src/csv/reader.rs
@@ -40,8 +40,11 @@
//! let batch = csv.next().unwrap().unwrap();
//! ```
+use lazy_static::lazy_static;
+use regex::{Regex, RegexBuilder};
+use std::collections::HashSet;
use std::fs::File;
-use std::io::BufReader;
+use std::io::{BufReader, Seek, SeekFrom};
use std::sync::Arc;
use csv as csv_crate;
@@ -54,6 +57,130 @@ use crate::record_batch::RecordBatch;
use self::csv_crate::{StringRecord, StringRecordsIntoIter};
+lazy_static! {
+ static ref DECIMAL_RE: Regex = Regex::new(r"^-?(\d+\.\d+)$").unwrap();
+ static ref INTEGER_RE: Regex = Regex::new(r"^-?(\d*.)$").unwrap();
+ static ref BOOLEAN_RE: Regex = RegexBuilder::new(r"^(true)$|^(false)$")
+ .case_insensitive(true)
+ .build()
+ .unwrap();
+}
+
+/// Infer the data type of a record
+fn infer_field_schema(string: &str) -> DataType {
+ // when quoting is enabled in the reader, these quotes aren't escaped, we
default to Utf8 for them
+ if string.starts_with("\"") {
+ return DataType::Utf8;
+ }
+ // match regex in a particular order
+ if BOOLEAN_RE.is_match(string) {
+ return DataType::Boolean;
+ } else if DECIMAL_RE.is_match(string) {
+ return DataType::Float64;
+ } else if INTEGER_RE.is_match(string) {
+ return DataType::Int64;
+ } else {
+ return DataType::Utf8;
+ }
+}
+
+/// Infer the schema of a CSV file by reading through the first n records of
the file,
+/// with `max_read_records` controlling the maximum number of records to read.
+///
+/// If `max_read_records` is not set, the whole file is read to infer its
schema.
+fn infer_file_schema(
+ mut file: File,
+ delimiter: u8,
+ max_read_records: Option<usize>,
+ has_headers: bool,
+) -> Result<Schema> {
+ let mut csv_reader = csv::ReaderBuilder::new()
+ .delimiter(delimiter)
+ .from_reader(BufReader::new(file.try_clone()?));
+
+ // get or create header names
+ // when has_headers is false, creates default column names with column_
prefix
+ let headers: Vec<String> = if has_headers {
+ let headers = &csv_reader.headers()?.clone();
+ headers.iter().map(|s| s.to_string()).collect()
+ } else {
+ let first_record_count = &csv_reader.headers()?.len();
+ (0..*first_record_count)
+ .map(|i| format!("column_{}", i + 1))
+ .into_iter()
+ .collect()
+ };
+
+ // save the csv reader position after reading headers
+ let position = csv_reader.position().clone();
+
+ let header_length = headers.len();
+ // keep track of inferred field types
+ let mut column_types: Vec<HashSet<DataType>> = vec![HashSet::new();
header_length];
+ // keep track of columns with nulls
+ let mut nulls: Vec<bool> = vec![false; header_length];
+
+ // return csv reader position to after headers
+ csv_reader.seek(position)?;
+
+ let mut fields = vec![];
+
+ for result in csv_reader
+ .into_records()
+ .take(max_read_records.unwrap_or(std::usize::MAX))
+ {
+ let record = result?;
+
+ for i in 0..header_length {
+ let string: Option<&str> = record.get(i);
+ match string {
+ Some(s) => {
+ if s == "" {
+ nulls[i] = true;
+ } else {
+ column_types[i].insert(infer_field_schema(s));
+ }
+ }
+ _ => {}
+ }
+ }
+ }
+
+ // build schema from inference results
+ for i in 0..header_length {
+ let possibilities = &column_types[i];
+ let has_nulls = nulls[i];
+ let field_name = &headers[i];
+
+ // determine data type based on possible types
+ // if there are incompatible types, use DataType::Utf8
+ match possibilities.len() {
+ 1 => {
+ for dtype in possibilities.iter() {
+ fields.push(Field::new(&field_name, dtype.clone(),
has_nulls));
+ }
+ }
+ 2 => {
+ if possibilities.contains(&DataType::Int64)
+ && possibilities.contains(&DataType::Float64)
+ {
+ // we have an integer and double, fall down to double
+ fields.push(Field::new(&field_name, DataType::Float64,
has_nulls));
+ } else {
+ // default to Utf8 for conflicting datatypes (e.g bool and
int)
+ fields.push(Field::new(&field_name, DataType::Utf8,
has_nulls));
+ }
+ }
+ _ => fields.push(Field::new(&field_name, DataType::Utf8,
has_nulls)),
+ }
+ }
+
+ // return the file seek back to the start
+ file.seek(SeekFrom::Start(0))?;
+
+ Ok(Schema::new(fields))
+}
+
/// CSV file reader
pub struct Reader {
/// Explicit schema for the CSV file
@@ -68,6 +195,8 @@ pub struct Reader {
impl Reader {
/// Create a new CsvReader
+ ///
+ /// To customise the Reader, such as to enable schema inference, use
`ReaderBuilder`
pub fn new(
file: File,
schema: Arc<Schema>,
@@ -78,10 +207,9 @@ impl Reader {
let csv_reader = csv::ReaderBuilder::new()
.has_headers(has_headers)
.from_reader(BufReader::new(file));
-
let record_iter = csv_reader.into_records();
Reader {
- schema: schema.clone(),
+ schema,
projection,
record_iter,
batch_size,
@@ -194,6 +322,141 @@ impl Reader {
}
}
+/// CSV file reader builder
+pub struct ReaderBuilder {
+ /// Optional schema for the CSV file
+ ///
+ /// If the schema is not supplied, the reader will try to infer the schema
+ /// based on the CSV structure.
+ schema: Option<Arc<Schema>>,
+ /// Whether the file has headers or not
+ ///
+ /// If schema inference is run on a file with no headers, default column
names
+ /// are created.
+ has_headers: bool,
+ /// An optional column delimiter. Defauits to `b','`
+ delimiter: Option<u8>,
+ /// Optional maximum number of records to read during schema inference
+ ///
+ /// If a number is not provided, all the records are read.
+ max_records: Option<usize>,
+ /// Batch size (number of records to load each time)
+ ///
+ /// The default batch size when using the `ReaderBuilder` is 1024 records
+ batch_size: usize,
+ /// Optional projection for which columns to load (zero-based column
indices)
+ projection: Option<Vec<usize>>,
+}
+
+impl Default for ReaderBuilder {
+ fn default() -> ReaderBuilder {
+ ReaderBuilder {
+ schema: None,
+ has_headers: false,
+ delimiter: None,
+ max_records: None,
+ batch_size: 1024,
+ projection: None,
+ }
+ }
+}
+
+impl ReaderBuilder {
+ /// Create a new builder for configuring CSV parsing options.
+ ///
+ /// To convert a builder into a reader, call `Reader::from_builder`
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// extern crate arrow;
+ ///
+ /// use arrow::csv;
+ /// use std::fs::File;
+ ///
+ /// fn example() -> csv::Reader {
+ /// let file =
File::open("test/data/uk_cities_with_headers.csv").unwrap();
+ ///
+ /// // create a builder, inferring the schema with the first 100
records
+ /// let builder = csv::ReaderBuilder::new().infer_schema(Some(100));
+ ///
+ /// let reader = builder.build(file).unwrap();
+ ///
+ /// reader
+ /// }
+ /// ```
+ pub fn new() -> ReaderBuilder {
+ ReaderBuilder::default()
+ }
+
+ /// Set the CSV file's schema
+ pub fn with_schema(mut self, schema: Arc<Schema>) -> Self {
+ self.schema = Some(schema);
+ self
+ }
+
+ /// Set whether the CSV file has headers
+ pub fn has_headers(mut self, has_headers: bool) -> Self {
+ self.has_headers = has_headers;
+ self
+ }
+
+ /// Set the CSV file's column delimiter as a byte character
+ pub fn with_delimiter(mut self, delimiter: u8) -> Self {
+ self.delimiter = Some(delimiter);
+ self
+ }
+
+ /// Set the CSV reader to infer the schema of the file
+ pub fn infer_schema(mut self, max_records: Option<usize>) -> Self {
+ // remove any schema that is set
+ self.schema = None;
+ self.max_records = max_records;
+ self
+ }
+
+ /// Set the batch size (number of records to load at one time)
+ pub fn with_batch_size(mut self, batch_size: usize) -> Self {
+ self.batch_size = batch_size;
+ self
+ }
+
+ /// Set the reader's column projection
+ pub fn with_projection(mut self, projection: Vec<usize>) -> Self {
+ self.projection = Some(projection);
+ self
+ }
+
+ /// Create a new `Reader` from the `ReaderBuilder`
+ pub fn build(self, file: File) -> Result<Reader> {
+ // check if schema should be inferred
+ let schema = match self.schema {
+ Some(schema) => schema,
+ None => {
+ let inferred_schema = infer_file_schema(
+ file.try_clone().unwrap(),
+ self.delimiter.unwrap_or(b','),
+ self.max_records,
+ self.has_headers,
+ )?;
+
+ Arc::new(inferred_schema)
+ }
+ };
+ let csv_reader = csv::ReaderBuilder::new()
+ .delimiter(self.delimiter.unwrap_or(b','))
+ .has_headers(self.has_headers)
+ .from_reader(BufReader::new(file));
+ let record_iter = csv_reader.into_records();
+ Ok(Reader {
+ schema,
+ projection: self.projection.clone(),
+ record_iter,
+ batch_size: self.batch_size,
+ })
+ }
+}
+
#[cfg(test)]
mod tests {
use super::*;
@@ -237,6 +500,75 @@ mod tests {
}
#[test]
+ fn test_csv_with_schema_inference() {
+ let file = File::open("test/data/uk_cities_with_headers.csv").unwrap();
+
+ let builder =
ReaderBuilder::new().has_headers(true).infer_schema(None);
+
+ let mut csv = builder.build(file).unwrap();
+ let batch = csv.next().unwrap().unwrap();
+ assert_eq!(37, batch.num_rows());
+ assert_eq!(3, batch.num_columns());
+
+ // access data from a primitive array
+ let lat = batch
+ .column(1)
+ .as_any()
+ .downcast_ref::<Float64Array>()
+ .unwrap();
+ assert_eq!(57.653484, lat.value(0));
+
+ // access data from a string array (ListArray<u8>)
+ let city = batch
+ .column(0)
+ .as_any()
+ .downcast_ref::<BinaryArray>()
+ .unwrap();
+
+ let city_name: String =
String::from_utf8(city.value(13).to_vec()).unwrap();
+
+ assert_eq!("Aberdeen, Aberdeen City, UK", city_name);
+ }
+
+ #[test]
+ fn test_csv_with_schema_inference_no_headers() {
+ let file = File::open("test/data/uk_cities.csv").unwrap();
+
+ let builder = ReaderBuilder::new().infer_schema(None);
+
+ let mut csv = builder.build(file).unwrap();
+ let batch = csv.next().unwrap().unwrap();
+
+ // csv field names should be 'column_{number}'
+ let schema = batch.schema();
+ assert_eq!("column_1", schema.field(0).name());
+ assert_eq!("column_2", schema.field(1).name());
+ assert_eq!("column_3", schema.field(2).name());
+
+ assert_eq!(37, batch.num_rows());
+ assert_eq!(3, batch.num_columns());
+
+ // access data from a primitive array
+ let lat = batch
+ .column(1)
+ .as_any()
+ .downcast_ref::<Float64Array>()
+ .unwrap();
+ assert_eq!(57.653484, lat.value(0));
+
+ // access data from a string array (ListArray<u8>)
+ let city = batch
+ .column(0)
+ .as_any()
+ .downcast_ref::<BinaryArray>()
+ .unwrap();
+
+ let city_name: String =
String::from_utf8(city.value(13).to_vec()).unwrap();
+
+ assert_eq!("Aberdeen, Aberdeen City, UK", city_name);
+ }
+
+ #[test]
fn test_csv_with_projection() {
let schema = Schema::new(vec![
Field::new("city", DataType::Utf8, false),
@@ -272,4 +604,39 @@ mod tests {
assert_eq!(false, batch.column(1).is_null(4));
}
+ #[test]
+ fn test_nulls_with_inference() {
+ let file = File::open("test/data/various_types.csv").unwrap();
+
+ let builder = ReaderBuilder::new()
+ .infer_schema(None)
+ .has_headers(true)
+ .with_delimiter(b'|')
+ .with_batch_size(512)
+ .with_projection(vec![0, 1, 2, 3]);
+
+ let mut csv = builder.build(file).unwrap();
+ let batch = csv.next().unwrap().unwrap();
+
+ assert_eq!(5, batch.num_rows());
+ assert_eq!(4, batch.num_columns());
+
+ let schema = batch.schema();
+
+ assert_eq!(&DataType::Int64, schema.field(0).data_type());
+ assert_eq!(&DataType::Float64, schema.field(1).data_type());
+ assert_eq!(&DataType::Float64, schema.field(2).data_type());
+ assert_eq!(&DataType::Boolean, schema.field(3).data_type());
+
+ assert_eq!(false, schema.field(0).is_nullable());
+ assert_eq!(true, schema.field(1).is_nullable());
+ assert_eq!(true, schema.field(2).is_nullable());
+ assert_eq!(false, schema.field(3).is_nullable());
+
+ assert_eq!(false, batch.column(1).is_null(0));
+ assert_eq!(false, batch.column(1).is_null(1));
+ assert_eq!(true, batch.column(1).is_null(2));
+ assert_eq!(false, batch.column(1).is_null(3));
+ assert_eq!(false, batch.column(1).is_null(4));
+ }
}
diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs
index 0627b45..05db6ce 100644
--- a/rust/arrow/src/datatypes.rs
+++ b/rust/arrow/src/datatypes.rs
@@ -42,7 +42,7 @@ use crate::error::{ArrowError, Result};
/// Nested types can themselves be nested within other arrays.
/// For more information on these types please see
/// [here](https://arrow.apache.org/docs/memory_layout.html).
-#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash)]
pub enum DataType {
Boolean,
Int8,
@@ -64,7 +64,7 @@ pub enum DataType {
/// Contains the meta-data for a single relative type.
///
/// The `Schema` object is an ordered collection of `Field` objects.
-#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash)]
pub struct Field {
name: String,
data_type: DataType,
diff --git a/rust/arrow/src/error.rs b/rust/arrow/src/error.rs
index 559b2d7..b75111f 100644
--- a/rust/arrow/src/error.rs
+++ b/rust/arrow/src/error.rs
@@ -15,12 +15,49 @@
// specific language governing permissions and limitations
// under the License.
+use std::error::Error;
+
+use csv as csv_crate;
+
#[derive(Debug, Clone, PartialEq)]
pub enum ArrowError {
MemoryError(String),
ParseError(String),
ComputeError(String),
DivideByZero,
+ CsvError(String),
+ IoError(String),
+}
+
+impl From<::std::io::Error> for ArrowError {
+ fn from(error: ::std::io::Error) -> Self {
+ ArrowError::IoError(error.description().to_string())
+ }
+}
+
+impl From<csv_crate::Error> for ArrowError {
+ fn from(error: csv_crate::Error) -> Self {
+ match error.kind() {
+ csv_crate::ErrorKind::Io(error) => {
+ ArrowError::CsvError(error.description().to_string())
+ }
+ csv_crate::ErrorKind::Utf8 {pos: _, err} => {
+ ArrowError::CsvError(format!("Encountered UTF-8 error while
reading CSV file: {:?}", err.description()))
+ }
+ csv_crate::ErrorKind::UnequalLengths {pos: _, expected_len, len}
=> {
+ ArrowError::CsvError(
+ format!(
+ "Encountered unequal lengths between records on CSV
file. Expected {} records, found {} records",
+ len,
+ expected_len
+ )
+ )
+ }
+ _ => {
+ ArrowError::CsvError("Error reading CSV file".to_string())
+ }
+ }
+ }
}
pub type Result<T> = ::std::result::Result<T, ArrowError>;
diff --git a/rust/arrow/test/data/uk_cities_with_headers.csv
b/rust/arrow/test/data/uk_cities_with_headers.csv
new file mode 100644
index 0000000..92f5a17
--- /dev/null
+++ b/rust/arrow/test/data/uk_cities_with_headers.csv
@@ -0,0 +1,38 @@
+city,lat,lng
+"Elgin, Scotland, the UK",57.653484,-3.335724
+"Stoke-on-Trent, Staffordshire, the UK",53.002666,-2.179404
+"Solihull, Birmingham, UK",52.412811,-1.778197
+"Cardiff, Cardiff county, UK",51.481583,-3.179090
+"Eastbourne, East Sussex, UK",50.768036,0.290472
+"Oxford, Oxfordshire, UK",51.752022,-1.257677
+"London, UK",51.509865,-0.118092
+"Swindon, Swindon, UK",51.568535,-1.772232
+"Gravesend, Kent, UK",51.441883,0.370759
+"Northampton, Northamptonshire, UK",52.240479,-0.902656
+"Rugby, Warwickshire, UK",52.370876,-1.265032
+"Sutton Coldfield, West Midlands, UK",52.570385,-1.824042
+"Harlow, Essex, UK",51.772938,0.102310
+"Aberdeen, Aberdeen City, UK",57.149651,-2.099075
+"Swansea, Swansea, UK",51.621441,-3.943646
+"Chesterfield, Derbyshire, UK",53.235046,-1.421629
+"Londonderry, Derry, UK",55.006763,-7.318268
+"Salisbury, Wiltshire, UK",51.068787,-1.794472
+"Weymouth, Dorset, UK",50.614429,-2.457621
+"Wolverhampton, West Midlands, UK",52.591370,-2.110748
+"Preston, Lancashire, UK",53.765762,-2.692337
+"Bournemouth, UK",50.720806,-1.904755
+"Doncaster, South Yorkshire, UK",53.522820,-1.128462
+"Ayr, South Ayrshire, UK",55.458565,-4.629179
+"Hastings, East Sussex, UK",50.854259,0.573453
+"Bedford, UK",52.136436,-0.460739
+"Basildon, Essex, UK",51.572376,0.470009
+"Chippenham, Wiltshire, UK",51.458057,-2.116074
+"Belfast, UK",54.607868,-5.926437
+"Uckfield, East Sussex, UK",50.967941,0.085831
+"Worthing, West Sussex, UK",50.825024,-0.383835
+"Leeds, West Yorkshire, UK",53.801277,-1.548567
+"Kendal, Cumbria, UK",54.328506,-2.743870
+"Plymouth, UK",50.376289,-4.143841
+"Haverhill, Suffolk, UK",52.080875,0.444517
+"Frankton, Warwickshire, UK",52.328415,-1.377561
+"Inverness, the UK",57.477772,-4.224721
\ No newline at end of file
diff --git a/rust/arrow/test/data/various_types.csv
b/rust/arrow/test/data/various_types.csv
new file mode 100644
index 0000000..322d9c3
--- /dev/null
+++ b/rust/arrow/test/data/various_types.csv
@@ -0,0 +1,6 @@
+c_int|c_float|c_string|c_bool
+1|1.1|"1.11"|true
+2|2.2|"2.22"|true
+3||"3.33"|true
+4|4.4||false
+5|6.6|""|false
\ No newline at end of file