This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 6ab67df984d Allow specifying comment character for CSV reader (#5759)
6ab67df984d is described below
commit 6ab67df984d358b7bad83a675dd6a2c1c2965155
Author: Benjamin Bannier <[email protected]>
AuthorDate: Mon May 13 12:19:11 2024 +0200
Allow specifying comment character for CSV reader (#5759)
This patch adds reader support for a comment character for reading CSV
files. While comments like almost nothing around the CSV format are not
truly standardized, a common format supported by many CSV
readers[^1][^2] is to ignore full lines starting with a comment
character (often `#`); inline or end of line comments are not supported.
Example:
# This is a comment in a CSV file without header.
1,2
# Comment inside the data block.
11,22
The implementation of this for Arrow is pretty straight-forward as all
we need to do is expose the existing `comment` option of `csv_core` used
to read CSV files.
Closes #5758.
[^1]:
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
[^2]:
https://stat.ethz.ch/R-manual/R-devel/library/utils/html/read.table.html
---
arrow-csv/src/reader/mod.rs | 51 +++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 51 insertions(+)
diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs
index 09087ca3195..9721349b018 100644
--- a/arrow-csv/src/reader/mod.rs
+++ b/arrow-csv/src/reader/mod.rs
@@ -230,6 +230,7 @@ pub struct Format {
escape: Option<u8>,
quote: Option<u8>,
terminator: Option<u8>,
+ comment: Option<u8>,
null_regex: NullRegex,
truncated_rows: bool,
}
@@ -260,6 +261,11 @@ impl Format {
self
}
+ pub fn with_comment(mut self, comment: u8) -> Self {
+ self.comment = Some(comment);
+ self
+ }
+
/// Provide a regex to match null values, defaults to `^$`
pub fn with_null_regex(mut self, null_regex: Regex) -> Self {
self.null_regex = NullRegex(Some(null_regex));
@@ -353,6 +359,9 @@ impl Format {
if let Some(t) = self.terminator {
builder.terminator(csv::Terminator::Any(t));
}
+ if let Some(comment) = self.comment {
+ builder.comment(Some(comment));
+ }
builder.from_reader(reader)
}
@@ -360,6 +369,7 @@ impl Format {
fn build_parser(&self) -> csv_core::Reader {
let mut builder = csv_core::ReaderBuilder::new();
builder.escape(self.escape);
+ builder.comment(self.comment);
if let Some(c) = self.delimiter {
builder.delimiter(c);
@@ -1109,6 +1119,11 @@ impl ReaderBuilder {
self
}
+ pub fn with_comment(mut self, comment: u8) -> Self {
+ self.format.comment = Some(comment);
+ self
+ }
+
/// Provide a regex to match null values, defaults to `^$`
pub fn with_null_regex(mut self, null_regex: Regex) -> Self {
self.format.null_regex = NullRegex(Some(null_regex));
@@ -2536,4 +2551,40 @@ mod tests {
assert_eq!(&t.get(), expected, "{values:?}")
}
}
+
+ #[test]
+ fn test_comment() {
+ let schema = Schema::new(vec![
+ Field::new("a", DataType::Int8, false),
+ Field::new("b", DataType::Int8, false),
+ ]);
+
+ let csv = "# comment1 \n1,2\n#comment2\n11,22";
+ let mut read = Cursor::new(csv.as_bytes());
+ let reader = ReaderBuilder::new(Arc::new(schema))
+ .with_comment(b'#')
+ .build(&mut read)
+ .unwrap();
+
+ let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
+ assert_eq!(batches.len(), 1);
+ let b = batches.first().unwrap();
+ assert_eq!(b.num_columns(), 2);
+ assert_eq!(
+ b.column(0)
+ .as_any()
+ .downcast_ref::<Int8Array>()
+ .unwrap()
+ .values(),
+ &vec![1, 11]
+ );
+ assert_eq!(
+ b.column(1)
+ .as_any()
+ .downcast_ref::<Int8Array>()
+ .unwrap()
+ .values(),
+ &vec![2, 22]
+ );
+ }
}