(arrow-rs) branch master updated: Allow specifying comment character for CSV reader (#5759)

tustvold Mon, 13 May 2024 03:20:39 -0700

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git



The following commit(s) were added to refs/heads/master by this push:
     new 6ab67df984d Allow specifying comment character for CSV reader (#5759)
6ab67df984d is described below

commit 6ab67df984d358b7bad83a675dd6a2c1c2965155
Author: Benjamin Bannier <[email protected]>
AuthorDate: Mon May 13 12:19:11 2024 +0200

    Allow specifying comment character for CSV reader (#5759)
    
    This patch adds reader support for a comment character for reading CSV
    files. While comments like almost nothing around the CSV format are not
    truly standardized, a common format supported by many CSV
    readers[^1][^2] is to ignore full lines starting with a comment
    character (often `#`); inline or end of line comments are not supported.
    
    Example:
    
        # This is a comment in a CSV file without header.
        1,2
        # Comment inside the data block.
        11,22
    
    The implementation of this for Arrow is pretty straight-forward as all
    we need to do is expose the existing `comment` option of `csv_core` used
    to read CSV files.
    
    Closes #5758.
    
    [^1]: 
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
    [^2]: 
https://stat.ethz.ch/R-manual/R-devel/library/utils/html/read.table.html
---
 arrow-csv/src/reader/mod.rs | 51 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs
index 09087ca3195..9721349b018 100644
--- a/arrow-csv/src/reader/mod.rs
+++ b/arrow-csv/src/reader/mod.rs
@@ -230,6 +230,7 @@ pub struct Format {
     escape: Option<u8>,
     quote: Option<u8>,
     terminator: Option<u8>,
+    comment: Option<u8>,
     null_regex: NullRegex,
     truncated_rows: bool,
 }
@@ -260,6 +261,11 @@ impl Format {
         self
     }
 
+    pub fn with_comment(mut self, comment: u8) -> Self {
+        self.comment = Some(comment);
+        self
+    }
+
     /// Provide a regex to match null values, defaults to `^$`
     pub fn with_null_regex(mut self, null_regex: Regex) -> Self {
         self.null_regex = NullRegex(Some(null_regex));
@@ -353,6 +359,9 @@ impl Format {
         if let Some(t) = self.terminator {
             builder.terminator(csv::Terminator::Any(t));
         }
+        if let Some(comment) = self.comment {
+            builder.comment(Some(comment));
+        }
         builder.from_reader(reader)
     }
 
@@ -360,6 +369,7 @@ impl Format {
     fn build_parser(&self) -> csv_core::Reader {
         let mut builder = csv_core::ReaderBuilder::new();
         builder.escape(self.escape);
+        builder.comment(self.comment);
 
         if let Some(c) = self.delimiter {
             builder.delimiter(c);
@@ -1109,6 +1119,11 @@ impl ReaderBuilder {
         self
     }
 
+    pub fn with_comment(mut self, comment: u8) -> Self {
+        self.format.comment = Some(comment);
+        self
+    }
+
     /// Provide a regex to match null values, defaults to `^$`
     pub fn with_null_regex(mut self, null_regex: Regex) -> Self {
         self.format.null_regex = NullRegex(Some(null_regex));
@@ -2536,4 +2551,40 @@ mod tests {
             assert_eq!(&t.get(), expected, "{values:?}")
         }
     }
+
+    #[test]
+    fn test_comment() {
+        let schema = Schema::new(vec![
+            Field::new("a", DataType::Int8, false),
+            Field::new("b", DataType::Int8, false),
+        ]);
+
+        let csv = "# comment1 \n1,2\n#comment2\n11,22";
+        let mut read = Cursor::new(csv.as_bytes());
+        let reader = ReaderBuilder::new(Arc::new(schema))
+            .with_comment(b'#')
+            .build(&mut read)
+            .unwrap();
+
+        let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
+        assert_eq!(batches.len(), 1);
+        let b = batches.first().unwrap();
+        assert_eq!(b.num_columns(), 2);
+        assert_eq!(
+            b.column(0)
+                .as_any()
+                .downcast_ref::<Int8Array>()
+                .unwrap()
+                .values(),
+            &vec![1, 11]
+        );
+        assert_eq!(
+            b.column(1)
+                .as_any()
+                .downcast_ref::<Int8Array>()
+                .unwrap()
+                .values(),
+            &vec![2, 22]
+        );
+    }
 }

(arrow-rs) branch master updated: Allow specifying comment character for CSV reader (#5759)

Reply via email to