Hello, Manuel,
Recently, I needed to parse some rather large ( 1+ GB) csv files and found the 
performance of the csv library lacking. The attached patch addresses the 
performance issues and simplifies the implementation. The api remains the same 
but I have changed the library so that it preserves leading and trailing 
whitespace for fields. The prior version stripped these, but I now believe that 
the parser should not do so. I have also updated the gflu example and recette 
so that they function correctly.
Almost forgot, the patch is against bigloo4.3b-alpha12Jul17. 
 
Best Regards,Joseph Donaldson
diff -ur bigloo4.3b/api/csv/examples/gflu.scm bigloo4.3b_mod/api/csv/examples/gflu.scm
--- bigloo4.3b/api/csv/examples/gflu.scm	2017-07-10 00:53:04.000000000 -0700
+++ bigloo4.3b_mod/api/csv/examples/gflu.scm	2017-07-12 15:15:42.734033699 -0700
@@ -20,7 +20,7 @@
 ;*    gflu-base-url ...                                                */
 ;*---------------------------------------------------------------------*/
 (define gflu-base-url 
-   "http://www.google.org/flutrends";)
+   "http://www.google.org/flutrends/about/data/flu";)
 
 ;*---------------------------------------------------------------------*/
 ;*    main ...                                                         */
diff -ur bigloo4.3b/api/csv/recette/recette.scm bigloo4.3b_mod/api/csv/recette/recette.scm
--- bigloo4.3b/api/csv/recette/recette.scm	2017-07-10 00:53:04.000000000 -0700
+++ bigloo4.3b_mod/api/csv/recette/recette.scm	2017-07-12 16:01:40.908898361 -0700
@@ -212,11 +212,11 @@
 	  (in (open-input-string test-string)))
       (unwind-protect
 	 (read-csv-records in +psv-lexer+)
-	 (close-input-port in)))
+       	 (close-input-port in)))
    :result (lambda (v)
 	      (if (eq? v 'result)
 		  '(("dog" "cat" "horse" "pig")
-		    ("pig" "horse" "cat" "dog"))
+		    ("pig" "horse" "cat" "dog")) 
 		  (every csv-record=? v '(("dog" "cat" "horse" "pig")
 					  ("pig" "horse" "cat" "dog"))))))
 
@@ -344,8 +344,8 @@
 	 (close-input-port in)))
    :result (lambda (v)
 	      (if (eq? v 'result)
-		  '("dog" "cat")
-		  (csv-record=? v '("dog" "cat")))))
+		  '("dog" " cat")
+		  (csv-record=? v '("dog" " cat")))))
 
 
 (define-test space-after-quote-before-sep
@@ -356,8 +356,8 @@
 	 (close-input-port in)))
    :result (lambda (v)
 	      (if (eq? v 'result)
-		  '("dog" "cat")
-		  (csv-record=? v '("dog" "cat")))))
+		  '("dog " "cat")
+		  (csv-record=? v '("dog " "cat")))))
 
 (define-test quotes-with-spaces-around-sep
    (let* ((test-string "\"dog\" , \"cat\"")
@@ -367,8 +367,8 @@
 	 (close-input-port in)))
    :result (lambda (v)
 	      (if (eq? v 'result)
-		  '("dog" "cat")
-		  (csv-record=? v '("dog" "cat")))))
+		  '("dog " " cat")
+		  (csv-record=? v '("dog " " cat")))))
 
 
 
diff -ur bigloo4.3b/api/csv/src/Llib/csv.sch bigloo4.3b_mod/api/csv/src/Llib/csv.sch
--- bigloo4.3b/api/csv/src/Llib/csv.sch	2017-07-10 00:53:04.000000000 -0700
+++ bigloo4.3b_mod/api/csv/src/Llib/csv.sch	2017-07-12 15:10:43.085678003 -0700
@@ -13,50 +13,38 @@
 ;*    make-csv-lexer                                                   */
 ;*---------------------------------------------------------------------*/
 (define-macro (make-csv-lexer sep quot)
-   (if (and (char? sep) (char? quot))
-       `(lambda (in-quote?)
-	   (regular-grammar ((quote ,quot)
-			     (separator ,sep))
-	      ((when in-quote?
-		  (: quote quote))
-	       (cons '2quote (string ,quot)))
-	      (quote 
-		 (begin
-		    (set! in-quote? (not in-quote?))
-		    (cons 'kwote (the-string))))
-	      ,(cond ((and (or (char=? sep #\space)
-			      (char=? quot #\space))
-			  (or (char=? sep #\tab)
-			      (char=? quot #\tab))) 
-		     `(define ,(gensym 'dummy) #unspecified))
-	       ((or (char=? sep #\space)
-		    (char=? quot #\space))
-		'((when (not in-quote?) (+ #\tab))
-		  (cons 'space (the-string))))
-	       ((or (char=? sep #\tab)
-		    (char=? quot #\tab))
-		'((when (not in-quote?) (+ #\space))
-		  (cons 'space (the-string))))
-	       (else
-		'((when (not in-quote?) (+ (or #\space #\tab)))
-		  (cons 'space (the-string)))))
-	      (separator
-		 'separator)
-	      ((or (: #\return #\newline)
-		   #\newline)
-	       'newline)
-	      ((when (not in-quote?)
-		  (+ (out quote separator #\return #\newline)))
-	       (cons 'text (the-string)))
-	      ((when in-quote?
-		  (+ (out quote)))
-	       (cons 'text (the-string)))
-	      (else 
-	       (let ((c (the-failure)))
-		  (set! in-quote? #f)
-		  (if (eof-object? c)
-		      c
-		      (error "csv-lexer" "Illegal character" c))))))
-       (error "csv-lexer"
-	  "separator and quote must be a single character"
-	  (list sep quot))))
+   (if (and (char? sep)
+	    (char? quot))
+       `(regular-grammar ((quote ,quot)
+			  (separator ,sep))
+	   (quote
+	    (let loop ((curr (read-char (the-port)))
+		       (res ""))
+	       (cond ((eof-object? curr)
+		      (raise (instantiate::&io-parse-error (proc "csv lexer")
+							   (msg "failed to parse")
+							   (obj curr))))
+		     ((and (char=? curr ,quot)
+			   (not (eof-object? (peek-char (the-port))))
+			   (char=? (peek-char (the-port)) ,quot))
+		      (read-char (the-port))
+		      (loop (read-char (the-port))
+			 (string-append res (string ,quot))))
+		     ((char=? curr ,quot)
+		      (cons 'text res))
+		     (else
+		      (loop (read-char (the-port))
+			    (string-append res (string curr)))))))
+	   (separator
+	    'separator)
+	   ((or (: #\return #\newline)
+		#\newline)
+	    'newline)
+	   ((+ (out quote separator #\return #\newline))
+	    (cons 'text (the-string)))
+	   (else 
+	    (let ((c (the-failure)))
+	       (if (eof-object? c)
+		   c
+		   (error 'csv-lexer "Illegal character" c)))))
+       (error 'csv-lexer "separator and quote must be a single character" (list sep quot))))
\ No newline at end of file
diff -ur bigloo4.3b/api/csv/src/Llib/csv.scm bigloo4.3b_mod/api/csv/src/Llib/csv.scm
--- bigloo4.3b/api/csv/src/Llib/csv.scm	2017-07-10 00:53:04.000000000 -0700
+++ bigloo4.3b_mod/api/csv/src/Llib/csv.scm	2017-07-12 15:07:44.209992907 -0700
@@ -39,8 +39,7 @@
 ;*    +csv-parser+ ...                                                 */
 ;*---------------------------------------------------------------------*/
 (define +csv-parser+
-   (lalr-grammar (kwote 2quote space separator newline text)
-      ;;; production rules		 
+   (lalr-grammar (separator text)
       (fields
 	 ((field)
 	  (list field))
@@ -50,60 +49,8 @@
       (field
 	 (()
 	  "")
-	 ((spaces)
-	   spaces)
-	 ((possible-space@a text possible-space@b)
-	  (string-append a text b))
-	 ((possible-space@a escaped possible-space@b)
-	  escaped))
-
-      (spaces
-	 ((space)
-	  space)
-	 ((spaces space)
-	  (string-append spaces space)))
-      
-      (possible-space
-	 (()
-	  "")
-	 ((space)
-	  space))
-      
-      (escaped
-	 ((kwote kwote)
-	  "")
-	 ((kwote edata kwote)
-	  edata))
-      
-      ; (escaped
-      ; 	 ((possible-space+kwote kwote+possible-space)
-      ; 	  "")
-      ; 	 ((possible-space+kwote edata kwote+possible-space)
-      ; 	  edata))
-      
-      ; (possible-space+kwote
-      ; 	 ((kwote)
-      ; 	  kwote)
-      ; 	 ((space kwote)
-      ; 	  kwote))
-      ; (kwote+possible-space
-      ; 	 ((kwote)
-      ; 	  kwote)
-      ; 	 ((kwote space)
-      ; 	  kwote))
-      
-      
-      (edata
-	 ((edatum)
-	  edatum)
-	 ((edatum edata)
-	  (string-append edatum edata)))
-      
-      (edatum
-	 ((text)
-	  text)
-	 ((2quote)
-	  2quote))))
+	 ((text field)
+	  (string-append text field)))))
 		 
 ;*---------------------------------------------------------------------*/
 ;*    read-csv-record ...                                              */
@@ -113,7 +60,7 @@
        (let ((pc (peek-char in)))
 	  (if (eof-object? pc)
 	      pc
-	      (read/lalrp +csv-parser+ (lexer #f) in
+	      (read/lalrp +csv-parser+ lexer in
 		 (lambda (x) (or (eof-object? x) (eq? x 'newline))))))
        (raise
 	  (instantiate::&io-port-error

Reply via email to