Hello, Manuel, Recently, I needed to parse some rather large ( 1+ GB) csv files and found the performance of the csv library lacking. The attached patch addresses the performance issues and simplifies the implementation. The api remains the same but I have changed the library so that it preserves leading and trailing whitespace for fields. The prior version stripped these, but I now believe that the parser should not do so. I have also updated the gflu example and recette so that they function correctly. Almost forgot, the patch is against bigloo4.3b-alpha12Jul17. Best Regards,Joseph Donaldson
diff -ur bigloo4.3b/api/csv/examples/gflu.scm bigloo4.3b_mod/api/csv/examples/gflu.scm --- bigloo4.3b/api/csv/examples/gflu.scm 2017-07-10 00:53:04.000000000 -0700 +++ bigloo4.3b_mod/api/csv/examples/gflu.scm 2017-07-12 15:15:42.734033699 -0700 @@ -20,7 +20,7 @@ ;* gflu-base-url ... */ ;*---------------------------------------------------------------------*/ (define gflu-base-url - "http://www.google.org/flutrends") + "http://www.google.org/flutrends/about/data/flu") ;*---------------------------------------------------------------------*/ ;* main ... */ diff -ur bigloo4.3b/api/csv/recette/recette.scm bigloo4.3b_mod/api/csv/recette/recette.scm --- bigloo4.3b/api/csv/recette/recette.scm 2017-07-10 00:53:04.000000000 -0700 +++ bigloo4.3b_mod/api/csv/recette/recette.scm 2017-07-12 16:01:40.908898361 -0700 @@ -212,11 +212,11 @@ (in (open-input-string test-string))) (unwind-protect (read-csv-records in +psv-lexer+) - (close-input-port in))) + (close-input-port in))) :result (lambda (v) (if (eq? v 'result) '(("dog" "cat" "horse" "pig") - ("pig" "horse" "cat" "dog")) + ("pig" "horse" "cat" "dog")) (every csv-record=? v '(("dog" "cat" "horse" "pig") ("pig" "horse" "cat" "dog")))))) @@ -344,8 +344,8 @@ (close-input-port in))) :result (lambda (v) (if (eq? v 'result) - '("dog" "cat") - (csv-record=? v '("dog" "cat"))))) + '("dog" " cat") + (csv-record=? v '("dog" " cat"))))) (define-test space-after-quote-before-sep @@ -356,8 +356,8 @@ (close-input-port in))) :result (lambda (v) (if (eq? v 'result) - '("dog" "cat") - (csv-record=? v '("dog" "cat"))))) + '("dog " "cat") + (csv-record=? v '("dog " "cat"))))) (define-test quotes-with-spaces-around-sep (let* ((test-string "\"dog\" , \"cat\"") @@ -367,8 +367,8 @@ (close-input-port in))) :result (lambda (v) (if (eq? v 'result) - '("dog" "cat") - (csv-record=? v '("dog" "cat"))))) + '("dog " " cat") + (csv-record=? v '("dog " " cat"))))) diff -ur bigloo4.3b/api/csv/src/Llib/csv.sch bigloo4.3b_mod/api/csv/src/Llib/csv.sch --- bigloo4.3b/api/csv/src/Llib/csv.sch 2017-07-10 00:53:04.000000000 -0700 +++ bigloo4.3b_mod/api/csv/src/Llib/csv.sch 2017-07-12 15:10:43.085678003 -0700 @@ -13,50 +13,38 @@ ;* make-csv-lexer */ ;*---------------------------------------------------------------------*/ (define-macro (make-csv-lexer sep quot) - (if (and (char? sep) (char? quot)) - `(lambda (in-quote?) - (regular-grammar ((quote ,quot) - (separator ,sep)) - ((when in-quote? - (: quote quote)) - (cons '2quote (string ,quot))) - (quote - (begin - (set! in-quote? (not in-quote?)) - (cons 'kwote (the-string)))) - ,(cond ((and (or (char=? sep #\space) - (char=? quot #\space)) - (or (char=? sep #\tab) - (char=? quot #\tab))) - `(define ,(gensym 'dummy) #unspecified)) - ((or (char=? sep #\space) - (char=? quot #\space)) - '((when (not in-quote?) (+ #\tab)) - (cons 'space (the-string)))) - ((or (char=? sep #\tab) - (char=? quot #\tab)) - '((when (not in-quote?) (+ #\space)) - (cons 'space (the-string)))) - (else - '((when (not in-quote?) (+ (or #\space #\tab))) - (cons 'space (the-string))))) - (separator - 'separator) - ((or (: #\return #\newline) - #\newline) - 'newline) - ((when (not in-quote?) - (+ (out quote separator #\return #\newline))) - (cons 'text (the-string))) - ((when in-quote? - (+ (out quote))) - (cons 'text (the-string))) - (else - (let ((c (the-failure))) - (set! in-quote? #f) - (if (eof-object? c) - c - (error "csv-lexer" "Illegal character" c)))))) - (error "csv-lexer" - "separator and quote must be a single character" - (list sep quot)))) + (if (and (char? sep) + (char? quot)) + `(regular-grammar ((quote ,quot) + (separator ,sep)) + (quote + (let loop ((curr (read-char (the-port))) + (res "")) + (cond ((eof-object? curr) + (raise (instantiate::&io-parse-error (proc "csv lexer") + (msg "failed to parse") + (obj curr)))) + ((and (char=? curr ,quot) + (not (eof-object? (peek-char (the-port)))) + (char=? (peek-char (the-port)) ,quot)) + (read-char (the-port)) + (loop (read-char (the-port)) + (string-append res (string ,quot)))) + ((char=? curr ,quot) + (cons 'text res)) + (else + (loop (read-char (the-port)) + (string-append res (string curr))))))) + (separator + 'separator) + ((or (: #\return #\newline) + #\newline) + 'newline) + ((+ (out quote separator #\return #\newline)) + (cons 'text (the-string))) + (else + (let ((c (the-failure))) + (if (eof-object? c) + c + (error 'csv-lexer "Illegal character" c))))) + (error 'csv-lexer "separator and quote must be a single character" (list sep quot)))) \ No newline at end of file diff -ur bigloo4.3b/api/csv/src/Llib/csv.scm bigloo4.3b_mod/api/csv/src/Llib/csv.scm --- bigloo4.3b/api/csv/src/Llib/csv.scm 2017-07-10 00:53:04.000000000 -0700 +++ bigloo4.3b_mod/api/csv/src/Llib/csv.scm 2017-07-12 15:07:44.209992907 -0700 @@ -39,8 +39,7 @@ ;* +csv-parser+ ... */ ;*---------------------------------------------------------------------*/ (define +csv-parser+ - (lalr-grammar (kwote 2quote space separator newline text) - ;;; production rules + (lalr-grammar (separator text) (fields ((field) (list field)) @@ -50,60 +49,8 @@ (field (() "") - ((spaces) - spaces) - ((possible-space@a text possible-space@b) - (string-append a text b)) - ((possible-space@a escaped possible-space@b) - escaped)) - - (spaces - ((space) - space) - ((spaces space) - (string-append spaces space))) - - (possible-space - (() - "") - ((space) - space)) - - (escaped - ((kwote kwote) - "") - ((kwote edata kwote) - edata)) - - ; (escaped - ; ((possible-space+kwote kwote+possible-space) - ; "") - ; ((possible-space+kwote edata kwote+possible-space) - ; edata)) - - ; (possible-space+kwote - ; ((kwote) - ; kwote) - ; ((space kwote) - ; kwote)) - ; (kwote+possible-space - ; ((kwote) - ; kwote) - ; ((kwote space) - ; kwote)) - - - (edata - ((edatum) - edatum) - ((edatum edata) - (string-append edatum edata))) - - (edatum - ((text) - text) - ((2quote) - 2quote)))) + ((text field) + (string-append text field))))) ;*---------------------------------------------------------------------*/ ;* read-csv-record ... */ @@ -113,7 +60,7 @@ (let ((pc (peek-char in))) (if (eof-object? pc) pc - (read/lalrp +csv-parser+ (lexer #f) in + (read/lalrp +csv-parser+ lexer in (lambda (x) (or (eof-object? x) (eq? x 'newline)))))) (raise (instantiate::&io-port-error
