From: Rutger van Beusekom <rut...@dezyne.org> * module/ice-9/peg/codegen.scm: (%peg:locations?, %peg:skip?): New exported parameters. (wrap-parser-for-users): Use them to enable skip parsing and switch having locations on comments and whitespace. * test-suite/tests/peg.test ("Skip parser"): Test it. * doc/ref/api-peg.texi (Whitespace and comments): Document it.
Co-authored-by: Janneke Nieuwenhuizen <jann...@gnu.org> --- doc/ref/api-peg.texi | 44 +++++++++++++++++++++ module/ice-9/peg.scm | 3 ++ module/ice-9/peg/codegen.scm | 74 ++++++++++++++++++++++++------------ test-suite/tests/peg.test | 47 +++++++++++++++++++++++ 4 files changed, 143 insertions(+), 25 deletions(-) diff --git a/doc/ref/api-peg.texi b/doc/ref/api-peg.texi index df2e74d05..733cb1c6d 100644 --- a/doc/ref/api-peg.texi +++ b/doc/ref/api-peg.texi @@ -1062,6 +1062,50 @@ current state of the input, as well as the parse result. (and=> (match-pattern grammar input-text) peg:tree)) @end lisp +@subsubheading Whitespace and comments + +To write a PEG parser for a whitespace invariant language or a language +which includes line and block comments requires littering the grammar +with whitespace or comment parser expressions, which not only violates +the DRY principle, but is hard to get right. + +For example, to parse a C-like language one would define these +whitespace and comment parsers + +@lisp +(define-skip-parser peg-eof none (not-followed-by peg-any)) +(define-skip-parser peg-eol none (or "\f" "\n" "\r" "\v")) +(define-skip-parser peg-ws none (or " " "\t")) +(define-skip-parser peg-line all + (and "//" (* (and (not-followed-by peg-eol) peg-any)) + (expect (or "\n" "\r\n" peg-eof)))) +(define-skip-parser peg-block-strict all + (and "/*" + (* (or peg-block (and (not-followed-by "*/") peg-any))) + (expect "*/"))) +(define-skip-parser peg-skip all + (* (or peg-ws peg-eol peg-line peg-block-strict))) +(define-skip-parser peg-block all + (and "/*" (* (or peg-block (and (not-followed-by "*/") peg-any))) + (or "*/" peg-eof))) +@end lisp + +When setting @var{%peg:skip?} to @code{peg-skip}, whitespace and +comments are silently skipped. + +@lisp +(parameterize ((%peg:skip? peg-skip)) + (and=> (match-pattern grammar input-text) peg:tree)) +@end lisp + +If you want to preserve locations and comments, set +@var{%peg:locations?} to @code{#t}. +@lisp +(parameterize ((%peg:skip? peg-skip) + (%peg:locations? #t)) + (and=> (match-pattern grammar input-text) peg:tree)) +@end lisp + @subsubheading Expect parsing The best thing about PEG is its backtracking nature making it diff --git a/module/ice-9/peg.scm b/module/ice-9/peg.scm index 499c3820c..fd9dce54c 100644 --- a/module/ice-9/peg.scm +++ b/module/ice-9/peg.scm @@ -28,7 +28,10 @@ #:use-module (ice-9 peg cache) #:re-export (define-peg-pattern define-peg-string-patterns + define-skip-parser %peg:debug? + %peg:locations? + %peg:skip? match-pattern search-for-pattern compile-peg-pattern diff --git a/module/ice-9/peg/codegen.scm b/module/ice-9/peg/codegen.scm index dd24bdac0..458a7e3ab 100644 --- a/module/ice-9/peg/codegen.scm +++ b/module/ice-9/peg/codegen.scm @@ -21,7 +21,10 @@ #:export (compile-peg-pattern wrap-parser-for-users add-peg-compiler! - %peg:debug?) + define-skip-parser + %peg:debug? + %peg:locations? + %peg:skip?) #:use-module (ice-9 pretty-print) #:use-module (system base pmatch)) @@ -349,6 +352,9 @@ return EXP." ;; Packages the results of a parser (define %peg:debug? (make-parameter #f)) +(define %peg:locations? (make-parameter #f)) +(define %peg:skip? (make-parameter (lambda (str strlen at) `(,at ())))) + (define (trace? symbol) (%peg:debug?)) @@ -361,7 +367,11 @@ return EXP." (make-string indent #\space) '#,s-syn)) (set! indent (+ indent 4)) - (let ((res (#,parser str strlen at))) + (let* ((comment-res ((%peg:skip?) str strlen at)) + (comment-loc (and (%peg:locations?) comment-res + `(location ,at ,(car comment-res)))) + (at (or (and comment-res (car comment-res)) at)) + (res (#,parser str strlen at))) (set! indent (- indent 4)) (let ((pos (or (and res (car res)) 0))) (when (and (trace? '#,s-syn) (< at pos)) @@ -369,26 +379,40 @@ return EXP." (make-string indent #\space) '#,s-syn (substring str at pos) - (substring str pos (min strlen (+ pos 10))))) - ;; Try to match the nonterminal. - (if res - ;; If we matched, do some post-processing to figure out - ;; what data to propagate upward. - (let ((at (car res)) - (body (cadr res))) - #,(cond - ((eq? accumsym 'name) - #`(list at '#,s-syn)) - ((eq? accumsym 'all) - #`(list (car res) - (cond - ((not (list? body)) - (list '#,s-syn body)) - ((null? body) '#,s-syn) - ((symbol? (car body)) - (list '#,s-syn body)) - (else (cons '#,s-syn body))))) - ((eq? accumsym 'none) #`(list (car res) '())) - (else #`(begin res)))) - ;; If we didn't match, just return false. - #f))))) + (substring str pos (min strlen (+ pos 10)))))) + ;; Try to match the nonterminal. + (if res + ;; If we matched, do some post-processing to figure out + ;; what data to propagate upward. + (let* ((body (cadr res)) + (loc `(location ,at ,(car res))) + (annotate (if (not (%peg:locations?)) '() + (if (null? (cadr comment-res)) `(,loc) + `((comment ,(cdr comment-res) ,comment-loc) + ,loc)))) + (at (car res))) + #,(cond + ((eq? accumsym 'name) + #`(list at '#,s-syn ,@annotate)) + ((eq? accumsym 'all) + #`(list (car res) + (cond + ((not (list? body)) + `(,'#,s-syn ,body ,@annotate)) + ((null? body) + `(,'#,s-syn ,@annotate)) + ((symbol? (car body)) + `(,'#,s-syn ,body ,@annotate)) + (else + (cons '#,s-syn (append body annotate)))))) + ((eq? accumsym 'none) #``(,at () ,@annotate)) + (else #``(,at ,body ,@annotate)))) + ;; If we didn't match, just return false. + #f)))) + +(define-syntax define-skip-parser + (lambda (x) + (syntax-case x () + ((_ sym accum pat) + (let* ((matchf (compile-peg-pattern #'pat (syntax->datum #'accum)))) + #`(define sym #,matchf)))))) diff --git a/test-suite/tests/peg.test b/test-suite/tests/peg.test index b3586c891..4f267f561 100644 --- a/test-suite/tests/peg.test +++ b/test-suite/tests/peg.test @@ -332,3 +332,50 @@ three <-- 'three'" (and=> (match-pattern expect-grammar "onethree") peg:tree)) (lambda args args)))) + +(define program-text " +/* + CopyLeft (L) Acme +*/ +foo // the first +bar + bar +baz +") + +(define-skip-parser peg-eof none (not-followed-by peg-any)) +(define-skip-parser peg-eol none (or "\f" "\n" "\r" "\v")) +(define-skip-parser peg-ws none (or " " "\t")) +(define-skip-parser peg-line all + (and "//" (* (and (not-followed-by peg-eol) peg-any)) + (expect (or "\n" "\r\n" peg-eof)))) +(define-skip-parser peg-block all + (and "/*" (* (or peg-block (and (not-followed-by "*/") peg-any))) + (or "*/" peg-eof))) +(define-skip-parser peg-block-strict all + (and "/*" + (* (or peg-block (and (not-followed-by "*/") peg-any))) + (expect "*/"))) +(define-skip-parser peg-skip all + (* (or peg-ws peg-eol peg-line peg-block-strict))) + +(with-test-prefix "Skip parser" + (pass-if-equal "skip comments and whitespace" + '(trace-grammar (foo "foo") (bar (bla "bar") (bla "bar")) (baz "baz")) + (and=> (parameterize ((%peg:skip? peg-skip)) + (match-pattern trace-grammar program-text)) + peg:tree)) + (pass-if-equal "preserve comments and whitespace" + '(trace-grammar (foo "foo" (location 26 29)) + (bar (bla "bar" (location 43 46)) + (bla "bar" (location 48 51)) + (comment "// the first\n" (location 29 43)) + (location 43 51)) + (baz "baz" (location 52 55)) + (comment "/*\n CopyLeft (L) Acme\n*/" + (location 0 26)) + (location 26 55)) + (and=> (parameterize ((%peg:skip? peg-skip) + (%peg:locations? #t)) + (match-pattern trace-grammar program-text)) + peg:tree))) -- 2.46.0