From: Rutger van Beusekom <rut...@dezyne.org>

* module/ice-9/peg/codegen.scm: (%peg:locations?, %peg:skip?): New
exported parameters.
(wrap-parser-for-users): Use them to enable skip parsing and switch
having locations on comments and whitespace.
* test-suite/tests/peg.test ("Skip parser"): Test it.
* doc/ref/api-peg.texi (Whitespace and comments): Document it.

Co-authored-by: Janneke Nieuwenhuizen <jann...@gnu.org>
---
 doc/ref/api-peg.texi         | 44 +++++++++++++++++++++
 module/ice-9/peg.scm         |  3 ++
 module/ice-9/peg/codegen.scm | 74 ++++++++++++++++++++++++------------
 test-suite/tests/peg.test    | 47 +++++++++++++++++++++++
 4 files changed, 143 insertions(+), 25 deletions(-)

diff --git a/doc/ref/api-peg.texi b/doc/ref/api-peg.texi
index df2e74d05..733cb1c6d 100644
--- a/doc/ref/api-peg.texi
+++ b/doc/ref/api-peg.texi
@@ -1062,6 +1062,50 @@ current state of the input, as well as the parse result.
   (and=> (match-pattern grammar input-text) peg:tree))
 @end lisp
 
+@subsubheading Whitespace and comments
+
+To write a PEG parser for a whitespace invariant language or a language
+which includes line and block comments requires littering the grammar
+with whitespace or comment parser expressions, which not only violates
+the DRY principle, but is hard to get right.
+
+For example, to parse a C-like language one would define these
+whitespace and comment parsers
+
+@lisp
+(define-skip-parser peg-eof none (not-followed-by peg-any))
+(define-skip-parser peg-eol none (or "\f" "\n" "\r" "\v"))
+(define-skip-parser peg-ws none (or " " "\t"))
+(define-skip-parser peg-line all
+  (and "//" (* (and (not-followed-by peg-eol) peg-any))
+       (expect (or "\n" "\r\n" peg-eof))))
+(define-skip-parser peg-block-strict all
+  (and "/*"
+       (* (or peg-block (and (not-followed-by "*/") peg-any)))
+       (expect "*/")))
+(define-skip-parser peg-skip all
+  (* (or peg-ws peg-eol peg-line peg-block-strict)))
+(define-skip-parser peg-block all
+  (and "/*" (* (or peg-block (and (not-followed-by "*/") peg-any)))
+       (or "*/" peg-eof)))
+@end lisp
+
+When setting @var{%peg:skip?} to @code{peg-skip}, whitespace and
+comments are silently skipped.
+
+@lisp
+(parameterize ((%peg:skip? peg-skip))
+  (and=> (match-pattern grammar input-text) peg:tree))
+@end lisp
+
+If you want to preserve locations and comments, set
+@var{%peg:locations?} to @code{#t}.
+@lisp
+(parameterize ((%peg:skip? peg-skip)
+               (%peg:locations? #t))
+  (and=> (match-pattern grammar input-text) peg:tree))
+@end lisp
+
 @subsubheading Expect parsing
 
 The best thing about PEG is its backtracking nature making it
diff --git a/module/ice-9/peg.scm b/module/ice-9/peg.scm
index 499c3820c..fd9dce54c 100644
--- a/module/ice-9/peg.scm
+++ b/module/ice-9/peg.scm
@@ -28,7 +28,10 @@
   #:use-module (ice-9 peg cache)
   #:re-export (define-peg-pattern
                define-peg-string-patterns
+               define-skip-parser
                %peg:debug?
+               %peg:locations?
+               %peg:skip?
                match-pattern
                search-for-pattern
                compile-peg-pattern
diff --git a/module/ice-9/peg/codegen.scm b/module/ice-9/peg/codegen.scm
index dd24bdac0..458a7e3ab 100644
--- a/module/ice-9/peg/codegen.scm
+++ b/module/ice-9/peg/codegen.scm
@@ -21,7 +21,10 @@
   #:export (compile-peg-pattern
             wrap-parser-for-users
             add-peg-compiler!
-            %peg:debug?)
+            define-skip-parser
+            %peg:debug?
+            %peg:locations?
+            %peg:skip?)
 
   #:use-module (ice-9 pretty-print)
   #:use-module (system base pmatch))
@@ -349,6 +352,9 @@ return EXP."
 ;; Packages the results of a parser
 
 (define %peg:debug? (make-parameter #f))
+(define %peg:locations? (make-parameter #f))
+(define %peg:skip? (make-parameter (lambda (str strlen at) `(,at ()))))
+
 (define (trace? symbol)
   (%peg:debug?))
 
@@ -361,7 +367,11 @@ return EXP."
                 (make-string indent #\space)
                 '#,s-syn))
       (set! indent (+ indent 4))
-      (let ((res (#,parser str strlen at)))
+      (let* ((comment-res ((%peg:skip?) str strlen at))
+             (comment-loc (and (%peg:locations?) comment-res
+                               `(location ,at ,(car comment-res))))
+             (at (or (and comment-res (car comment-res)) at))
+             (res (#,parser str strlen at)))
         (set! indent (- indent 4))
         (let ((pos (or (and res (car res)) 0)))
           (when (and (trace? '#,s-syn) (< at pos))
@@ -369,26 +379,40 @@ return EXP."
                     (make-string indent #\space)
                     '#,s-syn
                     (substring str at pos)
-                    (substring str pos (min strlen (+ pos 10)))))
-          ;; Try to match the nonterminal.
-          (if res
-              ;; If we matched, do some post-processing to figure out
-              ;; what data to propagate upward.
-              (let ((at (car res))
-                    (body (cadr res)))
-                #,(cond
-                   ((eq? accumsym 'name)
-                    #`(list at '#,s-syn))
-                   ((eq? accumsym 'all)
-                    #`(list (car res)
-                            (cond
-                             ((not (list? body))
-                              (list '#,s-syn body))
-                             ((null? body) '#,s-syn)
-                             ((symbol? (car body))
-                              (list '#,s-syn body))
-                             (else (cons '#,s-syn body)))))
-                   ((eq? accumsym 'none) #`(list (car res) '()))
-                   (else #`(begin res))))
-              ;; If we didn't match, just return false.
-              #f)))))
+                    (substring str pos (min strlen (+ pos 10))))))
+        ;; Try to match the nonterminal.
+        (if res
+            ;; If we matched, do some post-processing to figure out
+            ;; what data to propagate upward.
+            (let* ((body (cadr res))
+                   (loc `(location ,at ,(car res)))
+                   (annotate (if (not (%peg:locations?)) '()
+                                 (if (null? (cadr comment-res)) `(,loc)
+                                     `((comment ,(cdr comment-res) 
,comment-loc)
+                                       ,loc))))
+                   (at (car res)))
+              #,(cond
+                 ((eq? accumsym 'name)
+                  #`(list at '#,s-syn ,@annotate))
+                 ((eq? accumsym 'all)
+                  #`(list (car res)
+                          (cond
+                           ((not (list? body))
+                            `(,'#,s-syn ,body ,@annotate))
+                           ((null? body)
+                            `(,'#,s-syn ,@annotate))
+                           ((symbol? (car body))
+                            `(,'#,s-syn ,body ,@annotate))
+                           (else
+                            (cons '#,s-syn (append body annotate))))))
+                 ((eq? accumsym 'none) #``(,at () ,@annotate))
+                 (else #``(,at ,body ,@annotate))))
+            ;; If we didn't match, just return false.
+            #f))))
+
+(define-syntax define-skip-parser
+  (lambda (x)
+    (syntax-case x ()
+      ((_ sym accum pat)
+       (let* ((matchf (compile-peg-pattern #'pat (syntax->datum #'accum))))
+         #`(define sym #,matchf))))))
diff --git a/test-suite/tests/peg.test b/test-suite/tests/peg.test
index b3586c891..4f267f561 100644
--- a/test-suite/tests/peg.test
+++ b/test-suite/tests/peg.test
@@ -332,3 +332,50 @@ three <-- 'three'"
         (and=> (match-pattern expect-grammar "onethree")
                peg:tree))
       (lambda args args))))
+
+(define program-text "
+/*
+ CopyLeft (L) Acme
+*/
+foo // the first
+bar
+       bar
+baz
+")
+
+(define-skip-parser peg-eof none (not-followed-by peg-any))
+(define-skip-parser peg-eol none (or "\f" "\n" "\r" "\v"))
+(define-skip-parser peg-ws none (or " " "\t"))
+(define-skip-parser peg-line all
+  (and "//" (* (and (not-followed-by peg-eol) peg-any))
+       (expect (or "\n" "\r\n" peg-eof))))
+(define-skip-parser peg-block all
+  (and "/*" (* (or peg-block (and (not-followed-by "*/") peg-any)))
+       (or "*/" peg-eof)))
+(define-skip-parser peg-block-strict all
+  (and "/*"
+       (* (or peg-block (and (not-followed-by "*/") peg-any)))
+       (expect "*/")))
+(define-skip-parser peg-skip all
+  (* (or peg-ws peg-eol peg-line peg-block-strict)))
+
+(with-test-prefix "Skip parser"
+  (pass-if-equal "skip comments and whitespace"
+      '(trace-grammar (foo "foo") (bar (bla "bar") (bla "bar")) (baz "baz"))
+    (and=> (parameterize ((%peg:skip? peg-skip))
+             (match-pattern trace-grammar program-text))
+           peg:tree))
+  (pass-if-equal "preserve comments and whitespace"
+      '(trace-grammar (foo "foo" (location 26 29))
+                      (bar (bla "bar" (location 43 46))
+                           (bla "bar" (location 48 51))
+                           (comment "// the first\n" (location 29 43))
+                           (location 43 51))
+                      (baz "baz" (location 52 55))
+                      (comment "/*\n CopyLeft (L) Acme\n*/"
+                               (location 0 26))
+                      (location 26 55))
+    (and=> (parameterize ((%peg:skip? peg-skip)
+                          (%peg:locations? #t))
+             (match-pattern trace-grammar program-text))
+           peg:tree)))
-- 
2.46.0


Reply via email to