Hi there, I'm currently implementing regexp-split for Guile, which provides a Perl-style split function (including correctly implementing the "limit" parameter), minus the special awk-style whitespace handling (that is used with a pattern of " ", as opposed to / /, with Perl's split).
Attached is a couple of patches, to support the regexp-split function which I'm proposing at the bottom of this message: 1. The first fixes the behaviour of fold-matches and list-matches when the pattern contains a ^ (identical to the patch in my last email). 2. The second adds the ability to limit the number of matches done. This applies on top of the first patch. Some comments about the regexp-split implementation: the value that's being passed to regexp-split-fold is a cons, where the car is the last match's end position, and the cdr is the substrings so far collected. The special check in regexp-split-fold for match-end being zero is to emulate a specific behaviour as documented for Perl's split: "Empty leading fields are produced when there are positive-width matches at the beginning of the string; a zero-width match at the beginning of the string does not produce an empty field." Below is the implementation; comments are welcome! If it all looks good, I'll write tests and documentation, with a view to eventually putting it into (ice-9 regex). Thanks, Chris. * * * (define (regexp-split-fold match prev) (if (zero? (match:end match)) prev (cons* (match:end match) (substring (match:string match) (car prev) (match:start match)) (cdr prev)))) (define (string-empty? str) (zero? (string-length str))) (define* (regexp-split pat str #:optional (limit 0)) (let* ((result (fold-matches pat str '(0) regexp-split-fold 0 (if (positive? limit) (1- limit) #f))) (final (cons (substring str (car result)) (cdr result)))) (reverse! (if (zero? limit) (drop-while string-empty? final) final))))
>From da8b0cd523f6e9bf9e1d46829cccf01e3115c614 Mon Sep 17 00:00:00 2001 From: "Chris K. Jester-Young" <cky...@gmail.com> Date: Sun, 16 Sep 2012 02:20:56 -0400 Subject: [PATCH 1/2] In fold-matches, set regexp/notbol unless matching string start. * module/ice-9/regex.scm (fold-matches): Set regexp/notbol if the starting position is nonzero. * test-suite/tests/regexp.test (fold-matches): Check that when matching /^foo/ against "foofoofoofoo", only one match results. --- module/ice-9/regex.scm | 3 ++- test-suite/tests/regexp.test | 9 ++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/module/ice-9/regex.scm b/module/ice-9/regex.scm index f7b94b7..08ae2c2 100644 --- a/module/ice-9/regex.scm +++ b/module/ice-9/regex.scm @@ -172,8 +172,9 @@ (let loop ((start 0) (value init) (abuts #f)) ; True if start abuts a previous match. + (define bol (if (zero? start) 0 regexp/notbol)) (let ((m (if (> start (string-length string)) #f - (regexp-exec regexp string start flags)))) + (regexp-exec regexp string start (logior flags bol))))) (cond ((not m) value) ((and (= (match:start m) (match:end m)) abuts) diff --git a/test-suite/tests/regexp.test b/test-suite/tests/regexp.test index ef59465..d549df2 100644 --- a/test-suite/tests/regexp.test +++ b/test-suite/tests/regexp.test @@ -132,7 +132,14 @@ (lambda (match result) (cons (match:substring match) result)) - (logior regexp/notbol regexp/noteol))))) + (logior regexp/notbol regexp/noteol)))) + + (pass-if "regexp/notbol is set correctly" + (equal? '("foo") + (fold-matches "^foo" "foofoofoofoo" '() + (lambda (match result) + (cons (match:substring match) + result)))))) ;;; -- 1.7.9.5
>From 147dc0d7fd9ab04d10b4f13cecf47a32c5b6c4b6 Mon Sep 17 00:00:00 2001 From: "Chris K. Jester-Young" <cky...@gmail.com> Date: Mon, 17 Sep 2012 01:06:07 -0400 Subject: [PATCH 2/2] Add "limit" parameter to fold-matches and list-matches. * doc/ref/api-regex.texi: Document new "limit" parameter. * module/ice-9/regex.scm (fold-matches, list-matches): Optionally take a "limit" argument that, if specified, limits how many times the pattern is matched. * test-suite/tests/regexp.test (fold-matches): Add tests for the correct functioning of the limit parameter. --- doc/ref/api-regex.texi | 10 ++++++---- module/ice-9/regex.scm | 18 ++++++++++-------- test-suite/tests/regexp.test | 16 +++++++++++++++- 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/doc/ref/api-regex.texi b/doc/ref/api-regex.texi index 082fb87..2d2243f 100644 --- a/doc/ref/api-regex.texi +++ b/doc/ref/api-regex.texi @@ -189,11 +189,12 @@ or @code{#f} otherwise. @end deffn @sp 1 -@deffn {Scheme Procedure} list-matches regexp str [flags] +@deffn {Scheme Procedure} list-matches regexp str [flags [limit]] Return a list of match structures which are the non-overlapping matches of @var{regexp} in @var{str}. @var{regexp} can be either a pattern string or a compiled regexp. The @var{flags} argument is as -per @code{regexp-exec} above. +per @code{regexp-exec} above. The @var{limit} argument, if specified, +limits how many times @var{regexp} is matched. @example (map match:substring (list-matches "[a-z]+" "abc 42 def 78")) @@ -201,11 +202,12 @@ per @code{regexp-exec} above. @end example @end deffn -@deffn {Scheme Procedure} fold-matches regexp str init proc [flags] +@deffn {Scheme Procedure} fold-matches regexp str init proc [flags [limit]] Apply @var{proc} to the non-overlapping matches of @var{regexp} in @var{str}, to build a result. @var{regexp} can be either a pattern string or a compiled regexp. The @var{flags} argument is as per -@code{regexp-exec} above. +@code{regexp-exec} above. The @var{limit} argument, if specified, +limits how many times @var{regexp} is matched. @var{proc} is called as @code{(@var{proc} match prev)} where @var{match} is a match structure and @var{prev} is the previous return diff --git a/module/ice-9/regex.scm b/module/ice-9/regex.scm index 08ae2c2..0ffe74c 100644 --- a/module/ice-9/regex.scm +++ b/module/ice-9/regex.scm @@ -167,26 +167,28 @@ ;;; `b'. Around or within `xxx', only the match covering all three ;;; x's counts, because the rest are not maximal. -(define* (fold-matches regexp string init proc #:optional (flags 0)) +(define* (fold-matches regexp string init proc #:optional (flags 0) limit) (let ((regexp (if (regexp? regexp) regexp (make-regexp regexp)))) (let loop ((start 0) + (count 0) (value init) (abuts #f)) ; True if start abuts a previous match. - (define bol (if (zero? start) 0 regexp/notbol)) - (let ((m (if (> start (string-length string)) #f - (regexp-exec regexp string start (logior flags bol))))) + (let* ((bol (if (zero? start) 0 regexp/notbol)) + (m (and (or (not limit) (< count limit)) + (<= start (string-length string)) + (regexp-exec regexp string start (logior flags bol))))) (cond ((not m) value) ((and (= (match:start m) (match:end m)) abuts) ;; We matched an empty string, but that would overlap the ;; match immediately before. Try again at a position ;; further to the right. - (loop (+ start 1) value #f)) + (loop (1+ start) count value #f)) (else - (loop (match:end m) (proc m value) #t))))))) + (loop (match:end m) (1+ count) (proc m value) #t))))))) -(define* (list-matches regexp string #:optional (flags 0)) - (reverse! (fold-matches regexp string '() cons flags))) +(define* (list-matches regexp string #:optional (flags 0) limit) + (reverse! (fold-matches regexp string '() cons flags limit))) (define (regexp-substitute/global port regexp string . items) diff --git a/test-suite/tests/regexp.test b/test-suite/tests/regexp.test index d549df2..c3ba698 100644 --- a/test-suite/tests/regexp.test +++ b/test-suite/tests/regexp.test @@ -139,7 +139,21 @@ (fold-matches "^foo" "foofoofoofoo" '() (lambda (match result) (cons (match:substring match) - result)))))) + result))))) + + (pass-if "without limit" + (equal? '("foo" "foo" "foo" "foo") + (fold-matches "foo" "foofoofoofoo" '() + (lambda (match result) + (cons (match:substring match) + result))))) + + (pass-if "with limit" + (equal? '("foo" "foo") + (fold-matches "foo" "foofoofoofoo" '() + (lambda (match result) + (cons (match:substring match) + result)) 0 2)))) ;;; -- 1.7.9.5