ovidiu 01/12/13 01:31:37 Added: scratchpad/schecoon/scheme pregexp.scm Log: Added. Revision Changes Path 1.1 xml-cocoon2/scratchpad/schecoon/scheme/pregexp.scm Index: pregexp.scm =================================================================== ;pregexp.scm ;Portable regular expressions for Scheme ;Dorai Sitaram ;http://www.cs.rice.edu/~dorai ;ds26 AT gte.com ;Oct 2, 1999 (define *pregexp-comment-char* #\;) ;#\return, #\tab are not R5RS (define *pregexp-return-char* (integer->char 13)) (define *pregexp-tab-char* (integer->char 9)) (define *pregexp-space-sensitive?* #t) (define pregexp-reverse! ;the useful reverse! isn't R5RS (lambda (s) (let loop ((s s) (r '())) (if (null? s) r (let ((d (cdr s))) (set-cdr! s r) (loop d s)))))) (define pregexp-error ;R5RS won't give me a portable error procedure. ;modify this as needed (lambda (where . what) (display "Error: ") (display where) (display ": ") (for-each display what) (newline) (error))) ;The comments ;( and ;) are there only to ;match escaped parens so my text editor will ;paren-balance correctly (define pregexp-read-pattern (lambda (s i n) (let loop ((branches '()) (i i)) (if (or (>= i n) (char=? (string-ref s i) ;( #\))) (list (cons ':or (pregexp-reverse! branches)) i) (let ((vv (pregexp-read-branch s (if (char=? (string-ref s i) #\|) (+ i 1) i) n))) (loop (cons (car vv) branches) (cadr vv))))))) (define pregexp-read-branch (lambda (s i n) (let loop ((pieces '()) (i i)) (cond ((>= i n) (list (cons ':seq (pregexp-reverse! pieces)) i)) ((let ((c (string-ref s i))) (or (char=? c #\|) ;( (char=? c #\)))) (list (cons ':seq (pregexp-reverse! pieces)) i)) (else (let ((vv (pregexp-read-piece s i n))) (loop (cons (car vv) pieces) (cadr vv)))))))) (define pregexp-read-piece (lambda (s i n) (let ((c (string-ref s i))) (case c ((#\^) (list ':bos (+ i 1))) ((#\$) (list ':eos (+ i 1))) ((#\.) (pregexp-wrap-quantifier-if-any (list ':any (+ i 1)) s n)) ((#\[) (pregexp-wrap-quantifier-if-any (case (string-ref s (+ i 1)) ((#\^) (let ((vv (pregexp-read-char-list s (+ i 2) n))) (list (list ':neg-char (car vv)) (cadr vv)))) (else (pregexp-read-char-list s (+ i 1) n))) s n)) ((#\() ;) (pregexp-wrap-quantifier-if-any (pregexp-read-subpattern s (+ i 1) n) s n)) ((#\\) (pregexp-wrap-quantifier-if-any (cond ((pregexp-read-escaped-number s i n) => (lambda (num-i) (list (list ':backref (car num-i)) (cadr num-i)))) ((pregexp-read-escaped-char s i n) => (lambda (char-i) (list (car char-i) (cadr char-i)))) (else (error 'pregexp-read-piece/backslash))) s n)) (else (if (or *pregexp-space-sensitive?* (and (not (char-whitespace? c)) (not (char=? c *pregexp-comment-char*)))) (pregexp-wrap-quantifier-if-any (list c (+ i 1)) s n) (let loop ((i i) (in-comment? #f)) (if (>= i n) (list ':empty i) (let ((c (string-ref s i))) (cond (in-comment? (loop (+ i 1) (not (char=? c #\newline)))) ((char-whitespace? c) (loop (+ i 1) #f)) ((char=? c *pregexp-comment-char*) (loop (+ i 1) #t)) (else (list ':empty i)))))))))))) (define pregexp-read-escaped-number (lambda (s i n) ; s[i] = \ (and (< (+ i 1) n) ;must have at least something following \ (let ((c (string-ref s (+ i 1)))) (and (char-numeric? c) (let loop ((i (+ i 2)) (r (list c))) (if (>= i n) (list (string->number (list->string (pregexp-reverse! r))) i) (let ((c (string-ref s i))) (if (char-numeric? c) (loop (+ i 1) (cons c r)) (list (string->number (list->string (pregexp-reverse! r))) i)))))))))) (define pregexp-read-escaped-char (lambda (s i n) ; s[i] = \ (and (< (+ i 1) n) (let ((c (string-ref s (+ i 1)))) (case c ((#\b) (list ':wbdry (+ i 2))) ((#\B) (list ':not-wbdry (+ i 2))) ((#\d) (list ':digit (+ i 2))) ((#\D) (list '(:neg-char :digit) (+ i 2))) ((#\n) (list #\newline (+ i 2))) ((#\r) (list *pregexp-return-char* (+ i 2))) ((#\s) (list ':space (+ i 2))) ((#\S) (list '(:neg-char :space) (+ i 2))) ((#\t) (list *pregexp-tab-char* (+ i 2))) ((#\w) (list ':word (+ i 2))) ((#\W) (list '(:neg-char :word) (+ i 2))) (else (list c (+ i 2)))))))) (define pregexp-read-posix-char-class (lambda (s i n) ; lbrack, colon already read (let ((neg? #f)) (let loop ((i i) (r (list #\:))) (if (>= i n) (error 'pregexp-read-posix-char-class) (let ((c (string-ref s i))) (cond ((char=? c #\^) (set! neg? #t) (loop (+ i 1) r)) ((char-alphabetic? c) (loop (+ i 1) (cons c r))) ((char=? c #\:) (if (or (>= (+ i 1) n) (not (char=? (string-ref s (+ i 1)) #\]))) (error 'pregexp-read-posix-char-class) (let ((posix-class (string->symbol (list->string (pregexp-reverse! r))))) (list (if neg? (list ':neg-char posix-class) posix-class) (+ i 2))))) (else (error 'pregexp-read-posix-char-class))))))))) (define pregexp-read-cluster-type (lambda (s i n) ; s[i-1] = left-paren (let ((c (string-ref s i))) (case c ((#\?) (let ((i (+ i 1))) (case (string-ref s i) ((#\:) (list '() (+ i 1))) ((#\=) (list '(:lookahead) (+ i 1))) ((#\!) (list '(:neg-lookahead) (+ i 1))) ((#\>) (list '(:no-backtrack) (+ i 1))) ((#\<) (list (case (string-ref s (+ i 1)) ((#\=) '(:lookbehind)) ((#\!) '(:neg-lookbehind)) (else (error 'pregexp-read-cluster-type))) (+ i 2))) (else (let loop ((i i) (r '()) (inv? #f)) (let ((c (string-ref s i))) (case c ((#\-) (loop (+ i 1) r #t)) ((#\i) (loop (+ i 1) (cons (if inv? ':case-sensitive ':case-insensitive) r) #f)) ((#\x) (set! *pregexp-space-sensitive?* inv?) (loop (+ i 1) r #f)) ((#\:) (list r (+ i 1))) (else (error 'pregexp-read-cluster-type))))))))) (else (list '(:sub) i)))))) (define pregexp-read-subpattern (lambda (s i n) (let* ((remember-space-sensitive? *pregexp-space-sensitive?*) (ctyp-i (pregexp-read-cluster-type s i n)) (ctyp (car ctyp-i)) (i (cadr ctyp-i)) (vv (pregexp-read-pattern s i n))) (set! *pregexp-space-sensitive?* remember-space-sensitive?) (let ((vv-re (car vv)) (vv-i (cadr vv))) (if (and (< vv-i n) (char=? (string-ref s vv-i) ;( #\))) (list (let loop ((ctyp ctyp) (re vv-re)) (if (null? ctyp) re (loop (cdr ctyp) (list (car ctyp) re)))) (+ vv-i 1)) (error 'pregexp-read-subpattern)))))) (define pregexp-wrap-quantifier-if-any (lambda (vv s n) (let ((re (car vv))) (let loop ((i (cadr vv))) (if (>= i n) vv (let ((c (string-ref s i))) (if (and (char-whitespace? c) (not *pregexp-space-sensitive?*)) (loop (+ i 1)) (case c ((#\* #\+ #\? #\{) (let* ((new-re (list ':between 'minimal? 'at-least 'at-most re)) (new-vv (list new-re 'next-i))) (case c ((#\*) (set-car! (cddr new-re) 0) (set-car! (cdddr new-re) #f)) ((#\+) (set-car! (cddr new-re) 1) (set-car! (cdddr new-re) #f)) ((#\?) (set-car! (cddr new-re) 0) (set-car! (cdddr new-re) 1)) ((#\{) (let ((mn (pregexp-read-nums s (+ i 1)))) (set-car! (cddr new-re) (car mn)) (set-car! (cdddr new-re) (cadr mn)) (set! i (caddr mn))))) (let loop ((i (+ i 1))) (if (>= i n) (begin (set-car! (cdr new-re) #f) (set-car! (cdr new-vv) i)) (let ((c (string-ref s i))) (cond ((and (char-whitespace? c) (not *pregexp-space-sensitive?*)) (loop (+ i 1))) ((char=? c #\?) (set-car! (cdr new-re) #t) (set-car! (cdr new-vv) (+ i 1))) (else (set-car! (cdr new-re) #f) (set-car! (cdr new-vv) i)))))) new-vv)) (else vv))))))))) ; (define pregexp-read-nums (lambda (s i) ; s[i-1] = { ; returns (m n k) where s[k] = } (let loop ((m '()) (n '()) (k i) (reading 1)) (let ((c (string-ref s k))) (cond ((char-numeric? c) (if (= reading 1) (loop (cons c m) n (+ k 1) 1) (loop m (cons c n) (+ k 1) 2))) ((and (char-whitespace? c) (not *pregexp-space-sensitive?*)) (loop m n (+ k 1) reading)) ((and (char=? c #\,) (= reading 1)) (loop m n (+ k 1) 2)) ((char=? c #\}) (let ((m (string->number (list->string (pregexp-reverse! m)))) (n (string->number (list->string (pregexp-reverse! n))))) (cond ((and (not m) (= reading 1)) (list 0 #f k)) ((= reading 1) (list m m k)) (else (list m n k))))) (else #f)))))) (define pregexp-invert-char-list (lambda (vv) (set-car! (car vv) ':none-of-chars) vv)) ; (define pregexp-read-char-list (lambda (s i n) (let loop ((r '()) (i i)) (if (>= i n) (pregexp-error 'pregexp-read-char-list "character class ended too soon") (let ((c (string-ref s i))) (case c ((#\]) (if (null? r) (loop (cons c r) (+ i 1)) (list (cons ':one-of-chars (pregexp-reverse! r)) (+ i 1)))) ((#\\) (let ((char-i (pregexp-read-escaped-char s i n))) (if char-i (loop (cons (car char-i) r) (cadr char-i)) (error 'pregexp-read-char-list/backslash)))) ((#\-) (let ((c-prev (car r))) (if (char? c-prev) (loop (cons (list ':char-range c-prev (string-ref s (+ i 1))) (cdr r)) (+ i 2)) (loop (cons c r) (+ i 1))))) ((#\[) (if (char=? (string-ref s (+ i 1)) #\:) (let ((posix-char-class-i (pregexp-read-posix-char-class s (+ i 2) n))) (loop (cons (car posix-char-class-i) r) (cadr posix-char-class-i))) (loop (cons c r) (+ i 1)))) (else (loop (cons c r) (+ i 1))))))))) ; (define pregexp-string-match (lambda (s1 s i n sk fk) (let ((n1 (string-length s1))) (if (> n1 n) (fk) (let loop ((j 0) (k i)) (cond ((>= j n1) (sk k)) ((>= k n) (fk)) ((char=? (string-ref s1 j) (string-ref s k)) (loop (+ j 1) (+ k 1))) (else (fk)))))))) (define pregexp-char-word? (lambda (c) ;too restrictive for Scheme but this ;is what \w is in most regexp notations (or (char-alphabetic? c) (char-numeric? c) (char=? c #\_)))) (define pregexp-at-word-boundary? (lambda (s i n) (or (= i 0) (>= i n) (let ((c/i (string-ref s i)) (c/i-1 (string-ref s (- i 1)))) (let ((c/i/w? (pregexp-check-if-in-char-class? c/i ':word)) (c/i-1/w? (pregexp-check-if-in-char-class? c/i-1 ':word))) (or (and c/i/w? (not c/i-1/w?)) (and (not c/i/w?) c/i-1/w?))))))) (define pregexp-check-if-in-char-class? (lambda (c char-class) (case char-class ((:any) (not (char=? c #\newline))) ; ((:alnum) (or (char-alphabetic? c) (char-numeric? c))) ((:alpha) (char-alphabetic? c)) ((:ascii) (< (char->integer c) 128)) ((:blank) (or (char=? c #\space) (char=? c *pregexp-tab-char*))) ((:cntrl) (< (char->integer c) 32)) ((:digit) (char-numeric? c)) ((:graph) (and (>= (char->integer c) 32) (not (char-whitespace? c)))) ((:lower) (char-lower-case? c)) ((:print) (>= (char->integer c) 32)) ((:punct) (and (>= (char->integer c) 32) (not (char-whitespace? c)) (not (char-alphabetic? c)) (not (char-numeric? c)))) ((:space) (char-whitespace? c)) ((:upper) (char-upper-case? c)) ((:word) (or (char-alphabetic? c) (char-numeric? c) (char=? c #\_))) ((:xdigit) (or (char-numeric? c) (char-ci=? c #\a) (char-ci=? c #\b) (char-ci=? c #\c) (char-ci=? c #\d) (char-ci=? c #\e) (char-ci=? c #\f))) (else (error 'pregexp-check-if-in-char-class?))))) (define pregexp-list-ref (lambda (s i) ;like list-ref but returns #f if index is ;out of bounds (let loop ((s s) (k 0)) (cond ((null? s) #f) ((= k i) (car s)) (else (loop (cdr s) (+ k 1))))))) ;re is a compiled regexp. It's a list that can't be ;nil. pregexp-match-positions-aux returns a 2-elt list whose ;car is the string-index following the matched ;portion and whose cadr contains the submatches. ;The proc returns false if there's no match. ;Am spelling loop- as loup- because these shouldn't ;be translated into CL loops by scm2cl (although ;they are tail-recursive in Scheme) (define pregexp-match-positions-aux (lambda (re s start n i) (let ((case-sensitive? #t)) (let sub ((re re) (i i) (backrefs '()) (sk list) (fk (lambda () #f))) ;(printf "sub ~s ~s~%" i re) (cond ((eqv? re ':bos) (if (= i start) (sk i backrefs) (fk))) ((eqv? re ':eos) (if (>= i n) (sk i backrefs) (fk))) ((eqv? re ':empty) (sk i backrefs)) ((eqv? re ':wbdry) (if (pregexp-at-word-boundary? s i n) (sk i backrefs) (fk))) ((eqv? re ':not-wbdry) (if (pregexp-at-word-boundary? s i n) (fk) (sk i backrefs))) ((and (char? re) (< i n)) (if ((if case-sensitive? char=? char-ci=?) (string-ref s i) re) (sk (+ i 1) backrefs) (fk))) ((and (not (pair? re)) (< i n)) (if (pregexp-check-if-in-char-class? (string-ref s i) re) (sk (+ i 1) backrefs) (fk))) ((and (pair? re) (eqv? (car re) ':char-range) (< i n)) (let ((c (string-ref s i))) (if (let ((c< (if case-sensitive? char<=? char-ci<=?))) (and (c< (cadr re) c) (c< c (caddr re)))) (sk (+ i 1) backrefs) (fk)))) ((pair? re) (case (car re) ((:char-range) (if (>= i n) (fk) (error 'pregexp-match-positions-aux))) ((:one-of-chars) (if (>= i n) (fk) (let loup-one-of-chars ((chars (cdr re))) (if (null? chars) (fk) (sub (car chars) i backrefs sk (lambda () (loup-one-of-chars (cdr chars)))))))) ((:neg-char) (if (>= i n) (fk) (sub (cadr re) i backrefs (lambda (i1 backrefs1) (fk)) (lambda () (sk (+ i 1) backrefs))))) ((:seq) (let loup-seq ((res (cdr re)) (i i) (backrefs backrefs)) (if (null? res) (sk i backrefs) (sub (car res) i backrefs (lambda (i1 backrefs1) (loup-seq (cdr res) i1 backrefs1)) fk)))) ((:or) (let loup-or ((res (cdr re))) (if (null? res) (fk) (sub (car res) i backrefs (lambda (i1 backrefs1) (or (sk i1 backrefs1) (loup-or (cdr res)))) (lambda () (loup-or (cdr res))))))) ((:backref) (let ((backref (pregexp-list-ref backrefs (cadr re)))) (if backref (pregexp-string-match (substring s (car backref) (cdr backref)) s i n (lambda (i) (sk i backrefs)) fk) (sk i backrefs)))) ((:sub) (let* ((sub-backref (cons i i)) (backrefs (append backrefs (list sub-backref)))) (sub (cadr re) i backrefs (lambda (i1 backrefs1) (set-cdr! sub-backref i1) (sk i1 backrefs1)) fk))) ((:lookahead) (let ((found-it? (sub (cadr re) i backrefs list (lambda () #f)))) (if found-it? (sk i backrefs) (fk)))) ((:neg-lookahead) (let ((found-it? (sub (cadr re) i backrefs list (lambda () #f)))) (if found-it? (fk) (sk i backrefs)))) ((:lookbehind) (let ((n-actual n)) (set! n i) (let ((found-it? (sub (list ':seq '(:between #f 0 #f :any) (cadr re) ':eos) 0 backrefs list (lambda () #f)))) (set! n n-actual) (if found-it? (sk i backrefs) (fk))))) ((:neg-lookbehind) (let ((n-actual n)) (set! n i) (let ((found-it? (sub (list ':seq '(:between #f 0 #f :any) (cadr re) ':eos) 0 backrefs list (lambda () #f)))) (set! n n-actual) (if found-it? (fk) (sk i backrefs))))) ((:no-backtrack) (let ((found-it? (sub (cadr re) i backrefs list (lambda () #f)))) (if found-it? (sk (car found-it?) (cadr found-it?)) (fk)))) ((:case-sensitive :case-insensitive) (let ((old case-sensitive?)) (set! case-sensitive? (eqv? (car re) ':case-sensitive)) (sub (cadr re) i backrefs (lambda (i1 backrefs1) (set! case-sensitive? old) (sk i1 backrefs1)) (lambda () (set! case-sensitive? old) (fk))))) ((:between) (let* ((maximal? (not (cadr re))) (p (caddr re)) (q (cadddr re)) (re (car (cddddr re))) (subpat? (and (pair? re) (eqv? (car re) ':sub)))) (let loup-p ((k 0) (i i) (cbackrefs 'no-match-yet)) (if (< k p) (sub re i backrefs (lambda (i1 backrefs1) (loup-p (+ k 1) i1 backrefs1)) fk) (let ((q (and q (- q p)))) (let loup-q ((k 0) (i i) (cbackrefs cbackrefs)) (let ((fk (lambda () (sk i (if (eqv? cbackrefs 'no-match-yet) (if subpat? (append backrefs (list #f)) backrefs) cbackrefs))))) (if (and q (>= k q)) (fk) (if maximal? (sub re i backrefs (lambda (i1 backrefs1) (or (loup-q (+ k 1) i1 backrefs1) (fk))) fk) (or (fk) (sub re i backrefs (lambda (i1 backrefs1) (loup-q (+ k 1) i1 backrefs1)) fk))))))))))) (else (error 'pregexp-match-positions-aux)))) ((>= i n) (fk)) (else (error 'pregexp-match-positions-aux))))))) (define pregexp-replace-aux (lambda (str ins n backrefs) (let loop ((i 0) (r "")) (if (>= i n) r (let ((c (string-ref ins i))) (if (char=? c #\\) (let* ((br-i (pregexp-read-escaped-number ins i n)) (br (if br-i (car br-i) (if (char=? (string-ref ins (+ i 1)) #\&) 0 #f))) (i (if br-i (cadr br-i) (if br (+ i 2) (+ i 1))))) (if (not br) (let ((c2 (string-ref ins i))) (loop (+ i 1) (if (char=? c2 #\$) r (string-append r (string c2))))) (loop i (let ((backref (pregexp-list-ref backrefs br))) (if backref (string-append r (substring str (car backref) (cdr backref))) r))))) (loop (+ i 1) (string-append r (string c))))))))) (define pregexp (lambda (s) (set! *pregexp-space-sensitive?* #t) ;in case it got corrupted (list ':sub (car (pregexp-read-pattern s 0 (string-length s)))))) (define pregexp-match-positions (lambda (pat str . opt-args) (let* ((pat (if (string? pat) (pregexp pat) pat)) (start (if (null? opt-args) 0 (let ((start (car opt-args))) (set! opt-args (cdr opt-args)) start))) (end (if (null? opt-args) (string-length str) (car opt-args)))) (let loop ((i start)) (and (<= i end) (let ((vv (pregexp-match-positions-aux pat str start end i))) (if vv (cadr vv) (loop (+ i 1))))))))) (define pregexp-match (lambda (pat str . opt-args) (let ((ix-prs (apply pregexp-match-positions pat str opt-args))) (and ix-prs (map (lambda (ix-pr) (and ix-pr (substring str (car ix-pr) (cdr ix-pr)))) ix-prs))))) (define pregexp-replace (lambda (pat str ins) (let* ((n (string-length str)) (pp (pregexp-match-positions pat str 0 n))) (if (not pp) str (let ((ins-len (string-length ins)) (m-i (caar pp)) (m-n (cdar pp))) (string-append (substring str 0 m-i) (pregexp-replace-aux str ins ins-len pp) (substring str m-n n))))))) (define pregexp-replace* (lambda (pat str ins) (let ((pat (if (string? pat) (pregexp pat) pat)) (n (string-length str)) (ins-len (string-length ins))) (let loop ((i 0) (r "")) (let ((pp (pregexp-match-positions pat str i n))) (if pp (loop (cdar pp) (string-append r (substring str i (caar pp)) (pregexp-replace-aux str ins ins-len pp))) (string-append r (substring str i n)))))))) (define // pregexp-match-positions) (define m// pregexp-match) (define s/// pregexp-replace) (define s///g pregexp-replace*) ;(load "d:/public_html/trace.scm")
---------------------------------------------------------------------- In case of troubles, e-mail: [EMAIL PROTECTED] To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]