Hello Joe. donaldsonjw, 2022-02-06 01:00: > Here is another version of xml.scm that I believe addresses the problem you > pointed out. Let me know how it works for you.
Thanks again. I had to fix two calls of 'make' by moving the position to the end, see attached file, then everyting worked like a charm. To finish my problem solution and test it (successfully) on some thousand nontrivial xml files, I had to add a new option content-pos. If true, content strings are turned into (content-string offset). The changes are backward-compatible because the default is content-pos #f. The representation (and the name of the option) might not fit the rest, so feel free to change it. You can open a pull request on bigloo's github soon, I guess. Greetings Sven
;*=====================================================================*/ ;* serrano/prgm/project/bigloo/bigloo/api/web/src/Llib/xml.scm */ ;* ------------------------------------------------------------- */ ;* Author : Manuel Serrano */ ;* Creation : Fri Mar 11 16:23:53 2005 */ ;* Last change : Sun Oct 7 08:46:03 2018 (serrano) */ ;* Copyright : 2005-18 Manuel Serrano */ ;* ------------------------------------------------------------- */ ;* XML parsing */ ;*=====================================================================*/ ;*---------------------------------------------------------------------*/ ;* The module */ ;*---------------------------------------------------------------------*/ (module __web_xml (option (set! *dlopen-init-gc* #t)) (export (xml-parse::pair-nil port::input-port #!key (content-length 0) (procedure (lambda (tag attribs body start-pos) (list tag attribs body))) (specials '()) (strict #t) (content-pos #f) (encoding 'UTF-8) (eoi #f)) (xml-string-decode::bstring ::bstring) (xml-string-decode!::bstring ::bstring) (xml-string-encode::bstring ::bstring) (read-xml #!optional (port::input-port (current-input-port))) (xml-metadata xml-tree::pair-nil))) ;*---------------------------------------------------------------------*/ ;* xml-parse ... */ ;*---------------------------------------------------------------------*/ (define (xml-parse::pair-nil port::input-port #!key (content-length 0) (procedure (lambda (tag attribs body start-pos) (list tag attribs body))) (specials '()) (strict #t) (content-pos #f) (encoding 'UTF-8) (eoi #f)) (when (elong? content-length) (set! content-length (elong->fixnum content-length))) (when (and (fixnum? content-length) (>fx content-length 0)) (input-port-fill-barrier-set! port content-length)) (when (>fx content-length 0) (set! content-length (+fx content-length (input-port-position port)))) (let loop ((decoder (lambda (x) x))) (let ((obj (read/rp xml-grammar port procedure procedure specials strict content-pos decoder encoding (input-port-position port)))) (when (and (fixnum? content-length) (>fx content-length 0)) (input-port-fill-barrier-set! port -1)) (cond ((eof-object? obj) '()) ((or (and (procedure? eoi) (eoi obj)) (and (>fx content-length 0) (>=fx (input-port-position port) content-length))) (list obj)) ((and (pair? obj) (eq? 'xml-decl (car obj))) (let ((enc (assq 'encoding (cdr obj)))) (if enc (cons obj (loop (get-decoder (cdr enc) encoding))) (cons obj (loop decoder))))) (else (cons obj (loop decoder))))))) ;*---------------------------------------------------------------------*/ ;* xml-parse-error ... */ ;*---------------------------------------------------------------------*/ (define (xml-parse-error msg obj name pos) (raise (instantiate::&io-parse-error (proc 'xml-parse) (msg msg) (obj obj) (fname name) (location pos)))) ;*---------------------------------------------------------------------*/ ;* error-line ... */ ;*---------------------------------------------------------------------*/ (define (error-line c port) (let ((line (read-line port))) (string-append "{" (string c) "}" (if (string? line) line "")))) ;*---------------------------------------------------------------------*/ ;* special ... */ ;*---------------------------------------------------------------------*/ (define-struct special tag attributes body owner) ;*---------------------------------------------------------------------*/ ;* collect-up-to ... */ ;*---------------------------------------------------------------------*/ (define (collect-up-to start-pos ignore tag attributes port make specials strict content-pos decoder encoding) (define (collect ignore tags) (let ((name (input-port-name port)) (po (input-port-position port))) (let loop ((acc '()) (item (ignore))) (cond ((symbol? item) (cond ((eq? item tag) (make tag attributes (reverse! acc) start-pos)) (strict (xml-parse-error "Illegal closing tag" (format "`~a' expected, `~a' provided" tag item) name po)) (else (make tag attributes (reverse! acc) start-pos)))) ((special? item) (let ((nitem (make (special-tag item) (special-attributes item) (special-body item) start-pos))) (if (memq (special-tag item) tags) (loop acc nitem) (list (make tag attributes (reverse! acc) start-pos) nitem)))) ((eof-object? item) (if strict (xml-parse-error (format "Premature end of line, expecting tag `~a'" tag) item name po) (make tag attributes (reverse! acc) start-pos))) (else (let ((po (input-port-last-token-position port))) (loop (econs item acc (list 'at name po)) (ignore)))))))) (let ((spec (assq tag specials))) (cond ((not spec) (collect ignore '())) ((null? (cdr spec)) (make tag attributes '() start-pos)) ((procedure? (cdr spec)) (make tag attributes ((cdr spec) port) start-pos)) ((pair? (cdr spec)) (let ((ignore (lambda () (read/rp xml-grammar port (lambda (t a b) (special t a b tag)) make specials strict content-pos decoder encoding (input-port-position port))))) (collect ignore (cdr spec)))) (else (error "xml-parse" "Illegal special handler" spec))))) ;*---------------------------------------------------------------------*/ ;* attribute-value-grammar ... */ ;*---------------------------------------------------------------------*/ (define attribute-value-grammar (regular-grammar (strict tag) ((+ (in " \t\n\r")) (ignore)) ((: #\" (* (or (out #\\ #\") (: #\\ all))) #\") (the-substring 1 (-fx (the-length) 1))) ((: #\' (* (or (out #\\ #\') (: #\\ all))) #\') (the-substring 1 (-fx (the-length) 1))) ((: (+ digit) (? (or "%" "px" "cm" "em" "mm" "inch"))) (if strict (xml-parse-error (format "Illegal `~a' attribute value" tag) (the-string) (input-port-name (the-port)) (input-port-position (the-port))) (the-string))) ((+ (out " \t\n\r<>(){}[]@!\"'/")) (if strict (xml-parse-error (format "Illegal `~a' attribute character" tag) (the-string) (input-port-name (the-port)) (input-port-position (the-port))) (the-string))) (else (let ((c (the-failure))) (if (not (eof-object? c)) (if (or strict (not (or (char=? c #\space) (char=? c #\Newline) (char=? c #\>)))) (xml-parse-error (format "Illegal `~a' attribute character" tag) (error-line c (the-port)) (input-port-name (the-port)) (input-port-position (the-port))) " ") (xml-parse-error (format "Premature end of line for tag `~a' attribute" tag) c (input-port-name (the-port)) (-fx (input-port-position (the-port)) 1))))))) ;*---------------------------------------------------------------------*/ ;* attribute-grammar ... */ ;*---------------------------------------------------------------------*/ (define attribute-grammar (regular-grammar ((id (: (in ("azAZ") "_") (* (in ("azAZ09") ":_-")))) tag strict decoder) ((+ (in " \t\n\r")) (ignore)) ((: id "=") (let* ((key (the-substring 0 (-fx (the-length) 1))) (val (read/rp attribute-value-grammar (the-port) strict tag))) (cons (string->symbol (decoder key)) (decoder val)))) ((: id (+ blank) "=") (let* ((key (the-substring 0 (-fx (the-length) 2))) (val (read/rp attribute-value-grammar (the-port) strict tag))) (let loop ((i (-fx (string-length key) 1))) (case (string-ref key i) ((#\space #\tab #\Newline) (loop (-fx i 1))) (else (set! key (substring key 0 (+ i 1)))))) (cons (string->symbol (decoder key)) (decoder val)))) ((: id) (let* ((key (decoder (the-string)))) (cons (string->symbol key) key))) ((or "/>" ">") (the-symbol)) (else (let ((c (the-failure))) (if (not (eof-object? c)) (xml-parse-error "Illegal attribute character" (error-line c (the-port)) (input-port-name (the-port)) (input-port-position (the-port))) (xml-parse-error (format "Premature end of line, expecting tag `~a'" tag) c (input-port-name (the-port)) (-fx (input-port-position (the-port)) 1))))))) ;*---------------------------------------------------------------------*/ ;* cdata-grammar ... */ ;*---------------------------------------------------------------------*/ (define cdata-grammar (regular-grammar (decoder) ((* (out "]")) (let* ((res (decoder (the-string))) (rest (ignore))) (string-append res rest))) ("]" (string-append "]" (ignore))) ((: "]]>" (? "\n")) "") (else (let* ((c (the-failure)) (msg (if (not (eof-object? c)) "Illegal <![CDATA[ character" "Premature end of line, expecting tag `]]>'"))) (xml-parse-error msg c (input-port-name (the-port)) (input-port-position (the-port))))))) ;*---------------------------------------------------------------------*/ ;* get-decoder ... */ ;*---------------------------------------------------------------------*/ (define (get-decoder::procedure enc::bstring dst-enc) (let ((src-enc (string->symbol (string-upcase enc)))) (cond ((or (not src-enc) (eq? src-enc dst-enc)) (lambda (x) x)) ((eq? src-enc 'UTF-8) (cond ((memq dst-enc '(ISO-8859-1 ISO-8859-2 ISO-8859-15)) utf8->iso-latin) ((eq? dst-enc 'UCS-2) utf8-string->ucs2-string) (else (lambda (x) x)))) ((memq src-enc '(ISO-8859-1 ISO-8859-2 ISO-8859-15)) (cond ((eq? dst-enc 'UTF-8) iso-latin->utf8) ((eq? dst-enc 'UCS-2) (lambda (x) (utf8-string->ucs2-string (iso-latin->utf8 x)))) (else (lambda (x) x)))) (else (lambda (x) x))))) ;*---------------------------------------------------------------------*/ ;* xml-grammar ... */ ;*---------------------------------------------------------------------*/ (define xml-grammar (regular-grammar ((id (: (in ("azAZ") "!?") (* (in ("azAZ09") ":_-")))) next make specials strict content-pos decoder encoding pos) (define (update-start-position!) (set! pos (input-port-position (the-port)))) (define (ignore-w/-pos-update!) (update-start-position!) (ignore)) ((+ (in " \t\n\r")) (update-start-position!) (if content-pos (list (the-string) (- pos (the-length))) (the-string))) ((: "<!--" (* (or (out "-") (: "-" (out "-")) (: "--" (out ">")))) "-->") (update-start-position!) (cons 'comment (the-string))) ((: "<!" (: (or (out "[-") (: "-" (out "-"))) (* (out ">]")) (? (: "[" (* (out "]")) "]")) (* (out ">"))) ">") (update-start-position!) (cons 'declaration (the-string))) ("<![CDATA[" (let ((cdata (read/rp cdata-grammar (the-port) decoder))) (update-start-position!) (cons 'cdata cdata))) ((: "<?xml " (* (out "?>")) "?>") (let ((s (the-substring 6 (the-length)))) (string-set! s (-fx (string-length s) 2) #\space) (let ((p (open-input-string s))) (let loop ((attr '())) (let ((obj (read/rp attribute-grammar p 'xml #t decoder))) (cond ((pair? obj) (loop (cons obj attr))) ((eq? obj '>) (update-start-position!) (cons 'xml-decl attr)))))))) ((: "<?" (* (out ">")) ">") (update-start-position!) (cons 'instruction (the-string))) ((: "<" id ">") (let* ((t (the-substring 1 (-fx (the-length) 1))) (ts (string->symbol t)) (p (the-port))) (let ((element (collect-up-to pos ignore-w/-pos-update! ts '() p make specials strict content-pos decoder encoding))) (update-start-position!) element))) ((: "<" id "/>") (let ((t (the-substring 1 (-fx (the-length) 2)))) (let ((p pos)) (update-start-position!) (make (string->symbol t) '() '() p)))) ((: "<" id (in " \n\t\r")) (let* ((t (the-substring 1 (-fx (the-length) 1))) (ts (string->symbol t)) (p (the-port))) (let loop ((attr '())) (let ((obj (read/rp attribute-grammar p t strict decoder))) (cond ((pair? obj) (loop (cons obj attr))) ((eq? obj '>) (let ((element (collect-up-to pos ignore-w/-pos-update! ts (reverse! attr) p make specials strict content-pos decoder encoding))) (update-start-position!) element)) ((eq? obj '/>) (let ((p pos)) (update-start-position!) (make ts (reverse! attr) '() p)))))))) ((: "</" id ">") (update-start-position!) (string->symbol (the-substring 2 (-fx (the-length) 1)))) ((+ (out "<")) (update-start-position!) (if content-pos (let ((pos2 (- pos (the-length)))) (list (decoder (the-string)) pos2)) (decoder (the-string)))) (else (let ((c (the-failure))) (cond ((not (eof-object? c)) (xml-parse-error "Illegal character" (error-line c (the-port)) (input-port-name (the-port)) (input-port-position (the-port)))) (else c)))))) ;*---------------------------------------------------------------------*/ ;* char-hexnumeric? ... */ ;*---------------------------------------------------------------------*/ (define (char-hexnumeric? c) (or (char-numeric? c) (and (char>=? c #\A) (char<=? c #\F)) (and (char>=? c #\a) (char<=? c #\f)))) ;*---------------------------------------------------------------------*/ ;* xml-string-decode-inner! ... */ ;*---------------------------------------------------------------------*/ (define (xml-string-decode-inner! str ol nl res) (define (char-value c) (cond ((char-numeric? c) (-fx (char->integer c) (char->integer #\0))) ((char<=? c #\F) (+fx 10 (-fx (char->integer c) (char->integer #\A)))) (else (+fx 10 (-fx (char->integer c) (char->integer #\a)))))) (let ((ol-2 (-fx ol 2))) (let loop ((i 0) (j 0)) (if (=fx j nl) res (let ((c (string-ref str i))) (if (and (char=? c #\%) (<fx i ol-2)) (let ((c1 (string-ref str (+fx i 1))) (c2 (string-ref str (+fx i 2)))) (if (and (char-hexnumeric? c1) (char-hexnumeric? c2)) (let* ((v1 (char-value c1)) (v2 (char-value c2)) (d (integer->char (+fx (*fx v1 16) v2)))) (string-set! res j d) (loop (+fx i 3) (+fx j 1))) (begin (string-set! res j c) (loop (+fx i 1) (+fx j 1))))) (begin (string-set! res j c) (loop (+fx i 1) (+fx j 1))))))))) ;*---------------------------------------------------------------------*/ ;* xml-count ... */ ;*---------------------------------------------------------------------*/ (define (xml-count str ol) (let loop ((i 0) (c 0)) (cond ((=fx i ol) c) ((char=? (string-ref str i) #\&) (cond ((substring-at? str "<" i) (loop (+fx i 4) (+fx c 1))) ((substring-at? str ">" i) (loop (+fx i 4) (+fx c 1))) ((substring-at? str "&" i) (loop (+fx i 5) (+fx c 1))) ((substring-at? str """ i) (loop (+fx i 6) (+fx c 1))) ((substring-at? str " " i) (loop (+fx i 6) (+fx c 1))) ((substring-at? str "&#" i) (let liip ((i (+fx i 2))) (cond ((=fx i ol) c) ((char-numeric? (string-ref str i)) (liip (+fx i 1))) (else (loop (+fx i 1) (+fx c 1)))))) (else (loop (+fx i 1) (+fx c 1))))) (else (loop (+fx i 1) (+fx c 1)))))) ;*---------------------------------------------------------------------*/ ;* xml-decode ... */ ;*---------------------------------------------------------------------*/ (define (xml-decode! str res ol nl) (let loop ((i 0) (j 0)) (cond ((=fx i ol) res) ((char=? (string-ref str i) #\&) (cond ((substring-at? str "<" i) (string-set! res j #\<) (loop (+fx i 4) (+fx j 1))) ((substring-at? str ">" i) (string-set! res j #\>) (loop (+fx i 4) (+fx j 1))) ((substring-at? str "&" i) (string-set! res j #\&) (loop (+fx i 5) (+fx j 1))) ((substring-at? str """ i) (string-set! res j #\") (loop (+fx i 6) (+fx j 1))) ((substring-at? str " " i) (string-set! res j #\space) (loop (+fx i 6) (+fx j 1))) ((substring-at? str "&#" i) (let liip ((i (+fx i 2)) (n 0)) (if (=fx i ol) res (let ((c (string-ref str i))) (if (char-numeric? c) (liip (+fx i 1) (+fx (*fx n 10) (-fx (char->integer c) (char->integer #\0)))) (begin (string-set! res j (integer->char n)) (loop (+fx i 1) (+fx j 1)))))))) (else (string-set! res j (string-ref str i)) (loop (+fx i 1) (+fx j 1))))) (else (string-set! res j (string-ref str i)) (loop (+fx i 1) (+fx j 1)))))) ;*---------------------------------------------------------------------*/ ;* xml-string-decode ... */ ;*---------------------------------------------------------------------*/ (define (xml-string-decode str) (let ((ol (string-length str))) (if (>=fx ol 3) (let ((nl (xml-count str ol))) (if (=fx nl ol) (string-copy str) (let ((res (make-string nl))) (xml-decode! str res ol nl) res))) (string-copy str)))) ;*---------------------------------------------------------------------*/ ;* xml-string-decode! ... */ ;*---------------------------------------------------------------------*/ (define (xml-string-decode! str) (let ((ol (string-length str))) (if (>=fx ol 3) (let ((nl (xml-count str ol))) (if (=fx nl ol) str (begin (xml-decode! str str ol nl) (string-shrink! str nl)))) str))) ;*---------------------------------------------------------------------*/ ;* xml-string-encode ... */ ;*---------------------------------------------------------------------*/ (define (xml-string-encode str) (define (count str ol) (let loop ((i 0) (n 0)) (if (=fx i ol) n (let ((c (string-ref str i))) (case c ((#\") (loop (+fx i 1) (+fx n 6))) ((#\&) (loop (+fx i 1) (+fx n 5))) ((#\< #\>) (loop (+fx i 1) (+fx n 4))) (else (loop (+fx i 1) (+fx n 1)))))))) (define (encode str ol nl) (if (=fx nl ol) str (let ((res (make-string nl))) (let loop ((i 0) (j 0)) (if (=fx j nl) res (let ((c (string-ref str i))) (case c ((#\<) (blit-string! "<" 0 res j 4) (loop (+fx i 1) (+fx j 4))) ((#\>) (blit-string! ">" 0 res j 4) (loop (+fx i 1) (+fx j 4))) ((#\&) (blit-string! "&" 0 res j 5) (loop (+fx i 1) (+fx j 5))) ((#\") (blit-string! """ 0 res j 6) (loop (+fx i 1) (+fx j 6))) (else (string-set! res j c) (loop (+fx i 1) (+fx j 1)))))))))) (let ((ol (string-length str))) (encode str ol (count str ol)))) ;*---------------------------------------------------------------------*/ ;* read-xml ... */ ;*---------------------------------------------------------------------*/ (define (read-xml #!optional (port::input-port (current-input-port))) (xml-parse port)) ;*---------------------------------------------------------------------*/ ;* xml-metadata ... */ ;*---------------------------------------------------------------------*/ (define (xml-metadata xml) (let ((xml-ver #f) (xml-enc #f) (xml-lang #f) (root-ver 0.0) (xml-root #f) (xml-ns '())) (let loop1 ((l xml)) (when (pair? l) (match-case (car l) ((xml-decl . (and ?attr (?- . ?-))) (for-each (lambda (at) (case (car at) ((version) (set! xml-ver (cdr attr))) ((encoding) (set! xml-enc (cdr attr))))) attr)) ((?mark ?lattr . ?-) (let loop3 ((lattr lattr)) (unless xml-root (set! xml-root mark)) (when (pair? lattr) (let ((attr (car lattr))) (case (car attr) ((xml:lang) (set! xml-lang (cdr attr))) ((xmlns) (set! xml-root (cons (cdr attr) xml-root))) ((version) (set! root-ver (string->number (cdr attr)))) (else (let ((str (symbol->string (car attr)))) (when (substring=? str "xmlns:" 6) (let* ((l (string-length str)) (s (substring str 6 l)) (si (string->symbol s))) (set! xml-ns (cons (cons (cdr attr) si) xml-ns))))))) (loop3 (cdr lattr))))))) (loop1 (cdr l)))) (unless xml-root (error "xml-metadata" "Empty XML document !" xml)) ;; Values are : ;; - XML Version (1.0 or 1.1) or #f ;; - XML Encoding (#f if unknown) ;; - xml:lang value ;; - Pair, which car is the first data markup, and ;; cdr the default namespace ;; - xml first data markup version attribute (0 if unspecified) ;; - list of prefixed namespaces (prefix . path) (values xml-ver xml-enc xml-lang xml-root root-ver xml-ns)))