branch: externals/xr commit a24897795ed811ce58d593fb2472723268e27391 Author: Mattias Engdegård <matti...@acm.org> Commit: Mattias Engdegård <matti...@acm.org>
Parse and lint skip set strings Add functions to parse and lint skip set strings, which are the arguments to `skip-chars-forward' and `skip-chars-backward': xr-skip-set xr-skip-set-pp xr-skip-set-lint Increment the version to 1.7. --- xr-test.el | 53 +++++++++++++++ xr.el | 221 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 261 insertions(+), 13 deletions(-) diff --git a/xr-test.el b/xr-test.el index 4d197bf..860070c 100644 --- a/xr-test.el +++ b/xr-test.el @@ -362,6 +362,59 @@ nil)) ) +(ert-deftest xr-skip-set () + (should (equal (xr-skip-set "0-9a-fA-F+*") + '(any "0-9a-fA-F" "+*"))) + (should (equal (xr-skip-set "^ab-ex-") + '(not (any "b-e" "ax-")))) + (should (equal (xr-skip-set "-^][\\") + '(any "^][-"))) + (should (equal (xr-skip-set "\\^a\\-bc-\\fg") + '(any "c-f" "^abg-"))) + (should (equal (xr-skip-set "\\") + '(any))) + (should (equal (xr-skip-set "--3^Q-\\") + '(any "--3Q-\\" "^"))) + (should (equal (xr-skip-set "^Q-\\c-\\n") + '(not (any "Q-c" "n-")))) + (should (equal (xr-skip-set "\\\\A-") + '(any "\\A-"))) + (should (equal (xr-skip-set "[a-z]") + '(any "a-z" "[]"))) + (should (equal (xr-skip-set "[:ascii:]-[:digit:]") + '(any "-" ascii digit))) + (should (equal (xr-skip-set "A-[:blank:]") + '(any "A-[" ":blank]"))) + (should (equal (xr-skip-set "\\[:xdigit:]-b") + '(any "]-b" "[:xdigt"))) + (should (equal (xr-skip-set "^a-z+" 'terse) + '(not (in "a-z" "+")))) + (should-error (xr-skip-set "[::]")) + (should-error (xr-skip-set "[:whitespace:]")) + (should (equal (xr-skip-set ".") + "\\.")) + (should (equal (xr-skip-set "^") + 'anything)) + (should (equal (xr-skip-set "^[:print:]") + '(not print))) + ) + +(ert-deftest xr-skip-set-lint () + (should (equal (xr-skip-set-lint "A[:ascii:]B[:space:][:ascii:]") + '((20 . "Duplicated character class `[:ascii:]'")))) + (should (equal (xr-skip-set-lint "a\\bF-AM-M\\") + '((1 . "Unnecessarily escaped `b'") + (3 . "Reversed range `F-A'") + (6 . "Single-element range `M-M'") + (9 . "Stray `\\' at end of string")))) + (should (equal (xr-skip-set-lint "A-Fa-z3D-K!3-7\\!b") + '((7 . "Ranges `A-F' and `D-K' overlap") + (11 . "Range `3-7' includes character `3'") + (14 . "Duplicated character `!'") + (14 . "Unnecessarily escaped `!'") + (16 . "Character `b' included in range `a-z'")))) + ) + (provide 'xr-test) ;;; xr-test.el ends here diff --git a/xr.el b/xr.el index eba3a10..2d1eff8 100644 --- a/xr.el +++ b/xr.el @@ -3,7 +3,7 @@ ;; Copyright (C) 2019 Free Software Foundation, Inc. ;; Author: Mattias Engdegård <matti...@acm.org> -;; Version: 1.6 +;; Version: 1.7 ;; Keywords: lisp, maint, regexps ;; This program is free software; you can redistribute it and/or modify @@ -30,11 +30,24 @@ ;; ;; Please refer to `rx' for more information about the notation. ;; -;; The exported functions are: +;; In addition to Emacs regexps, this package can also parse and +;; troubleshoot skip set strings, which are arguments to +;; `skip-chars-forward' and `skip-chars-backward'. +;; +;; The exported functions for regexps are: +;; +;; `xr' - returns the converted rx expression +;; `xr-pp' - converts to rx and pretty-prints +;; `xr-lint' - finds mistakes in a regexp string +;; +;; For skip sets we also have: +;; +;; `xr-skip-set' - return the converted rx expression +;; `xr-skip-set-pp' - converts to rx and pretty-prints +;; `xr-skip-set-lint' - finds mistakes in a skip set string +;; +;; There is finally the generally useful: ;; -;; `xr' - returns the converted rx expression -;; `xr-pp' - pretty-prints the converted rx expression -;; `xr-lint' - finds deprecated syntax in a regexp string ;; `xr-pp-rx-to-str' - pretty-prints an rx expression to a string ;; ;; Suggested use is from an interactive elisp buffer. @@ -591,6 +604,153 @@ (error "Unbalanced \\)")) rx))) +;; Grammar for skip-set strings: +;; +;; skip-set ::= `^'? item* +;; item ::= range | single +;; range ::= single `-' end +;; single ::= (any char but `\') +;; | `\' (any char) +;; end ::= single | `\' +;; +;; The grammar is ambiguous, resolved left-to-right: +;; - a leading ^ is always a negation marker +;; - an item is always a range if possible +;; - an end is only `\' if last in the string + +(defun xr--parse-skip-set-buffer (warnings) + (let ((negated (looking-at (rx "^"))) + (ranges nil) + (classes nil)) + (when negated + (forward-char 1)) + (while (not (eobp)) + (cond + ((looking-at (rx "[:" (group (*? anything)) ":]")) + (let ((sym (intern (match-string 1)))) + (unless (memq sym + '(ascii alnum alpha blank cntrl digit graph + lower multibyte nonascii print punct space + unibyte upper word xdigit)) + (error "No character class `%s'" (match-string 0))) + (when (memq sym classes) + (xr--report warnings (point) + (format "Duplicated character class `%s'" + (match-string 0)))) + (push sym classes))) + + ((looking-at (rx (or (seq "\\" (group anything)) + (group (not (any "\\")))) + (opt "-" + (or (seq "\\" (group anything)) + (group anything))))) + (let ((start (string-to-char (or (match-string 1) + (match-string 2)))) + (end (or (and (match-beginning 3) + (string-to-char (match-string 3))) + (and (match-beginning 4) + (string-to-char (match-string 4)))))) + (when (and (match-beginning 1) + (not (memq start '(?^ ?- ?\\)))) + (xr--report warnings (point) + (xr--escape-string + (format "Unnecessarily escaped `%c'" start) nil))) + (if (and end (> start end)) + (xr--report warnings (point) + (xr--escape-string + (format "Reversed range `%c-%c'" start end) nil)) + (when (eq start end) + (xr--report warnings (point) + (xr--escape-string + (format "Single-element range `%c-%c'" start end) + nil)) + (setq end nil)) + (let ((tail ranges)) + (while tail + (let ((range (car tail))) + (if (and (<= (car range) (or end start)) + (<= start (cdr range))) + (let ((msg + (cond + ((and end (< start end) + (< (car range) (cdr range))) + (format "Ranges `%c-%c' and `%c-%c' overlap" + (car range) (cdr range) start end)) + ((and end (< start end)) + (format "Range `%c-%c' includes character `%c'" + start end (car range))) + ((< (car range) (cdr range)) + (format + "Character `%c' included in range `%c-%c'" + start (car range) (cdr range))) + (t + (format "Duplicated character `%c'" + start))))) + (xr--report warnings (point) + (xr--escape-string msg nil)) + ;; Expand previous interval to include this range. + (setcar range (min (car range) start)) + (setcdr range (max (cdr range) (or end start))) + (setq start nil) + (setq tail nil)) + (setq tail (cdr tail)))))) + (when start + (push (cons start (or end start)) ranges))))) + + ((looking-at (rx "\\" eos)) + (xr--report warnings (point) + "Stray `\\' at end of string"))) + + (goto-char (match-end 0))) + + (cond + ;; Single non-negated character, like "-": make a string. + ((and (not negated) + (null classes) + (= (length ranges) 1) + (eq (caar ranges) (cdar ranges))) + (regexp-quote (char-to-string (caar ranges)))) + ;; Negated empty set, like "^": anything. + ((and negated + (null classes) + (null ranges)) + 'anything) + ;; Single named class, like "[:nonascii:]": use the symbol. + ((and (= (length classes) 1) + (null ranges)) + (if negated + (list 'not (car classes)) + (car classes))) + ;; Anything else: produce (any ...) + (t + (let ((intervals nil) + (chars nil)) + (mapc (lambda (range) + (if (eq (car range) (cdr range)) + (push (car range) chars) + (push (string (car range) ?- (cdr range)) intervals))) + ranges) + ;; Put a single `-' last. + (when (memq ?- chars) + (setq chars (append (delq ?- chars) (list ?-)))) + (let ((set (cons 'any + (append + (and intervals + (list (apply #'concat intervals))) + (and chars + (list (apply #'string chars))) + (nreverse classes))))) + (if negated + (list 'not set) + set))))))) + +(defun xr--parse-skip-set (skip-string warnings) + (with-temp-buffer + (set-buffer-multibyte t) + (insert skip-string) + (goto-char (point-min)) + (xr--parse-skip-set-buffer warnings))) + ;; Substitute keywords in RX using HEAD-ALIST and BODY-ALIST in the ;; head and body positions, respectively. (defun xr--substitute-keywords (head-alist body-alist rx) @@ -631,6 +791,14 @@ (bow . word-start) (eow . word-end)))))) +(defun xr--in-dialect (rx dialect) + (let ((keywords (assq (or dialect 'medium) xr--keywords))) + (unless keywords + (error "Unknown dialect `%S'" dialect)) + (if (cdr keywords) + (xr--substitute-keywords (cadr keywords) (cddr keywords) rx) + rx))) + ;;;###autoload (defun xr (re-string &optional dialect) "Convert a regexp string to rx notation; the inverse of `rx'. @@ -641,17 +809,22 @@ and is one of: `brief' -- short keywords `terse' -- very short keywords `medium' or nil -- a compromise (the default)" - (let ((keywords (assq (or dialect 'medium) xr--keywords))) - (unless keywords - (error "Unknown dialect `%S'" dialect)) - (let ((rx (xr--parse re-string nil))) - (if (cdr keywords) - (xr--substitute-keywords (cadr keywords) (cddr keywords) rx) - rx)))) + (xr--in-dialect (xr--parse re-string nil) dialect)) + +;;;###autoload +(defun xr-skip-set (skip-set-string &optional dialect) + "Convert a skip set string argument to rx notation. +SKIP-SET-STRING is interpreted according to the syntax of +`skip-chars-forward' and `skip-chars-backward' and converted to +a character class on `rx' form. +If desired, `rx' can then be used to convert the result to an +ordinary regexp. +See `xr' for a description of the DIALECT argument." + (xr--in-dialect (xr--parse-skip-set skip-set-string nil) dialect)) ;;;###autoload (defun xr-lint (re-string) - "Detect dubious practices in RE-STRING. + "Detect dubious practices and possible mistakes in RE-STRING. This includes uses of tolerated but discouraged constructs. Outright regexp syntax violations are signalled as errors. Return a list of (OFFSET . COMMENT) where COMMENT applies at OFFSET @@ -660,6 +833,19 @@ in RE-STRING." (xr--parse re-string warnings) (sort (car warnings) #'car-less-than-car))) +;;;###autoload +(defun xr-skip-set-lint (skip-set-string) + "Detect dubious practices and possible mistakes in SKIP-SET-STRING. +This includes uses of tolerated but discouraged constructs. +Outright syntax violations are signalled as errors. +The argument is interpreted according to the syntax of +`skip-chars-forward' and `skip-chars-backward'. +Return a list of (OFFSET . COMMENT) where COMMENT applies at OFFSET +in SKIP-SET-STRING." + (let ((warnings (list nil))) + (xr--parse-skip-set skip-set-string warnings) + (sort (car warnings) #'car-less-than-car))) + ;; Escape non-printing characters in a string for maximum readability. ;; If ESCAPE-PRINTABLE, also escape \ and ", otherwise don't. (defun xr--escape-string (string escape-printable) @@ -738,6 +924,15 @@ It is intended for use from an interactive elisp session. See `xr' for a description of the DIALECT argument." (insert (xr-pp-rx-to-str (xr re-string dialect)))) +;;;###autoload +(defun xr-skip-set-pp (skip-set-string &optional dialect) + "Convert a skip set string to `rx' notation and pretty-print. +This function uses `xr-skip-set' to translate SKIP-SET-STRING +into DIALECT. +It is intended for use from an interactive elisp session. +See `xr' for a description of the DIALECT argument." + (insert (xr-pp-rx-to-str (xr-skip-set skip-set-string dialect)))) + (provide 'xr) ;;; xr.el ends here