branch: elpa/parseclj
commit 22f2eb106fd37272b64c4bfde6c388d308358463
Author: Arne Brasseur <[email protected]>
Commit: Arne Brasseur <[email protected]>
Support \uxxxx and \oxxx escape codes in strings
---
clj-lex-test.el | 7 ++++++-
clj-parse-test.el | 7 ++++++-
clj-parse.el | 42 ++++++++++++++++++++++++++----------------
3 files changed, 38 insertions(+), 18 deletions(-)
diff --git a/clj-lex-test.el b/clj-lex-test.el
index eeabcdc4db..426698303a 100644
--- a/clj-lex-test.el
+++ b/clj-lex-test.el
@@ -89,7 +89,12 @@
(insert "\\u0078\\o170")
(goto-char 1)
(should (equal (clj-lex-next) (clj-lex-token :character "\\u0078" 1)))
- (should (equal (clj-lex-next) (clj-lex-token :character "\\o170" 7)))))
+ (should (equal (clj-lex-next) (clj-lex-token :character "\\o170" 7))))
+
+ (with-temp-buffer
+ (insert "\"\\u0078\\o170\"")
+ (goto-char 1)
+ (should (equal (clj-lex-next) (clj-lex-token :string "\"\\u0078\\o170\""
1)))))
(ert-deftest clj-lex-test-at-number? ()
(dolist (str '("123" ".9" "+1" "0" "-456"))
diff --git a/clj-parse-test.el b/clj-parse-test.el
index 2c003cb17b..fae8cc03e5 100644
--- a/clj-parse-test.el
+++ b/clj-parse-test.el
@@ -64,7 +64,12 @@
(with-temp-buffer
(insert "(\\newline \\return \\space \\tab \\a \\b \\c \\u0078 \\o171)")
(goto-char 1)
- (should (equal (clj-parse) '((?\n ?\r ?\ ?\t ?a ?b ?c ?x ?y))))))
+ (should (equal (clj-parse) '((?\n ?\r ?\ ?\t ?a ?b ?c ?x ?y)))))
+
+ (with-temp-buffer
+ (insert "\"\\u0078 \\o171\"")
+ (goto-char 1)
+ (should (equal (clj-parse) '("x y")))))
(provide 'clj-parse-test)
diff --git a/clj-parse.el b/clj-parse.el
index 610a98eac0..004901090b 100644
--- a/clj-parse.el
+++ b/clj-parse.el
@@ -39,23 +39,33 @@
:character)
"Tokens that represent leaf nodes in the AST.")
-;; Java/JavaScript strings support other escape codes like "\u0111", but
-;; these are the only ones mentioned in the EDN spec.
-;; Although of course for bare characters
+;; The EDN spec is not clear about wether \u0123 and \o012 are supported in
+;; strings. They are described as character literals, but not as string escape
+;; codes. In practice all implementations support them (mostly with broken
+;; surrogate pair support), so we do the same. Sorry, emoji 🙁.
+;;
+;; Note that this is kind of broken, we don't correctly detect if \u or \o
forms
+;; don't have the right forms.
(defun clj-parse-string (s)
- (replace-regexp-in-string "\\\\[tbnrf'\"\\]"
- (lambda (x)
- (cl-case (elt x 1)
- (?t "\t")
- (?f "\f")
- (?\" "\"")
- (?r "\r")
- (?n "\n")
- (?\\ "\\\\")
- (t (substring x 1 2))))
- (substring s 1 -1)))
-
-
+ (replace-regexp-in-string
+ "\\\\o[0-8]\\{3\\}"
+ (lambda (x)
+ (make-string 1 (string-to-number (substring x 2) 8) ))
+ (replace-regexp-in-string
+ "\\\\u[0-9a-fA-F]\\{4\\}"
+ (lambda (x)
+ (make-string 1 (string-to-number (substring x 2) 16)))
+ (replace-regexp-in-string "\\\\[tbnrf'\"\\]"
+ (lambda (x)
+ (cl-case (elt x 1)
+ (?t "\t")
+ (?f "\f")
+ (?\" "\"")
+ (?r "\r")
+ (?n "\n")
+ (?\\ "\\\\")
+ (t (substring x 1))))
+ (substring s 1 -1)))))
(defun clj-parse-character (c)
(let* ((form (cdr (assq 'form token)))