Huang Jing <[email protected]> writes: > CC'd include-yy who has helped testing out the patch so he can join > the discussion more easily. > > So basically after the patch (HTML) export works fine, but the display > in Emacs Org buffer has some issues (a regression?). > > | 中文 +English+ 中文 vs 中文+English+中文 > | ^ with space ^ without space > > The first version without spaces will be displayed correctly (with the > crossover), while the second one with space inserted between (which > used to work before) does not get displayed correctly [1].
This is to be expected. The patch was very rough. I now updated to a more proper patch, adding full-fledged Unicode support to the markup: "breakable" characters like Chinese, generic Unicode categories for opening/closing punctuation, generic Unicode dashes, etc are allowed with the attached patch. Something similar might need to be done for radio targets/links and for plain links like 你好file:foo.org
>From 8ec306d2015d00a7158098fd76b979248d6b7e07 Mon Sep 17 00:00:00 2001 Message-ID: <8ec306d2015d00a7158098fd76b979248d6b7e07.1766228647.git.yanta...@posteo.net> From: Ihor Radchenko <[email protected]> Date: Sat, 20 Dec 2025 11:58:16 +0100 Subject: [PATCH] WIP: Org markup: Allow Unicode punctuation and breakable symbols around emphasis * lisp/org-element.el (org-element-category-table): Define custom category table adding opening/closing punctuation, opening/closing quotes, dashes, and auxiliary punctuation. (org-element--parse-generic-emphasis): Extend allowed characters around emphasis to generic opening/closing punctuation, quote punctuation, dash-likes, and auxiliary ,-like punctuation. Also, allow breakable characters, like Chinese/Japanese symbols for languages that do not use spaces. * lisp/org.el (org-mode): Setup category table. (org-emphasis-regexp-components): Allow pre/post to be nil to follow the new defaults. Change the default values of pre/past to nil. (org-set-emph-re): (org-do-emphasis-faces): (org-emphasize): Fall back to parser defaults when pre/past in `org-emphasis-regexp-components' is nil. --- lisp/org-element.el | 50 +++++++++++++++++++++++++++++-- lisp/org.el | 71 ++++++++++++++++++++++++++++++++++----------- 2 files changed, 102 insertions(+), 19 deletions(-) diff --git a/lisp/org-element.el b/lisp/org-element.el index 0b51b4524..54df11d91 100644 --- a/lisp/org-element.el +++ b/lisp/org-element.el @@ -3323,6 +3323,38 @@ ;;; Objects ;;;; Bold +(defvar org-element-category-table + (let ((category-table (copy-category-table)) + (uniprop-table (unicode-property-table-internal 'general-category))) + ;; Define categories + (define-category ?{ "Opening punctuation" category-table) + (define-category ?} "Closing punctuation" category-table) + (define-category ?\[ "Initial quote" category-table) + (define-category ?\] "Final quote" category-table) + (define-category ?- "Dash" category-table) + (define-category ?, "Other punctuation" category-table) + ;; Map characters to categories according to their general-category + (map-char-table + (lambda (key val) + (pcase val + ('Ps (modify-category-entry key ?{ category-table)) + ('Pe (modify-category-entry key ?} category-table)) + ('Pi (modify-category-entry key ?\[ category-table)) + ('Pf (modify-category-entry key ?\] category-table)) + ('Pd (modify-category-entry key ?- category-table)) + ('Po (modify-category-entry key ?, category-table)))) + uniprop-table) + category-table) + "Category table for Org buffers. +The table defines additional Unicode categories: +- ?{ for opening punctuation +- ?} for closing punctuation +- ?[ for opening quote +- ?] for closing quote +- ?- for dash-like +- ?, for other punctuation. +These categories are necessary for parsing emphasis.") + (defun org-element--parse-generic-emphasis (mark type) "Parse emphasis object at point, if any. @@ -3336,7 +3368,14 @@ (defun org-element--parse-generic-emphasis (mark type) (unless (bolp) (forward-char -1)) (let ((opening-re (rx-to-string - `(seq (or line-start (any space ?- ?\( ?' ?\" ?\{)) + `(seq (or line-start space + ;; opening punctuation + (category ?{) (category ?\[) + ;; dashes, other punctuation + (category ?-) (category ?,) + ;; Chinese, Japanese, and other breakable + ;; characters + (category ?|)) ,mark (not space))))) (when (looking-at-p opening-re) @@ -3346,7 +3385,14 @@ (defun org-element--parse-generic-emphasis (mark type) `(seq (not space) (group ,mark) - (or (any space ?- ?. ?, ?\; ?: ?! ?? ?' ?\" ?\) ?\} ?\\ ?\[) + (or space + ;; closing punctuation + (category ?}) (category ?\]) + ;; dashes, other punctuation + (category ?-) (category ?,) + ;; Chinese, Japanese, and other breakable + ;; characters + (category ?|) line-end))))) (when (re-search-forward closing-re nil t) (let ((closing (match-end 1))) diff --git a/lisp/org.el b/lisp/org.el index 910c075cd..720c8abf6 100644 --- a/lisp/org.el +++ b/lisp/org.el @@ -3858,10 +3858,22 @@ (defun org-set-emph-re (var val) (body (if (<= nl 0) body (format "%s*?\\(?:\n%s*?\\)\\{0,%d\\}" body body nl))) (template - (format (concat "\\([%s]\\|^\\)" ;before markers + ;; See `org-element--parse-generic-emphasis' + (format (concat "\\(%s\\)" ;before markers "\\(\\([%%s]\\)\\([^%s]\\|[^%s]%s[^%s]\\)\\3\\)" "\\([%s]\\|$\\)") ;after markers - pre border border body border post))) + (if pre (format "[%s]\\|^" pre) + (rx (or line-start space + (category ?{) (category ?\[) + (category ?-) (category ?,) + (category ?|)))) + border border body border + (if post (format "[%s]\\|$" post) + (rx (or space + (category ?}) (category ?\]) + (category ?-) (category ?,) + (category ?|) + line-end)))))) (setq org-emph-re (format template "*/_+")) (setq org-verbatim-re (format template "=~"))))) @@ -3869,7 +3881,7 @@ (defun org-set-emph-re (var val) ;; set this option proved cumbersome. See this message/thread: ;; https://orgmode.org/list/[email protected] (defvar org-emphasis-regexp-components - '("-[:space:]('\"{" "-[:space:].,:!?;'\")}\\[" "[:space:]" "." 1) + '(nil nil "[:space:]" "." 1) "Components used to build the regular expression for FONTIFYING emphasis. WARNING: This variable only affects visual fontification, but does not change Org markup. For example, it does not affect how emphasis markup @@ -3882,7 +3894,9 @@ (defvar org-emphasis-regexp-components specify what is allowed/forbidden in each part: pre Chars allowed as prematch. Beginning of line will be allowed too. + nil means use parser defaults. post Chars allowed as postmatch. End of line will be allowed too. + nil means use parser defaults. border The chars *forbidden* as border characters. body-regexp A regexp like \".\" to match a body character. Don't use non-shy groups here, and don't allow newline here. @@ -5127,6 +5141,9 @@ (define-derived-mode org-mode outline-mode "Org" (org-install-agenda-files-menu)) (setq-local outline-regexp org-outline-regexp) (setq-local outline-level 'org-outline-level) + (require 'org-element) + (defvar org-element-category-table) ; org-element.el + (set-category-table org-element-category-table) ;; Initialize cache. (org-element-cache-reset) (when (and org-element-cache-persistent @@ -5402,8 +5419,14 @@ (defsubst org-rear-nonsticky-at (pos) (defun org-do-emphasis-faces (limit) "Run through the buffer and emphasize strings." - (let ((quick-re (format "\\([%s]\\|^\\)\\([~=*/_+]\\)" - (car org-emphasis-regexp-components)))) + (let ((quick-re (format "\\(%s\\)\\([~=*/_+]\\)" + (if (car org-emphasis-regexp-components) + (format "[%s]\\|^" (car org-emphasis-regexp-components)) + ;; See `org-element--parse-generic-emphasis' + (rx (or line-start space + (category ?{) (category ?\[) + (category ?-) (category ?,) + (category ?|))))))) (catch :exit (while (re-search-forward quick-re limit t) (let* ((marker (match-string 2)) @@ -5413,24 +5436,24 @@ (defun org-do-emphasis-faces (limit) (and ;; Do not match table hlines. (not (and (equal marker "+") - (org-match-line - "[ \t]*\\(|[-+]+|?\\|\\+[-+]+\\+\\)[ \t]*$"))) + (org-match-line + "[ \t]*\\(|[-+]+|?\\|\\+[-+]+\\+\\)[ \t]*$"))) ;; Do not match headline stars. Do not consider ;; stars of a headline as closing marker for bold ;; markup either. (not (and (equal marker "*") - (save-excursion - (forward-char) - (skip-chars-backward "*") - (looking-at-p org-outline-regexp-bol)))) + (save-excursion + (forward-char) + (skip-chars-backward "*") + (looking-at-p org-outline-regexp-bol)))) ;; Match full emphasis markup regexp. (looking-at (if verbatim? org-verbatim-re org-emph-re)) ;; Do not span over paragraph boundaries. (not (string-match-p org-element-paragraph-separate - (match-string 2))) + (match-string 2))) ;; Do not span over cells in table rows. (not (and (save-match-data (org-match-line "[ \t]*|")) - (string-match-p "|" (match-string 4)))))) + (string-match-p "|" (match-string 4)))))) (pcase-let ((`(,_ ,face ,_) (assoc marker org-emphasis-alist)) (m (if org-hide-emphasis-markers 4 2))) (font-lock-prepend-text-property @@ -5495,12 +5518,26 @@ (defun org-emphasize (&optional char) (setq string (concat s string s)) (when beg (delete-region beg end)) (unless (or (bolp) - (string-match (concat "[" (nth 0 erc) "\n]") - (char-to-string (char-before (point))))) + (string-match + (if (nth 0 erc) (concat "[" (nth 0 erc) "\n]") + ;; See `org-element--parse-generic-emphasis' + (rx (or space + (category ?{) (category ?\[) + (category ?-) (category ?,) + (category ?|) + "\n"))) + (char-to-string (char-before (point))))) (insert " ")) (unless (or (eobp) - (string-match (concat "[" (nth 1 erc) "\n]") - (char-to-string (char-after (point))))) + (string-match + ;; See `org-element--parse-generic-emphasis' + (if (nth 1 erc) (concat "[" (nth 1 erc) "\n]") + (rx (or space + (category ?}) (category ?\]) + (category ?-) (category ?,) + (category ?|) + "\n"))) + (char-to-string (char-after (point))))) (insert " ") (backward-char 1)) (insert string) (and move (backward-char 1)))) -- 2.50.1
-- Ihor Radchenko // yantar92, Org mode maintainer, Learn more about Org mode at <https://orgmode.org/>. Support Org development at <https://liberapay.com/org-mode>, or support my work at <https://liberapay.com/yantar92>
