Huang Jing <[email protected]> writes:

> CC'd include-yy who has helped testing out the patch so he can join
> the discussion more easily.
>
> So basically after the patch (HTML) export works fine, but the display
> in Emacs Org buffer has some issues (a regression?).
>
>    | 中文 +English+ 中文 vs 中文+English+中文
>    |    ^ with space          ^ without space
>
> The first version without spaces will be displayed correctly (with the
> crossover), while the second one with space inserted between (which
> used to work before) does not get displayed correctly [1].

This is to be expected. The patch was very rough.
I now updated to a more proper patch, adding full-fledged Unicode
support to the markup: "breakable" characters like Chinese, generic
Unicode categories for opening/closing punctuation, generic Unicode
dashes, etc are allowed with the attached patch.

Something similar might need to be done for radio targets/links and for
plain links like 你好file:foo.org

>From 8ec306d2015d00a7158098fd76b979248d6b7e07 Mon Sep 17 00:00:00 2001
Message-ID: <8ec306d2015d00a7158098fd76b979248d6b7e07.1766228647.git.yanta...@posteo.net>
From: Ihor Radchenko <[email protected]>
Date: Sat, 20 Dec 2025 11:58:16 +0100
Subject: [PATCH] WIP: Org markup: Allow Unicode punctuation and breakable
 symbols around emphasis

* lisp/org-element.el (org-element-category-table): Define custom
category table adding opening/closing punctuation, opening/closing
quotes, dashes, and auxiliary punctuation.
(org-element--parse-generic-emphasis): Extend allowed characters
around emphasis to generic opening/closing punctuation, quote
punctuation, dash-likes, and auxiliary ,-like punctuation.  Also,
allow breakable characters, like Chinese/Japanese symbols for
languages that do not use spaces.
* lisp/org.el (org-mode): Setup category table.
(org-emphasis-regexp-components): Allow pre/post to be nil to follow
the new defaults.  Change the default values of pre/past to nil.
(org-set-emph-re):
(org-do-emphasis-faces):
(org-emphasize): Fall back to parser defaults when pre/past in
`org-emphasis-regexp-components' is nil.
---
 lisp/org-element.el | 50 +++++++++++++++++++++++++++++--
 lisp/org.el         | 71 ++++++++++++++++++++++++++++++++++-----------
 2 files changed, 102 insertions(+), 19 deletions(-)

diff --git a/lisp/org-element.el b/lisp/org-element.el
index 0b51b4524..54df11d91 100644
--- a/lisp/org-element.el
+++ b/lisp/org-element.el
@@ -3323,6 +3323,38 @@ ;;; Objects
 
 ;;;; Bold
 
+(defvar org-element-category-table
+  (let ((category-table (copy-category-table))
+        (uniprop-table (unicode-property-table-internal 'general-category)))
+    ;; Define categories
+    (define-category ?{ "Opening punctuation" category-table)
+    (define-category ?} "Closing punctuation" category-table)
+    (define-category ?\[ "Initial quote" category-table)
+    (define-category ?\] "Final quote" category-table)
+    (define-category ?- "Dash" category-table)
+    (define-category ?, "Other punctuation" category-table)
+    ;; Map characters to categories according to their general-category
+    (map-char-table
+     (lambda (key val)
+       (pcase val
+         ('Ps (modify-category-entry key ?{ category-table))
+         ('Pe (modify-category-entry key ?} category-table))
+         ('Pi (modify-category-entry key ?\[ category-table))
+         ('Pf (modify-category-entry key ?\] category-table))
+         ('Pd (modify-category-entry key ?- category-table))
+         ('Po (modify-category-entry key ?, category-table))))
+     uniprop-table)
+    category-table)
+  "Category table for Org buffers.
+The table defines additional Unicode categories:
+- ?{ for opening punctuation
+- ?} for closing punctuation
+- ?[ for opening quote
+- ?] for closing quote
+- ?- for dash-like
+- ?, for other punctuation.
+These categories are necessary for parsing emphasis.")
+
 (defun org-element--parse-generic-emphasis (mark type)
   "Parse emphasis object at point, if any.
 
@@ -3336,7 +3368,14 @@ (defun org-element--parse-generic-emphasis (mark type)
       (unless (bolp) (forward-char -1))
       (let ((opening-re
              (rx-to-string
-              `(seq (or line-start (any space ?- ?\( ?' ?\" ?\{))
+              `(seq (or line-start space
+                        ;; opening punctuation
+                        (category ?{) (category ?\[)
+                        ;; dashes, other punctuation
+                        (category ?-) (category ?,)
+                        ;; Chinese, Japanese, and other breakable
+                        ;; characters
+                        (category ?|))
                     ,mark
                     (not space)))))
         (when (looking-at-p opening-re)
@@ -3346,7 +3385,14 @@ (defun org-element--parse-generic-emphasis (mark type)
                   `(seq
                     (not space)
                     (group ,mark)
-                    (or (any space ?- ?. ?, ?\; ?: ?! ?? ?' ?\" ?\) ?\} ?\\ ?\[)
+                    (or space
+                        ;; closing punctuation
+                        (category ?}) (category ?\])
+                        ;; dashes, other punctuation
+                        (category ?-) (category ?,)
+                        ;; Chinese, Japanese, and other breakable
+                        ;; characters
+                        (category ?|)
                         line-end)))))
             (when (re-search-forward closing-re nil t)
               (let ((closing (match-end 1)))
diff --git a/lisp/org.el b/lisp/org.el
index 910c075cd..720c8abf6 100644
--- a/lisp/org.el
+++ b/lisp/org.el
@@ -3858,10 +3858,22 @@ (defun org-set-emph-re (var val)
 	 (body (if (<= nl 0) body
 		 (format "%s*?\\(?:\n%s*?\\)\\{0,%d\\}" body body nl)))
 	 (template
-	  (format (concat "\\([%s]\\|^\\)" ;before markers
+          ;; See `org-element--parse-generic-emphasis'
+	  (format (concat "\\(%s\\)" ;before markers
 			  "\\(\\([%%s]\\)\\([^%s]\\|[^%s]%s[^%s]\\)\\3\\)"
 			  "\\([%s]\\|$\\)") ;after markers
-		  pre border border body border post)))
+		  (if pre (format "[%s]\\|^" pre)
+                    (rx (or line-start space
+                            (category ?{) (category ?\[)
+                            (category ?-) (category ?,)
+                            (category ?|))))
+                  border border body border
+                  (if post (format "[%s]\\|$" post)
+                    (rx (or space
+                            (category ?}) (category ?\])
+                            (category ?-) (category ?,)
+                            (category ?|)
+                            line-end))))))
       (setq org-emph-re (format template "*/_+"))
       (setq org-verbatim-re (format template "=~")))))
 
@@ -3869,7 +3881,7 @@ (defun org-set-emph-re (var val)
 ;; set this option proved cumbersome.  See this message/thread:
 ;; https://orgmode.org/list/[email protected]
 (defvar org-emphasis-regexp-components
-  '("-[:space:]('\"{" "-[:space:].,:!?;'\")}\\[" "[:space:]" "." 1)
+  '(nil nil "[:space:]" "." 1)
   "Components used to build the regular expression for FONTIFYING emphasis.
 WARNING: This variable only affects visual fontification, but does not
 change Org markup.  For example, it does not affect how emphasis markup
@@ -3882,7 +3894,9 @@ (defvar org-emphasis-regexp-components
 specify what is allowed/forbidden in each part:
 
 pre          Chars allowed as prematch.  Beginning of line will be allowed too.
+             nil means use parser defaults.
 post         Chars allowed as postmatch.  End of line will be allowed too.
+             nil means use parser defaults.
 border       The chars *forbidden* as border characters.
 body-regexp  A regexp like \".\" to match a body character.  Don't use
              non-shy groups here, and don't allow newline here.
@@ -5127,6 +5141,9 @@ (define-derived-mode org-mode outline-mode "Org"
     (org-install-agenda-files-menu))
   (setq-local outline-regexp org-outline-regexp)
   (setq-local outline-level 'org-outline-level)
+  (require 'org-element)
+  (defvar org-element-category-table) ; org-element.el
+  (set-category-table org-element-category-table)
   ;; Initialize cache.
   (org-element-cache-reset)
   (when (and org-element-cache-persistent
@@ -5402,8 +5419,14 @@ (defsubst org-rear-nonsticky-at (pos)
 
 (defun org-do-emphasis-faces (limit)
   "Run through the buffer and emphasize strings."
-  (let ((quick-re (format "\\([%s]\\|^\\)\\([~=*/_+]\\)"
-			  (car org-emphasis-regexp-components))))
+  (let ((quick-re (format "\\(%s\\)\\([~=*/_+]\\)"
+			  (if (car org-emphasis-regexp-components)
+                              (format "[%s]\\|^" (car org-emphasis-regexp-components))
+                            ;; See `org-element--parse-generic-emphasis'
+                            (rx (or line-start space
+                                    (category ?{) (category ?\[)
+                                    (category ?-) (category ?,)
+                                    (category ?|)))))))
     (catch :exit
       (while (re-search-forward quick-re limit t)
 	(let* ((marker (match-string 2))
@@ -5413,24 +5436,24 @@ (defun org-do-emphasis-faces (limit)
 		  (and
 		   ;; Do not match table hlines.
 		   (not (and (equal marker "+")
-			     (org-match-line
-			      "[ \t]*\\(|[-+]+|?\\|\\+[-+]+\\+\\)[ \t]*$")))
+			   (org-match-line
+			    "[ \t]*\\(|[-+]+|?\\|\\+[-+]+\\+\\)[ \t]*$")))
 		   ;; Do not match headline stars.  Do not consider
 		   ;; stars of a headline as closing marker for bold
 		   ;; markup either.
 		   (not (and (equal marker "*")
-			     (save-excursion
-			       (forward-char)
-			       (skip-chars-backward "*")
-			       (looking-at-p org-outline-regexp-bol))))
+			   (save-excursion
+			     (forward-char)
+			     (skip-chars-backward "*")
+			     (looking-at-p org-outline-regexp-bol))))
 		   ;; Match full emphasis markup regexp.
 		   (looking-at (if verbatim? org-verbatim-re org-emph-re))
 		   ;; Do not span over paragraph boundaries.
 		   (not (string-match-p org-element-paragraph-separate
-					(match-string 2)))
+				      (match-string 2)))
 		   ;; Do not span over cells in table rows.
 		   (not (and (save-match-data (org-match-line "[ \t]*|"))
-			     (string-match-p "|" (match-string 4))))))
+			   (string-match-p "|" (match-string 4))))))
 	    (pcase-let ((`(,_ ,face ,_) (assoc marker org-emphasis-alist))
 			(m (if org-hide-emphasis-markers 4 2)))
 	      (font-lock-prepend-text-property
@@ -5495,12 +5518,26 @@ (defun org-emphasize (&optional char)
     (setq string (concat s string s))
     (when beg (delete-region beg end))
     (unless (or (bolp)
-		(string-match (concat "[" (nth 0 erc) "\n]")
-			      (char-to-string (char-before (point)))))
+		(string-match
+                 (if (nth 0 erc) (concat "[" (nth 0 erc) "\n]")
+                   ;; See `org-element--parse-generic-emphasis'
+                   (rx (or space
+                           (category ?{) (category ?\[)
+                           (category ?-) (category ?,)
+                           (category ?|)
+                           "\n")))
+		 (char-to-string (char-before (point)))))
       (insert " "))
     (unless (or (eobp)
-		(string-match (concat "[" (nth 1 erc) "\n]")
-			      (char-to-string (char-after (point)))))
+		(string-match
+                 ;; See `org-element--parse-generic-emphasis'
+                 (if (nth 1 erc) (concat "[" (nth 1 erc) "\n]")
+                   (rx (or space
+                           (category ?}) (category ?\])
+                           (category ?-) (category ?,)
+                           (category ?|)
+                           "\n")))
+		 (char-to-string (char-after (point)))))
       (insert " ") (backward-char 1))
     (insert string)
     (and move (backward-char 1))))
-- 
2.50.1


-- 
Ihor Radchenko // yantar92,
Org mode maintainer,
Learn more about Org mode at <https://orgmode.org/>.
Support Org development at <https://liberapay.com/org-mode>,
or support my work at <https://liberapay.com/yantar92>

Reply via email to