Patch 8.2.0901
Problem: Formatting CJK text isn't optimal.
Solution: Properly break CJK lines. (closes #3875)
Files: runtime/doc/change.txt, src/mbyte.c, src/ops.c, src/option.h,
src/proto/mbyte.pro, src/testdir/Make_all.mak, src/textformat.c,
src/testdir/test_cjk_linebreak.vim
*** ../vim-8.2.0900/runtime/doc/change.txt 2019-12-17 21:27:14.686319918
+0100
--- runtime/doc/change.txt 2020-06-04 17:56:02.629097404 +0200
***************
*** 1686,1691 ****
--- 1688,1697 ----
characters. Overruled by the 'M' flag.
1 Don't break a line after a one-letter word. It's broken before it
instead (if possible).
+ ] Respect textwidth rigorously. With this flag set, no line can be
+ longer than textwidth, unless line-break-prohibition rules make this
+ impossible. Mainly for CJK scripts and works only if 'encoding' is
+ "utf-8".
j Where it makes sense, remove a comment leader when joining lines. For
example, joining:
int i; // the index ~
*** ../vim-8.2.0900/src/mbyte.c 2020-06-01 14:34:22.027462262 +0200
--- src/mbyte.c 2020-06-04 18:19:20.338981249 +0200
***************
*** 3843,3848 ****
--- 3843,4000 ----
}
/*
+ * Whether space is NOT allowed before/after 'c'.
+ */
+ int
+ utf_eat_space(int cc)
+ {
+ return ((cc >= 0x2000 && cc <= 0x206F) // General punctuations
+ || (cc >= 0x2e00 && cc <= 0x2e7f) // Supplemental punctuations
+ || (cc >= 0x3000 && cc <= 0x303f) // CJK symbols and punctuations
+ || (cc >= 0xff01 && cc <= 0xff0f) // Full width ASCII punctuations
+ || (cc >= 0xff1a && cc <= 0xff20) // ..
+ || (cc >= 0xff3b && cc <= 0xff40) // ..
+ || (cc >= 0xff5b && cc <= 0xff65)); // ..
+ }
+
+ /*
+ * Whether line break is allowed before "cc".
+ */
+ int
+ utf_allow_break_before(int cc)
+ {
+ static const int BOL_prohibition_punct[] =
+ {
+ '!',
+ '%',
+ ')',
+ ',',
+ ':',
+ ';',
+ '>',
+ '?',
+ ']',
+ '}',
+ 0x2019, // ’ right single quotation mark
+ 0x201d, // ” right double quotation mark
+ 0x2020, // † dagger
+ 0x2021, // ‡ double dagger
+ 0x2026, // … horizontal ellipsis
+ 0x2030, // ‰ per mille sign
+ 0x2031, // ‱ per then thousand sign
+ 0x203c, // ‼ double exclamation mark
+ 0x2047, // ⁇ double question mark
+ 0x2048, // ⁈ question exclamation mark
+ 0x2049, // ⁉ exclamation question mark
+ 0x2103, // ℃ degree celsius
+ 0x2109, // ℉ degree fahrenheit
+ 0x3001, // 、 ideographic comma
+ 0x3002, // 。 ideographic full stop
+ 0x3009, // 〉 right angle bracket
+ 0x300b, // 》 right double angle bracket
+ 0x300d, // 」 right corner bracket
+ 0x300f, // 』 right white corner bracket
+ 0x3011, // 】 right black lenticular bracket
+ 0x3015, // 〕 right tortoise shell bracket
+ 0x3017, // 〗 right white lenticular bracket
+ 0x3019, // 〙 right white tortoise shell bracket
+ 0x301b, // 〛 right white square bracket
+ 0xff01, // ! fullwidth exclamation mark
+ 0xff09, // ) fullwidth right parenthesis
+ 0xff0c, // , fullwidth comma
+ 0xff0e, // . fullwidth full stop
+ 0xff1a, // : fullwidth colon
+ 0xff1b, // ; fullwidth semicolon
+ 0xff1f, // ? fullwidth question mark
+ 0xff3d, // ] fullwidth right square bracket
+ 0xff5d, // } fullwidth right curly bracket
+ };
+
+ int first = 0;
+ int last = sizeof(BOL_prohibition_punct)/sizeof(int) - 1;
+ int mid = 0;
+
+ while (first < last)
+ {
+ mid = (first + last)/2;
+
+ if (cc == BOL_prohibition_punct[mid])
+ return FALSE;
+ else if (cc > BOL_prohibition_punct[mid])
+ first = mid + 1;
+ else
+ last = mid - 1;
+ }
+
+ return cc != BOL_prohibition_punct[first];
+ }
+
+ /*
+ * Whether line break is allowed after "cc".
+ */
+ static int
+ utf_allow_break_after(int cc)
+ {
+ static const int EOL_prohibition_punct[] =
+ {
+ '(',
+ '<',
+ '[',
+ '`',
+ '{',
+ //0x2014, // — em dash
+ 0x2018, // ‘ left single quotation mark
+ 0x201c, // “ left double quotation mark
+ //0x2053, // ~ swung dash
+ 0x3008, // 〈 left angle bracket
+ 0x300a, // 《 left double angle bracket
+ 0x300c, // 「 left corner bracket
+ 0x300e, // 『 left white corner bracket
+ 0x3010, // 【 left black lenticular bracket
+ 0x3014, // 〔 left tortoise shell bracket
+ 0x3016, // 〖 left white lenticular bracket
+ 0x3018, // 〘 left white tortoise shell bracket
+ 0x301a, // 〚 left white square bracket
+ 0xff08, // ( fullwidth left parenthesis
+ 0xff3b, // [ fullwidth left square bracket
+ 0xff5b, // { fullwidth left curly bracket
+ };
+
+ int first = 0;
+ int last = sizeof(EOL_prohibition_punct)/sizeof(int) - 1;
+ int mid = 0;
+
+ while (first < last)
+ {
+ mid = (first + last)/2;
+
+ if (cc == EOL_prohibition_punct[mid])
+ return FALSE;
+ else if (cc > EOL_prohibition_punct[mid])
+ first = mid + 1;
+ else
+ last = mid - 1;
+ }
+
+ return cc != EOL_prohibition_punct[first];
+ }
+
+ /*
+ * Whether line break is allowed between "cc" and "ncc".
+ */
+ int
+ utf_allow_break(int cc, int ncc)
+ {
+ // don't break between two-letter punctuations
+ if (cc == ncc
+ && (cc == 0x2014 // em dash
+ || cc == 0x2026)) // horizontal ellipsis
+ return FALSE;
+
+ return utf_allow_break_after(cc) && utf_allow_break_before(ncc);
+ }
+
+ /*
* Copy a character from "*fp" to "*tp" and advance the pointers.
*/
void
*** ../vim-8.2.0900/src/ops.c 2020-06-01 19:14:09.050505748 +0200
--- src/ops.c 2020-06-04 17:38:21.854286486 +0200
***************
*** 1967,1973 ****
&& (!has_format_option(FO_MBYTE_JOIN)
|| (mb_ptr2char(curr) < 0x100 && endcurr1 < 0x100))
&& (!has_format_option(FO_MBYTE_JOIN2)
! || mb_ptr2char(curr) < 0x100 || endcurr1 < 0x100)
)
{
// don't add a space if the line is ending in a space
--- 1967,1976 ----
&& (!has_format_option(FO_MBYTE_JOIN)
|| (mb_ptr2char(curr) < 0x100 && endcurr1 < 0x100))
&& (!has_format_option(FO_MBYTE_JOIN2)
! || (mb_ptr2char(curr) < 0x100
! && !(enc_utf8 && utf_eat_space(endcurr1)))
! || (endcurr1 < 0x100
! && !(enc_utf8 && utf_eat_space(mb_ptr2char(curr)))))
)
{
// don't add a space if the line is ending in a space
*** ../vim-8.2.0900/src/option.h 2020-05-31 23:11:02.082515688 +0200
--- src/option.h 2020-06-04 17:38:21.854286486 +0200
***************
*** 141,152 ****
#define FO_ONE_LETTER '1'
#define FO_WHITE_PAR 'w' // trailing white space continues paragr.
#define FO_AUTO 'a' // automatic formatting
#define FO_REMOVE_COMS 'j' // remove comment leaders when joining
lines
#define FO_PERIOD_ABBR 'p' // don't break a single space after a
period
#define DFLT_FO_VI "vt"
#define DFLT_FO_VIM "tcq"
! #define FO_ALL "tcroq2vlb1mMBn,awjp" // for do_set()
// characters for the p_cpo option:
#define CPO_ALTREAD 'a' // ":read" sets alternate file name
--- 141,153 ----
#define FO_ONE_LETTER '1'
#define FO_WHITE_PAR 'w' // trailing white space continues paragr.
#define FO_AUTO 'a' // automatic formatting
+ #define FO_RIGOROUS_TW ']' // respect textwidth rigorously
#define FO_REMOVE_COMS 'j' // remove comment leaders when joining
lines
#define FO_PERIOD_ABBR 'p' // don't break a single space after a
period
#define DFLT_FO_VI "vt"
#define DFLT_FO_VIM "tcq"
! #define FO_ALL "tcroq2vlb1mMBn,aw]jp" // for do_set()
// characters for the p_cpo option:
#define CPO_ALTREAD 'a' // ":read" sets alternate file name
*** ../vim-8.2.0900/src/proto/mbyte.pro 2020-06-01 14:34:22.027462262 +0200
--- src/proto/mbyte.pro 2020-06-04 18:00:03.751806728 +0200
***************
*** 52,57 ****
--- 52,60 ----
int latin_head_off(char_u *base, char_u *p);
int dbcs_screen_head_off(char_u *base, char_u *p);
int utf_head_off(char_u *base, char_u *p);
+ int utf_eat_space(int cc);
+ int utf_allow_break_before(int cc);
+ int utf_allow_break(int cc, int ncc);
void mb_copy_char(char_u **fp, char_u **tp);
int mb_off_next(char_u *base, char_u *p);
int mb_tail_off(char_u *base, char_u *p);
*** ../vim-8.2.0900/src/testdir/Make_all.mak 2020-06-04 15:52:06.095922759
+0200
--- src/testdir/Make_all.mak 2020-06-04 17:38:21.854286486 +0200
***************
*** 85,90 ****
--- 85,91 ----
test_charsearch_utf8 \
test_checkpath \
test_cindent \
+ test_cjk_linebreak \
test_clientserver \
test_close_count \
test_cmdline \
***************
*** 333,338 ****
--- 334,340 ----
test_charsearch.res \
test_checkpath.res \
test_cindent.res \
+ test_cjk_linebreak.res \
test_clientserver.res \
test_close_count.res \
test_cmdline.res \
*** ../vim-8.2.0900/src/textformat.c 2020-05-01 14:26:17.132949262 +0200
--- src/textformat.c 2020-06-04 18:16:11.963699002 +0200
***************
*** 45,54 ****
--- 45,56 ----
int c) // character to be inserted (can be NUL)
{
int cc;
+ int skip_pos;
int save_char = NUL;
int haveto_redraw = FALSE;
int fo_ins_blank = has_format_option(FO_INS_BLANK);
int fo_multibyte = has_format_option(FO_MBYTE_BREAK);
+ int fo_rigor_tw = has_format_option(FO_RIGOROUS_TW);
int fo_white_par = has_format_option(FO_WHITE_PAR);
int first_line = TRUE;
colnr_T leader_len;
***************
*** 125,130 ****
--- 127,133 ----
curwin->w_cursor.col = startcol;
foundcol = 0;
+ skip_pos = 0;
// Find position to break at.
// Stop at first entered white when 'formatoptions' has 'v'
***************
*** 189,196 ****
if (curwin->w_cursor.col <= (colnr_T)wantcol)
break;
}
! else if (cc >= 0x100 && fo_multibyte)
{
// Break after or before a multi-byte character.
if (curwin->w_cursor.col != startcol)
{
--- 192,202 ----
if (curwin->w_cursor.col <= (colnr_T)wantcol)
break;
}
! else if ((cc >= 0x100 || !utf_allow_break_before(cc)) &&
fo_multibyte)
{
+ int ncc;
+ int allow_break;
+
// Break after or before a multi-byte character.
if (curwin->w_cursor.col != startcol)
{
***************
*** 199,206 ****
break;
col = curwin->w_cursor.col;
inc_cursor();
! // Don't change end_foundcol if already set.
! if (foundcol != curwin->w_cursor.col)
{
foundcol = curwin->w_cursor.col;
end_foundcol = foundcol;
--- 205,218 ----
break;
col = curwin->w_cursor.col;
inc_cursor();
! ncc = gchar_cursor();
!
! allow_break =
! (enc_utf8 && utf_allow_break(cc, ncc))
! || enc_dbcs;
!
! // If we have already checked this position, skip!
! if (curwin->w_cursor.col != skip_pos && allow_break)
{
foundcol = curwin->w_cursor.col;
end_foundcol = foundcol;
***************
*** 213,218 ****
--- 225,231 ----
if (curwin->w_cursor.col == 0)
break;
+ ncc = cc;
col = curwin->w_cursor.col;
dec_cursor();
***************
*** 220,235 ****
if (WHITECHAR(cc))
continue; // break with space
! // Don't break until after the comment leader
if (curwin->w_cursor.col < leader_len)
break;
curwin->w_cursor.col = col;
! foundcol = curwin->w_cursor.col;
! end_foundcol = foundcol;
if (curwin->w_cursor.col <= (colnr_T)wantcol)
! break;
}
if (curwin->w_cursor.col == 0)
break;
--- 233,297 ----
if (WHITECHAR(cc))
continue; // break with space
! // Don't break until after the comment leader.
if (curwin->w_cursor.col < leader_len)
break;
curwin->w_cursor.col = col;
+ skip_pos = curwin->w_cursor.col;
! allow_break =
! (enc_utf8 && utf_allow_break(cc, ncc))
! || enc_dbcs;
!
! // Must handle this to respect line break prohibition.
! if (allow_break)
! {
! foundcol = curwin->w_cursor.col;
! end_foundcol = foundcol;
! }
if (curwin->w_cursor.col <= (colnr_T)wantcol)
! {
! int ncc_allow_break =
! (enc_utf8 && utf_allow_break_before(ncc)) || enc_dbcs;
!
! if (allow_break)
! break;
! if (!ncc_allow_break && !fo_rigor_tw)
! {
! // Enable at most 1 punct hang outside of textwidth.
! if (curwin->w_cursor.col == startcol)
! {
! // We are inserting a non-breakable char, postpone
! // line break check to next insert.
! end_foundcol = foundcol = 0;
! break;
! }
!
! // Neither cc nor ncc is NUL if we are here, so
! // it's safe to inc_cursor.
! col = curwin->w_cursor.col;
!
! inc_cursor();
! cc = ncc;
! ncc = gchar_cursor();
! // handle insert
! ncc = (ncc != NUL) ? ncc : c;
!
! allow_break =
! (enc_utf8 && utf_allow_break(cc, ncc))
! || enc_dbcs;
!
! if (allow_break)
! {
! // Break only when we are not at end of line.
! end_foundcol = foundcol =
! ncc == NUL? 0 : curwin->w_cursor.col;
! break;
! }
! curwin->w_cursor.col = col;
! }
! }
}
if (curwin->w_cursor.col == 0)
break;
*** ../vim-8.2.0900/src/testdir/test_cjk_linebreak.vim 2020-06-04
18:21:24.394514113 +0200
--- src/testdir/test_cjk_linebreak.vim 2020-06-04 18:10:04.553145403 +0200
***************
*** 0 ****
--- 1,91 ----
+ scriptencoding utf-8
+
+ func Run_cjk_linebreak_after()
+ set textwidth=12
+ for punct in [
+ \ '!', '%', ')', ',', ':', ';', '>', '?', ']', '}', '’', '”', '†',
'‡',
+ \ '…', '‰', '‱', '‼', '⁇', '⁈', '⁉', '℃', '℉', '、', '。', '〉', '》',
+ \ '」', '』', '】', '〕', '〗', '〙', '〛', '!', ')', ',', '.', ':',
+ \ ';', '?', ']', '}']
+ call setline('.', '这是一个测试'.punct.'试试 CJK 行禁则补丁。')
+ normal gqq
+ call assert_equal('这是一个测试'.punct, getline(1))
+ %d_
+ endfor
+ endfunc
+
+ func Test_cjk_linebreak_after()
+ set formatoptions=croqn2mB1j
+ call Run_cjk_linebreak_after()
+ endfunc
+
+ " TODO: this test fails
+ "func Test_cjk_linebreak_after_rigorous()
+ " set formatoptions=croqn2mB1j]
+ " call Run_cjk_linebreak_after()
+ "endfunc
+
+ func Run_cjk_linebreak_before()
+ set textwidth=12
+ for punct in [
+ \ '(', '<', '[', '`', '{', '‘', '“', '〈', '《', '「', '『', '【', '〔',
+ \ '〖', '〘', '〚', '(', '[', '{']
+ call setline('.', '这是个测试'.punct.'试试 CJK 行禁则补丁。')
+ normal gqq
+ call assert_equal('这是个测试', getline(1))
+ %d_
+ endfor
+ endfunc
+
+ func Test_cjk_linebreak_before()
+ set formatoptions=croqn2mB1j
+ call Run_cjk_linebreak_before()
+ endfunc
+
+ func Test_cjk_linebreak_before_rigorous()
+ set formatoptions=croqn2mB1j]
+ call Run_cjk_linebreak_before()
+ endfunc
+
+ func Run_cjk_linebreak_nobetween()
+ " …… must not start a line
+ call setline('.', '这是个测试……试试 CJK 行禁则补丁。')
+ set textwidth=12 ambiwidth=double
+ normal gqq
+ " TODO: this fails
+ " call assert_equal('这是个测试……', getline(1))
+ %d_
+
+ call setline('.', '这是一个测试……试试 CJK 行禁则补丁。')
+ set textwidth=12 ambiwidth=double
+ normal gqq
+ call assert_equal('这是一个测', getline(1))
+ %d_
+
+ " but —— can
+ call setline('.', '这是个测试——试试 CJK 行禁则补丁。')
+ set textwidth=12 ambiwidth=double
+ normal gqq
+ call assert_equal('这是个测试', getline(1))
+ endfunc
+
+ func Test_cjk_linebreak_nobetween()
+ set formatoptions=croqn2mB1j
+ call Run_cjk_linebreak_nobetween()
+ endfunc
+
+ func Test_cjk_linebreak_nobetween_rigorous()
+ set formatoptions=croqn2mB1j]
+ call Run_cjk_linebreak_nobetween()
+ endfunc
+
+ func Test_cjk_linebreak_join_punct()
+ for punct in ['——', '〗', ',', '。', '……']
+ call setline(1, '文本文本'.punct)
+ call setline(2, 'English')
+ set formatoptions=croqn2mB1j
+ normal ggJ
+ call assert_equal('文本文本'.punct.'English', getline(1))
+ %d_
+ endfor
+ endfunc
*** ../vim-8.2.0900/src/version.c 2020-06-04 17:19:01.581522349 +0200
--- src/version.c 2020-06-04 17:39:49.046032743 +0200
***************
*** 748,749 ****
--- 748,751 ----
{ /* Add new patch number below this line */
+ /**/
+ 901,
/**/
--
MAN: You don't frighten us, English pig-dog! Go and boil your bottoms,
son of a silly person. I blow my nose on you, so-called Arthur-king,
you and your silly English K...kaniggets.
He puts hands to his ears and blows a raspberry.
"Monty Python and the Holy Grail" PYTHON (MONTY) PICTURES LTD
/// Bram Moolenaar -- [email protected] -- http://www.Moolenaar.net \\\
/// sponsor Vim, vote for features -- http://www.Vim.org/sponsor/ \\\
\\\ an exciting new programming language -- http://www.Zimbu.org ///
\\\ help me help AIDS victims -- http://ICCF-Holland.org ///
--
--
You received this message from the "vim_dev" maillist.
Do not top-post! Type your reply below the text you are replying to.
For more information, visit http://www.vim.org/maillist.php
---
You received this message because you are subscribed to the Google Groups
"vim_dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To view this discussion on the web visit
https://groups.google.com/d/msgid/vim_dev/202006041622.054GMdaK778208%40masaka.moolenaar.net.