Patch 8.2.0901
Problem:    Formatting CJK text isn't optimal.
Solution:   Properly break CJK lines. (closes #3875)
Files:      runtime/doc/change.txt, src/mbyte.c, src/ops.c, src/option.h,
            src/proto/mbyte.pro, src/testdir/Make_all.mak, src/textformat.c,
            src/testdir/test_cjk_linebreak.vim


*** ../vim-8.2.0900/runtime/doc/change.txt      2019-12-17 21:27:14.686319918 
+0100
--- runtime/doc/change.txt      2020-06-04 17:56:02.629097404 +0200
***************
*** 1686,1691 ****
--- 1688,1697 ----
        characters.  Overruled by the 'M' flag.
  1     Don't break a line after a one-letter word.  It's broken before it
        instead (if possible).
+ ]     Respect textwidth rigorously. With this flag set, no line can be
+       longer than textwidth, unless line-break-prohibition rules make this
+       impossible.  Mainly for CJK scripts and works only if 'encoding' is
+       "utf-8".
  j     Where it makes sense, remove a comment leader when joining lines.  For
        example, joining:
                int i;   // the index ~
*** ../vim-8.2.0900/src/mbyte.c 2020-06-01 14:34:22.027462262 +0200
--- src/mbyte.c 2020-06-04 18:19:20.338981249 +0200
***************
*** 3843,3848 ****
--- 3843,4000 ----
  }
  
  /*
+  * Whether space is NOT allowed before/after 'c'.
+  */
+     int
+ utf_eat_space(int cc)
+ {
+     return ((cc >= 0x2000 && cc <= 0x206F)    // General punctuations
+        || (cc >= 0x2e00 && cc <= 0x2e7f)      // Supplemental punctuations
+        || (cc >= 0x3000 && cc <= 0x303f)      // CJK symbols and punctuations
+        || (cc >= 0xff01 && cc <= 0xff0f)      // Full width ASCII punctuations
+        || (cc >= 0xff1a && cc <= 0xff20)      // ..
+        || (cc >= 0xff3b && cc <= 0xff40)      // ..
+        || (cc >= 0xff5b && cc <= 0xff65));    // ..
+ }
+ 
+ /*
+  * Whether line break is allowed before "cc".
+  */
+     int
+ utf_allow_break_before(int cc)
+ {
+     static const int BOL_prohibition_punct[] =
+     {
+       '!',
+       '%',
+       ')',
+       ',',
+       ':',
+       ';',
+       '>',
+       '?',
+       ']',
+       '}',
+       0x2019, // ’ right single quotation mark
+       0x201d, // ” right double quotation mark
+       0x2020, // † dagger
+       0x2021, // ‡ double dagger
+       0x2026, // … horizontal ellipsis
+       0x2030, // ‰ per mille sign
+       0x2031, // ‱ per then thousand sign
+       0x203c, // ‼ double exclamation mark
+       0x2047, // ⁇ double question mark
+       0x2048, // ⁈ question exclamation mark
+       0x2049, // ⁉ exclamation question mark
+       0x2103, // ℃ degree celsius
+       0x2109, // ℉ degree fahrenheit
+       0x3001, // 、 ideographic comma
+       0x3002, // 。 ideographic full stop
+       0x3009, // 〉 right angle bracket
+       0x300b, // 》 right double angle bracket
+       0x300d, // 」 right corner bracket
+       0x300f, // 』 right white corner bracket
+       0x3011, // 】 right black lenticular bracket
+       0x3015, // 〕 right tortoise shell bracket
+       0x3017, // 〗 right white lenticular bracket
+       0x3019, // 〙 right white tortoise shell bracket
+       0x301b, // 〛 right white square bracket
+       0xff01, // ! fullwidth exclamation mark
+       0xff09, // ) fullwidth right parenthesis
+       0xff0c, // , fullwidth comma
+       0xff0e, // . fullwidth full stop
+       0xff1a, // : fullwidth colon
+       0xff1b, // ; fullwidth semicolon
+       0xff1f, // ? fullwidth question mark
+       0xff3d, // ] fullwidth right square bracket
+       0xff5d, // } fullwidth right curly bracket
+     };
+ 
+     int first = 0;
+     int last  = sizeof(BOL_prohibition_punct)/sizeof(int) - 1;
+     int mid   = 0;
+ 
+     while (first < last)
+     {
+       mid = (first + last)/2;
+ 
+       if (cc == BOL_prohibition_punct[mid])
+           return FALSE;
+       else if (cc > BOL_prohibition_punct[mid])
+           first = mid + 1;
+       else
+           last = mid - 1;
+     }
+ 
+     return cc != BOL_prohibition_punct[first];
+ }
+ 
+ /*
+  * Whether line break is allowed after "cc".
+  */
+     static int
+ utf_allow_break_after(int cc)
+ {
+     static const int EOL_prohibition_punct[] =
+     {
+       '(',
+       '<',
+       '[',
+       '`',
+       '{',
+       //0x2014, // — em dash
+       0x2018, // ‘ left single quotation mark
+       0x201c, // “ left double quotation mark
+       //0x2053, // ~ swung dash
+       0x3008, // 〈 left angle bracket
+       0x300a, // 《 left double angle bracket
+       0x300c, // 「 left corner bracket
+       0x300e, // 『 left white corner bracket
+       0x3010, // 【 left black lenticular bracket
+       0x3014, // 〔 left tortoise shell bracket
+       0x3016, // 〖 left white lenticular bracket
+       0x3018, // 〘 left white tortoise shell bracket
+       0x301a, // 〚 left white square bracket
+       0xff08, // ( fullwidth left parenthesis
+       0xff3b, // [ fullwidth left square bracket
+       0xff5b, // { fullwidth left curly bracket
+     };
+ 
+     int first = 0;
+     int last  = sizeof(EOL_prohibition_punct)/sizeof(int) - 1;
+     int mid   = 0;
+ 
+     while (first < last)
+     {
+       mid = (first + last)/2;
+ 
+       if (cc == EOL_prohibition_punct[mid])
+           return FALSE;
+       else if (cc > EOL_prohibition_punct[mid])
+           first = mid + 1;
+       else
+           last = mid - 1;
+     }
+ 
+     return cc != EOL_prohibition_punct[first];
+ }
+ 
+ /*
+  * Whether line break is allowed between "cc" and "ncc".
+  */
+     int
+ utf_allow_break(int cc, int ncc)
+ {
+     // don't break between two-letter punctuations
+     if (cc == ncc
+           && (cc == 0x2014 // em dash
+               || cc == 0x2026)) // horizontal ellipsis
+       return FALSE;
+ 
+     return utf_allow_break_after(cc) && utf_allow_break_before(ncc);
+ }
+ 
+ /*
   * Copy a character from "*fp" to "*tp" and advance the pointers.
   */
      void
*** ../vim-8.2.0900/src/ops.c   2020-06-01 19:14:09.050505748 +0200
--- src/ops.c   2020-06-04 17:38:21.854286486 +0200
***************
*** 1967,1973 ****
                    && (!has_format_option(FO_MBYTE_JOIN)
                        || (mb_ptr2char(curr) < 0x100 && endcurr1 < 0x100))
                    && (!has_format_option(FO_MBYTE_JOIN2)
!                       || mb_ptr2char(curr) < 0x100 || endcurr1 < 0x100)
               )
            {
                // don't add a space if the line is ending in a space
--- 1967,1976 ----
                    && (!has_format_option(FO_MBYTE_JOIN)
                        || (mb_ptr2char(curr) < 0x100 && endcurr1 < 0x100))
                    && (!has_format_option(FO_MBYTE_JOIN2)
!                       || (mb_ptr2char(curr) < 0x100
!                           && !(enc_utf8 && utf_eat_space(endcurr1)))
!                       || (endcurr1 < 0x100
!                           && !(enc_utf8 && utf_eat_space(mb_ptr2char(curr)))))
               )
            {
                // don't add a space if the line is ending in a space
*** ../vim-8.2.0900/src/option.h        2020-05-31 23:11:02.082515688 +0200
--- src/option.h        2020-06-04 17:38:21.854286486 +0200
***************
*** 141,152 ****
  #define FO_ONE_LETTER '1'
  #define FO_WHITE_PAR  'w'     // trailing white space continues paragr.
  #define FO_AUTO               'a'     // automatic formatting
  #define FO_REMOVE_COMS        'j'     // remove comment leaders when joining 
lines
  #define FO_PERIOD_ABBR        'p'     // don't break a single space after a 
period
  
  #define DFLT_FO_VI    "vt"
  #define DFLT_FO_VIM   "tcq"
! #define FO_ALL                "tcroq2vlb1mMBn,awjp"   // for do_set()
  
  // characters for the p_cpo option:
  #define CPO_ALTREAD   'a'     // ":read" sets alternate file name
--- 141,153 ----
  #define FO_ONE_LETTER '1'
  #define FO_WHITE_PAR  'w'     // trailing white space continues paragr.
  #define FO_AUTO               'a'     // automatic formatting
+ #define FO_RIGOROUS_TW        ']'     // respect textwidth rigorously
  #define FO_REMOVE_COMS        'j'     // remove comment leaders when joining 
lines
  #define FO_PERIOD_ABBR        'p'     // don't break a single space after a 
period
  
  #define DFLT_FO_VI    "vt"
  #define DFLT_FO_VIM   "tcq"
! #define FO_ALL                "tcroq2vlb1mMBn,aw]jp"  // for do_set()
  
  // characters for the p_cpo option:
  #define CPO_ALTREAD   'a'     // ":read" sets alternate file name
*** ../vim-8.2.0900/src/proto/mbyte.pro 2020-06-01 14:34:22.027462262 +0200
--- src/proto/mbyte.pro 2020-06-04 18:00:03.751806728 +0200
***************
*** 52,57 ****
--- 52,60 ----
  int latin_head_off(char_u *base, char_u *p);
  int dbcs_screen_head_off(char_u *base, char_u *p);
  int utf_head_off(char_u *base, char_u *p);
+ int utf_eat_space(int cc);
+ int utf_allow_break_before(int cc);
+ int utf_allow_break(int cc, int ncc);
  void mb_copy_char(char_u **fp, char_u **tp);
  int mb_off_next(char_u *base, char_u *p);
  int mb_tail_off(char_u *base, char_u *p);
*** ../vim-8.2.0900/src/testdir/Make_all.mak    2020-06-04 15:52:06.095922759 
+0200
--- src/testdir/Make_all.mak    2020-06-04 17:38:21.854286486 +0200
***************
*** 85,90 ****
--- 85,91 ----
        test_charsearch_utf8 \
        test_checkpath \
        test_cindent \
+       test_cjk_linebreak \
        test_clientserver \
        test_close_count \
        test_cmdline \
***************
*** 333,338 ****
--- 334,340 ----
        test_charsearch.res \
        test_checkpath.res \
        test_cindent.res \
+       test_cjk_linebreak.res \
        test_clientserver.res \
        test_close_count.res \
        test_cmdline.res \
*** ../vim-8.2.0900/src/textformat.c    2020-05-01 14:26:17.132949262 +0200
--- src/textformat.c    2020-06-04 18:16:11.963699002 +0200
***************
*** 45,54 ****
--- 45,56 ----
      int               c) // character to be inserted (can be NUL)
  {
      int               cc;
+     int               skip_pos;
      int               save_char = NUL;
      int               haveto_redraw = FALSE;
      int               fo_ins_blank = has_format_option(FO_INS_BLANK);
      int               fo_multibyte = has_format_option(FO_MBYTE_BREAK);
+     int               fo_rigor_tw  = has_format_option(FO_RIGOROUS_TW);
      int               fo_white_par = has_format_option(FO_WHITE_PAR);
      int               first_line = TRUE;
      colnr_T   leader_len;
***************
*** 125,130 ****
--- 127,133 ----
  
        curwin->w_cursor.col = startcol;
        foundcol = 0;
+       skip_pos = 0;
  
        // Find position to break at.
        // Stop at first entered white when 'formatoptions' has 'v'
***************
*** 189,196 ****
                if (curwin->w_cursor.col <= (colnr_T)wantcol)
                    break;
            }
!           else if (cc >= 0x100 && fo_multibyte)
            {
                // Break after or before a multi-byte character.
                if (curwin->w_cursor.col != startcol)
                {
--- 192,202 ----
                if (curwin->w_cursor.col <= (colnr_T)wantcol)
                    break;
            }
!           else if ((cc >= 0x100 || !utf_allow_break_before(cc)) && 
fo_multibyte)
            {
+               int ncc;
+               int allow_break;
+ 
                // Break after or before a multi-byte character.
                if (curwin->w_cursor.col != startcol)
                {
***************
*** 199,206 ****
                        break;
                    col = curwin->w_cursor.col;
                    inc_cursor();
!                   // Don't change end_foundcol if already set.
!                   if (foundcol != curwin->w_cursor.col)
                    {
                        foundcol = curwin->w_cursor.col;
                        end_foundcol = foundcol;
--- 205,218 ----
                        break;
                    col = curwin->w_cursor.col;
                    inc_cursor();
!                   ncc = gchar_cursor();
! 
!                   allow_break =
!                       (enc_utf8 && utf_allow_break(cc, ncc))
!                       || enc_dbcs;
! 
!                   // If we have already checked this position, skip!
!                   if (curwin->w_cursor.col != skip_pos && allow_break)
                    {
                        foundcol = curwin->w_cursor.col;
                        end_foundcol = foundcol;
***************
*** 213,218 ****
--- 225,231 ----
                if (curwin->w_cursor.col == 0)
                    break;
  
+               ncc = cc;
                col = curwin->w_cursor.col;
  
                dec_cursor();
***************
*** 220,235 ****
  
                if (WHITECHAR(cc))
                    continue;           // break with space
!               // Don't break until after the comment leader
                if (curwin->w_cursor.col < leader_len)
                    break;
  
                curwin->w_cursor.col = col;
  
!               foundcol = curwin->w_cursor.col;
!               end_foundcol = foundcol;
                if (curwin->w_cursor.col <= (colnr_T)wantcol)
!                   break;
            }
            if (curwin->w_cursor.col == 0)
                break;
--- 233,297 ----
  
                if (WHITECHAR(cc))
                    continue;           // break with space
!               // Don't break until after the comment leader.
                if (curwin->w_cursor.col < leader_len)
                    break;
  
                curwin->w_cursor.col = col;
+               skip_pos = curwin->w_cursor.col;
  
!               allow_break =
!                   (enc_utf8 && utf_allow_break(cc, ncc))
!                   || enc_dbcs;
! 
!               // Must handle this to respect line break prohibition.
!               if (allow_break)
!               {
!                   foundcol = curwin->w_cursor.col;
!                   end_foundcol = foundcol;
!               }
                if (curwin->w_cursor.col <= (colnr_T)wantcol)
!               {
!                   int ncc_allow_break =
!                        (enc_utf8 && utf_allow_break_before(ncc)) || enc_dbcs;
! 
!                   if (allow_break)
!                       break;
!                   if (!ncc_allow_break && !fo_rigor_tw)
!                   {
!                       // Enable at most 1 punct hang outside of textwidth.
!                       if (curwin->w_cursor.col == startcol)
!                       {
!                           // We are inserting a non-breakable char, postpone
!                           // line break check to next insert.
!                           end_foundcol = foundcol = 0;
!                           break;
!                       }
! 
!                       // Neither cc nor ncc is NUL if we are here, so
!                       // it's safe to inc_cursor.
!                       col = curwin->w_cursor.col;
! 
!                       inc_cursor();
!                       cc  = ncc;
!                       ncc = gchar_cursor();
!                       // handle insert
!                       ncc = (ncc != NUL) ? ncc : c;
! 
!                       allow_break =
!                               (enc_utf8 && utf_allow_break(cc, ncc))
!                               || enc_dbcs;
! 
!                       if (allow_break)
!                       {
!                           // Break only when we are not at end of line.
!                           end_foundcol = foundcol =
!                                     ncc == NUL? 0 : curwin->w_cursor.col;
!                           break;
!                       }
!                       curwin->w_cursor.col = col;
!                   }
!               }
            }
            if (curwin->w_cursor.col == 0)
                break;
*** ../vim-8.2.0900/src/testdir/test_cjk_linebreak.vim  2020-06-04 
18:21:24.394514113 +0200
--- src/testdir/test_cjk_linebreak.vim  2020-06-04 18:10:04.553145403 +0200
***************
*** 0 ****
--- 1,91 ----
+ scriptencoding utf-8
+ 
+ func Run_cjk_linebreak_after()
+   set textwidth=12
+   for punct in [
+         \ '!', '%', ')', ',', ':', ';', '>', '?', ']', '}', '’', '”', '†', 
'‡',
+         \ '…', '‰', '‱', '‼', '⁇', '⁈', '⁉', '℃', '℉', '、', '。', '〉', '》',
+         \ '」', '』', '】', '〕', '〗', '〙', '〛', '!', ')', ',', '.', ':',
+         \ ';', '?', ']', '}']
+     call setline('.', '这是一个测试'.punct.'试试 CJK 行禁则补丁。')
+     normal gqq
+     call assert_equal('这是一个测试'.punct, getline(1))
+     %d_
+   endfor
+ endfunc
+ 
+ func Test_cjk_linebreak_after()
+   set formatoptions=croqn2mB1j
+   call Run_cjk_linebreak_after()
+ endfunc
+ 
+ " TODO: this test fails
+ "func Test_cjk_linebreak_after_rigorous()
+ "  set formatoptions=croqn2mB1j]
+ "  call Run_cjk_linebreak_after()
+ "endfunc
+ 
+ func Run_cjk_linebreak_before()
+   set textwidth=12
+   for punct in [
+         \ '(', '<', '[', '`', '{', '‘', '“', '〈', '《', '「', '『', '【', '〔',
+         \ '〖', '〘', '〚', '(', '[', '{']
+     call setline('.', '这是个测试'.punct.'试试 CJK 行禁则补丁。')
+     normal gqq
+     call assert_equal('这是个测试', getline(1))
+     %d_
+   endfor
+ endfunc
+ 
+ func Test_cjk_linebreak_before()
+   set formatoptions=croqn2mB1j
+   call Run_cjk_linebreak_before()
+ endfunc
+ 
+ func Test_cjk_linebreak_before_rigorous()
+   set formatoptions=croqn2mB1j]
+   call Run_cjk_linebreak_before()
+ endfunc
+ 
+ func Run_cjk_linebreak_nobetween()
+   " …… must not start a line
+   call setline('.', '这是个测试……试试 CJK 行禁则补丁。')
+   set textwidth=12 ambiwidth=double
+   normal gqq
+   " TODO: this fails
+   " call assert_equal('这是个测试……', getline(1))
+   %d_
+ 
+   call setline('.', '这是一个测试……试试 CJK 行禁则补丁。')
+   set textwidth=12 ambiwidth=double
+   normal gqq
+   call assert_equal('这是一个测', getline(1))
+   %d_
+ 
+   " but —— can
+   call setline('.', '这是个测试——试试 CJK 行禁则补丁。')
+   set textwidth=12 ambiwidth=double
+   normal gqq
+   call assert_equal('这是个测试', getline(1))
+ endfunc
+ 
+ func Test_cjk_linebreak_nobetween()
+   set formatoptions=croqn2mB1j
+   call Run_cjk_linebreak_nobetween()
+ endfunc
+ 
+ func Test_cjk_linebreak_nobetween_rigorous()
+   set formatoptions=croqn2mB1j]
+   call Run_cjk_linebreak_nobetween()
+ endfunc
+ 
+ func Test_cjk_linebreak_join_punct()
+   for punct in ['——', '〗', ',', '。', '……']
+     call setline(1, '文本文本'.punct)
+     call setline(2, 'English')
+     set formatoptions=croqn2mB1j
+     normal ggJ
+     call assert_equal('文本文本'.punct.'English', getline(1))
+     %d_
+   endfor
+ endfunc
*** ../vim-8.2.0900/src/version.c       2020-06-04 17:19:01.581522349 +0200
--- src/version.c       2020-06-04 17:39:49.046032743 +0200
***************
*** 748,749 ****
--- 748,751 ----
  {   /* Add new patch number below this line */
+ /**/
+     901,
  /**/

-- 
MAN:     You don't frighten us, English pig-dog!  Go and boil your bottoms,
         son of a silly person.  I blow my nose on you, so-called Arthur-king,
         you and your silly English K...kaniggets.
   He puts hands to his ears and blows a raspberry.
                 "Monty Python and the Holy Grail" PYTHON (MONTY) PICTURES LTD

 /// Bram Moolenaar -- [email protected] -- http://www.Moolenaar.net   \\\
///        sponsor Vim, vote for features -- http://www.Vim.org/sponsor/ \\\
\\\  an exciting new programming language -- http://www.Zimbu.org        ///
 \\\            help me help AIDS victims -- http://ICCF-Holland.org    ///

-- 
-- 
You received this message from the "vim_dev" maillist.
Do not top-post! Type your reply below the text you are replying to.
For more information, visit http://www.vim.org/maillist.php

--- 
You received this message because you are subscribed to the Google Groups 
"vim_dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To view this discussion on the web visit 
https://groups.google.com/d/msgid/vim_dev/202006041622.054GMdaK778208%40masaka.moolenaar.net.

Raspunde prin e-mail lui