patch 9.2.0140: file reading performance can be improved

Commit: 
https://github.com/vim/vim/commit/2ca96b09d751c35189dd587d9b201e2aeb5559c0
Author: Yasuhiro Matsumoto <[email protected]>
Date:   Wed Mar 11 20:18:26 2026 +0000

    patch 9.2.0140: file reading performance can be improved
    
    Problem:  Reading large files is slow because UTF-8 validation and
              newline scanning are performed byte-by-byte. Initial file
              loading also triggers listener and channel processing.
    Solution: Use memchr() for SIMD-optimized newline scanning, implement
              word-at-a-time ASCII skipping during UTF-8 validation using a
              bitmask, skip listener/netbeans/channel notifications
              when the ML_APPEND_NEW flag is set during readfile()
              (Yasuhiro Matsumoto).
    
    closes: #19612
    
    Co-authored-by: NRK <[email protected]>
    Signed-off-by: Yasuhiro Matsumoto <[email protected]>
    Signed-off-by: Christian Brabandt <[email protected]>

diff --git a/src/fileio.c b/src/fileio.c
index 0ba9905c3..8917a7e29 100644
--- a/src/fileio.c
+++ b/src/fileio.c
@@ -27,6 +27,10 @@
 // Is there any system that doesn't have access()?
 #define USE_MCH_ACCESS
 
+// Bitmask with 0x80 set in each byte of a long_u word, used to detect
+// non-ASCII bytes (high bit set) in multiple bytes at once.
+#define NONASCII_MASK (((long_u)-1 / 0xFF) * 0x80)
+
 #if defined(__hpux) && !defined(HAVE_DIRFD)
 # define dirfd(x) ((x)->__dd_fd)
 # define HAVE_DIRFD
@@ -2056,11 +2060,27 @@ retry:
                int  incomplete_tail = FALSE;
 
                // Reading UTF-8: Check if the bytes are valid UTF-8.
-               for (p = ptr; ; ++p)
+               for (p = ptr; ; )
                {
-                   int  todo = (int)((ptr + size) - p);
+                   int  todo;
                    int  l;
 
+                   // Skip ASCII bytes quickly using word-at-a-time check.
+                   {
+                       char_u *ascii_end = ptr + size;
+                       while (ascii_end - p >= (long)sizeof(long_u))
+                       {
+                           long_u word;
+                           memcpy(&word, p, sizeof(long_u));
+                           if (word & NONASCII_MASK)
+                               break;
+                           p += sizeof(long_u);
+                       }
+                       while (p < ascii_end && *p < 0x80)
+                           ++p;
+                   }
+
+                   todo = (int)((ptr + size) - p);
                    if (todo <= 0)
                        break;
                    if (*p >= 0x80)
@@ -2109,14 +2129,17 @@ retry:
                            if (bad_char_behavior == BAD_DROP)
                            {
                                mch_memmove(p, p + 1, todo - 1);
-                               --p;
                                --size;
                            }
-                           else if (bad_char_behavior != BAD_KEEP)
-                               *p = bad_char_behavior;
+                           else
+                           {
+                               if (bad_char_behavior != BAD_KEEP)
+                                   *p = bad_char_behavior;
+                               ++p;
+                           }
                        }
                        else
-                           p += l - 1;
+                           p += l;
                    }
                }
                if (p < ptr + size && !incomplete_tail)
@@ -2255,73 +2278,101 @@ rewind_retry:
        }
        else
        {
-           --ptr;
-           while (++ptr, --size >= 0)
+           // Use memchr() for SIMD-optimized newline scanning instead
+           // of scanning each byte individually.
+           char_u *end = ptr + size;
+
+           while (ptr < end)
            {
-               if ((c = *ptr) != NUL && c != NL)  // catch most common case
-                   continue;
-               if (c == NUL)
-                   *ptr = NL;  // NULs are replaced by newlines!
-               else
+               char_u *nl = (char_u *)memchr(ptr, NL, end - ptr);
+               char_u *nul_scan;
+
+               if (nl == NULL)
                {
-                   if (skip_count == 0)
+                   // No more newlines in buffer.
+                   // Replace any NUL bytes with NL in remaining data.
+                   while ((nul_scan = (char_u *)memchr(ptr, NUL,
+                                                     end - ptr)) != NULL)
+                   {
+                       *nul_scan = NL;
+                       ptr = nul_scan + 1;
+                   }
+                   ptr = end;
+                   break;
+               }
+
+               // Replace NUL bytes with NL before the newline.
+               {
+                   char_u *scan = ptr;
+                   while ((nul_scan = (char_u *)memchr(scan, NUL,
+                                                      nl - scan)) != NULL)
+                   {
+                       *nul_scan = NL;
+                       scan = nul_scan + 1;
+                   }
+               }
+
+               // Process the newline.
+               ptr = nl;
+               if (skip_count == 0)
+               {
+                   *ptr = NUL;         // end of line
+                   len = (colnr_T)(ptr - line_start + 1);
+                   if (fileformat == EOL_DOS)
                    {
-                       *ptr = NUL;             // end of line
-                       len = (colnr_T)(ptr - line_start + 1);
-                       if (fileformat == EOL_DOS)
+                       if (ptr > line_start && ptr[-1] == CAR)
                        {
-                           if (ptr > line_start && ptr[-1] == CAR)
-                           {
-                               // remove CR before NL
-                               ptr[-1] = NUL;
-                               --len;
-                           }
-                           /*
-                            * Reading in Dos format, but no CR-LF found!
-                            * When 'fileformats' includes "unix", delete all
-                            * the lines read so far and start all over again.
-                            * Otherwise give an error message later.
-                            */
-                           else if (ff_error != EOL_DOS)
-                           {
-                               if (   try_unix
-                                   && !read_stdin
-                                   && (read_buffer
-                                       || vim_lseek(fd, (off_T)0L, SEEK_SET)
-                                                                         == 0))
-                               {
-                                   fileformat = EOL_UNIX;
-                                   if (set_options)
-                                       set_fileformat(EOL_UNIX, OPT_LOCAL);
-                                   file_rewind = TRUE;
-                                   keep_fileformat = TRUE;
-                                   goto retry;
-                               }
-                               ff_error = EOL_DOS;
-                           }
+                           // remove CR before NL
+                           ptr[-1] = NUL;
+                           --len;
                        }
-                       if (ml_append(lnum, line_start, len, newfile) == FAIL)
+                       /*
+                        * Reading in Dos format, but no CR-LF found!
+                        * When 'fileformats' includes "unix", delete all
+                        * the lines read so far and start all over again.
+                        * Otherwise give an error message later.
+                        */
+                       else if (ff_error != EOL_DOS)
                        {
-                           error = TRUE;
-                           break;
+                           if (   try_unix
+                               && !read_stdin
+                               && (read_buffer
+                                   || vim_lseek(fd, (off_T)0L, SEEK_SET)
+                                                                     == 0))
+                           {
+                               fileformat = EOL_UNIX;
+                               if (set_options)
+                                   set_fileformat(EOL_UNIX, OPT_LOCAL);
+                               file_rewind = TRUE;
+                               keep_fileformat = TRUE;
+                               goto retry;
+                           }
+                           ff_error = EOL_DOS;
                        }
+                   }
+                   if (ml_append(lnum, line_start, len, newfile) == FAIL)
+                   {
+                       error = TRUE;
+                       break;
+                   }
 #ifdef FEAT_PERSISTENT_UNDO
-                       if (read_undo_file)
-                           sha256_update(&sha_ctx, line_start, len);
+                   if (read_undo_file)
+                       sha256_update(&sha_ctx, line_start, len);
 #endif
-                       ++lnum;
-                       if (--read_count == 0)
-                       {
-                           error = TRUE;           // break loop
-                           line_start = ptr;   // nothing left to write
-                           break;
-                       }
+                   ++lnum;
+                   if (--read_count == 0)
+                   {
+                       error = TRUE;       // break loop
+                       line_start = ptr;   // nothing left to write
+                       break;
                    }
-                   else
-                       --skip_count;
-                   line_start = ptr + 1;
                }
+               else
+                   --skip_count;
+               line_start = ptr + 1;
+               ++ptr;
            }
+           size = -1;
        }
        linerest = (long)(ptr - line_start);
        ui_breakcheck();
diff --git a/src/memline.c b/src/memline.c
index 2d92aadb6..58824ad11 100644
--- a/src/memline.c
+++ b/src/memline.c
@@ -3472,7 +3472,7 @@ ml_append_int(
 #endif
 
 #ifdef FEAT_NETBEANS_INTG
-    if (netbeans_active())
+    if (!(flags & ML_APPEND_NEW) && netbeans_active())
     {
        int line_len = (int)STRLEN(line);
        if (line_len > 0)
@@ -3481,7 +3481,7 @@ ml_append_int(
     }
 #endif
 #ifdef FEAT_JOB_CHANNEL
-    if (buf->b_write_to_channel)
+    if (!(flags & ML_APPEND_NEW) && buf->b_write_to_channel)
        channel_write_new_lines(buf);
 #endif
     ret = OK;
@@ -3512,11 +3512,15 @@ ml_append_flush(
        ml_flush_line(buf);
 
 #ifdef FEAT_EVAL
-    // When inserting above recorded changes: flush the changes before changing
-    // the text.  Then flush the cached line, it may become invalid.
-    may_invoke_listeners(buf, lnum + 1, lnum + 1, 1);
-    if (buf->b_ml.ml_line_lnum != 0)
-       ml_flush_line(buf);
+    if (!(flags & ML_APPEND_NEW))
+    {
+       // When inserting above recorded changes: flush the changes before
+       // changing the text.  Then flush the cached line, it may become
+       // invalid.  Skip during initial file read for performance.
+       may_invoke_listeners(buf, lnum + 1, lnum + 1, 1);
+       if (buf->b_ml.ml_line_lnum != 0)
+           ml_flush_line(buf);
+    }
 #endif
 
     return ml_append_int(buf, lnum, line, len, flags);
diff --git a/src/version.c b/src/version.c
index ad7d6ab32..8c1f6c30a 100644
--- a/src/version.c
+++ b/src/version.c
@@ -734,6 +734,8 @@ static char *(features[]) =
 
 static int included_patches[] =
 {   /* Add new patch number below this line */
+/**/
+    140,
 /**/
     139,
 /**/

-- 
-- 
You received this message from the "vim_dev" maillist.
Do not top-post! Type your reply below the text you are replying to.
For more information, visit http://www.vim.org/maillist.php

--- 
You received this message because you are subscribed to the Google Groups 
"vim_dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To view this discussion visit 
https://groups.google.com/d/msgid/vim_dev/E1w0QQR-000dV0-6V%40256bit.org.

Raspunde prin e-mail lui