Bram Moolenaar wrote: > > > Lee Naish wrote: > >> The "J" command seems to have a complexity bug when given a large numeric >> argument, eg 100000J >> >> eg: >> >> % yes | head -100000 > vimtest >> % time vim vimtest >> >> <Joined all lines using 99999J> >> >> 17.2u 0.0s 2:29.74 11.5% 0+0k 0+10232io 0pf+0w >> % yes | head -200000 > vimtest >> % time vim vimtest >> >> <Joined all lines using 199999J> >> >> 68.8u 0.4s 1:41.22 68.3% 0+0k 0+39648io 0pf+0w >> >> ie, doubling the number of lines increased the runtime by a factor of 4 >> exactly - seems like there is a O(N^2) component to the algorithm. > > This is a known problem. It works like doing J 100000 times. So near > the end you are appending a short line to a very long line, copying > everything into newly allocated memory. > > Allocating all the memory at once is a bit complicated. Perhaps an easy > solution would be to join two lines at a time.
I see at least 2 things that explain the O(n^2) complexity when joining many lines: 1/ STRLEN() is called every time on the current line. As the current line grows, STRLEN(..) becomes slow and has O(n^2) complexity when joining n lines. 2/ For each join, a new line gets allocated, and the current line + next line get copied into the new one. As the current line size increases, copying has complexity O(n^2). I have a patch which fixes at least 1/ (but not 2/). The speeds up is quite noticeable, but complexity is still O(n^2) since I only addressed 1 of the 2 issues. It works by letting the caller of do_join(...) specify the length of the current line (if it knows it), so that do_join(...) then does not need to call STRLEN(...) for every line. Here is the time it takes to join 100.000 lines on my laptop before and after patch. before patch: 28.228s after patch: 21.518s I measured several times with this test case: $ yes | head -100000 > yes.tmp $ time ./vim -u NONE -c '%join|q!' yes.tmp To speed up much more (and to get rid of O(n^2) complexity), we'd need to address the remaining point 2/. I don't understand yet enough how memory is managed for lines, but I'd like to try to look at it. Maybe we could realloc(...) the current line instead of doing a new alloc + copy each time. realloc(...) will often just increase the size of the block without actually copying anything (when there is free memory after the block). Or maybe we need to join all n lines at once rather than joining pairs of lines. Not sure yet what's best/easiest. -- Dominique --~--~---------~--~----~------------~-------~--~----~ You received this message from the "vim_dev" maillist. For more information, visit http://www.vim.org/maillist.php -~----------~----~----~----~------~----~------~--~---
Index: ops.c =================================================================== RCS file: /cvsroot/vim/vim7/src/ops.c,v retrieving revision 1.68 diff -c -r1.68 ops.c *** ops.c 3 Dec 2008 12:38:36 -0000 1.68 --- ops.c 10 Dec 2008 21:41:13 -0000 *************** *** 1919,1925 **** ); curwin->w_cursor = curpos; /* restore curwin->w_cursor */ ! (void)do_join(FALSE); } } --- 1919,1925 ---- ); curwin->w_cursor = curpos; /* restore curwin->w_cursor */ ! (void)do_join(FALSE, NULL); } } *************** *** 4111,4116 **** --- 4111,4117 ---- int insert_space; { colnr_T col = MAXCOL; + int len = -1; /* current line length not known yet */ if (u_save((linenr_T)(curwin->w_cursor.lnum - 1), (linenr_T)(curwin->w_cursor.lnum + count)) == FAIL) *************** *** 4119,4125 **** while (--count > 0) { line_breakcheck(); ! if (got_int || do_join(insert_space) == FAIL) { beep_flush(); break; --- 4120,4126 ---- while (--count > 0) { line_breakcheck(); ! if (got_int || do_join(insert_space, &len) == FAIL) { beep_flush(); break; *************** *** 4149,4156 **** * return FAIL for failure, OK otherwise */ int ! do_join(insert_space) int insert_space; { char_u *curr; char_u *next, *next_start; --- 4150,4158 ---- * return FAIL for failure, OK otherwise */ int ! do_join(insert_space, len) int insert_space; + int *len; { char_u *curr; char_u *next, *next_start; *************** *** 4165,4171 **** return FAIL; /* can't join on last line */ curr = ml_get_curline(); ! currsize = (int)STRLEN(curr); endcurr1 = endcurr2 = NUL; if (insert_space && currsize > 0) { --- 4167,4176 ---- return FAIL; /* can't join on last line */ curr = ml_get_curline(); ! ! /* Calculate line size only if not provided by caller to avoid ! * O(n^2) complexity when joining many lines */ ! currsize = (len == NULL || *len < 0) ? (int)STRLEN(curr) : *len; endcurr1 = endcurr2 = NUL; if (insert_space && currsize > 0) { *************** *** 4218,4223 **** --- 4223,4230 ---- } } nextsize = (int)STRLEN(next); + if (len != NULL) + *len = currsize + nextsize + spaces; newp = alloc_check((unsigned)(currsize + nextsize + spaces + 1)); if (newp == NULL) *************** *** 4229,4235 **** * invalidated it. */ mch_memmove(newp + currsize + spaces, next, (size_t)(nextsize + 1)); ! curr = ml_get_curline(); mch_memmove(newp, curr, (size_t)currsize); --- 4236,4246 ---- * invalidated it. */ mch_memmove(newp + currsize + spaces, next, (size_t)(nextsize + 1)); ! ! /* FIXME: slow (O(n^2)) when joining many lines. Perhaps it can be ! * speed up using a realloc() (which does not always need to copy) ! * instead of a alloc() + copy? ! */ curr = ml_get_curline(); mch_memmove(newp, curr, (size_t)currsize); *************** *** 4704,4710 **** (long)-next_leader_len); #endif curwin->w_cursor.lnum--; ! if (do_join(TRUE) == FAIL) { beep_flush(); break; --- 4715,4721 ---- (long)-next_leader_len); #endif curwin->w_cursor.lnum--; ! if (do_join(TRUE, NULL) == FAIL) { beep_flush(); break; Index: edit.c =================================================================== RCS file: /cvsroot/vim/vim7/src/edit.c,v retrieving revision 1.141 diff -c -r1.141 edit.c *** edit.c 6 Aug 2008 16:56:55 -0000 1.141 --- edit.c 10 Dec 2008 21:41:17 -0000 *************** *** 8206,8212 **** if (!can_bs(BS_EOL) /* only if "eol" included */ || u_save((linenr_T)(curwin->w_cursor.lnum - 1), (linenr_T)(curwin->w_cursor.lnum + 2)) == FAIL ! || do_join(FALSE) == FAIL) vim_beep(); else curwin->w_cursor.col = temp; --- 8206,8212 ---- if (!can_bs(BS_EOL) /* only if "eol" included */ || u_save((linenr_T)(curwin->w_cursor.lnum - 1), (linenr_T)(curwin->w_cursor.lnum + 2)) == FAIL ! || do_join(FALSE, NULL) == FAIL) vim_beep(); else curwin->w_cursor.col = temp; *************** *** 8386,8392 **** ptr[len - 1] = NUL; } ! (void)do_join(FALSE); if (temp == NUL && gchar_cursor() != NUL) inc_cursor(); } --- 8386,8392 ---- ptr[len - 1] = NUL; } ! (void)do_join(FALSE, NULL); if (temp == NUL && gchar_cursor() != NUL) inc_cursor(); } Index: proto/ops.pro =================================================================== RCS file: /cvsroot/vim/vim7/src/proto/ops.pro,v retrieving revision 1.13 diff -c -r1.13 ops.pro *** proto/ops.pro 12 Mar 2008 16:26:04 -0000 1.13 --- proto/ops.pro 10 Dec 2008 21:41:17 -0000 *************** *** 37,43 **** int get_register_name __ARGS((int num)); void ex_display __ARGS((exarg_T *eap)); void do_do_join __ARGS((long count, int insert_space)); ! int do_join __ARGS((int insert_space)); void op_format __ARGS((oparg_T *oap, int keep_cursor)); void op_formatexpr __ARGS((oparg_T *oap)); int fex_format __ARGS((linenr_T lnum, long count, int c)); --- 37,43 ---- int get_register_name __ARGS((int num)); void ex_display __ARGS((exarg_T *eap)); void do_do_join __ARGS((long count, int insert_space)); ! int do_join __ARGS((int insert_space, int *len)); void op_format __ARGS((oparg_T *oap, int keep_cursor)); void op_formatexpr __ARGS((oparg_T *oap)); int fex_format __ARGS((linenr_T lnum, long count, int c));