Bram Moolenaar wrote:
>
>
> Lee Naish wrote:
>
>> The "J" command seems to have a complexity bug when given a large numeric
>> argument, eg 100000J
>>
>> eg:
>>
>> %  yes | head -100000 > vimtest
>> %  time vim vimtest
>>
>> <Joined all lines using 99999J>
>>
>> 17.2u 0.0s 2:29.74 11.5% 0+0k 0+10232io 0pf+0w
>> % yes | head -200000 > vimtest
>> %  time vim vimtest
>>
>> <Joined all lines using 199999J>
>>
>> 68.8u 0.4s 1:41.22 68.3% 0+0k 0+39648io 0pf+0w
>>
>> ie, doubling the number of lines increased the runtime by a factor of 4
>> exactly - seems like there is a O(N^2) component to the algorithm.
>
> This is a known problem.  It works like doing J 100000 times.  So near
> the end you are appending a short line to a very long line, copying
> everything into newly allocated memory.
>
> Allocating all the memory at once is a bit complicated.  Perhaps an easy
> solution would be to join two lines at a time.


I see at least 2 things that explain the O(n^2) complexity when
joining many lines:

1/ STRLEN() is called every time on the current line.  As the current
   line grows, STRLEN(..) becomes slow and has O(n^2) complexity
   when joining n lines.

2/ For each join, a new line gets allocated, and the current line + next
  line get copied into the new one.  As the current line size increases,
  copying has complexity O(n^2).

I have a patch which fixes at least 1/ (but not 2/).  The speeds up
is quite noticeable, but complexity is still O(n^2) since I only
addressed 1 of the 2 issues.

It works by letting the caller of do_join(...) specify the length of the
current line (if it knows it), so that do_join(...) then does not need
to call STRLEN(...) for every line.

Here is the time it takes to join 100.000 lines on my laptop before
and after patch.

before patch:  28.228s
after patch: 21.518s

I measured several times with this test case:

$ yes | head -100000 > yes.tmp
$ time ./vim -u NONE -c '%join|q!' yes.tmp

To speed up much more (and to get rid of O(n^2) complexity),
we'd need to address the remaining point 2/.  I don't understand
yet enough how memory is managed for lines, but I'd like to try
to look at it. Maybe we could realloc(...) the current line instead
of doing a new alloc + copy each time.  realloc(...) will often just
increase the size of the block without actually copying anything
(when there is free memory after the block).  Or maybe we need
to join all n lines at once rather than joining pairs of lines. Not
sure yet what's best/easiest.

-- Dominique

--~--~---------~--~----~------------~-------~--~----~
You received this message from the "vim_dev" maillist.
For more information, visit http://www.vim.org/maillist.php
-~----------~----~----~----~------~----~------~--~---

Index: ops.c
===================================================================
RCS file: /cvsroot/vim/vim7/src/ops.c,v
retrieving revision 1.68
diff -c -r1.68 ops.c
*** ops.c	3 Dec 2008 12:38:36 -0000	1.68
--- ops.c	10 Dec 2008 21:41:13 -0000
***************
*** 1919,1925 ****
  							    );
  	    curwin->w_cursor = curpos;		/* restore curwin->w_cursor */
  
! 	    (void)do_join(FALSE);
  	}
      }
  
--- 1919,1925 ----
  							    );
  	    curwin->w_cursor = curpos;		/* restore curwin->w_cursor */
  
! 	    (void)do_join(FALSE, NULL);
  	}
      }
  
***************
*** 4111,4116 ****
--- 4111,4117 ----
      int	    insert_space;
  {
      colnr_T	col = MAXCOL;
+     int		len = -1;	/* current line length not known yet */
  
      if (u_save((linenr_T)(curwin->w_cursor.lnum - 1),
  		    (linenr_T)(curwin->w_cursor.lnum + count)) == FAIL)
***************
*** 4119,4125 ****
      while (--count > 0)
      {
  	line_breakcheck();
! 	if (got_int || do_join(insert_space) == FAIL)
  	{
  	    beep_flush();
  	    break;
--- 4120,4126 ----
      while (--count > 0)
      {
  	line_breakcheck();
! 	if (got_int || do_join(insert_space, &len) == FAIL)
  	{
  	    beep_flush();
  	    break;
***************
*** 4149,4156 ****
   * return FAIL for failure, OK otherwise
   */
      int
! do_join(insert_space)
      int		insert_space;
  {
      char_u	*curr;
      char_u	*next, *next_start;
--- 4150,4158 ----
   * return FAIL for failure, OK otherwise
   */
      int
! do_join(insert_space, len)
      int		insert_space;
+     int		*len;
  {
      char_u	*curr;
      char_u	*next, *next_start;
***************
*** 4165,4171 ****
  	return FAIL;		/* can't join on last line */
  
      curr = ml_get_curline();
!     currsize = (int)STRLEN(curr);
      endcurr1 = endcurr2 = NUL;
      if (insert_space && currsize > 0)
      {
--- 4167,4176 ----
  	return FAIL;		/* can't join on last line */
  
      curr = ml_get_curline();
!     
!     /* Calculate line size only if not provided by caller to avoid
!      * O(n^2) complexity when joining many lines */
!     currsize = (len == NULL || *len < 0) ? (int)STRLEN(curr) : *len;
      endcurr1 = endcurr2 = NUL;
      if (insert_space && currsize > 0)
      {
***************
*** 4218,4223 ****
--- 4223,4230 ----
  	}
      }
      nextsize = (int)STRLEN(next);
+     if (len != NULL)
+ 	*len = currsize + nextsize + spaces;
  
      newp = alloc_check((unsigned)(currsize + nextsize + spaces + 1));
      if (newp == NULL)
***************
*** 4229,4235 ****
       * invalidated it.
       */
      mch_memmove(newp + currsize + spaces, next, (size_t)(nextsize + 1));
! 
      curr = ml_get_curline();
      mch_memmove(newp, curr, (size_t)currsize);
  
--- 4236,4246 ----
       * invalidated it.
       */
      mch_memmove(newp + currsize + spaces, next, (size_t)(nextsize + 1));
!     
!     /* FIXME: slow (O(n^2)) when joining many lines. Perhaps it can be
!      * speed up using a realloc() (which does not always need to copy)
!      * instead of a alloc() + copy?
!      */
      curr = ml_get_curline();
      mch_memmove(newp, curr, (size_t)currsize);
  
***************
*** 4704,4710 ****
  						      (long)-next_leader_len);
  #endif
  		curwin->w_cursor.lnum--;
! 		if (do_join(TRUE) == FAIL)
  		{
  		    beep_flush();
  		    break;
--- 4715,4721 ----
  						      (long)-next_leader_len);
  #endif
  		curwin->w_cursor.lnum--;
! 		if (do_join(TRUE, NULL) == FAIL)
  		{
  		    beep_flush();
  		    break;
Index: edit.c
===================================================================
RCS file: /cvsroot/vim/vim7/src/edit.c,v
retrieving revision 1.141
diff -c -r1.141 edit.c
*** edit.c	6 Aug 2008 16:56:55 -0000	1.141
--- edit.c	10 Dec 2008 21:41:17 -0000
***************
*** 8206,8212 ****
  	if (!can_bs(BS_EOL)		/* only if "eol" included */
  		|| u_save((linenr_T)(curwin->w_cursor.lnum - 1),
  		    (linenr_T)(curwin->w_cursor.lnum + 2)) == FAIL
! 		|| do_join(FALSE) == FAIL)
  	    vim_beep();
  	else
  	    curwin->w_cursor.col = temp;
--- 8206,8212 ----
  	if (!can_bs(BS_EOL)		/* only if "eol" included */
  		|| u_save((linenr_T)(curwin->w_cursor.lnum - 1),
  		    (linenr_T)(curwin->w_cursor.lnum + 2)) == FAIL
! 		|| do_join(FALSE, NULL) == FAIL)
  	    vim_beep();
  	else
  	    curwin->w_cursor.col = temp;
***************
*** 8386,8392 ****
  			ptr[len - 1] = NUL;
  		}
  
! 		(void)do_join(FALSE);
  		if (temp == NUL && gchar_cursor() != NUL)
  		    inc_cursor();
  	    }
--- 8386,8392 ----
  			ptr[len - 1] = NUL;
  		}
  
! 		(void)do_join(FALSE, NULL);
  		if (temp == NUL && gchar_cursor() != NUL)
  		    inc_cursor();
  	    }
Index: proto/ops.pro
===================================================================
RCS file: /cvsroot/vim/vim7/src/proto/ops.pro,v
retrieving revision 1.13
diff -c -r1.13 ops.pro
*** proto/ops.pro	12 Mar 2008 16:26:04 -0000	1.13
--- proto/ops.pro	10 Dec 2008 21:41:17 -0000
***************
*** 37,43 ****
  int get_register_name __ARGS((int num));
  void ex_display __ARGS((exarg_T *eap));
  void do_do_join __ARGS((long count, int insert_space));
! int do_join __ARGS((int insert_space));
  void op_format __ARGS((oparg_T *oap, int keep_cursor));
  void op_formatexpr __ARGS((oparg_T *oap));
  int fex_format __ARGS((linenr_T lnum, long count, int c));
--- 37,43 ----
  int get_register_name __ARGS((int num));
  void ex_display __ARGS((exarg_T *eap));
  void do_do_join __ARGS((long count, int insert_space));
! int do_join __ARGS((int insert_space, int *len));
  void op_format __ARGS((oparg_T *oap, int keep_cursor));
  void op_formatexpr __ARGS((oparg_T *oap));
  int fex_format __ARGS((linenr_T lnum, long count, int c));

Raspunde prin e-mail lui