Hi

1) Vim is trying to allocate a negative number of bytes
when doing:

$ vim -u NONE \
  -c 'call search("\\%[123456789012345678901234567]\\{1,30000}")'

It reports:

E342: Out of memory!  (allocating 18446744071740188400 bytes)

(18446744071740188400 in hex is 0xFFFFFFFF8A9DE6F0)

It's admittedly an improbable regexp, but vim should not
allocate a negative number of bytes.  It happens because
allocated size for states in nfa_regmatch() uses 'int'
instead of 'size_t' so beyond 2Gb, vim tries to allocate
a negative number of bytes.

Fixed in attached patch: "int-overflow-regexp_nfa.c-7.4.591.patch"

Perhaps new regexp engine should not be used when it creates
so many states because it's slow and uses lots of memory,
but I did not try to change that.


2) Since some regexp can allocate many nfa_thread_T states,
I looked whether we can save memory and noticed that
sizeof(nfa_thread_T) is quite big: 1360 bytes on x86_64.

Patch "memory-opt-regexp_nfa.c-7.4.591.patch" avoids
wasting padding bytes in:

        struct multipos
        {
            lpos_T      start;
            lpos_T      end;
        } multi[NSUBEXP];

start.lnum is 64 bits (long) whereas start.col is 32 bits (int).
so 2x32 bits of padding are wasted inside the multipos struct.
We can avoid wasting 2x32 bits of padding by grouping start.lnum
and end.lnum next to each other using:

        struct multipos
        {
            linenr_T    start_lnum;
            linenr_T    end_lnum;
            colnr_T     start_col;
            colnr_T     end_col;
        } multi[NSUBEXP];

This makes sizeof(nfa_thread_T)=1040 bytes instead of 1360 bytes.

Using the same case as earlier...

$ vim -u NONE \
  -c 'call search("\\%[123456789012345678901234567]\\{1,30000}")'

Prior to patch, nfa_regmatch() was allocating:
  1,710,003 states * 1360 bytes = 2,325,604,080 bytes.

After patch, it allocates:
  1,710,003 states * 1040 bytes = 1,778,403,120 bytes

That saves 23.5% of memory for NFA states.

Regards
Dominique

-- 
-- 
You received this message from the "vim_dev" maillist.
Do not top-post! Type your reply below the text you are replying to.
For more information, visit http://www.vim.org/maillist.php

--- 
You received this message because you are subscribed to the Google Groups 
"vim_dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/d/optout.
diff -r 3d2db5a7403f src/regexp_nfa.c
--- a/src/regexp_nfa.c	Thu Jan 22 22:41:56 2015 +0100
+++ b/src/regexp_nfa.c	Sun Jan 25 00:19:56 2015 +0100
@@ -5399,7 +5399,7 @@
     regsubs_T		*m;
 {
     int		result;
-    int		size = 0;
+    size_t	size = 0;
     int		flag = 0;
     int		go_to_nextline = FALSE;
     nfa_thread_T *t;
diff -r 3d2db5a7403f src/regexp_nfa.c
--- a/src/regexp_nfa.c	Thu Jan 22 22:41:56 2015 +0100
+++ b/src/regexp_nfa.c	Sun Jan 25 00:42:39 2015 +0100
@@ -1453,7 +1453,7 @@
 			 * matched an unlimited number of times. NFA_NOPEN is
 			 * added only once at a position, while NFA_SPLIT is
 			 * added multiple times.  This is more efficient than
-			 * not allowsing NFA_SPLIT multiple times, it is used
+			 * not allowing NFA_SPLIT multiple times, it is used
 			 * a lot. */
 			EMIT(NFA_NOPEN);
 			break;
@@ -3717,8 +3717,10 @@
     {
 	struct multipos
 	{
-	    lpos_T	start;
-	    lpos_T	end;
+	    linenr_T	start_lnum;
+	    linenr_T	end_lnum;
+	    colnr_T	start_col;
+	    colnr_T	end_col;
 	} multi[NSUBEXP];
 	struct linepos
 	{
@@ -3803,10 +3805,10 @@
 	if (REG_MULTI)
 	    fprintf(log_fd, "*** group %d, start: c=%d, l=%d, end: c=%d, l=%d\n",
 		    j,
-		    sub->list.multi[j].start.col,
-		    (int)sub->list.multi[j].start.lnum,
-		    sub->list.multi[j].end.col,
-		    (int)sub->list.multi[j].end.lnum);
+		    sub->list.multi[j].start_col,
+		    (int)sub->list.multi[j].start_lnum,
+		    sub->list.multi[j].end_col,
+		    (int)sub->list.multi[j].end_lnum);
 	else
 	{
 	    char *s = (char *)sub->list.line[j].start;
@@ -3943,8 +3945,11 @@
     {
 	if (REG_MULTI)
 	{
-	    if (from->list.multi[0].end.lnum >= 0)
-		to->list.multi[0].end = from->list.multi[0].end;
+	    if (from->list.multi[0].end_lnum >= 0)
+            {
+		to->list.multi[0].end_lnum = from->list.multi[0].end_lnum;
+		to->list.multi[0].end_col = from->list.multi[0].end_col;
+            }
 	}
 	else
 	{
@@ -3976,33 +3981,33 @@
 	for (i = 0; i < todo; ++i)
 	{
 	    if (i < sub1->in_use)
-		s1 = sub1->list.multi[i].start.lnum;
+		s1 = sub1->list.multi[i].start_lnum;
 	    else
 		s1 = -1;
 	    if (i < sub2->in_use)
-		s2 = sub2->list.multi[i].start.lnum;
+		s2 = sub2->list.multi[i].start_lnum;
 	    else
 		s2 = -1;
 	    if (s1 != s2)
 		return FALSE;
-	    if (s1 != -1 && sub1->list.multi[i].start.col
-					     != sub2->list.multi[i].start.col)
+	    if (s1 != -1 && sub1->list.multi[i].start_col
+					     != sub2->list.multi[i].start_col)
 		return FALSE;
 
 	    if (nfa_has_backref)
 	    {
 		if (i < sub1->in_use)
-		    s1 = sub1->list.multi[i].end.lnum;
+		    s1 = sub1->list.multi[i].end_lnum;
 		else
 		    s1 = -1;
 		if (i < sub2->in_use)
-		    s2 = sub2->list.multi[i].end.lnum;
+		    s2 = sub2->list.multi[i].end_lnum;
 		else
 		    s2 = -1;
 		if (s1 != s2)
 		    return FALSE;
-		if (s1 != -1 && sub1->list.multi[i].end.col
-					       != sub2->list.multi[i].end.col)
+		if (s1 != -1 && sub1->list.multi[i].end_col
+					       != sub2->list.multi[i].end_col)
 		return FALSE;
 	    }
 	}
@@ -4053,7 +4058,7 @@
     if (sub->in_use <= 0)
 	col = -1;
     else if (REG_MULTI)
-	col = sub->list.multi[0].start.col;
+	col = sub->list.multi[0].start_col;
     else
 	col = (int)(sub->list.line[0].start - regline);
     nfa_set_code(state->c);
@@ -4473,7 +4478,8 @@
 	    {
 		if (subidx < sub->in_use)
 		{
-		    save_lpos = sub->list.multi[subidx].start;
+		    save_lpos.lnum = sub->list.multi[subidx].start_lnum;
+		    save_lpos.col = sub->list.multi[subidx].start_col;
 		    save_in_use = -1;
 		}
 		else
@@ -4481,20 +4487,20 @@
 		    save_in_use = sub->in_use;
 		    for (i = sub->in_use; i < subidx; ++i)
 		    {
-			sub->list.multi[i].start.lnum = -1;
-			sub->list.multi[i].end.lnum = -1;
+			sub->list.multi[i].start_lnum = -1;
+			sub->list.multi[i].end_lnum = -1;
 		    }
 		    sub->in_use = subidx + 1;
 		}
 		if (off == -1)
 		{
-		    sub->list.multi[subidx].start.lnum = reglnum + 1;
-		    sub->list.multi[subidx].start.col = 0;
+		    sub->list.multi[subidx].start_lnum = reglnum + 1;
+		    sub->list.multi[subidx].start_col = 0;
 		}
 		else
 		{
-		    sub->list.multi[subidx].start.lnum = reglnum;
-		    sub->list.multi[subidx].start.col =
+		    sub->list.multi[subidx].start_lnum = reglnum;
+		    sub->list.multi[subidx].start_col =
 					  (colnr_T)(reginput - regline + off);
 		}
 	    }
@@ -4530,7 +4536,10 @@
 	    if (save_in_use == -1)
 	    {
 		if (REG_MULTI)
-		    sub->list.multi[subidx].start = save_lpos;
+                {
+		    sub->list.multi[subidx].start_lnum = save_lpos.lnum;
+		    sub->list.multi[subidx].start_col = save_lpos.col;
+                }
 		else
 		    sub->list.line[subidx].start = save_ptr;
 	    }
@@ -4540,7 +4549,7 @@
 
 	case NFA_MCLOSE:
 	    if (nfa_has_zend && (REG_MULTI
-			? subs->norm.list.multi[0].end.lnum >= 0
+			? subs->norm.list.multi[0].end_lnum >= 0
 			: subs->norm.list.line[0].end != NULL))
 	    {
 		/* Do not overwrite the position set by \ze. */
@@ -4594,16 +4603,17 @@
 		sub->in_use = subidx + 1;
 	    if (REG_MULTI)
 	    {
-		save_lpos = sub->list.multi[subidx].end;
+		save_lpos.lnum = sub->list.multi[subidx].end_lnum;
+		save_lpos.col = sub->list.multi[subidx].end_col;
 		if (off == -1)
 		{
-		    sub->list.multi[subidx].end.lnum = reglnum + 1;
-		    sub->list.multi[subidx].end.col = 0;
+		    sub->list.multi[subidx].end_lnum = reglnum + 1;
+		    sub->list.multi[subidx].end_col = 0;
 		}
 		else
 		{
-		    sub->list.multi[subidx].end.lnum = reglnum;
-		    sub->list.multi[subidx].end.col =
+		    sub->list.multi[subidx].end_lnum = reglnum;
+		    sub->list.multi[subidx].end_col =
 					  (colnr_T)(reginput - regline + off);
 		}
 		/* avoid compiler warnings */
@@ -4628,7 +4638,10 @@
 		sub = &subs->norm;
 
 	    if (REG_MULTI)
-		sub->list.multi[subidx].end = save_lpos;
+            {
+		sub->list.multi[subidx].end_lnum = save_lpos.lnum;
+		sub->list.multi[subidx].end_col = save_lpos.col;
+            }
 	    else
 		sub->list.line[subidx].end = save_ptr;
 	    sub->in_use = save_in_use;
@@ -4816,15 +4829,15 @@
 
     if (REG_MULTI)
     {
-	if (sub->list.multi[subidx].start.lnum < 0
-				       || sub->list.multi[subidx].end.lnum < 0)
+	if (sub->list.multi[subidx].start_lnum < 0
+				       || sub->list.multi[subidx].end_lnum < 0)
 	    goto retempty;
-	if (sub->list.multi[subidx].start.lnum == reglnum
-			       && sub->list.multi[subidx].end.lnum == reglnum)
+	if (sub->list.multi[subidx].start_lnum == reglnum
+			       && sub->list.multi[subidx].end_lnum == reglnum)
 	{
-	    len = sub->list.multi[subidx].end.col
-					  - sub->list.multi[subidx].start.col;
-	    if (cstrncmp(regline + sub->list.multi[subidx].start.col,
+	    len = sub->list.multi[subidx].end_col
+					  - sub->list.multi[subidx].start_col;
+	    if (cstrncmp(regline + sub->list.multi[subidx].start_col,
 							 reginput, &len) == 0)
 	    {
 		*bytelen = len;
@@ -4834,10 +4847,10 @@
 	else
 	{
 	    if (match_with_backref(
-			sub->list.multi[subidx].start.lnum,
-			sub->list.multi[subidx].start.col,
-			sub->list.multi[subidx].end.lnum,
-			sub->list.multi[subidx].end.col,
+			sub->list.multi[subidx].start_lnum,
+			sub->list.multi[subidx].start_col,
+			sub->list.multi[subidx].end_lnum,
+			sub->list.multi[subidx].end_col,
 			bytelen) == RA_MATCH)
 		return TRUE;
 	}
@@ -5399,7 +5412,7 @@
     regsubs_T		*m;
 {
     int		result;
-    int		size = 0;
+    size_t	size = 0;
     int		flag = 0;
     int		go_to_nextline = FALSE;
     nfa_thread_T *t;
@@ -5432,6 +5445,7 @@
 
     /* Allocate memory for the lists of nodes. */
     size = (nstate + 1) * sizeof(nfa_thread_T);
+
     list[0].t = (nfa_thread_T *)lalloc(size, TRUE);
     list[0].len = nstate + 1;
     list[1].t = (nfa_thread_T *)lalloc(size, TRUE);
@@ -5473,8 +5487,8 @@
     {
 	if (REG_MULTI)
 	{
-	    m->norm.list.multi[0].start.lnum = reglnum;
-	    m->norm.list.multi[0].start.col = (colnr_T)(reginput - regline);
+	    m->norm.list.multi[0].start_lnum = reglnum;
+	    m->norm.list.multi[0].start_col = (colnr_T)(reginput - regline);
 	}
 	else
 	    m->norm.list.line[0].start = reginput;
@@ -5571,7 +5585,7 @@
 		if (t->subs.norm.in_use <= 0)
 		    col = -1;
 		else if (REG_MULTI)
-		    col = t->subs.norm.list.multi[0].start.col;
+		    col = t->subs.norm.list.multi[0].start_col;
 		else
 		    col = (int)(t->subs.norm.list.line[0].start - regline);
 		nfa_set_code(t->state->c);
@@ -5852,7 +5866,7 @@
 		     * continue with what follows. */
 		    if (REG_MULTI)
 			/* TODO: multi-line match */
-			bytelen = m->norm.list.multi[0].end.col
+			bytelen = m->norm.list.multi[0].end_col
 						  - (int)(reginput - regline);
 		    else
 			bytelen = (int)(m->norm.list.line[0].end - reginput);
@@ -6732,7 +6746,7 @@
 		if (add)
 		{
 		    if (REG_MULTI)
-			m->norm.list.multi[0].start.col =
+			m->norm.list.multi[0].start_col =
 					 (colnr_T)(reginput - regline) + clen;
 		    else
 			m->norm.list.line[0].start = reginput + clen;
@@ -6845,8 +6859,11 @@
     {
 	for (i = 0; i < subs.norm.in_use; i++)
 	{
-	    reg_startpos[i] = subs.norm.list.multi[i].start;
-	    reg_endpos[i] = subs.norm.list.multi[i].end;
+	    reg_startpos[i].lnum = subs.norm.list.multi[i].start_lnum;
+	    reg_startpos[i].col = subs.norm.list.multi[i].start_col;
+
+	    reg_endpos[i].lnum = subs.norm.list.multi[i].end_lnum;
+	    reg_endpos[i].col = subs.norm.list.multi[i].end_col;
 	}
 
 	if (reg_startpos[0].lnum < 0)
@@ -6894,13 +6911,13 @@
 		struct multipos *mpos = &subs.synt.list.multi[i];
 
 		/* Only accept single line matches that are valid. */
-		if (mpos->start.lnum >= 0
-			&& mpos->start.lnum == mpos->end.lnum
-			&& mpos->end.col >= mpos->start.col)
+		if (mpos->start_lnum >= 0
+			&& mpos->start_lnum == mpos->end_lnum
+			&& mpos->end_col >= mpos->start_col)
 		    re_extmatch_out->matches[i] =
-			vim_strnsave(reg_getline(mpos->start.lnum)
-							    + mpos->start.col,
-					     mpos->end.col - mpos->start.col);
+			vim_strnsave(reg_getline(mpos->start_lnum)
+							    + mpos->start_col,
+					     mpos->end_col - mpos->start_col);
 	    }
 	    else
 	    {

Raspunde prin e-mail lui