[Patch] allow more than 10 capturing groups

Christian Brabandt Mon, 20 Dec 2010 14:26:36 -0800

Hi Bram,
when working with csv-files, I have often wished for more than 10 
capturing groups. So here is a patch, that allows 99 capturing groups in 
the replacement part. This uses the perl-like syntax ${1} until ${99} 
for the capturing groups in the replacement part. If there does not 
exist a capturing group, this will resolve to the empty string.


I have tested it locally and it works for me[1]. This obviously needs to 
be very well tested, before included. 

[1] running make test as well as trying several different :s commands.

regards,
Christian

-- 
You received this message from the "vim_dev" maillist.
Do not top-post! Type your reply below the text you are replying to.
For more information, visit http://www.vim.org/maillist.php

diff --git a/runtime/doc/pattern.txt b/runtime/doc/pattern.txt
--- a/runtime/doc/pattern.txt
+++ b/runtime/doc/pattern.txt
@@ -960,6 +960,16 @@
 	in the pattern (going left to right), NOT based on what is matched
 	first.
 
+${1}    In the replacement part, replace with the first sub-   */${}*
+        expression, that was matched with \( and \).
+   ...
+${99}   In the replacement part, replace with the ${99} sub-expression
+	Note: If there are no 99 subexpressions, ${99} will replace 
+	the match with an empty string.
+	Note also, that the numbering of groups is done based on which
+	"\(" comes first in the pattern (going left to right), NOT based
+	on what is matched first.
+
 \%(\)	A pattern enclosed by escaped parentheses.	*/\%(\)* */\%(* *E53*
 	Just like \(\), but without counting it as a sub-expression.  This
 	allows using more groups and it's a little bit faster.
diff --git a/src/regexp.c b/src/regexp.c
--- a/src/regexp.c
+++ b/src/regexp.c
@@ -1116,8 +1116,10 @@
 	else if ((OP(scan) == BOW
 		    || OP(scan) == EOW
 		    || OP(scan) == NOTHING
-		    || OP(scan) == MOPEN + 0 || OP(scan) == NOPEN
-		    || OP(scan) == MCLOSE + 0 || OP(scan) == NCLOSE)
+		    || (OP(scan) == MOPEN && OPERAND_MIN(scan) == 0)
+		    || OP(scan) == NOPEN
+		    || (OP(scan) == MCLOSE && OPERAND_MIN(scan) == 0)
+		    || OP(scan) == NCLOSE)
 		 && OP(regnext(scan)) == EXACTLY)
 	{
 #ifdef FEAT_MBYTE
@@ -1245,7 +1247,11 @@
 	    EMSG_M_RET_NULL(_("E51: Too many %s("), reg_magic == MAGIC_ALL);
 	parno = regnpar;
 	++regnpar;
-	ret = regnode(MOPEN + parno);
+	ret = regnode(MOPEN);
+	if (ret == JUST_CALC_SIZE)
+	  regsize += 4;
+	else
+	    regcode = re_put_long(regcode,parno);
     }
     else if (paren == REG_NPAREN)
     {
@@ -1286,8 +1292,12 @@
 #ifdef FEAT_SYN_HL
 	    paren == REG_ZPAREN ? ZCLOSE + parno :
 #endif
-	    paren == REG_PAREN ? MCLOSE + parno :
+	    paren == REG_PAREN ? MCLOSE :
 	    paren == REG_NPAREN ? NCLOSE : END);
+    if (ender == JUST_CALC_SIZE && paren == REG_PAREN)
+      regsize += 4;
+    else if (paren == REG_PAREN)
+       regcode = re_put_long(regcode, parno);
     regtail(ret, ender);
 
     /* Hook the tails of the branches to the closing node. */
@@ -1794,9 +1804,19 @@
       case Magic('8'):
       case Magic('9'):
 	    {
-		int		    refnum;
+		int	refnum;
+		int	refnum2 = -1; 
+		int	d = getchr();
+		if ( '0' <= d && d <= '9')
+		{
+		  refnum2 = d - Magic('0');
+		}
+		else
+		    ungetchr();
 
 		refnum = c - Magic('0');
+		if (refnum2 >= 0 && refnum2 <= 9)
+		  refnum = refnum * 10 + refnum2;
 		/*
 		 * Check if the back reference is legal. We must have seen the
 		 * close brace.
@@ -1815,7 +1835,11 @@
 		    if (*p == NUL)
 			EMSG_RET_NULL(_("E65: Illegal back reference"));
 		}
-		ret = regnode(BACKREF + refnum);
+		ret = regnode(BACKREF);
+		if (ret == JUST_CALC_SIZE)
+		    regsize += 4;
+		else
+		    regcode = re_put_long(regcode, refnum);
 	    }
 	    break;
 
@@ -1851,10 +1875,18 @@
 			  break;
 #endif
 
-		case 's': ret = regnode(MOPEN + 0);
+		case 's': ret = regnode(MOPEN);
+			  if (ret == JUST_CALC_SIZE)
+			      regsize += 4;
+			  else
+			      regcode = re_put_long(regcode,0);
 			  break;
 
-		case 'e': ret = regnode(MCLOSE + 0);
+		case 'e': ret = regnode(MCLOSE);
+			  if (ret == JUST_CALC_SIZE)
+			      regsize += 4;
+			  else
+			      regcode = re_put_long(regcode,0);
 			  break;
 
 		default:  EMSG_RET_NULL(_("E68: Invalid character after \\z"));
@@ -4456,18 +4488,9 @@
 	    }
 	    break;
 
-	  case MOPEN + 0:   /* Match start: \zs */
-	  case MOPEN + 1:   /* \( */
-	  case MOPEN + 2:
-	  case MOPEN + 3:
-	  case MOPEN + 4:
-	  case MOPEN + 5:
-	  case MOPEN + 6:
-	  case MOPEN + 7:
-	  case MOPEN + 8:
-	  case MOPEN + 9:
+	  case MOPEN:   /* Match start: \zs, \( */
 	    {
-		no = op - MOPEN;
+		no = OPERAND_MIN(scan);
 		cleanup_subexpr();
 		rp = regstack_push(RS_MOPEN, scan);
 		if (rp == NULL)
@@ -4516,18 +4539,9 @@
 	    break;
 #endif
 
-	  case MCLOSE + 0:  /* Match end: \ze */
-	  case MCLOSE + 1:  /* \) */
-	  case MCLOSE + 2:
-	  case MCLOSE + 3:
-	  case MCLOSE + 4:
-	  case MCLOSE + 5:
-	  case MCLOSE + 6:
-	  case MCLOSE + 7:
-	  case MCLOSE + 8:
-	  case MCLOSE + 9:
+	  case MCLOSE:  /* Match end: \ze \) */
 	    {
-		no = op - MCLOSE;
+		no = OPERAND_MIN(scan);
 		cleanup_subexpr();
 		rp = regstack_push(RS_MCLOSE, scan);
 		if (rp == NULL)
@@ -4568,22 +4582,14 @@
 	    break;
 #endif
 
-	  case BACKREF + 1:
-	  case BACKREF + 2:
-	  case BACKREF + 3:
-	  case BACKREF + 4:
-	  case BACKREF + 5:
-	  case BACKREF + 6:
-	  case BACKREF + 7:
-	  case BACKREF + 8:
-	  case BACKREF + 9:
+	  case BACKREF:
 	    {
 		int		len;
 		linenr_T	clnum;
 		colnr_T		ccol;
 		char_u		*p;
 
-		no = op - BACKREF;
+		no = OPERAND_MIN(scan);
 		cleanup_subexpr();
 		if (!REG_MULTI)		/* Single-line regexp */
 		{
@@ -7055,6 +7061,19 @@
       {
 	if (c == '&' && magic)
 	    no = 0;
+	else if (c == '$'  &&  *src != NUL && *src == '{')
+	{
+	    int t=0;
+	    src++;
+	    while ( '0' <= *src && *src <= '9' && *src != NUL)
+	    {
+		 t = 10*t + *src - '0';
+		 ++src;
+	    }
+	    if (*src == '}')
+	      ++src;
+	    no=t;
+	}
 	else if (c == '\\' && *src != NUL)
 	{
 	    if (*src == '&' && !magic)
diff --git a/src/regexp.h b/src/regexp.h
--- a/src/regexp.h
+++ b/src/regexp.h
@@ -19,7 +19,7 @@
  * The second one (index 1) is the first sub-match, referenced with "\1".
  * This goes up to the tenth (index 9), referenced with "\9".
  */
-#define NSUBEXP  10
+#define NSUBEXP  100
 
 /*
  * Structure returned by vim_regcomp() to pass on to vim_regexec().

[Patch] allow more than 10 capturing groups

Raspunde prin e-mail lui