We have a couple of open bug reports in which the regex library takes
unreasonable amounts of time and/or memory to process a regexp.  While
I have hopes that both of the known issues can be improved, it seems
likely that there will always be cases where sufficiently complicated
regexps just take too long.  Right now, any such case is an effective
DOS since the regex code contains no provisions for detecting a pending
query cancel: the backend just locks up for however long it takes,
unless it runs out of memory.  I've resisted adding such provisions
because I have hopes of making that code into a standalone library someday
... but I think it's time to capitulate.  Arguably, other applications
making use of a regex library could want query-cancel support too, so as
long as there's an API for specifying a callback function to detect a
pending query cancel, this doesn't seem any worse than some of the other
things we've done to Spencer's code.

In the attached proposed patch, I've not actually invented such an API,
but I have encapsulated the cancel check in an internal callback function,
so that it wouldn't take much more work to add an API for setting a
different callback function.

I don't claim that the two places I added cancel checks necessarily
provide 100% coverage of long code paths in the library; but they're
pretty central to regex compilation and execution respectively, and
they do stop the two known problem cases quite quickly.  We can always
add more checks if found necessary.

Barring objection or better ideas, I propose to back-patch this into
all live branches.

                        regards, tom lane

diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c
index ae2dbe4..f6dad01 100644
*** a/src/backend/regex/regc_nfa.c
--- b/src/backend/regex/regc_nfa.c
*************** newstate(struct nfa * nfa)
*** 174,184 ****
--- 174,196 ----
  {
  	struct state *s;
  
+ 	/*
+ 	 * This is a handy place to check for operation cancel during regex
+ 	 * compilation, since no code path will go very long without making a new
+ 	 * state.
+ 	 */
+ 	if (CANCEL_REQUESTED(nfa->v->re))
+ 	{
+ 		NERR(REG_CANCEL);
+ 		return NULL;
+ 	}
+ 
  	if (TooManyStates(nfa))
  	{
  		NERR(REG_ETOOBIG);
  		return NULL;
  	}
+ 
  	if (nfa->free != NULL)
  	{
  		s = nfa->free;
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index ca1ccc5..f10f7a8 100644
*** a/src/backend/regex/regcomp.c
--- b/src/backend/regex/regcomp.c
***************
*** 34,39 ****
--- 34,41 ----
  
  #include "regex/regguts.h"
  
+ #include "miscadmin.h"			/* needed by rcancelrequested() */
+ 
  /*
   * forward declarations, up here so forward datatypes etc. are defined early
   */
*************** static long nfanode(struct vars *, struc
*** 67,72 ****
--- 69,75 ----
  static int	newlacon(struct vars *, struct state *, struct state *, int);
  static void freelacons(struct subre *, int);
  static void rfree(regex_t *);
+ static int	rcancelrequested(void);
  
  #ifdef REG_DEBUG
  static void dump(regex_t *, FILE *);
*************** struct vars
*** 276,281 ****
--- 279,285 ----
  /* static function list */
  static const struct fns functions = {
  	rfree,						/* regfree insides */
+ 	rcancelrequested			/* check for cancel request */
  };
  
  
*************** rfree(regex_t *re)
*** 1893,1898 ****
--- 1897,1918 ----
  	}
  }
  
+ /*
+  * rcancelrequested - check for external request to cancel regex operation
+  *
+  * Return nonzero to fail the operation with error code REG_CANCEL,
+  * zero to keep going
+  *
+  * The current implementation is Postgres-specific.  If we ever get around
+  * to splitting this out as a standalone library, there will need to be some
+  * API to let applications provide a callback function for this.
+  */
+ static int
+ rcancelrequested(void)
+ {
+ 	return InterruptPending && (QueryCancelPending || ProcDiePending);
+ }
+ 
  #ifdef REG_DEBUG
  
  /*
diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c
index 78ebdee..0edb83c 100644
*** a/src/backend/regex/regexec.c
--- b/src/backend/regex/regexec.c
*************** cdissect(struct vars * v,
*** 595,600 ****
--- 595,604 ----
  	assert(t != NULL);
  	MDEBUG(("cdissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op));
  
+ 	/* handy place to check for operation cancel */
+ 	if (CANCEL_REQUESTED(v->re))
+ 		return REG_CANCEL;
+ 
  	switch (t->op)
  	{
  		case '=':				/* terminal node */
diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c
index 588052c..7c5b0d5 100644
*** a/src/backend/utils/adt/regexp.c
--- b/src/backend/utils/adt/regexp.c
***************
*** 31,36 ****
--- 31,37 ----
  
  #include "catalog/pg_type.h"
  #include "funcapi.h"
+ #include "miscadmin.h"
  #include "regex/regex.h"
  #include "utils/array.h"
  #include "utils/builtins.h"
*************** RE_compile_and_cache(text *text_re, int 
*** 188,193 ****
--- 189,203 ----
  	if (regcomp_result != REG_OKAY)
  	{
  		/* re didn't compile (no need for pg_regfree, if so) */
+ 
+ 		/*
+ 		 * Here and in other places in this file, do CHECK_FOR_INTERRUPTS
+ 		 * before reporting a regex error.	This is so that if the regex
+ 		 * library aborts and returns REG_CANCEL, we don't print an error
+ 		 * message that implies the regex was invalid.
+ 		 */
+ 		CHECK_FOR_INTERRUPTS();
+ 
  		pg_regerror(regcomp_result, &re_temp.cre_re, errMsg, sizeof(errMsg));
  		ereport(ERROR,
  				(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
*************** RE_wchar_execute(regex_t *re, pg_wchar *
*** 268,273 ****
--- 278,284 ----
  	if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
  	{
  		/* re failed??? */
+ 		CHECK_FOR_INTERRUPTS();
  		pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
  		ereport(ERROR,
  				(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
*************** regexp_fixed_prefix(text *text_re, bool 
*** 1216,1221 ****
--- 1227,1233 ----
  
  		default:
  			/* re failed??? */
+ 			CHECK_FOR_INTERRUPTS();
  			pg_regerror(re_result, re, errMsg, sizeof(errMsg));
  			ereport(ERROR,
  					(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 8ac402b..cb07a06 100644
*** a/src/backend/utils/adt/varlena.c
--- b/src/backend/utils/adt/varlena.c
*************** replace_text_regexp(text *src_text, void
*** 3035,3040 ****
--- 3035,3041 ----
  		{
  			char		errMsg[100];
  
+ 			CHECK_FOR_INTERRUPTS();
  			pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
  			ereport(ERROR,
  					(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
diff --git a/src/include/regex/regerrs.h b/src/include/regex/regerrs.h
index f02711e..809b511 100644
*** a/src/include/regex/regerrs.h
--- b/src/include/regex/regerrs.h
***************
*** 81,83 ****
--- 81,87 ----
  {
  	REG_ECOLORS, "REG_ECOLORS", "too many colors"
  },
+ 
+ {
+ 	REG_CANCEL, "REG_CANCEL", "operation cancelled"
+ },
diff --git a/src/include/regex/regex.h b/src/include/regex/regex.h
index 3e87dff..2c7fa4d 100644
*** a/src/include/regex/regex.h
--- b/src/include/regex/regex.h
*************** typedef struct
*** 154,159 ****
--- 154,160 ----
  #define REG_BADOPT	18			/* invalid embedded option */
  #define REG_ETOOBIG 19			/* nfa has too many states */
  #define REG_ECOLORS 20			/* too many colors */
+ #define REG_CANCEL	21			/* operation cancelled */
  /* two specials for debugging and testing */
  #define REG_ATOI	101			/* convert error-code name to number */
  #define REG_ITOA	102			/* convert error-code number to name */
diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h
index 3a39755..5361411 100644
*** a/src/include/regex/regguts.h
--- b/src/include/regex/regguts.h
*************** struct subre
*** 446,453 ****
--- 446,456 ----
  struct fns
  {
  	void		FUNCPTR(free, (regex_t *));
+ 	int			FUNCPTR(cancel_requested, (void));
  };
  
+ #define CANCEL_REQUESTED(re)  \
+ 	((*((struct fns *) (re)->re_fns)->cancel_requested) ())
  
  
  /*
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to