Re: [PATCHES] WIP patch adding new regexp functions

Jeremy Drake Wed, 07 Feb 2007 16:21:12 -0800

On Wed, 7 Feb 2007, David Fetter wrote:

> On Wed, Feb 07, 2007 at 09:23:58AM -0500, Tom Lane wrote:
> > Jeremy Drake <[EMAIL PROTECTED]> writes:
> > > * Put together a patch to add these functions to core.  I could put them
> > >   directly in regexp.c, so the support functions could stay static.  My
> > >   concern here is that I don't know if there are any functions currently
> > >   in core with OUT parameters.
> >
> > As of 8.2 there are.
> >
> > If we are going to include these I would vote for core not contrib
> > status, exactly to avoid having to export those functions.
>
> +1 for core. :)


Here is a patch to add these functions to core.  Please take a look and
let me know what you think.  I had to jump through a bit of a hoop to
avoid opr_sanity test from breaking, may not have done that right.

Also, this patch allows regexp_replace to take more flags, since my two
new functions needed flags also, I split flag parsing into a seperate
function and made regexp_replace use it too.  It also results that the
error message for an invalid flag in regexp_replace changes, since it is
now more generic as it is called from multiple functions.

I still need to write documentation for the new functions before I
consider the patch completed, but please take a look at the code and see
if it is acceptable as I do not have any further changes planned for it.

-- 
Chicken Soup, n.:
        An ancient miracle drug containing equal parts of aureomycin,
        cocaine, interferon, and TLC.  The only ailment chicken soup
        can't cure is neurotic dependence on one's mother.
                -- Arthur Naiman, "Every Goy's Guide to Yiddish"

Index: src/backend/utils/adt/regexp.c
===================================================================
RCS file: 
/home/jeremyd/local/postgres/cvsuproot/pgsql/src/backend/utils/adt/regexp.c,v
retrieving revision 1.68
diff -c -r1.68 regexp.c
*** src/backend/utils/adt/regexp.c      5 Jan 2007 22:19:41 -0000       1.68
--- src/backend/utils/adt/regexp.c      8 Feb 2007 00:04:08 -0000
***************
*** 29,36 ****
--- 29,39 ----
   */
  #include "postgres.h"
  
+ #include "funcapi.h"
+ #include "access/heapam.h"
  #include "regex/regex.h"
  #include "utils/builtins.h"
+ #include "utils/lsyscache.h"
  #include "utils/guc.h"
  
  
***************
*** 75,80 ****
--- 78,127 ----
        regex_t         cre_re;                 /* the compiled regular 
expression */
  } cached_re_str;
  
+ typedef struct re_comp_flags {
+       bool              return_pre_and_post;
+       bool              glob;
+       int                       cflags;
+ } re_comp_flags;
+ 
+ typedef struct regexp_matches_ctx {
+       text            * orig_str;
+       size_t            orig_len;
+       pg_wchar        * wide_str;
+       size_t            wide_len;
+       regex_t         * cpattern;
+       size_t            offset;
+       regmatch_t      * pmatch;
+ 
+       /* flag options */
+       re_comp_flags flags;
+ 
+       /* return type info */
+       TupleDesc         rettupdesc;
+       Oid                       rettype;
+       TypeFuncClass typefunc;
+ 
+       /* text type info */
+       Oid               param_type;
+       int16             typlen;
+       bool              typbyval;
+       char              typalign;
+ } regexp_matches_ctx;
+ 
+ typedef struct regexp_split_ctx {
+       text            * orig_str;
+       size_t            orig_len;
+       pg_wchar        * wide_str;
+       size_t            wide_len;
+       regex_t         * cpattern;
+       regmatch_t        match;
+       size_t            offset;
+ 
+       /* flag options */
+       re_comp_flags flags;
+ } regexp_split_ctx;
+ 
+ 
  static int    num_res = 0;            /* # of cached re's */
  static cached_re_str re_array[MAX_CACHED_RES];        /* cached re's */
  
***************
*** 191,238 ****
  }
  
  /*
!  * RE_compile_and_execute - compile and execute a RE
   *
   * Returns TRUE on match, FALSE on no match
   *
!  *    text_re --- the pattern, expressed as an *untoasted* TEXT object
!  *    dat --- the data to match against (need not be null-terminated)
!  *    dat_len --- the length of the data string
!  *    cflags --- compile options for the pattern
   *    nmatch, pmatch  --- optional return area for match details
   *
!  * Both pattern and data are given in the database encoding.  We internally
!  * convert to array of pg_wchar which is what Spencer's regex package wants.
   */
  static bool
! RE_compile_and_execute(text *text_re, char *dat, int dat_len,
!                                          int cflags, int nmatch, regmatch_t 
*pmatch)
  {
-       pg_wchar   *data;
-       size_t          data_len;
        int                     regexec_result;
-       regex_t    *re;
        char            errMsg[100];
  
-       /* Convert data string to wide characters */
-       data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
-       data_len = pg_mb2wchar_with_len(dat, data, dat_len);
- 
-       /* Compile RE */
-       re = RE_compile_and_cache(text_re, cflags);
- 
        /* Perform RE match and return result */
        regexec_result = pg_regexec(re,
                                                                data,
                                                                data_len,
!                                                               0,
                                                                NULL,   /* no 
details */
                                                                nmatch,
                                                                pmatch,
                                                                0);
  
-       pfree(data);
- 
        if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
        {
                /* re failed??? */
--- 238,273 ----
  }
  
  /*
!  * RE_wchar_execute - execute a RE
   *
   * Returns TRUE on match, FALSE on no match
   *
!  *    re --- the compiled pattern as returned by RE_compile_and_cache
!  *    data --- the data to match against (need not be null-terminated)
!  *    data_len --- the length of the data string
!  *    start_search -- the offset in the data to start searching
   *    nmatch, pmatch  --- optional return area for match details
   *
!  * Data is given as array of pg_wchar which is what Spencer's regex package
!  * wants.
   */
  static bool
! RE_wchar_execute(regex_t *re, pg_wchar *data, int data_len, size_t 
start_search,
!                                          int nmatch, regmatch_t *pmatch)
  {
        int                     regexec_result;
        char            errMsg[100];
  
        /* Perform RE match and return result */
        regexec_result = pg_regexec(re,
                                                                data,
                                                                data_len,
!                                                               start_search,
                                                                NULL,   /* no 
details */
                                                                nmatch,
                                                                pmatch,
                                                                0);
  
        if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
        {
                /* re failed??? */
***************
*** 245,250 ****
--- 280,391 ----
        return (regexec_result == REG_OKAY);
  }
  
+ /*
+  * RE_execute - execute a RE
+  *
+  * Returns TRUE on match, FALSE on no match
+  *
+  *    re --- the compiled pattern as returned by RE_compile_and_cache
+  *    dat --- the data to match against (need not be null-terminated)
+  *    dat_len --- the length of the data string
+  *    nmatch, pmatch  --- optional return area for match details
+  *
+  * Data is given in the database encoding.  We internally
+  * convert to array of pg_wchar which is what Spencer's regex package wants.
+  */
+ static bool
+ RE_execute(regex_t *re, char *dat, int dat_len,
+                                          int nmatch, regmatch_t *pmatch)
+ {
+       pg_wchar   *data;
+       size_t          data_len;
+       bool            match;
+ 
+       /* Convert data string to wide characters */
+       data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
+       data_len = pg_mb2wchar_with_len(dat, data, dat_len);
+ 
+       /* Perform RE match and return result */
+       match = RE_wchar_execute(re, data, data_len, 0, nmatch, pmatch);
+       pfree(data);
+       return match;
+ }
+ 
+ /*
+  * RE_compile_and_execute - compile and execute a RE
+  *
+  * Returns TRUE on match, FALSE on no match
+  *
+  *    text_re --- the pattern, expressed as an *untoasted* TEXT object
+  *    dat --- the data to match against (need not be null-terminated)
+  *    dat_len --- the length of the data string
+  *    cflags --- compile options for the pattern
+  *    nmatch, pmatch  --- optional return area for match details
+  *
+  * Both pattern and data are given in the database encoding.  We internally
+  * convert to array of pg_wchar which is what Spencer's regex package wants.
+  */
+ static bool
+ RE_compile_and_execute(text *text_re, char *dat, int dat_len,
+                                          int cflags, int nmatch, regmatch_t 
*pmatch)
+ {
+       regex_t    *re;
+ 
+       /* Compile RE */
+       re = RE_compile_and_cache(text_re, cflags);
+ 
+       return RE_execute(re, dat, dat_len, nmatch, pmatch);
+ }
+ 
+ static void parse_re_comp_flags(re_comp_flags *flags, text *o)
+ {
+       memset(flags, 0, sizeof(re_comp_flags));
+       flags->cflags = regex_flavor;
+ 
+       if (o)
+       {
+               char * opt_p = VARDATA(o);
+               size_t opt_len = (VARSIZE(o) - VARHDRSZ);
+               size_t i;
+ 
+               for (i = 0; i < opt_len; i++)
+               {
+                       switch (opt_p[i])
+                       {
+                               case 'g':
+                                       flags->glob = true;
+                                       break;
+                               case 'i':
+                                       flags->cflags |= REG_ICASE;
+                                       break;
+                               case 'm':
+                               case 'n':
+                                       flags->cflags |= REG_NEWLINE;
+                                       break;
+                               case 'p':
+                                       flags->cflags |= REG_NLSTOP;
+                                       flags->cflags &= ~REG_NLANCH;
+                                       break;
+                               case 'r':
+                                       flags->return_pre_and_post = true;
+                                       break;
+                               case 'w':
+                                       flags->cflags &= ~REG_NLSTOP;
+                                       flags->cflags |= REG_NLANCH;
+                                       break;
+                               case 'x':
+                                       flags->cflags |= REG_EXPANDED;
+                                       break;
+                               default:
+                                       ereport(ERROR,
+                                                       
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                                        errmsg("invalid regexp 
option: %c", opt_p[i])));
+                                       break;
+                       }
+               }
+       }
+ }
+ 
  
  /*
   * assign_regex_flavor - GUC hook to validate and set REGEX_FLAVOR
***************
*** 469,507 ****
        text       *p = PG_GETARG_TEXT_P(1);
        text       *r = PG_GETARG_TEXT_P(2);
        text       *opt = PG_GETARG_TEXT_P(3);
-       char       *opt_p = VARDATA(opt);
-       int                     opt_len = (VARSIZE(opt) - VARHDRSZ);
-       int                     i;
-       bool            glob = false;
-       bool            ignorecase = false;
        regex_t    *re;
  
        /* parse options */
!       for (i = 0; i < opt_len; i++)
!       {
!               switch (opt_p[i])
!               {
!                       case 'i':
!                               ignorecase = true;
!                               break;
!                       case 'g':
!                               glob = true;
!                               break;
!                       default:
!                               ereport(ERROR,
!                                               
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
!                                                errmsg("invalid option of 
regexp_replace: %c",
!                                                               opt_p[i])));
!                               break;
!               }
!       }
  
!       if (ignorecase)
!               re = RE_compile_and_cache(p, regex_flavor | REG_ICASE);
!       else
!               re = RE_compile_and_cache(p, regex_flavor);
  
!       PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, glob));
  }
  
  /* similar_escape()
--- 610,629 ----
        text       *p = PG_GETARG_TEXT_P(1);
        text       *r = PG_GETARG_TEXT_P(2);
        text       *opt = PG_GETARG_TEXT_P(3);
        regex_t    *re;
+       re_comp_flags flags;
  
        /* parse options */
!       parse_re_comp_flags(&flags, opt);
  
!       if (flags.return_pre_and_post)
!               ereport(ERROR,
!                               (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
!                                errmsg("invalid option of regexp_replace: 
r")));
  
!       re = RE_compile_and_cache(p, flags.cflags);
! 
!       PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, flags.glob));
  }
  
  /* similar_escape()
***************
*** 625,630 ****
--- 747,1073 ----
        PG_RETURN_TEXT_P(result);
  }
  
+ #define PG_RETURN_DATUM_MAYBE_SRF(_result) \
+       do { \
+               if (doing_srf) \
+                       SRF_RETURN_NEXT(funcctx, _result); \
+               else \
+                       PG_RETURN_DATUM(_result); \
+       } while (0)
+ 
+ #define PG_RETURN_DONE_MAYBE_SRF() \
+       do { \
+               if (doing_srf) \
+                       SRF_RETURN_DONE(funcctx); \
+               else \
+                       PG_RETURN_NULL(); \
+       } while (0)
+ 
+ #define PG_GETARG_TEXT_P_COPY_IF_SRF(_argno) \
+       (doing_srf ? PG_GETARG_TEXT_P_COPY(_argno) : PG_GETARG_TEXT_P(_argno))
+ 
+ #define PG_GETARG_TEXT_P_IF_EXISTS(_n) \
+       (PG_NARGS() > _n ? PG_GETARG_TEXT_P(_n) : NULL)
+ 
+ Datum
+ regexp_matches(PG_FUNCTION_ARGS)
+ {
+       FuncCallContext         *funcctx = NULL;
+       MemoryContext            oldcontext = NULL;
+       regexp_matches_ctx      *matchctx = NULL;
+       re_comp_flags            flags;
+       bool doing_srf = (fcinfo->resultinfo != NULL && IsA(fcinfo->resultinfo, 
ReturnSetInfo));
+ 
+       if (!doing_srf || SRF_IS_FIRSTCALL())
+       {
+               text *p;
+               text *o = PG_GETARG_TEXT_P_IF_EXISTS(2);
+ 
+               parse_re_comp_flags(&flags, o);
+ 
+               /* it is questionable whether or not this is allowable:
+                * we were called as an SRF, but only want to return one row,
+                * this will cheat and return that in a normal rather than
+                * SRF way
+                */
+ #if 0
+               if (!flags.glob)
+                       doing_srf = false;
+ #endif
+ 
+               if (doing_srf)
+               {
+                       funcctx = SRF_FIRSTCALL_INIT();
+                       oldcontext = 
MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+               }
+ 
+               matchctx = palloc0 (sizeof(regexp_matches_ctx));
+ 
+               /* this needs to be visible so copy it if SRF */
+               matchctx->orig_str = PG_GETARG_TEXT_P_COPY_IF_SRF(0);
+               matchctx->orig_len = VARSIZE(matchctx->orig_str) - VARHDRSZ;
+ 
+               p = PG_GETARG_TEXT_P(1);
+ 
+               memcpy(&matchctx->flags, &flags, sizeof(re_comp_flags));
+ 
+               matchctx->cpattern = RE_compile_and_cache(p, 
matchctx->flags.cflags);
+               matchctx->pmatch = palloc0(sizeof(regmatch_t) * 
(matchctx->cpattern->re_nsub + 1));
+               matchctx->offset = 0;
+ 
+               /* get text type oid, too lazy to do it some other way */
+               matchctx->param_type = get_fn_expr_argtype(fcinfo->flinfo, 0);
+               get_typlenbyvalalign(matchctx->param_type, &matchctx->typlen, 
&matchctx->typbyval, &matchctx->typalign);
+ 
+               matchctx->typefunc = get_call_result_type(fcinfo, 
&matchctx->rettype, &matchctx->rettupdesc);
+ 
+               if (matchctx->typefunc == TYPEFUNC_COMPOSITE)
+                       matchctx->rettupdesc = 
BlessTupleDesc(matchctx->rettupdesc);
+ 
+               matchctx->wide_str = (pg_wchar *) palloc((matchctx->orig_len + 
1) * sizeof(pg_wchar));
+               matchctx->wide_len = 
pg_mb2wchar_with_len(VARDATA(matchctx->orig_str), matchctx->wide_str, 
matchctx->orig_len);
+ 
+               matchctx->pmatch[0].rm_so = -1;
+               /* both < 0 but not equal */
+               matchctx->pmatch[0].rm_eo = -2;
+ 
+               if (doing_srf)
+               {
+                       MemoryContextSwitchTo(oldcontext);
+                       funcctx->user_fctx = (void*)matchctx;
+ 
+                       /* Avoid run-away function by making sure we never 
iterate more than the length of the text */
+                       funcctx->max_calls = (matchctx->flags.glob ? (3 * 
matchctx->orig_len) : 1);
+               }
+       }
+ 
+       if (doing_srf)
+       {
+               funcctx = SRF_PERCALL_SETUP();
+               matchctx = (regexp_matches_ctx*)(funcctx->user_fctx);
+ 
+               if (funcctx->call_cntr >= funcctx->max_calls)
+               {
+                       if (funcctx->max_calls != 1)
+                               ereport(ERROR,
+                                               
(errcode(ERRCODE_INTERNAL_ERROR),
+                                                errmsg("set returning match 
function terminated after iterating %d times", funcctx->call_cntr)));
+                       SRF_RETURN_DONE(funcctx);
+               }
+       }
+ 
+       if (matchctx->offset < matchctx->orig_len)
+       {
+               if (matchctx->pmatch[0].rm_so == matchctx->pmatch[0].rm_eo)
+                       matchctx->offset++;
+ 
+               if (RE_wchar_execute(matchctx->cpattern,
+                                       matchctx->wide_str,
+                                       matchctx->wide_len,
+                                       matchctx->offset,
+                                       matchctx->cpattern->re_nsub + 1,
+                                       matchctx->pmatch))
+               {
+                       ArrayType * resultarray;
+                       Datum result;
+                       Datum elems[matchctx->cpattern->re_nsub];
+                       bool nulls[matchctx->cpattern->re_nsub];
+                       size_t i;
+                       int ndims = 1;
+                       int dims[1] = {matchctx->cpattern->re_nsub};
+                       int lbs[1] = {1};
+ 
+                       for (i = 0; i < matchctx->cpattern->re_nsub; ++i)
+                       {
+                               int so = matchctx->pmatch[i+1].rm_so,
+                                       eo = matchctx->pmatch[i+1].rm_eo;
+ 
+                               if (so < 0 || eo < 0)
+                               {
+                                       elems[i] = 0;
+                                       nulls[i] = true;
+                               }
+                               else
+                               {
+                                       elems[i] = 
DirectFunctionCall3(text_substr,
+                                                       
PointerGetDatum(matchctx->orig_str),
+                                                       Int32GetDatum(so + 1),
+                                                       Int32GetDatum(eo - so));
+                                       nulls[i] = false;
+                               }
+                       }
+ 
+                       resultarray = construct_md_array(elems, nulls, ndims, 
dims, lbs,
+                                       matchctx->param_type, matchctx->typlen, 
matchctx->typbyval, matchctx->typalign);
+ 
+                       if (matchctx->typefunc == TYPEFUNC_COMPOSITE)
+                       {
+                               Datum vals[4];
+                               bool  valnulls[4] = {true, true, true, true};
+ 
+                               if (matchctx->flags.return_pre_and_post)
+                               {
+                                       valnulls[0] = valnulls[3] = false;
+ 
+                                       vals[0] = 
DirectFunctionCall3(text_substr,
+                                                       
PointerGetDatum(matchctx->orig_str),
+                                                       Int32GetDatum(1),
+                                                       
Int32GetDatum(matchctx->pmatch[0].rm_so));
+ 
+                                       vals[3] = 
DirectFunctionCall2(text_substr_no_len,
+                                                       
PointerGetDatum(matchctx->orig_str),
+                                                       
Int32GetDatum(matchctx->pmatch[0].rm_eo + 1));
+                               }
+ 
+                               vals[1] = DirectFunctionCall3(text_substr,
+                                               
PointerGetDatum(matchctx->orig_str),
+                                               
Int32GetDatum(matchctx->pmatch[0].rm_so + 1),
+                                               
Int32GetDatum(matchctx->pmatch[0].rm_eo - matchctx->pmatch[0].rm_so));
+                               valnulls[1] = false;
+ 
+ 
+                               vals[2] = PointerGetDatum(resultarray);
+                               valnulls[2] = false;
+ 
+                               result = HeapTupleGetDatum(
+                                               heap_form_tuple 
(matchctx->rettupdesc, vals, valnulls));
+                       }
+                       else
+                       {
+                               result = PointerGetDatum(resultarray);
+                       }
+ 
+                       matchctx->offset = matchctx->pmatch[0].rm_eo;
+                       PG_RETURN_DATUM_MAYBE_SRF(result);
+               }
+               /* else fall through and return done (or null) */
+       }
+ 
+       PG_RETURN_DONE_MAYBE_SRF();
+ }
+ 
+ /* Hack to make oprsanity happy */
+ Datum regexp_matches_noopts(PG_FUNCTION_ARGS)
+ {
+       return regexp_matches(fcinfo);
+ }
+ 
+ Datum
+ regexp_split(PG_FUNCTION_ARGS)
+ {
+       FuncCallContext  *funcctx;
+       MemoryContext     oldcontext;
+       regexp_split_ctx *splitctx;
+ 
+       if (SRF_IS_FIRSTCALL())
+       {
+               text * o = PG_GETARG_TEXT_P_IF_EXISTS(2);
+ 
+               funcctx = SRF_FIRSTCALL_INIT();
+               oldcontext = 
MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+               splitctx = (regexp_split_ctx *) 
palloc(sizeof(regexp_split_ctx));
+ 
+               splitctx->orig_str = PG_GETARG_TEXT_P_COPY(0);
+               splitctx->orig_len = VARSIZE(splitctx->orig_str) - VARHDRSZ;
+ 
+               parse_re_comp_flags(&splitctx->flags, o);
+               if (splitctx->flags.glob)
+               {
+                       ereport(ERROR,
+                                       
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                        errmsg("regexp_split does not support 
the global option")));
+               }
+ 
+               if (splitctx->flags.return_pre_and_post)
+               {
+                       ereport(ERROR,
+                                       
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                        errmsg("regexp_split does not support 
the return pre and postmatch option")));
+               }
+ 
+ 
+               splitctx->cpattern = RE_compile_and_cache(PG_GETARG_TEXT_P(1), 
splitctx->flags.cflags);
+ 
+               splitctx->wide_str = (pg_wchar *) palloc((splitctx->orig_len + 
1) * sizeof(pg_wchar));
+               splitctx->wide_len = 
pg_mb2wchar_with_len(VARDATA(splitctx->orig_str), splitctx->wide_str, 
splitctx->orig_len);
+ 
+               splitctx->offset = 0;
+ 
+               splitctx->match.rm_so = -1;
+               /* both < 0 but not equal */
+               splitctx->match.rm_eo = -2;
+ 
+               MemoryContextSwitchTo(oldcontext);
+               funcctx->user_fctx = (void*)splitctx;
+ 
+               /* Avoid run-away function by making sure we never iterate more 
than the length of the text */
+               funcctx->max_calls = 3 * splitctx->orig_len;
+       }
+ 
+       funcctx = SRF_PERCALL_SETUP();
+       splitctx = (regexp_split_ctx*)(funcctx->user_fctx);
+ 
+       if (funcctx->call_cntr > funcctx->max_calls)
+       {
+               ereport(ERROR,
+                               (errcode(ERRCODE_INTERNAL_ERROR),
+                                errmsg("set returning split function 
terminated after iterating %d times", funcctx->call_cntr)));
+               SRF_RETURN_DONE(funcctx);
+       }
+ 
+       if (splitctx->offset < splitctx->orig_len)
+       {
+               regmatch_t * pmatch = &(splitctx->match);
+               bool first_match = (pmatch->rm_so < 0 || pmatch->rm_eo < 0);
+               bool incremented_offset = false;
+               do
+               {
+                       if (pmatch->rm_so == pmatch->rm_eo)
+                       {
+                               splitctx->offset++;
+                               incremented_offset = true;
+                       }
+ 
+                       if (RE_wchar_execute(splitctx->cpattern,
+                                               splitctx->wide_str,
+                                               splitctx->wide_len,
+                                               splitctx->offset,
+                                               1,
+                                               pmatch))
+                       {
+                               if (!((first_match && pmatch->rm_so == 
pmatch->rm_eo) || (pmatch->rm_so == splitctx->offset && !incremented_offset)))
+                               {
+                                       Datum result = 
DirectFunctionCall3(text_substr,
+                                                       
PointerGetDatum(splitctx->orig_str),
+                                                       
Int32GetDatum(splitctx->offset + (incremented_offset ? 0 : 1)),
+                                                       
Int32GetDatum(pmatch->rm_so - splitctx->offset + (incremented_offset ? 1 : 0)));
+                                       splitctx->offset = pmatch->rm_eo;
+                                       SRF_RETURN_NEXT(funcctx, result);
+                               }
+                       }
+                       else
+                       {
+                               /* no more matches, return rest of string */
+                               Datum result = 
DirectFunctionCall2(text_substr_no_len,
+                                               
PointerGetDatum(splitctx->orig_str),
+                                               Int32GetDatum(splitctx->offset 
+ (incremented_offset ? 0 : 1)));
+                               /* so we know we're done next time thru */
+                               splitctx->offset = splitctx->orig_len;
+                               SRF_RETURN_NEXT(funcctx, result);
+                       }
+                       first_match = false;
+               } while (1);
+       }
+ 
+       SRF_RETURN_DONE(funcctx);
+ }
+ 
+ /* Hack to make oprsanity happy */
+ Datum regexp_split_noopts(PG_FUNCTION_ARGS)
+ {
+       return regexp_split(fcinfo);
+ }
+ 
  /*
   * report whether regex_flavor is currently BASIC
   */
Index: src/include/catalog/pg_proc.h
===================================================================
RCS file: 
/home/jeremyd/local/postgres/cvsuproot/pgsql/src/include/catalog/pg_proc.h,v
retrieving revision 1.443
diff -c -r1.443 pg_proc.h
*** src/include/catalog/pg_proc.h       7 Feb 2007 23:11:30 -0000       1.443
--- src/include/catalog/pg_proc.h       7 Feb 2007 23:50:47 -0000
***************
*** 2259,2266 ****
--- 2259,2274 ----
  DESCR("replace text using regexp");
  DATA(insert OID =  2285 ( regexp_replace         PGNSP PGUID 12 1 0 f f t f i 
4 25 "25 25 25 25" _null_ _null_ _null_ textregexreplace - _null_ ));
  DESCR("replace text using regexp");
+ DATA(insert OID =  2760 ( regexp_matches   PGNSP PGUID 12 1 0 f f t f i 2 
1009 "25 25" _null_ _null_ _null_   regexp_matches_noopts - _null_ ));
+ DESCR("return all match groups for regexp");
+ DATA(insert OID =  2761 ( regexp_matches   PGNSP PGUID 12 1 10 f f t t i 3 
2249 "25 25 25" "{25,25,25,25,25,1009,25}" "{i,i,i,o,o,o,o}" 
"{str,pattern,flags,prematch,fullmatch,matches,postmatch}"    regexp_matches - 
_null_ ));
+ DESCR("return all match groups for regexp");
  DATA(insert OID =  2088 ( split_part   PGNSP PGUID 12 1 0 f f t f i 3 25 "25 
25 23" _null_ _null_ _null_      split_text - _null_ ));
  DESCR("split string by field_sep and return field_num");
+ DATA(insert OID =  2762 ( regexp_split PGNSP PGUID 12 1 1000 f f t t i 2 25 
"25 25" _null_ _null_ _null_      regexp_split_noopts - _null_ ));
+ DESCR("split string by pattern");
+ DATA(insert OID =  2763 ( regexp_split PGNSP PGUID 12 1 1000 f f t t i 3 25 
"25 25 25" _null_ _null_ _null_   regexp_split - _null_ ));
+ DESCR("split string by pattern");
  DATA(insert OID =  2089 ( to_hex         PGNSP PGUID 12 1 0 f f t f i 1 25 
"23" _null_ _null_ _null_  to_hex32 - _null_ ));
  DESCR("convert int4 number to hex");
  DATA(insert OID =  2090 ( to_hex         PGNSP PGUID 12 1 0 f f t f i 1 25 
"20" _null_ _null_ _null_  to_hex64 - _null_ ));
Index: src/include/utils/builtins.h
===================================================================
RCS file: 
/home/jeremyd/local/postgres/cvsuproot/pgsql/src/include/utils/builtins.h,v
retrieving revision 1.287
diff -c -r1.287 builtins.h
*** src/include/utils/builtins.h        28 Jan 2007 16:16:54 -0000      1.287
--- src/include/utils/builtins.h        8 Feb 2007 00:02:06 -0000
***************
*** 478,483 ****
--- 478,487 ----
  extern Datum textregexreplace_noopt(PG_FUNCTION_ARGS);
  extern Datum textregexreplace(PG_FUNCTION_ARGS);
  extern Datum similar_escape(PG_FUNCTION_ARGS);
+ extern Datum regexp_matches(PG_FUNCTION_ARGS);
+ extern Datum regexp_matches_noopts(PG_FUNCTION_ARGS);
+ extern Datum regexp_split(PG_FUNCTION_ARGS);
+ extern Datum regexp_split_noopts(PG_FUNCTION_ARGS);
  extern bool regex_flavor_is_basic(void);
  
  /* regproc.c */
Index: src/test/regress/expected/strings.out
===================================================================
RCS file: 
/home/jeremyd/local/postgres/cvsuproot/pgsql/src/test/regress/expected/strings.out,v
retrieving revision 1.29
diff -c -r1.29 strings.out
*** src/test/regress/expected/strings.out       11 May 2006 19:15:36 -0000      
1.29
--- src/test/regress/expected/strings.out       8 Feb 2007 00:00:05 -0000
***************
*** 217,225 ****
   Z Z
  (1 row)
  
! -- invalid option of REGEXP_REPLACE
  SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'z');
! ERROR:  invalid option of regexp_replace: z
  -- E021-11 position expression
  SELECT POSITION('4' IN '1234567890') = '4' AS "4";
   4 
--- 217,479 ----
   Z Z
  (1 row)
  
! -- invalid regexp option
  SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'z');
! ERROR:  invalid regexp option: z
! -- set so we can tell NULL from empty string
! \pset null '\\N'
! -- return all matches from regexp
! SELECT * FROM regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$);
!  regexp_matches 
! ----------------
!  {bar,beque}
! (1 row)
! 
! SELECT * FROM regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$, '');
!  prematch | fullmatch |   matches   | postmatch 
! ----------+-----------+-------------+-----------
!  \N       | barbeque  | {bar,beque} | \N
! (1 row)
! 
! SELECT * FROM regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$, 'r');
!  prematch | fullmatch |   matches   | postmatch 
! ----------+-----------+-------------+-----------
!  foo      | barbeque  | {bar,beque} | baz
! (1 row)
! 
! -- test case insensitive - first variant not possible
! SELECT * FROM regexp_matches('foObARbEqUEbAz', $re$(bar)(beque)$re$, 'i');
!  prematch | fullmatch |   matches   | postmatch 
! ----------+-----------+-------------+-----------
!  \N       | bARbEqUE  | {bAR,bEqUE} | \N
! (1 row)
! 
! SELECT * FROM regexp_matches('foObARbEqUEbAz', $re$(bar)(beque)$re$, 'ir');
!  prematch | fullmatch |   matches   | postmatch 
! ----------+-----------+-------------+-----------
!  foO      | bARbEqUE  | {bAR,bEqUE} | bAz
! (1 row)
! 
! -- global option - more than one match
! SELECT * FROM regexp_matches('foobarbequebazilbarfbonk', 
$re$(b[^b]+)(b[^b]+)$re$, 'gr');
!   prematch   | fullmatch |   matches    |   postmatch   
! -------------+-----------+--------------+---------------
!  foo         | barbeque  | {bar,beque}  | bazilbarfbonk
!  foobarbeque | bazilbarf | {bazil,barf} | bonk
! (2 rows)
! 
! -- empty capture group (matched empty string)
! SELECT * FROM regexp_matches('foobarbequebaz', $re$(bar)(.*)(beque)$re$);
!  regexp_matches 
! ----------------
!  {bar,"",beque}
! (1 row)
! 
! -- no match
! SELECT * FROM regexp_matches('foobarbequebaz', $re$(bar)(.+)(beque)$re$);
!  regexp_matches 
! ----------------
!  \N
! (1 row)
! 
! -- optional capture group did not match, null entry in array
! SELECT * FROM regexp_matches('foobarbequebaz', $re$(bar)(.+)?(beque)$re$);
!   regexp_matches  
! ------------------
!  {bar,NULL,beque}
! (1 row)
! 
! -- nothing before match, empty string
! SELECT * FROM regexp_matches('barbequebaz', $re$(bar)(beque)$re$, 'r');
!  prematch | fullmatch |   matches   | postmatch 
! ----------+-----------+-------------+-----------
!           | barbeque  | {bar,beque} | baz
! (1 row)
! 
! SELECT * FROM regexp_matches('barbequebaz', $re$^(bar)(beque)$re$, 'r');
!  prematch | fullmatch |   matches   | postmatch 
! ----------+-----------+-------------+-----------
!           | barbeque  | {bar,beque} | baz
! (1 row)
! 
! SELECT * FROM regexp_matches('barbequebaz', $re$(^bar)(beque)$re$, 'r');
!  prematch | fullmatch |   matches   | postmatch 
! ----------+-----------+-------------+-----------
!           | barbeque  | {bar,beque} | baz
! (1 row)
! 
! -- nothing after match, empty string
! SELECT * FROM regexp_matches('foobarbeque', $re$(bar)(beque)$re$, 'r');
!  prematch | fullmatch |   matches   | postmatch 
! ----------+-----------+-------------+-----------
!  foo      | barbeque  | {bar,beque} | 
! (1 row)
! 
! SELECT * FROM regexp_matches('foobarbeque', $re$(bar)(beque)$$re$, 'r');
!  prematch | fullmatch |   matches   | postmatch 
! ----------+-----------+-------------+-----------
!  foo      | barbeque  | {bar,beque} | 
! (1 row)
! 
! SELECT * FROM regexp_matches('foobarbeque', $re$(bar)(beque$)$re$, 'r');
!  prematch | fullmatch |   matches   | postmatch 
! ----------+-----------+-------------+-----------
!  foo      | barbeque  | {bar,beque} | 
! (1 row)
! 
! -- no capture groups, should first throw error?
! SELECT * FROM regexp_matches('foobarbequebaz', $re$barbeque$re$);
!  regexp_matches 
! ----------------
!  {}
! (1 row)
! 
! SELECT * FROM regexp_matches('foobarbequebaz', $re$barbeque$re$, '');
!  prematch | fullmatch | matches | postmatch 
! ----------+-----------+---------+-----------
!  \N       | barbeque  | {}      | \N
! (1 row)
! 
! SELECT * FROM regexp_matches('foobarbequebaz', $re$barbeque$re$, 'r');
!  prematch | fullmatch | matches | postmatch 
! ----------+-----------+---------+-----------
!  foo      | barbeque  | {}      | baz
! (1 row)
! 
! -- give me errors
! SELECT * FROM regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$, 
'zipper');
! ERROR:  invalid regexp option: z
! SELECT * FROM regexp_matches('foobarbequebaz', $re$(barbeque$re$, '');
! ERROR:  invalid regular expression: parentheses () not balanced
! SELECT * FROM regexp_matches('foobarbequebaz', $re$(bar)(beque){2,1}$re$, '');
! ERROR:  invalid regular expression: invalid repetition count(s)
! -- split string on regexp
! SELECT foo, length(foo) FROM regexp_split('the quick brown fox jumped over 
the lazy dog', $re$\s+$re$) AS foo;
!   foo   | length 
! --------+--------
!  the    |      3
!  quick  |      5
!  brown  |      5
!  fox    |      3
!  jumped |      6
!  over   |      4
!  the    |      3
!  lazy   |      4
!  dog    |      3
! (9 rows)
! 
! SELECT foo, length(foo) FROM regexp_split('the quick brown fox jumped over 
the lazy dog', $re$\s*$re$) AS foo;
!  foo | length 
! -----+--------
!  t   |      1
!  h   |      1
!  e   |      1
!  q   |      1
!  u   |      1
!  i   |      1
!  c   |      1
!  k   |      1
!  b   |      1
!  r   |      1
!  o   |      1
!  w   |      1
!  n   |      1
!  f   |      1
!  o   |      1
!  x   |      1
!  j   |      1
!  u   |      1
!  m   |      1
!  p   |      1
!  e   |      1
!  d   |      1
!  o   |      1
!  v   |      1
!  e   |      1
!  r   |      1
!  t   |      1
!  h   |      1
!  e   |      1
!  l   |      1
!  a   |      1
!  z   |      1
!  y   |      1
!  d   |      1
!  o   |      1
!  g   |      1
! (36 rows)
! 
! SELECT foo, length(foo) FROM regexp_split('the quick brown fox jumped over 
the lazy dog', '') AS foo;
!  foo | length 
! -----+--------
!  t   |      1
!  h   |      1
!  e   |      1
!      |      1
!  q   |      1
!  u   |      1
!  i   |      1
!  c   |      1
!  k   |      1
!      |      1
!  b   |      1
!  r   |      1
!  o   |      1
!  w   |      1
!  n   |      1
!      |      1
!  f   |      1
!  o   |      1
!  x   |      1
!      |      1
!  j   |      1
!  u   |      1
!  m   |      1
!  p   |      1
!  e   |      1
!  d   |      1
!      |      1
!  o   |      1
!  v   |      1
!  e   |      1
!  r   |      1
!      |      1
!  t   |      1
!  h   |      1
!  e   |      1
!      |      1
!  l   |      1
!  a   |      1
!  z   |      1
!  y   |      1
!      |      1
!  d   |      1
!  o   |      1
!  g   |      1
! (44 rows)
! 
! -- case insensitive
! SELECT foo, length(foo) FROM regexp_split('thE QUick bROWn FOx jUMPed ovEr 
THE lazy dOG', 'e', 'i') AS foo;
!           foo          | length 
! -----------------------+--------
!  th                    |      2
!   QUick bROWn FOx jUMP |     21
!  d ov                  |      4
!  r TH                  |      4
!   lazy dOG             |      9
! (5 rows)
! 
! -- errors
! SELECT foo, length(foo) FROM regexp_split('thE QUick bROWn FOx jUMPed ovEr 
THE lazy dOG', 'e', 'zippy') AS foo;
! ERROR:  invalid regexp option: z
! -- global option meaningless for regexp_split
! SELECT foo, length(foo) FROM regexp_split('thE QUick bROWn FOx jUMPed ovEr 
THE lazy dOG', 'e', 'g') AS foo;
! ERROR:  regexp_split does not support the global option
! -- return pre and post option meaningless for regexp_split
! SELECT foo, length(foo) FROM regexp_split('thE QUick bROWn FOx jUMPed ovEr 
THE lazy dOG', 'e', 'r') AS foo;
! ERROR:  regexp_split does not support the return pre and postmatch option
! -- change NULL-display back
! \pset null ''
  -- E021-11 position expression
  SELECT POSITION('4' IN '1234567890') = '4' AS "4";
   4 
Index: src/test/regress/sql/strings.sql
===================================================================
RCS file: 
/home/jeremyd/local/postgres/cvsuproot/pgsql/src/test/regress/sql/strings.sql,v
retrieving revision 1.18
diff -c -r1.18 strings.sql
*** src/test/regress/sql/strings.sql    6 Mar 2006 19:49:20 -0000       1.18
--- src/test/regress/sql/strings.sql    7 Feb 2007 23:58:23 -0000
***************
*** 85,93 ****
  SELECT regexp_replace('AAA   BBB   CCC   ', E'\\s+', ' ', 'g');
  SELECT regexp_replace('AAA', '^|$', 'Z', 'g');
  SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'gi');
! -- invalid option of REGEXP_REPLACE
  SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'z');
  
  -- E021-11 position expression
  SELECT POSITION('4' IN '1234567890') = '4' AS "4";
  
--- 85,153 ----
  SELECT regexp_replace('AAA   BBB   CCC   ', E'\\s+', ' ', 'g');
  SELECT regexp_replace('AAA', '^|$', 'Z', 'g');
  SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'gi');
! -- invalid regexp option
  SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'z');
  
+ -- set so we can tell NULL from empty string
+ \pset null '\\N'
+ 
+ -- return all matches from regexp
+ SELECT * FROM regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$);
+ SELECT * FROM regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$, '');
+ SELECT * FROM regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$, 'r');
+ 
+ -- test case insensitive - first variant not possible
+ SELECT * FROM regexp_matches('foObARbEqUEbAz', $re$(bar)(beque)$re$, 'i');
+ SELECT * FROM regexp_matches('foObARbEqUEbAz', $re$(bar)(beque)$re$, 'ir');
+ 
+ -- global option - more than one match
+ SELECT * FROM regexp_matches('foobarbequebazilbarfbonk', 
$re$(b[^b]+)(b[^b]+)$re$, 'gr');
+ 
+ -- empty capture group (matched empty string)
+ SELECT * FROM regexp_matches('foobarbequebaz', $re$(bar)(.*)(beque)$re$);
+ -- no match
+ SELECT * FROM regexp_matches('foobarbequebaz', $re$(bar)(.+)(beque)$re$);
+ -- optional capture group did not match, null entry in array
+ SELECT * FROM regexp_matches('foobarbequebaz', $re$(bar)(.+)?(beque)$re$);
+ 
+ -- nothing before match, empty string
+ SELECT * FROM regexp_matches('barbequebaz', $re$(bar)(beque)$re$, 'r');
+ SELECT * FROM regexp_matches('barbequebaz', $re$^(bar)(beque)$re$, 'r');
+ SELECT * FROM regexp_matches('barbequebaz', $re$(^bar)(beque)$re$, 'r');
+ 
+ -- nothing after match, empty string
+ SELECT * FROM regexp_matches('foobarbeque', $re$(bar)(beque)$re$, 'r');
+ SELECT * FROM regexp_matches('foobarbeque', $re$(bar)(beque)$$re$, 'r');
+ SELECT * FROM regexp_matches('foobarbeque', $re$(bar)(beque$)$re$, 'r');
+ 
+ -- no capture groups, should first throw error?
+ SELECT * FROM regexp_matches('foobarbequebaz', $re$barbeque$re$);
+ SELECT * FROM regexp_matches('foobarbequebaz', $re$barbeque$re$, '');
+ SELECT * FROM regexp_matches('foobarbequebaz', $re$barbeque$re$, 'r');
+ 
+ -- give me errors
+ SELECT * FROM regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$, 
'zipper');
+ SELECT * FROM regexp_matches('foobarbequebaz', $re$(barbeque$re$, '');
+ SELECT * FROM regexp_matches('foobarbequebaz', $re$(bar)(beque){2,1}$re$, '');
+ 
+ -- split string on regexp
+ SELECT foo, length(foo) FROM regexp_split('the quick brown fox jumped over 
the lazy dog', $re$\s+$re$) AS foo;
+ 
+ SELECT foo, length(foo) FROM regexp_split('the quick brown fox jumped over 
the lazy dog', $re$\s*$re$) AS foo;
+ SELECT foo, length(foo) FROM regexp_split('the quick brown fox jumped over 
the lazy dog', '') AS foo;
+ -- case insensitive
+ SELECT foo, length(foo) FROM regexp_split('thE QUick bROWn FOx jUMPed ovEr 
THE lazy dOG', 'e', 'i') AS foo;
+ 
+ -- errors
+ SELECT foo, length(foo) FROM regexp_split('thE QUick bROWn FOx jUMPed ovEr 
THE lazy dOG', 'e', 'zippy') AS foo;
+ -- global option meaningless for regexp_split
+ SELECT foo, length(foo) FROM regexp_split('thE QUick bROWn FOx jUMPed ovEr 
THE lazy dOG', 'e', 'g') AS foo;
+ -- return pre and post option meaningless for regexp_split
+ SELECT foo, length(foo) FROM regexp_split('thE QUick bROWn FOx jUMPed ovEr 
THE lazy dOG', 'e', 'r') AS foo;
+ 
+ -- change NULL-display back
+ \pset null ''
+ 
  -- E021-11 position expression
  SELECT POSITION('4' IN '1234567890') = '4' AS "4";

---------------------------(end of broadcast)---------------------------
TIP 6: explain analyze is your friend

Re: [PATCHES] WIP patch adding new regexp functions

Reply via email to