Re: [PATCHES] [HACKERS] writing new regexp functions

Jeremy Drake Thu, 01 Feb 2007 19:29:58 -0800

On Thu, 1 Feb 2007, Jeremy Drake wrote:

> On Thu, 1 Feb 2007, Tom Lane wrote:
>
> > Jeremy Drake <[EMAIL PROTECTED]> writes:
> > > Is there some specific reason that these functions are static,
> >
> > Yeah: not cluttering the global namespace.
>
> > Is there a reason for not putting your new code itself into regexp.c?
>
> Not really, I just figured it would be cleaner/easier to write it as an
> extension.  I also figure that it is unlikely that every regexp function
> that anyone could possibly want will be implemented in core in that one
> file.
<snip>


> Anyway, the particular thing I was writing was a function like
> substring(str FROM pattern) which instead of returning just the first
> match group, would return an array of text containing all of the match
> groups.  I exported the functions in my sandbox, and wrote a module with a
> function that does this.

I have attached the patch I have put together, which does the following:
* Expose the previously static RE_* functions from regexp.c which wrap
  the code in src/backend/regex with postgres-style errors, string
  conversion, and caching of patterns.

* expose regex_flavor guc var, which is needed to know how to interpret
  patterns when compiling them

* Add a couple more RE_* functions in regexp.c to provide access
  to different levels of the process, which were necessary to avoid
  duplicating effort elsewhere.

* Update replace_text_regexp in varlena.c to use newly exposed functions
  from regexp.c instead of duplicating error handling code from there.

Also attached is the function I wrote to retrieve all of the capture
groups in a pattern match in a text[].  I also intend to put together a
function analogous to split_part which will take a string and a pattern to
split on, and return setof text.

Let me know if I should work under the assumption of the attached patch
and write the functions for contrib or pgfoundry, or to put the functions
in regexp.c and try to get them in core, or both? (it made my life a lot
easier working on the function to not have to restart the postmaster every
time I recompiled it, may be nice for the future to be able to make
extensions like this...)

-- 
To err is human, to forgive, beyond the scope of the Operating System.

#include "postgres.h"
#include "fmgr.h"
#include "utils/array.h"
#include "utils/builtins.h"
#include "utils/lsyscache.h"
#include "utils/regexp.h"

PG_MODULE_MAGIC;

Datum regexp_matches(PG_FUNCTION_ARGS);

PG_FUNCTION_INFO_V1(regexp_matches);
Datum
regexp_matches(PG_FUNCTION_ARGS)
{
        text *s = PG_GETARG_TEXT_P(0);
        text *p = PG_GETARG_TEXT_P(1);
        regex_t *cpat;

        cpat = RE_compile_and_cache(p, regex_flavor);
        if (cpat->re_nsub)
        {
                regmatch_t *pmatch = palloc0(sizeof(regmatch_t) * 
(cpat->re_nsub + 1));
                if (RE_execute(cpat,
                                           VARDATA(s),
                                           VARSIZE(s) - VARHDRSZ,
                                           cpat->re_nsub + 1,
                                           pmatch))
                {
                        ArrayType *result;
                        Datum elems[cpat->re_nsub];
                        bool nulls[cpat->re_nsub];
                        /* get text type oid, too lazy to do it some other way 
*/
                        Oid param_type = get_fn_expr_argtype(fcinfo->flinfo, 0);
                        size_t i;
                        int ndims = 1;
                        int dims[1] = {cpat->re_nsub};
                        int lbs[1] = {1};
                        int16 typlen;
                        bool typbyval;
                        char typalign;

                        get_typlenbyvalalign(param_type, &typlen, &typbyval, 
&typalign);

                        for (i = 0; i < cpat->re_nsub; ++i)
                        {
                                int so = pmatch[i+1].rm_so,
                                    eo = pmatch[i+1].rm_eo;

                                if (so < 0 || eo < 0)
                                {
                                        elems[i] = 0;
                                        nulls[i] = true;
                                }
                                else
                                {
                                        elems[i] = 
DirectFunctionCall3(text_substr,
                                                        PointerGetDatum(s),
                                                        Int32GetDatum(so + 1),
                                                        Int32GetDatum(eo - so));
                                        nulls[i] = false;
                                }
                        }

                        pfree(pmatch);
                        result = construct_md_array(elems, nulls, ndims, dims, 
lbs,
                                        param_type, typlen, typbyval, typalign);
                        PG_RETURN_ARRAYTYPE_P(result);
                }
                /* if no match, fall through and return null */
        }
        else
        {
                ereport(ERROR,
                        (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
                         errmsg("regular expression has no match groups")));
        }

        PG_RETURN_NULL();
}

Index: src/backend/utils/adt/regexp.c
===================================================================
RCS file: 
/home/jeremyd/local/postgres/cvsuproot/pgsql/src/backend/utils/adt/regexp.c,v
retrieving revision 1.68
diff -c -r1.68 regexp.c
*** src/backend/utils/adt/regexp.c      5 Jan 2007 22:19:41 -0000       1.68
--- src/backend/utils/adt/regexp.c      2 Feb 2007 02:45:32 -0000
***************
*** 29,41 ****
   */
  #include "postgres.h"
  
- #include "regex/regex.h"
  #include "utils/builtins.h"
  #include "utils/guc.h"
  
  
  /* GUC-settable flavor parameter */
! static int    regex_flavor = REG_ADVANCED;
  
  
  /*
--- 29,41 ----
   */
  #include "postgres.h"
  
  #include "utils/builtins.h"
  #include "utils/guc.h"
+ #include "utils/regexp.h"
  
  
  /* GUC-settable flavor parameter */
! int   regex_flavor = REG_ADVANCED;
  
  
  /*
***************
*** 90,96 ****
   * Pattern is given in the database encoding.  We internally convert to
   * array of pg_wchar which is what Spencer's regex package wants.
   */
! static regex_t *
  RE_compile_and_cache(text *text_re, int cflags)
  {
        int                     text_re_len = VARSIZE(text_re);
--- 90,96 ----
   * Pattern is given in the database encoding.  We internally convert to
   * array of pg_wchar which is what Spencer's regex package wants.
   */
! regex_t *
  RE_compile_and_cache(text *text_re, int cflags)
  {
        int                     text_re_len = VARSIZE(text_re);
***************
*** 191,238 ****
  }
  
  /*
!  * RE_compile_and_execute - compile and execute a RE
   *
   * Returns TRUE on match, FALSE on no match
   *
!  *    text_re --- the pattern, expressed as an *untoasted* TEXT object
!  *    dat --- the data to match against (need not be null-terminated)
!  *    dat_len --- the length of the data string
!  *    cflags --- compile options for the pattern
   *    nmatch, pmatch  --- optional return area for match details
   *
!  * Both pattern and data are given in the database encoding.  We internally
!  * convert to array of pg_wchar which is what Spencer's regex package wants.
   */
! static bool
! RE_compile_and_execute(text *text_re, char *dat, int dat_len,
!                                          int cflags, int nmatch, regmatch_t 
*pmatch)
  {
-       pg_wchar   *data;
-       size_t          data_len;
        int                     regexec_result;
-       regex_t    *re;
        char            errMsg[100];
  
-       /* Convert data string to wide characters */
-       data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
-       data_len = pg_mb2wchar_with_len(dat, data, dat_len);
- 
-       /* Compile RE */
-       re = RE_compile_and_cache(text_re, cflags);
- 
        /* Perform RE match and return result */
        regexec_result = pg_regexec(re,
                                                                data,
                                                                data_len,
!                                                               0,
                                                                NULL,   /* no 
details */
                                                                nmatch,
                                                                pmatch,
                                                                0);
  
-       pfree(data);
- 
        if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
        {
                /* re failed??? */
--- 191,226 ----
  }
  
  /*
!  * RE_wchar_execute - execute a RE
   *
   * Returns TRUE on match, FALSE on no match
   *
!  *    re --- the compiled pattern as returned by RE_compile_and_cache
!  *    data --- the data to match against (need not be null-terminated)
!  *    data_len --- the length of the data string
!  *    start_search -- the offset in the data to start searching
   *    nmatch, pmatch  --- optional return area for match details
   *
!  * Data is given as array of pg_wchar which is what Spencer's regex package
!  * wants.
   */
! bool
! RE_wchar_execute(regex_t *re, pg_wchar *data, int data_len, size_t 
start_search,
!                                          int nmatch, regmatch_t *pmatch)
  {
        int                     regexec_result;
        char            errMsg[100];
  
        /* Perform RE match and return result */
        regexec_result = pg_regexec(re,
                                                                data,
                                                                data_len,
!                                                               start_search,
                                                                NULL,   /* no 
details */
                                                                nmatch,
                                                                pmatch,
                                                                0);
  
        if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
        {
                /* re failed??? */
***************
*** 245,250 ****
--- 233,299 ----
        return (regexec_result == REG_OKAY);
  }
  
+ /*
+  * RE_execute - execute a RE
+  *
+  * Returns TRUE on match, FALSE on no match
+  *
+  *    re --- the compiled pattern as returned by RE_compile_and_cache
+  *    dat --- the data to match against (need not be null-terminated)
+  *    dat_len --- the length of the data string
+  *    nmatch, pmatch  --- optional return area for match details
+  *
+  * Data is given in the database encoding.  We internally
+  * convert to array of pg_wchar which is what Spencer's regex package wants.
+  */
+ bool
+ RE_execute(regex_t *re, char *dat, int dat_len,
+                                          int nmatch, regmatch_t *pmatch)
+ {
+       pg_wchar   *data;
+       size_t          data_len;
+       bool            match;
+ 
+       /* Convert data string to wide characters */
+       data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
+       data_len = pg_mb2wchar_with_len(dat, data, dat_len);
+ 
+       /* Perform RE match and return result */
+       match = RE_wchar_execute(re, data, data_len, 0, nmatch, pmatch);
+       pfree(data);
+       return match;
+ }
+ 
+ /*
+  * RE_compile_and_execute - compile and execute a RE
+  *
+  * Returns TRUE on match, FALSE on no match
+  *
+  *    text_re --- the pattern, expressed as an *untoasted* TEXT object
+  *    dat --- the data to match against (need not be null-terminated)
+  *    dat_len --- the length of the data string
+  *    cflags --- compile options for the pattern
+  *    nmatch, pmatch  --- optional return area for match details
+  *
+  * Both pattern and data are given in the database encoding.  We internally
+  * convert to array of pg_wchar which is what Spencer's regex package wants.
+  */
+ bool
+ RE_compile_and_execute(text *text_re, char *dat, int dat_len,
+                                          int cflags, int nmatch, regmatch_t 
*pmatch)
+ {
+       pg_wchar   *data;
+       size_t          data_len;
+       int                     regexec_result;
+       regex_t    *re;
+       char            errMsg[100];
+ 
+       /* Compile RE */
+       re = RE_compile_and_cache(text_re, cflags);
+ 
+       return RE_execute(re, dat, dat_len, nmatch, pmatch);
+ }
+ 
  
  /*
   * assign_regex_flavor - GUC hook to validate and set REGEX_FLAVOR
Index: src/backend/utils/adt/varlena.c
===================================================================
RCS file: 
/home/jeremyd/local/postgres/cvsuproot/pgsql/src/backend/utils/adt/varlena.c,v
retrieving revision 1.154
diff -c -r1.154 varlena.c
*** src/backend/utils/adt/varlena.c     5 Jan 2007 22:19:42 -0000       1.154
--- src/backend/utils/adt/varlena.c     2 Feb 2007 02:50:31 -0000
***************
*** 23,32 ****
  #include "libpq/pqformat.h"
  #include "miscadmin.h"
  #include "parser/scansup.h"
- #include "regex/regex.h"
  #include "utils/builtins.h"
  #include "utils/lsyscache.h"
  #include "utils/pg_locale.h"
  
  
  typedef struct varlena unknown;
--- 23,32 ----
  #include "libpq/pqformat.h"
  #include "miscadmin.h"
  #include "parser/scansup.h"
  #include "utils/builtins.h"
  #include "utils/lsyscache.h"
  #include "utils/pg_locale.h"
+ #include "utils/regexp.h"
  
  
  typedef struct varlena unknown;
***************
*** 2355,2386 ****
        search_start = 0;
        while (search_start <= data_len)
        {
-               int                     regexec_result;
- 
                CHECK_FOR_INTERRUPTS();
  
!               regexec_result = pg_regexec(re,
!                                                                       data,
!                                                                       
data_len,
!                                                                       
search_start,
!                                                                       NULL,   
        /* no details */
!                                                                       
REGEXP_REPLACE_BACKREF_CNT,
!                                                                       pmatch,
!                                                                       0);
! 
!               if (regexec_result == REG_NOMATCH)
                        break;
  
-               if (regexec_result != REG_OKAY)
-               {
-                       char            errMsg[100];
- 
-                       pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
-                       ereport(ERROR,
-                                       
(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
-                                        errmsg("regular expression failed: 
%s", errMsg)));
-               }
- 
                /*
                 * Copy the text to the left of the match position.  Note we are
                 * given character not byte indexes.
--- 2355,2366 ----
        search_start = 0;
        while (search_start <= data_len)
        {
                CHECK_FOR_INTERRUPTS();
  
!               if (!RE_wchar_execute (re, data, data_len, search_start,
!                                                       
REGEXP_REPLACE_BACKREF_CNT, pmatch))
                        break;
  
                /*
                 * Copy the text to the left of the match position.  Note we are
                 * given character not byte indexes.
*** ../pgsql-orig/src/include/utils/regexp.h    Wed Dec 31 16:00:00 1969
--- src/include/utils/regexp.h  Thu Feb  1 18:46:49 2007
***************
*** 0 ****
--- 1,29 ----
+ /*-------------------------------------------------------------------------
+  *
+  * regexp.h
+  *      Header file for regexp connector code.
+  *
+  * Copyright (c) 2007, PostgreSQL Global Development Group
+  *
+  * $PostgreSQL$
+  *
+  *-------------------------------------------------------------------------
+  */ 
+ #ifndef REGEXP_H
+ #define REGEXP_H
+ 
+ #include "regex/regex.h"
+ 
+ /* regexp support routines for PostgreSQL-izing regexp code */
+ extern regex_t * RE_compile_and_cache(text *text_re, int cflags);
+ extern bool RE_compile_and_execute(text *text_re, char *dat, int dat_len,
+                                          int cflags, int nmatch, regmatch_t 
*pmatch);
+ extern bool RE_wchar_execute(regex_t *re, pg_wchar *data, int data_len,
+                                          size_t start_search, int nmatch, 
regmatch_t *pmatch);
+ extern bool RE_execute(regex_t *re, char *dat, int dat_len,
+                                          int nmatch, regmatch_t *pmatch);
+ 
+ /* regexp flavor GUC variable */
+ extern int regex_flavor;
+ 
+ #endif   /* REGEXP_H */

---------------------------(end of broadcast)---------------------------
TIP 4: Have you searched our list archives?

               http://archives.postgresql.org

Re: [PATCHES] [HACKERS] writing new regexp functions

Reply via email to