Re: [PATCHES] [HACKERS] writing new regexp functions

Jeremy Drake Fri, 02 Feb 2007 19:02:19 -0800

On Fri, 2 Feb 2007, David Fetter wrote:

> On Fri, Feb 02, 2007 at 08:56:31PM -0500, Tom Lane wrote:
> >
> > All of SQL's pattern match operators have the pattern on the right,
> > so my advice is to stick with that and try not to think about Perl
> > ;-)
>
> Perl provides inspiration, but here, consistency would help more than
> orderly imitation of how it does what it does.   And besides, when
> people really need Perl, they can pull it in as a PL :)


Alright, here is my code to date.  I have put the pattern after the string
in the split function, as discussed above.  The .tar.gz file expects to be
untarred in contrib/.  I have made some regression tests that can be run
using 'make installcheck' as normal for contrib.  I think they exercise
the corner cases in the code, but I may very well have missed some.  It
requires the (previously submitted) attached patch to core to compile, as
it takes advantage of new exported functions from
src/backend/utils/adt/regexp.c.

Let me know if you see any bugs or issues with this code, and I am open to
suggestions for further regression tests ;)


Things that I still want to look into:
* regexp flags (a la regexp_replace).

* maybe make regexp_matches return setof whatever, if given a 'g' flag
  return all matches in string.

* maybe a join function that works as an aggregate
   SELECT join(',', col) FROM tbl
  currently can be written as
   SELECT array_to_string(ARRAY(SELECT col FROM tbl), ',')




-- 
It was a virgin forest, a place where the Hand of Man had never set
foot.

Index: src/backend/utils/adt/regexp.c
===================================================================
RCS file: 
/home/jeremyd/local/postgres/cvsuproot/pgsql/src/backend/utils/adt/regexp.c,v
retrieving revision 1.68
diff -c -r1.68 regexp.c
*** src/backend/utils/adt/regexp.c      5 Jan 2007 22:19:41 -0000       1.68
--- src/backend/utils/adt/regexp.c      2 Feb 2007 02:45:32 -0000
***************
*** 29,41 ****
   */
  #include "postgres.h"
  
- #include "regex/regex.h"
  #include "utils/builtins.h"
  #include "utils/guc.h"
  
  
  /* GUC-settable flavor parameter */
! static int    regex_flavor = REG_ADVANCED;
  
  
  /*
--- 29,41 ----
   */
  #include "postgres.h"
  
  #include "utils/builtins.h"
  #include "utils/guc.h"
+ #include "utils/regexp.h"
  
  
  /* GUC-settable flavor parameter */
! int   regex_flavor = REG_ADVANCED;
  
  
  /*
***************
*** 90,96 ****
   * Pattern is given in the database encoding.  We internally convert to
   * array of pg_wchar which is what Spencer's regex package wants.
   */
! static regex_t *
  RE_compile_and_cache(text *text_re, int cflags)
  {
        int                     text_re_len = VARSIZE(text_re);
--- 90,96 ----
   * Pattern is given in the database encoding.  We internally convert to
   * array of pg_wchar which is what Spencer's regex package wants.
   */
! regex_t *
  RE_compile_and_cache(text *text_re, int cflags)
  {
        int                     text_re_len = VARSIZE(text_re);
***************
*** 191,238 ****
  }
  
  /*
!  * RE_compile_and_execute - compile and execute a RE
   *
   * Returns TRUE on match, FALSE on no match
   *
!  *    text_re --- the pattern, expressed as an *untoasted* TEXT object
!  *    dat --- the data to match against (need not be null-terminated)
!  *    dat_len --- the length of the data string
!  *    cflags --- compile options for the pattern
   *    nmatch, pmatch  --- optional return area for match details
   *
!  * Both pattern and data are given in the database encoding.  We internally
!  * convert to array of pg_wchar which is what Spencer's regex package wants.
   */
! static bool
! RE_compile_and_execute(text *text_re, char *dat, int dat_len,
!                                          int cflags, int nmatch, regmatch_t 
*pmatch)
  {
-       pg_wchar   *data;
-       size_t          data_len;
        int                     regexec_result;
-       regex_t    *re;
        char            errMsg[100];
  
-       /* Convert data string to wide characters */
-       data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
-       data_len = pg_mb2wchar_with_len(dat, data, dat_len);
- 
-       /* Compile RE */
-       re = RE_compile_and_cache(text_re, cflags);
- 
        /* Perform RE match and return result */
        regexec_result = pg_regexec(re,
                                                                data,
                                                                data_len,
!                                                               0,
                                                                NULL,   /* no 
details */
                                                                nmatch,
                                                                pmatch,
                                                                0);
  
-       pfree(data);
- 
        if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
        {
                /* re failed??? */
--- 191,226 ----
  }
  
  /*
!  * RE_wchar_execute - execute a RE
   *
   * Returns TRUE on match, FALSE on no match
   *
!  *    re --- the compiled pattern as returned by RE_compile_and_cache
!  *    data --- the data to match against (need not be null-terminated)
!  *    data_len --- the length of the data string
!  *    start_search -- the offset in the data to start searching
   *    nmatch, pmatch  --- optional return area for match details
   *
!  * Data is given as array of pg_wchar which is what Spencer's regex package
!  * wants.
   */
! bool
! RE_wchar_execute(regex_t *re, pg_wchar *data, int data_len, size_t 
start_search,
!                                          int nmatch, regmatch_t *pmatch)
  {
        int                     regexec_result;
        char            errMsg[100];
  
        /* Perform RE match and return result */
        regexec_result = pg_regexec(re,
                                                                data,
                                                                data_len,
!                                                               start_search,
                                                                NULL,   /* no 
details */
                                                                nmatch,
                                                                pmatch,
                                                                0);
  
        if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
        {
                /* re failed??? */
***************
*** 245,250 ****
--- 233,299 ----
        return (regexec_result == REG_OKAY);
  }
  
+ /*
+  * RE_execute - execute a RE
+  *
+  * Returns TRUE on match, FALSE on no match
+  *
+  *    re --- the compiled pattern as returned by RE_compile_and_cache
+  *    dat --- the data to match against (need not be null-terminated)
+  *    dat_len --- the length of the data string
+  *    nmatch, pmatch  --- optional return area for match details
+  *
+  * Data is given in the database encoding.  We internally
+  * convert to array of pg_wchar which is what Spencer's regex package wants.
+  */
+ bool
+ RE_execute(regex_t *re, char *dat, int dat_len,
+                                          int nmatch, regmatch_t *pmatch)
+ {
+       pg_wchar   *data;
+       size_t          data_len;
+       bool            match;
+ 
+       /* Convert data string to wide characters */
+       data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
+       data_len = pg_mb2wchar_with_len(dat, data, dat_len);
+ 
+       /* Perform RE match and return result */
+       match = RE_wchar_execute(re, data, data_len, 0, nmatch, pmatch);
+       pfree(data);
+       return match;
+ }
+ 
+ /*
+  * RE_compile_and_execute - compile and execute a RE
+  *
+  * Returns TRUE on match, FALSE on no match
+  *
+  *    text_re --- the pattern, expressed as an *untoasted* TEXT object
+  *    dat --- the data to match against (need not be null-terminated)
+  *    dat_len --- the length of the data string
+  *    cflags --- compile options for the pattern
+  *    nmatch, pmatch  --- optional return area for match details
+  *
+  * Both pattern and data are given in the database encoding.  We internally
+  * convert to array of pg_wchar which is what Spencer's regex package wants.
+  */
+ bool
+ RE_compile_and_execute(text *text_re, char *dat, int dat_len,
+                                          int cflags, int nmatch, regmatch_t 
*pmatch)
+ {
+       pg_wchar   *data;
+       size_t          data_len;
+       int                     regexec_result;
+       regex_t    *re;
+       char            errMsg[100];
+ 
+       /* Compile RE */
+       re = RE_compile_and_cache(text_re, cflags);
+ 
+       return RE_execute(re, dat, dat_len, nmatch, pmatch);
+ }
+ 
  
  /*
   * assign_regex_flavor - GUC hook to validate and set REGEX_FLAVOR
Index: src/backend/utils/adt/varlena.c
===================================================================
RCS file: 
/home/jeremyd/local/postgres/cvsuproot/pgsql/src/backend/utils/adt/varlena.c,v
retrieving revision 1.154
diff -c -r1.154 varlena.c
*** src/backend/utils/adt/varlena.c     5 Jan 2007 22:19:42 -0000       1.154
--- src/backend/utils/adt/varlena.c     2 Feb 2007 02:50:31 -0000
***************
*** 23,32 ****
  #include "libpq/pqformat.h"
  #include "miscadmin.h"
  #include "parser/scansup.h"
- #include "regex/regex.h"
  #include "utils/builtins.h"
  #include "utils/lsyscache.h"
  #include "utils/pg_locale.h"
  
  
  typedef struct varlena unknown;
--- 23,32 ----
  #include "libpq/pqformat.h"
  #include "miscadmin.h"
  #include "parser/scansup.h"
  #include "utils/builtins.h"
  #include "utils/lsyscache.h"
  #include "utils/pg_locale.h"
+ #include "utils/regexp.h"
  
  
  typedef struct varlena unknown;
***************
*** 2355,2386 ****
        search_start = 0;
        while (search_start <= data_len)
        {
-               int                     regexec_result;
- 
                CHECK_FOR_INTERRUPTS();
  
!               regexec_result = pg_regexec(re,
!                                                                       data,
!                                                                       
data_len,
!                                                                       
search_start,
!                                                                       NULL,   
        /* no details */
!                                                                       
REGEXP_REPLACE_BACKREF_CNT,
!                                                                       pmatch,
!                                                                       0);
! 
!               if (regexec_result == REG_NOMATCH)
                        break;
  
-               if (regexec_result != REG_OKAY)
-               {
-                       char            errMsg[100];
- 
-                       pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
-                       ereport(ERROR,
-                                       
(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
-                                        errmsg("regular expression failed: 
%s", errMsg)));
-               }
- 
                /*
                 * Copy the text to the left of the match position.  Note we are
                 * given character not byte indexes.
--- 2355,2366 ----
        search_start = 0;
        while (search_start <= data_len)
        {
                CHECK_FOR_INTERRUPTS();
  
!               if (!RE_wchar_execute (re, data, data_len, search_start,
!                                                       
REGEXP_REPLACE_BACKREF_CNT, pmatch))
                        break;
  
                /*
                 * Copy the text to the left of the match position.  Note we are
                 * given character not byte indexes.
*** ../pgsql-orig/src/include/utils/regexp.h    Wed Dec 31 16:00:00 1969
--- src/include/utils/regexp.h  Thu Feb  1 18:46:49 2007
***************
*** 0 ****
--- 1,29 ----
+ /*-------------------------------------------------------------------------
+  *
+  * regexp.h
+  *      Header file for regexp connector code.
+  *
+  * Copyright (c) 2007, PostgreSQL Global Development Group
+  *
+  * $PostgreSQL$
+  *
+  *-------------------------------------------------------------------------
+  */ 
+ #ifndef REGEXP_H
+ #define REGEXP_H
+ 
+ #include "regex/regex.h"
+ 
+ /* regexp support routines for PostgreSQL-izing regexp code */
+ extern regex_t * RE_compile_and_cache(text *text_re, int cflags);
+ extern bool RE_compile_and_execute(text *text_re, char *dat, int dat_len,
+                                          int cflags, int nmatch, regmatch_t 
*pmatch);
+ extern bool RE_wchar_execute(regex_t *re, pg_wchar *data, int data_len,
+                                          size_t start_search, int nmatch, 
regmatch_t *pmatch);
+ extern bool RE_execute(regex_t *re, char *dat, int dat_len,
+                                          int nmatch, regmatch_t *pmatch);
+ 
+ /* regexp flavor GUC variable */
+ extern int regex_flavor;
+ 
+ #endif   /* REGEXP_H */

regexp_ext.tar.gz
Description: Binary data

---------------------------(end of broadcast)---------------------------
TIP 1: if posting/reading through Usenet, please send an appropriate
       subscribe-nomail command to [EMAIL PROTECTED] so that your
       message can get through to the mailing list cleanly

Re: [PATCHES] [HACKERS] writing new regexp functions

Reply via email to