This patch provides a new function regexp_replace.
regexp_replace extends a replace function and enables text search
by the regular expression. And, a back reference can be used within
a replace string.
(This patch for PostgreSQL 7.4.3)

Function: regexp_replace(str, pattern, replace_str)
Retuen Type: text
Description: Replace all matched string in str.
             pattern is regular expression pattern.
             replace_str is replace string that can use '\1' - '\9', and
'\&'.
             '\1' - '\9' is back reference to the n'th subexpression.
             '\&' is matched string.

(example1)
select regexp_replace('ABC-DEF', '(\\w+)-(\\w+)', '\\2-\\1')
result: DEF-ABC

(example2)
update tab1 set col1 = regexp_replace(col1, '[A-Z]', '');

---
Atsushi Ogawa
[EMAIL PROTECTED]

--- cut here ---

*** ./src/backend/regex/regexec.c.orig  Tue Jul 20 08:45:39 2004
--- ./src/backend/regex/regexec.c       Tue Jul 20 08:49:36 2004
***************
*** 110,115 ****
--- 110,116 ----
        regmatch_t *pmatch;
        rm_detail_t *details;
        chr                *start;                      /* start of string */
+       chr                *search_start;       /* search start of string */
        chr                *stop;                       /* just past end of string */
        int                     err;                    /* error code if any (0 none) 
*/
        regoff_t   *mem;                        /* memory vector for backtracking */
***************
*** 168,173 ****
--- 169,175 ----
  pg_regexec(regex_t *re,
                   const chr *string,
                   size_t len,
+                  size_t search_start,
                   rm_detail_t *details,
                   size_t nmatch,
                   regmatch_t pmatch[],
***************
*** 219,224 ****
--- 221,227 ----
                v->pmatch = pmatch;
        v->details = details;
        v->start = (chr *) string;
+       v->search_start = (chr *) string + search_start;
        v->stop = (chr *) string + len;
        v->err = 0;
        if (backref)
***************
*** 288,294 ****
        NOERR();
        MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
        cold = NULL;
!       close = shortest(v, s, v->start, v->start, v->stop, &cold, (int *)
NULL);
        freedfa(s);
        NOERR();
        if (v->g->cflags & REG_EXPECT)
--- 291,298 ----
        NOERR();
        MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
        cold = NULL;
!       close = shortest(v, s, v->search_start, v->search_start, v->stop,
!                                        &cold, (int *) NULL);
        freedfa(s);
        NOERR();
        if (v->g->cflags & REG_EXPECT)
***************
*** 415,421 ****

        assert(d != NULL && s != NULL);
        cold = NULL;
!       close = v->start;
        do
        {
                MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
--- 419,425 ----

        assert(d != NULL && s != NULL);
        cold = NULL;
!       close = v->search_start;
        do
        {
                MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
*** ./src/backend/utils/adt/regexp.c.orig       Tue Jul 20 08:50:08 2004
--- ./src/backend/utils/adt/regexp.c    Tue Jul 20 09:00:05 2004
***************
*** 80,116 ****


  /*
!  * RE_compile_and_execute - compile and execute a RE, caching if possible
   *
!  * Returns TRUE on match, FALSE on no match
   *
!  *    text_re --- the pattern, expressed as an *untoasted* TEXT object
!  *    dat --- the data to match against (need not be null-terminated)
!  *    dat_len --- the length of the data string
!  *    cflags --- compile options for the pattern
!  *    nmatch, pmatch  --- optional return area for match details
   *
!  * Both pattern and data are given in the database encoding.  We
internally
!  * convert to array of pg_wchar which is what Spencer's regex package
wants.
   */
! static bool
! RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len,
!                                          int cflags, int nmatch, regmatch_t *pmatch)
  {
        int                     text_re_len = VARSIZE(text_re);
-       pg_wchar   *data;
-       size_t          data_len;
        pg_wchar   *pattern;
        size_t          pattern_len;
        int                     i;
        int                     regcomp_result;
-       int                     regexec_result;
        cached_re_str re_temp;

-       /* Convert data string to wide characters */
-       data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
-       data_len = pg_mb2wchar_with_len(dat, data, dat_len);
-
        /*
         * Look for a match among previously compiled REs.      Since the data
         * structure is self-organizing with most-used entries at the front,
--- 80,105 ----


  /*
!  * RE_compile_and_cache - compile a RE, caching if possible
   *
!  * Returns regex_t
   *
!  *  text_re --- the pattern, expressed as an *untoasted* TEXT object
!  *  cflags --- compile options for the pattern
   *
!  * Pattern is given in the database encoding.  We internally convert to
!  * array of pg_wchar which is what Spencer's regex package wants.
   */
! static regex_t
! RE_compile_and_cache(text *text_re, int cflags)
  {
        int                     text_re_len = VARSIZE(text_re);
        pg_wchar   *pattern;
        size_t          pattern_len;
        int                     i;
        int                     regcomp_result;
        cached_re_str re_temp;

        /*
         * Look for a match among previously compiled REs.      Since the data
         * structure is self-organizing with most-used entries at the front,
***************
*** 132,149 ****
                                re_array[0] = re_temp;
                        }

!                       /* Perform RE match and return result */
!                       regexec_result = pg_regexec(&re_array[0].cre_re,
!                                                                               data,
!                                                                               
data_len,
!                                                                               NULL,  
 /* no details */
!                                                                               nmatch,
!                                                                               pmatch,
!                                                                               0);
!
!                       pfree(data);
!
!                       return (regexec_result == 0);
                }
        }

--- 121,127 ----
                                re_array[0] = re_temp;
                        }

!                       return re_array[0].cre_re;
                }
        }

***************
*** 210,219 ****
--- 188,231 ----
        re_array[0] = re_temp;
        num_res++;

+       return re_array[0].cre_re;
+ }
+
+ /*
+  * RE_compile_and_execute - compile and execute a RE, caching if possible
+  *
+  * Returns TRUE on match, FALSE on no match
+  *
+  *    text_re --- the pattern, expressed as an *untoasted* TEXT object
+  *    dat --- the data to match against (need not be null-terminated)
+  *    dat_len --- the length of the data string
+  *    cflags --- compile options for the pattern
+  *    nmatch, pmatch  --- optional return area for match details
+  *
+  * Both pattern and data are given in the database encoding.  We
internally
+  * convert to array of pg_wchar which is what Spencer's regex package
wants.
+  */
+ static bool
+ RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len,
+                                          int cflags, int nmatch, regmatch_t *pmatch)
+ {
+       pg_wchar   *data;
+       size_t          data_len;
+       int                     regexec_result;
+       regex_t         re;
+
+       /* Convert data string to wide characters */
+       data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
+       data_len = pg_mb2wchar_with_len(dat, data, dat_len);
+
+       /* Compile RE */
+       re = RE_compile_and_cache(text_re, cflags);
+
        /* Perform RE match and return result */
        regexec_result = pg_regexec(&re_array[0].cre_re,
                                                                data,
                                                                data_len,
+                                                               0,
                                                                NULL,   /* no details 
*/
                                                                nmatch,
                                                                pmatch,
***************
*** 415,420 ****
--- 427,452 ----
        }

        PG_RETURN_NULL();
+ }
+
+ /*
+  * textregexreplace()
+  *      Return a replace string matched by a regular expression.
+  */
+ Datum
+ textregexreplace(PG_FUNCTION_ARGS)
+ {
+       text       *s = PG_GETARG_TEXT_P(0);
+       text       *p = PG_GETARG_TEXT_P(1);
+       text       *r = PG_GETARG_TEXT_P(2);
+       regex_t         re;
+
+       re = RE_compile_and_cache(p, regex_flavor);
+
+       return (DirectFunctionCall3(replace_text_regexp,
+                                                               PointerGetDatum(s),
+                                                               PointerGetDatum(&re),
+                                                               PointerGetDatum(r)));
  }

  /* similar_escape()
*** ./src/backend/utils/adt/varlena.c.orig      Tue Jul 20 09:00:17 2004
--- ./src/backend/utils/adt/varlena.c   Tue Jul 20 10:23:32 2004
***************
*** 28,33 ****
--- 28,34 ----
  #include "utils/builtins.h"
  #include "utils/lsyscache.h"
  #include "utils/pg_locale.h"
+ #include "regex/regex.h"


  typedef struct varlena unknown;
***************
*** 1971,1976 ****
--- 1972,2122 ----
        ret_text = PG_STR_GET_TEXT(str->data);
        pfree(str->data);
        pfree(str);
+
+       PG_RETURN_TEXT_P(ret_text);
+ }
+
+ /*
+  * have_escape_in_regexp_replace_str
+  * check replace string have escape char
+  */
+ static bool
+ have_escape_in_regexp_replace_str(const char *replace_str)
+ {
+       return (strchr(replace_str, '\\') != NULL);
+ }
+
+ #define REGEXP_REPLACE_BACKREF_CNT            10
+ /*
+  * appendStringInfoRegexpSubstr
+  * append string for regexp back references.
+  */
+ static void
+ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
+     regmatch_t *pmatch, text *buf_text, int search_start)
+ {
+       const char *pstart = PG_TEXT_GET_STR(replace_text);
+       const char *p = pstart;
+       const char *pnext;
+
+       text       *add_text;
+       int                     so;
+       int                     eo;
+
+       for(;;) {
+               pnext = strchr(p, '\\');
+               if(pnext == NULL) break;
+
+               add_text = text_substring(PointerGetDatum(replace_text),
+                                                                 p - pstart + 1, 
pnext - p, false);
+               appendStringInfoString(str, PG_TEXT_GET_STR(add_text));
+               pfree(add_text);
+
+               p = pnext + 1;
+               so = eo = -1;
+
+               if(*p >= '1' && *p <= '9') {
+                       int             idx = *p - '0';
+                       so = pmatch[idx].rm_so - search_start;
+                       eo = pmatch[idx].rm_eo - search_start;
+                       p++;
+               } else {
+                       switch(*p) {
+                       case '&':
+                               so = pmatch[0].rm_so - search_start;
+                               eo = pmatch[0].rm_eo - search_start;
+                               p++;
+                               break;
+                       }
+               }
+
+               if(so != -1 && eo != -1) {
+                       add_text = text_substring(PointerGetDatum(buf_text),
+                                                                         so + 1, (eo 
- so), false);
+                       appendStringInfoString(str, PG_TEXT_GET_STR(add_text));
+                       pfree(add_text);
+               }
+       }
+
+       add_text = text_substring(PointerGetDatum(replace_text),
+                                                         p - pstart + 1, -1, true);
+       appendStringInfoString(str, PG_TEXT_GET_STR(add_text));
+       pfree(add_text);
+ }
+
+ /*
+  * replace_text_regexp
+  * replace text using regexp
+ */
+ Datum
+ replace_text_regexp(PG_FUNCTION_ARGS)
+ {
+       text       *left_text;
+       text       *right_text;
+       text       *buf_text;
+       text       *ret_text;
+       text       *src_text = PG_GETARG_TEXT_P(0);
+       char       *src_text_str = PG_TEXT_GET_STR(src_text);
+       int                     src_text_len = TEXTLEN(src_text);
+       regex_t    *re = (regex_t *)PG_GETARG_POINTER(1);
+       text       *replace_text = PG_GETARG_TEXT_P(2);
+       char       *replace_str = PG_TEXT_GET_STR(replace_text);
+       StringInfo      str = makeStringInfo();
+       int                     regexec_result;
+       regmatch_t      pmatch[REGEXP_REPLACE_BACKREF_CNT];
+       pg_wchar   *data;
+       size_t          data_len;
+       int                     search_start;
+       bool            have_escape;
+
+       buf_text = TEXTDUP(src_text);
+
+       /* Convert data string to wide characters */
+       data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
+       data_len = pg_mb2wchar_with_len(src_text_str, data,
strlen(src_text_str));
+
+       have_escape = have_escape_in_regexp_replace_str(replace_str);
+
+       for(search_start = 0; search_start <= data_len;) {
+               regexec_result = pg_regexec(re,
+                                                                       data,
+                                                                       data_len,
+                                                                       search_start,
+                                                                       NULL,   /* no 
details */
+                                                                       
REGEXP_REPLACE_BACKREF_CNT,
+                                                                       pmatch,
+                                                                       0);
+               if(regexec_result != 0) break;
+
+               left_text = text_substring(PointerGetDatum(buf_text),
+                                                                  1, pmatch[0].rm_so 
- search_start, false);
+               right_text = text_substring(PointerGetDatum(buf_text),
+                                                                  pmatch[0].rm_eo - 
search_start + 1,
+                                                                  -1, true);
+
+               appendStringInfoString(str, PG_TEXT_GET_STR(left_text));
+               if(have_escape) {
+                       appendStringInfoRegexpSubstr(str, replace_text, pmatch,
+                                                                                
buf_text, search_start);
+               } else {
+                       appendStringInfoString(str, replace_str);
+               }
+
+               pfree(buf_text);
+               pfree(left_text);
+               buf_text = right_text;
+
+               search_start = pmatch[0].rm_eo;
+               if(pmatch[0].rm_so == pmatch[0].rm_eo) search_start++;
+       }
+
+       appendStringInfoString(str, PG_TEXT_GET_STR(buf_text));
+       pfree(buf_text);
+
+       ret_text = PG_STR_GET_TEXT(str->data);
+       pfree(str->data);
+       pfree(str);
+       pfree(data);

        PG_RETURN_TEXT_P(ret_text);
  }
*** ./src/include/catalog/pg_proc.h.orig        Tue Jul 20 09:24:11 2004
--- ./src/include/catalog/pg_proc.h     Tue Jul 20 09:26:11 2004
***************
*** 2186,2191 ****
--- 2186,2193 ----
  DESCR("return portion of string");
  DATA(insert OID =  2087 ( replace        PGNSP PGUID 12 f f t f i 3 25 "25 25
25"  replace_text - _null_ ));
  DESCR("replace all occurrences of old_substr with new_substr in string");
+ DATA(insert OID =  2167 ( regexp_replace      PGNSP PGUID 12 f f t f i 3 25
"25 25 25"  textregexreplace - _null_ ));
+ DESCR("replace text using regexp");
  DATA(insert OID =  2088 ( split_part   PGNSP PGUID 12 f f t f i 3 25 "25
25 23"  split_text - _null_ ));
  DESCR("split string by field_sep and return field_num");
  DATA(insert OID =  2089 ( to_hex         PGNSP PGUID 12 f f t f i 1 25 "23"
to_hex32 - _null_ ));
*** ./src/include/regex/regex.h.orig    Tue Jul 20 08:51:06 2004
--- ./src/include/regex/regex.h Tue Jul 20 08:51:16 2004
***************
*** 163,169 ****
   * the prototypes for exported functions
   */
  extern int    pg_regcomp(regex_t *, const pg_wchar *, size_t, int);
! extern int    pg_regexec(regex_t *, const pg_wchar *, size_t, rm_detail_t *,
size_t, regmatch_t[], int);
  extern void pg_regfree(regex_t *);
  extern size_t pg_regerror(int, const regex_t *, char *, size_t);

--- 163,169 ----
   * the prototypes for exported functions
   */
  extern int    pg_regcomp(regex_t *, const pg_wchar *, size_t, int);
! extern int    pg_regexec(regex_t *, const pg_wchar *, size_t, size_t,
rm_detail_t *, size_t, regmatch_t[], int);
  extern void pg_regfree(regex_t *);
  extern size_t pg_regerror(int, const regex_t *, char *, size_t);

*** ./src/include/utils/builtins.h.orig Tue Jul 20 09:11:19 2004
--- ./src/include/utils/builtins.h      Tue Jul 20 09:11:46 2004
***************
*** 408,413 ****
--- 408,414 ----
  extern Datum texticregexeq(PG_FUNCTION_ARGS);
  extern Datum texticregexne(PG_FUNCTION_ARGS);
  extern Datum textregexsubstr(PG_FUNCTION_ARGS);
+ extern Datum textregexreplace(PG_FUNCTION_ARGS);
  extern Datum similar_escape(PG_FUNCTION_ARGS);
  extern const char *assign_regex_flavor(const char *value,
                                        bool doit, bool interactive);
***************
*** 537,542 ****
--- 538,544 ----
  extern bool SplitIdentifierString(char *rawstring, char separator,
                                          List **namelist);
  extern Datum replace_text(PG_FUNCTION_ARGS);
+ extern Datum replace_text_regexp(PG_FUNCTION_ARGS);
  extern Datum split_text(PG_FUNCTION_ARGS);
  extern Datum text_to_array(PG_FUNCTION_ARGS);
  extern Datum array_to_text(PG_FUNCTION_ARGS);

__________________________________________________
Do You Yahoo!?
http://bb.yahoo.co.jp/


---------------------------(end of broadcast)---------------------------
TIP 3: if posting/reading through Usenet, please send an appropriate
      subscribe-nomail command to [EMAIL PROTECTED] so that your
      message can get through to the mailing list cleanly

Reply via email to