Changeset: 431bfeaaa76b for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/431bfeaaa76b
Branch: default
Log Message:
Merges regexp branch
diffs (101 lines):
diff --git a/monetdb5/modules/mal/pcre.c b/monetdb5/modules/mal/pcre.c
--- a/monetdb5/modules/mal/pcre.c
+++ b/monetdb5/modules/mal/pcre.c
@@ -733,6 +733,7 @@ single_replace(pcre *pcre_code, pcre_ext
int offset = 0;
int len_result = 0;
int addlen;
+ int empty_match_correction = 0;
char *tmp;
do {
@@ -740,7 +741,12 @@ single_replace(pcre *pcre_code, pcre_ext
exec_options, ovector,
ovecsize);
if (j <= 0)
break;
- addlen = ovector[0] - offset + (nbackrefs == 0 ?
len_replacement : 0);
+
+ empty_match_correction = ovector[0] == ovector[1] ? 1 : 0;
+
+ // calculate the length of the string that will be appended to
result
+ addlen = ovector[0] - offset
+ + (nbackrefs == 0 ? len_replacement : 0) +
empty_match_correction;
if (len_result + addlen >= *max_result) {
tmp = GDKrealloc(result, len_result + addlen + 1);
if (tmp == NULL) {
@@ -750,11 +756,13 @@ single_replace(pcre *pcre_code, pcre_ext
result = tmp;
*max_result = len_result + addlen + 1;
}
+ // append to the result the parts of the original string that
are left unchanged
if (ovector[0] > offset) {
strncpy(result + len_result, origin_str + offset,
ovector[0] - offset);
len_result += ovector[0] - offset;
}
+ // append to the result the replacement of the matched string
if (nbackrefs == 0) {
strncpy(result + len_result, replacement,
len_replacement);
len_result += len_replacement;
@@ -807,8 +815,18 @@ single_replace(pcre *pcre_code, pcre_ext
len_result += addlen;
}
}
- offset = ovector[1];
- } while (offset < len_origin_str && global);
+ // In case of an empty match just advance the offset by 1
+ offset = ovector[1] + empty_match_correction;
+ // and copy the character that we just advanced over
+ if (empty_match_correction) {
+ strncpy(result + len_result, origin_str + ovector[1],
1);
+ ++len_result;
+ }
+ // before we loop around check with the offset - 1 if we had an
empty match
+ // since we manually advanced the offset by one. otherwise we
gonna skip a
+ // replacement at the end of the string
+ } while ((offset - empty_match_correction) < len_origin_str && global);
+
if (offset < len_origin_str) {
addlen = len_origin_str - offset;
if (len_result + addlen >= *max_result) {
diff --git a/sql/test/Tests/regexp.test b/sql/test/Tests/regexp.test
--- a/sql/test/Tests/regexp.test
+++ b/sql/test/Tests/regexp.test
@@ -54,9 +54,36 @@ select regexp_replace('foo', 'f o o', 'X
----
foo
-# regex option - not extended
+# regex option - extended
query T rowsort
select regexp_replace('foo', 'f o o', 'XYZ', 'x')
----
XYZ
+# regex option - not emtpy match
+query T rowsort
+select regexp_replace('foobar', 'k?', 'XY')
+----
+foobar
+
+# regex option - emtpy match
+query T rowsort
+select regexp_replace('foobar', 'k?', '-', 'e')
+----
+-f-o-o-b-a-r-
+
+# regex option - empty match w alternative v1
+query T rowsort
+select regexp_replace('abc', 'b|k?', '-', 'e')
+----
+-a--c-
+
+# regex option - empty match w alternative v2
+# even though you would expect -a--c- the pcre lib does not return
+# the longest match for this particular pattern in offset 1 ('b') but
+# an empty string match ¯\_(ツ)_/¯
+query T rowsort
+select regexp_replace('abc', 'k?|b', '-', 'e')
+----
+-a-b-c-
+
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]