Add a new sample converter that finds the first regex match and returns
the substring for that match, or a capture group, if an index is
provided.
---
 doc/configuration.txt            | 22 +++++++++++
 reg-tests/converter/regmatch.vtc | 39 +++++++++++++++++++
 src/sample.c                     | 66 ++++++++++++++++++++++++++++++++
 3 files changed, 127 insertions(+)
 create mode 100644 reg-tests/converter/regmatch.vtc

diff --git a/doc/configuration.txt b/doc/configuration.txt
index f21a29a68..e84395d23 100644
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@@ -16238,6 +16238,28 @@ protobuf(<field_number>,[<field_type>])
   More information may be found here about the protocol buffers message field 
types:
   https://developers.google.com/protocol-buffers/docs/encoding
 
+regmatch(<regex>[,<index>[,<flags>]])
+  This extracts a substring that matches the regex pattern. It will return the 
first
+  match in the input string. By default it returns the entire match, but if 
<index>
+  is supplied, then the capture group for that number will be returned 
instead. A
+  value of 0 returns the entire match. The regex can be made case insensitive 
by
+  adding the flag "i" in <flags>.
+
+  It is highly recommended to enclose the regex part using protected quotes to
+  improve clarity and never have a closing parenthesis from the regex mixed up 
with
+  the parenthesis from the function. Just like in Bourne shell, the first 
level of
+  quotes is processed when delimiting word groups on the line, a second level 
is
+  usable for argument. It is recommended to use single quotes outside since 
these
+  ones do not try to resolve backslashes nor dollar signs.
+
+  Examples:
+
+     # extract part of content-type
+     http-request set-var(txn.imtype) 
'hdr(content-type),regmatch("image/(.*)",1)'
+
+     # extract cookie with certain pattern
+     http-request set-header x-test-cookie 
%[hdr(cookie),'regmatch(test-\w+=\d+)']
+
 regsub(<regex>,<subst>[,<flags>])
   Applies a regex-based substitution to the input string. It does the same
   operation as the well-known "sed" utility with "s/<regex>/<subst>/". By
diff --git a/reg-tests/converter/regmatch.vtc b/reg-tests/converter/regmatch.vtc
new file mode 100644
index 000000000..46df78ee0
--- /dev/null
+++ b/reg-tests/converter/regmatch.vtc
@@ -0,0 +1,39 @@
+varnishtest "regmatch converter Test"
+
+feature ignore_unknown_macro
+
+server s1 {
+       rxreq
+       txresp
+} -repeat 3 -start
+
+haproxy h1 -conf {
+       defaults
+       mode http
+       timeout connect 1s
+       timeout client 1s
+       timeout server 1s
+
+       frontend fe
+       bind "fd@${fe}"
+
+       #### requests
+       http-request set-var(txn.match) "path,regmatch('test/(\d+)/',1)"
+       http-response set-header Found %[var(txn.match)] if { var(txn.match) -m 
found }
+
+       default_backend be
+
+       backend be
+       server s1 ${s1_addr}:${s1_port}
+} -start
+
+client c1 -connect ${h1_fe_sock} {
+       txreq -url "/test/123/456"
+       rxresp
+       expect resp.status == 200
+       expect resp.http.found == "123"
+       txreq -url "/foo"
+       rxresp
+       expect resp.status == 200
+       expect resp.http.found == "<undef>"
+} -run
diff --git a/src/sample.c b/src/sample.c
index 835a18115..66d674e3f 100644
--- a/src/sample.c
+++ b/src/sample.c
@@ -2671,6 +2671,71 @@ static int sample_conv_word(const struct arg *arg_p, 
struct sample *smp, void *p
        return 1;
 }
 
+static int sample_conv_regmatch_check(struct arg *args, struct sample_conv 
*conv,
+                                      const char *file, int line, char **err)
+{
+       struct arg *arg = args;
+       char *p;
+       int len;
+
+       if (arg[1].type == ARGT_SINT && (arg[1].data.sint < 0 || 
arg[1].data.sint > MAX_MATCH)) {
+               memprintf(err, "invalid capture group number %lld. must be 
between 0 and %d", arg[1].data.sint, MAX_MATCH);
+               return 0;
+       }
+
+       /* arg0 is a regex, it uses type_flag for ICASE */
+       arg[0].type_flags = 0;
+
+       if (arg[2].type != ARGT_STR)
+               return 1;
+
+       p = arg[2].data.str.area;
+       len = arg[2].data.str.data;
+       while (len) {
+               if (*p == 'i') {
+                       arg[0].type_flags |= ARGF_REG_ICASE;
+               }
+               else {
+                       memprintf(err, "invalid regex flag '%c', only 'i' is 
supported", *p);
+                       return 0;
+               }
+               p++;
+               len--;
+       }
+       return 1;
+}
+
+/* This sample function is designed to find the first match of a regex in the 
input string.
+ * If arg1 is supplied, that is used as the capture group to return (or the 
whole match if 0).
+ */
+static int sample_conv_regmatch(const struct arg *arg_p, struct sample *smp, 
void *private)
+{
+       struct my_regex *reg = arg_p[0].data.reg;
+       regmatch_t pmatch[MAX_MATCH];
+       regmatch_t capture;
+       int nmatch = (arg_p[1].type == ARGT_SINT) ? arg_p[1].data.sint : 0;
+       int found;
+
+       found = regex_exec_match2(reg, smp->data.u.str.area, 
smp->data.u.str.data, MAX_MATCH, pmatch, 0);
+       /* Error if no match is found */
+       if (!found) {
+               smp->data.u.str.data = 0;
+               return 0;
+       }
+       capture = pmatch[nmatch];
+       smp->data.u.str.data = capture.rm_eo - capture.rm_so;
+       /* If ret string is len 0, no need to change pointers or update size */
+       if (!smp->data.u.str.data)
+               return 1;
+
+       smp->data.u.str.area += capture.rm_so;
+       /* adjust size if necessary */
+       if (smp->data.u.str.size)
+               smp->data.u.str.size -= capture.rm_so;
+
+       return 1;
+}
+
 static int sample_conv_regsub_check(struct arg *args, struct sample_conv *conv,
                                     const char *file, int line, char **err)
 {
@@ -4116,6 +4181,7 @@ static struct sample_conv_kw_list sample_conv_kws = {ILH, 
{
        { "bytes",  sample_conv_bytes,     ARG2(1,SINT,SINT), NULL, SMP_T_BIN,  
SMP_T_BIN },
        { "field",  sample_conv_field,     ARG3(2,SINT,STR,SINT), 
sample_conv_field_check, SMP_T_STR,  SMP_T_STR },
        { "word",   sample_conv_word,      ARG3(2,SINT,STR,SINT), 
sample_conv_field_check, SMP_T_STR,  SMP_T_STR },
+       { "regmatch", sample_conv_regmatch, ARG3(1,REG,SINT,STR), 
sample_conv_regmatch_check, SMP_T_STR, SMP_T_STR },
        { "regsub", sample_conv_regsub,    ARG3(2,REG,STR,STR), 
sample_conv_regsub_check, SMP_T_STR, SMP_T_STR },
        { "sha1",   sample_conv_sha1,      0,            NULL, SMP_T_BIN,  
SMP_T_BIN  },
 #ifdef USE_OPENSSL
-- 
2.31.1


Reply via email to