Add a new sample converter that finds the first regex match and returns the substring for that match, or a capture group, if an index is provided. --- doc/configuration.txt | 22 +++++++++++ reg-tests/converter/regmatch.vtc | 39 +++++++++++++++++++ src/sample.c | 66 ++++++++++++++++++++++++++++++++ 3 files changed, 127 insertions(+) create mode 100644 reg-tests/converter/regmatch.vtc
diff --git a/doc/configuration.txt b/doc/configuration.txt index f21a29a68..e84395d23 100644 --- a/doc/configuration.txt +++ b/doc/configuration.txt @@ -16238,6 +16238,28 @@ protobuf(<field_number>,[<field_type>]) More information may be found here about the protocol buffers message field types: https://developers.google.com/protocol-buffers/docs/encoding +regmatch(<regex>[,<index>[,<flags>]]) + This extracts a substring that matches the regex pattern. It will return the first + match in the input string. By default it returns the entire match, but if <index> + is supplied, then the capture group for that number will be returned instead. A + value of 0 returns the entire match. The regex can be made case insensitive by + adding the flag "i" in <flags>. + + It is highly recommended to enclose the regex part using protected quotes to + improve clarity and never have a closing parenthesis from the regex mixed up with + the parenthesis from the function. Just like in Bourne shell, the first level of + quotes is processed when delimiting word groups on the line, a second level is + usable for argument. It is recommended to use single quotes outside since these + ones do not try to resolve backslashes nor dollar signs. + + Examples: + + # extract part of content-type + http-request set-var(txn.imtype) 'hdr(content-type),regmatch("image/(.*)",1)' + + # extract cookie with certain pattern + http-request set-header x-test-cookie %[hdr(cookie),'regmatch(test-\w+=\d+)'] + regsub(<regex>,<subst>[,<flags>]) Applies a regex-based substitution to the input string. It does the same operation as the well-known "sed" utility with "s/<regex>/<subst>/". By diff --git a/reg-tests/converter/regmatch.vtc b/reg-tests/converter/regmatch.vtc new file mode 100644 index 000000000..46df78ee0 --- /dev/null +++ b/reg-tests/converter/regmatch.vtc @@ -0,0 +1,39 @@ +varnishtest "regmatch converter Test" + +feature ignore_unknown_macro + +server s1 { + rxreq + txresp +} -repeat 3 -start + +haproxy h1 -conf { + defaults + mode http + timeout connect 1s + timeout client 1s + timeout server 1s + + frontend fe + bind "fd@${fe}" + + #### requests + http-request set-var(txn.match) "path,regmatch('test/(\d+)/',1)" + http-response set-header Found %[var(txn.match)] if { var(txn.match) -m found } + + default_backend be + + backend be + server s1 ${s1_addr}:${s1_port} +} -start + +client c1 -connect ${h1_fe_sock} { + txreq -url "/test/123/456" + rxresp + expect resp.status == 200 + expect resp.http.found == "123" + txreq -url "/foo" + rxresp + expect resp.status == 200 + expect resp.http.found == "<undef>" +} -run diff --git a/src/sample.c b/src/sample.c index 835a18115..66d674e3f 100644 --- a/src/sample.c +++ b/src/sample.c @@ -2671,6 +2671,71 @@ static int sample_conv_word(const struct arg *arg_p, struct sample *smp, void *p return 1; } +static int sample_conv_regmatch_check(struct arg *args, struct sample_conv *conv, + const char *file, int line, char **err) +{ + struct arg *arg = args; + char *p; + int len; + + if (arg[1].type == ARGT_SINT && (arg[1].data.sint < 0 || arg[1].data.sint > MAX_MATCH)) { + memprintf(err, "invalid capture group number %lld. must be between 0 and %d", arg[1].data.sint, MAX_MATCH); + return 0; + } + + /* arg0 is a regex, it uses type_flag for ICASE */ + arg[0].type_flags = 0; + + if (arg[2].type != ARGT_STR) + return 1; + + p = arg[2].data.str.area; + len = arg[2].data.str.data; + while (len) { + if (*p == 'i') { + arg[0].type_flags |= ARGF_REG_ICASE; + } + else { + memprintf(err, "invalid regex flag '%c', only 'i' is supported", *p); + return 0; + } + p++; + len--; + } + return 1; +} + +/* This sample function is designed to find the first match of a regex in the input string. + * If arg1 is supplied, that is used as the capture group to return (or the whole match if 0). + */ +static int sample_conv_regmatch(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct my_regex *reg = arg_p[0].data.reg; + regmatch_t pmatch[MAX_MATCH]; + regmatch_t capture; + int nmatch = (arg_p[1].type == ARGT_SINT) ? arg_p[1].data.sint : 0; + int found; + + found = regex_exec_match2(reg, smp->data.u.str.area, smp->data.u.str.data, MAX_MATCH, pmatch, 0); + /* Error if no match is found */ + if (!found) { + smp->data.u.str.data = 0; + return 0; + } + capture = pmatch[nmatch]; + smp->data.u.str.data = capture.rm_eo - capture.rm_so; + /* If ret string is len 0, no need to change pointers or update size */ + if (!smp->data.u.str.data) + return 1; + + smp->data.u.str.area += capture.rm_so; + /* adjust size if necessary */ + if (smp->data.u.str.size) + smp->data.u.str.size -= capture.rm_so; + + return 1; +} + static int sample_conv_regsub_check(struct arg *args, struct sample_conv *conv, const char *file, int line, char **err) { @@ -4116,6 +4181,7 @@ static struct sample_conv_kw_list sample_conv_kws = {ILH, { { "bytes", sample_conv_bytes, ARG2(1,SINT,SINT), NULL, SMP_T_BIN, SMP_T_BIN }, { "field", sample_conv_field, ARG3(2,SINT,STR,SINT), sample_conv_field_check, SMP_T_STR, SMP_T_STR }, { "word", sample_conv_word, ARG3(2,SINT,STR,SINT), sample_conv_field_check, SMP_T_STR, SMP_T_STR }, + { "regmatch", sample_conv_regmatch, ARG3(1,REG,SINT,STR), sample_conv_regmatch_check, SMP_T_STR, SMP_T_STR }, { "regsub", sample_conv_regsub, ARG3(2,REG,STR,STR), sample_conv_regsub_check, SMP_T_STR, SMP_T_STR }, { "sha1", sample_conv_sha1, 0, NULL, SMP_T_BIN, SMP_T_BIN }, #ifdef USE_OPENSSL -- 2.31.1

