This is an automated email from the ASF dual-hosted git repository. bcall pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/trafficserver.git
The following commit(s) were added to refs/heads/master by this push: new 8e1c1b9faf Change Regex class to use PCRE2 (#11014) 8e1c1b9faf is described below commit 8e1c1b9fafb13df87ca431a1e855f64223b06ffb Author: Bryan Call <bc...@apache.org> AuthorDate: Thu Feb 29 12:06:24 2024 -0800 Change Regex class to use PCRE2 (#11014) --- CMakeLists.txt | 2 +- include/proxy/http/remap/UrlRewrite.h | 2 +- include/tsutil/Regex.h | 106 +++++--- plugins/experimental/tls_bridge/CMakeLists.txt | 2 +- src/proxy/http/remap/UrlRewrite.cc | 14 +- src/tsutil/CMakeLists.txt | 3 +- src/tsutil/Regex.cc | 329 +++++++++++++++++-------- src/tsutil/unit_tests/test_Regex.cc | 138 ++++++++++- 8 files changed, 440 insertions(+), 156 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 63ec25723a..a276d827cc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -253,7 +253,7 @@ if(LibLZMA_FOUND) endif() find_package(PCRE REQUIRED) -find_package(PCRE2 COMPONENTS 8BIT) +pkg_check_modules(PCRE2 REQUIRED IMPORTED_TARGET libpcre2-8) include(CheckOpenSSLIsBoringSSL) include(CheckOpenSSLIsQuictls) diff --git a/include/proxy/http/remap/UrlRewrite.h b/include/proxy/http/remap/UrlRewrite.h index 86dcb50a07..797ad94c47 100644 --- a/include/proxy/http/remap/UrlRewrite.h +++ b/include/proxy/http/remap/UrlRewrite.h @@ -232,7 +232,7 @@ private: int request_host_len); bool _regexMappingLookup(RegexMappingList ®ex_mappings, URL *request_url, int request_port, const char *request_host, int request_host_len, int rank_ceiling, UrlMappingContainer &mapping_container); - int _expandSubstitutions(int *matches_info, const RegexMapping *reg_map, const char *matched_string, char *dest_buf, + int _expandSubstitutions(size_t *matches_info, const RegexMapping *reg_map, const char *matched_string, char *dest_buf, int dest_buf_size); void _destroyTable(std::unique_ptr<URLTable> &h_table); void _destroyList(RegexMappingList ®exes); diff --git a/include/tsutil/Regex.h b/include/tsutil/Regex.h index a1c51e3661..c4ca8feb03 100644 --- a/include/tsutil/Regex.h +++ b/include/tsutil/Regex.h @@ -28,24 +28,60 @@ #include <vector> #include <memory> -#include "swoc/MemSpan.h" +#define PCRE2_CODE_UNIT_WIDTH 8 +#include <pcre2.h> -/// Match flags for regular expression evaluation. +/// @brief Match flags for regular expression evaluation. enum REFlags { - RE_CASE_INSENSITIVE = 0x0001, ///< Ignore case (default: case sensitive). - RE_UNANCHORED = 0x0002, ///< Unanchored (DFA defaults to anchored). - RE_ANCHORED = 0x0004, ///< Anchored (Regex defaults to unanchored). + RE_CASE_INSENSITIVE = PCRE2_CASELESS, ///< Ignore case (default: case sensitive). + RE_UNANCHORED = PCRE2_MULTILINE, ///< Unanchored (DFA defaults to anchored). + RE_ANCHORED = PCRE2_ANCHORED, ///< Anchored (Regex defaults to unanchored). }; -/** Wrapper for PCRE evaluation. - * - */ -class Regex +/// @brief Wrapper for PCRE2 match data. +class RegexMatches { + friend class Regex; + public: - /// Default number of capture groups. - static constexpr size_t DEFAULT_GROUP_COUNT = 10; + /** Construct a new RegexMatches object. + * + * @param size The number of matches to allocate space for. + */ + RegexMatches(uint32_t size = DEFAULT_MATCHES); + ~RegexMatches(); + + /** Get the match at the given index. + * + * @return The match at the given index. + */ + std::string_view operator[](size_t index) const; + /** Get the ovector pointer for the capture groups. Don't use this unless you know what you are doing. + * + * @return ovector pointer. + */ + size_t *get_ovector_pointer(); + int32_t size() const; + +protected: + pcre2_match_data *get_match_data(); + void set_subject(std::string_view subject); + void set_size(int32_t size); + +private: + constexpr static uint32_t DEFAULT_MATCHES = 10; + static void *malloc(size_t size, void *caller); + pcre2_match_data *_match_data = nullptr; + std::string_view _subject; + char _buffer[24 + 96 + 16 * DEFAULT_MATCHES]; // 24 bytes for the general context, 96 bytes overhead, 16 bytes per match. + size_t _buffer_bytes_used = 0; + int32_t _size = 0; +}; +/// @brief Wrapper for PCRE2 regular expression. +class Regex +{ +public: Regex() = default; Regex(Regex const &) = delete; // No copying. Regex(Regex &&that) noexcept; @@ -59,46 +95,43 @@ public: * * @a flags should be the bitwise @c or of @c REFlags values. */ - bool compile(const char *pattern, unsigned flags = 0); + bool compile(std::string_view pattern, uint32_t flags = 0); - /** Execute the regular expression. + /** Compile the @a pattern into a regular expression. * - * @param str String to match against. - * @return @c true if the pattern matched, @a false if not. + * @param pattern Source pattern for regular expression (null terminated). + * @param error String to receive error message. + * @param erroffset Pointer to integer to receive error offset. + * @param flags Compilation flags. + * @return @a true if compiled successfully, @a false otherwise. * - * It is safe to call this method concurrently on the same instance of @a this. + * @a flags should be the bitwise @c or of @c REFlags values. */ - bool exec(std::string_view const &str) const; + bool compile(std::string_view pattern, std::string &error, int &erroffset, unsigned flags = 0); /** Execute the regular expression. * - * @param str String to match against. - * @param ovector Capture results. - * @param ovecsize Number of elements in @a ovector. + * @param subject String to match against. * @return @c true if the pattern matched, @a false if not. * * It is safe to call this method concurrently on the same instance of @a this. - * - * Each capture group takes 3 elements of @a ovector, therefore @a ovecsize must - * be a multiple of 3 and at least three times the number of desired capture groups. */ - bool exec(std::string_view const &str, int *ovector, int ovecsize) const; + bool exec(std::string_view subject) const; /** Execute the regular expression. * - * @param str String to match against. - * @param ovector Capture results. - * @param ovecsize Number of elements in @a ovector. - * @return @c true if the pattern matched, @a false if not. + * @param subject String to match against. + * @param matches Place to store the capture groups. + * @return @c The number of capture groups. < 0 if an error occurred. 0 if the number of Matches is too small. * * It is safe to call this method concurrently on the same instance of @a this. * * Each capture group takes 3 elements of @a ovector, therefore @a ovecsize must * be a multiple of 3 and at least three times the number of desired capture groups. */ - bool exec(std::string_view str, swoc::MemSpan<int> groups) const; + int exec(std::string_view subject, RegexMatches &matches) const; - /// @return The number of groups captured in the last call to @c exec. + /// @return The number of capture groups in the compiled pattern. int get_capture_count(); private: @@ -106,8 +139,7 @@ private: // enough to use as pointers. For some reason the header defines in name only a struct and // then aliases it to the standard name, rather than simply declare the latter in name only. // The goal is completely wrap PCRE and not include that header in client code. - void *regex = nullptr; ///< Compiled expression. - void *regex_extra = nullptr; ///< Extra information about the expression. + pcre2_code *_code = nullptr; }; /** Deterministic Finite state Automata container. @@ -122,18 +154,18 @@ public: ~DFA(); /// @return The number of patterns successfully compiled. - int compile(std::string_view const &pattern, unsigned flags = 0); + int32_t compile(std::string_view pattern, unsigned flags = 0); /// @return The number of patterns successfully compiled. - int compile(std::string_view *patterns, int npatterns, unsigned flags = 0); + int32_t compile(std::string_view *patterns, int npatterns, unsigned flags = 0); /// @return The number of patterns successfully compiled. - int compile(const char **patterns, int npatterns, unsigned flags = 0); + int32_t compile(const char **patterns, int npatterns, unsigned flags = 0); /** Match @a str against the internal patterns. * * @param str String to match. * @return Index of the matched pattern, -1 if no match. */ - int match(std::string_view const &str) const; + int32_t match(std::string_view str) const; private: struct Pattern { @@ -148,7 +180,7 @@ private: * @param flags Regular expression compilation flags. * @return @c true if @a pattern was successfully compiled, @c false if not. */ - bool build(std::string_view const &pattern, unsigned flags = 0); + bool build(std::string_view pattern, unsigned flags = 0); std::vector<Pattern> _patterns; }; diff --git a/plugins/experimental/tls_bridge/CMakeLists.txt b/plugins/experimental/tls_bridge/CMakeLists.txt index e3de8b9249..5430505431 100644 --- a/plugins/experimental/tls_bridge/CMakeLists.txt +++ b/plugins/experimental/tls_bridge/CMakeLists.txt @@ -17,5 +17,5 @@ add_atsplugin(tls_bridge tls_bridge.cc) -target_link_libraries(tls_bridge PRIVATE libswoc::libswoc) +target_link_libraries(tls_bridge PRIVATE ts::tsutil libswoc::libswoc) verify_global_plugin(tls_bridge) diff --git a/src/proxy/http/remap/UrlRewrite.cc b/src/proxy/http/remap/UrlRewrite.cc index 1f8176e97c..e377451921 100644 --- a/src/proxy/http/remap/UrlRewrite.cc +++ b/src/proxy/http/remap/UrlRewrite.cc @@ -857,7 +857,7 @@ UrlRewrite::_mappingLookup(MappingsStore &mappings, URL *request_url, int reques // does not null terminate return string int -UrlRewrite::_expandSubstitutions(int *matches_info, const RegexMapping *reg_map, const char *matched_string, char *dest_buf, +UrlRewrite::_expandSubstitutions(size_t *matches_info, const RegexMapping *reg_map, const char *matched_string, char *dest_buf, int dest_buf_size) { int cur_buf_size = 0; @@ -908,6 +908,7 @@ UrlRewrite::_regexMappingLookup(RegexMappingList ®ex_mappings, URL *request_u int request_host_len, int rank_ceiling, UrlMappingContainer &mapping_container) { bool retval = false; + RegexMatches matches; if (rank_ceiling == -1) { // we will now look at all regex mappings rank_ceiling = INT_MAX; @@ -959,11 +960,9 @@ UrlRewrite::_regexMappingLookup(RegexMappingList ®ex_mappings, URL *request_u continue; } - int matches_info[MAX_REGEX_SUBS * 3]; - bool match_result = - list_iter->regular_expression.exec(std::string_view(request_host, request_host_len), matches_info, countof(matches_info)); + int match_result = list_iter->regular_expression.exec(std::string_view(request_host, request_host_len), matches); - if (match_result == true) { + if (match_result > 0) { Debug("url_rewrite_regex", "Request URL host [%.*s] matched regex in mapping of rank %d " "with %d possible substitutions", @@ -975,8 +974,9 @@ UrlRewrite::_regexMappingLookup(RegexMappingList ®ex_mappings, URL *request_u int buf_len; // Expand substitutions in the host field from the stored template - buf_len = _expandSubstitutions(matches_info, list_iter, request_host, buf, sizeof(buf)); - URL *expanded_url = mapping_container.createNewToURL(); + size_t *matches_info = matches.get_ovector_pointer(); + buf_len = _expandSubstitutions(matches_info, list_iter, request_host, buf, sizeof(buf)); + URL *expanded_url = mapping_container.createNewToURL(); expanded_url->copy(&((list_iter->url_map)->toURL)); expanded_url->host_set(buf, buf_len); diff --git a/src/tsutil/CMakeLists.txt b/src/tsutil/CMakeLists.txt index 44b83448fd..a747431daa 100644 --- a/src/tsutil/CMakeLists.txt +++ b/src/tsutil/CMakeLists.txt @@ -50,9 +50,10 @@ add_library( ts_unit_parser.cc Regex.cc ) + add_library(ts::tsutil ALIAS tsutil) set_target_properties(tsutil PROPERTIES POSITION_INDEPENDENT_CODE TRUE PUBLIC_HEADER "${TSUTIL_PUBLIC_HEADERS}") -target_link_libraries(tsutil PUBLIC libswoc::libswoc yaml-cpp::yaml-cpp PCRE::PCRE) +target_link_libraries(tsutil PUBLIC libswoc::libswoc yaml-cpp::yaml-cpp PkgConfig::PCRE2) install( TARGETS tsutil diff --git a/src/tsutil/Regex.cc b/src/tsutil/Regex.cc index 42d9d27c28..faea3b8546 100644 --- a/src/tsutil/Regex.cc +++ b/src/tsutil/Regex.cc @@ -26,160 +26,281 @@ #include <array> #include <assert.h> -#if __has_include(<pcre/pcre.h>) -#include <pcre/pcre.h> -#else -#include <pcre.h> -#endif - +//---------------------------------------------------------------------------- namespace { -inline pcre * -as_pcre(void *p) +void * +my_malloc(size_t size, void * /*caller*/) { - return static_cast<pcre *>(p); + void *ptr = malloc(size); + return ptr; } -inline pcre_extra * -as_extra(void *p) + +void +my_free(void *ptr, void * /*caller*/) { - return static_cast<pcre_extra *>(p); + free(ptr); } } // namespace -#ifdef PCRE_CONFIG_JIT -/* -Using two thread locals avoids the deadlock because without the thread local object access, get_jit_stack doesn't call -the TLS init function which ends up calling __cxx_thread_atexit(which locks the dl_whatever mutex). Since the raw -pointer doesn't have a destructor to call, it doesn't need to call this. Interestingly, get_jit_stack was calling the -TLS init function to setup the destructor call at thread exit whether or not the class was declared in the function -body. -*/ -namespace +//---------------------------------------------------------------------------- +class RegexContext { -thread_local pcre_jit_stack *jit_stack; - -struct JitStackCleanup { - ~JitStackCleanup() +public: + static RegexContext * + get_instance() + { + if (!_regex_context) { + _regex_context = new RegexContext(); + } + return _regex_context; + } + ~RegexContext() { - if (jit_stack) { - pcre_jit_stack_free(jit_stack); + if (_general_context != nullptr) { + pcre2_general_context_free(_general_context); + } + if (_compile_context != nullptr) { + pcre2_compile_context_free(_compile_context); + } + if (_match_context != nullptr) { + pcre2_match_context_free(_match_context); + } + if (_jit_stack != nullptr) { + pcre2_jit_stack_free(_jit_stack); } } + pcre2_general_context * + get_general_context() + { + return _general_context; + } + pcre2_compile_context * + get_compile_context() + { + return _compile_context; + } + pcre2_match_context * + get_match_context() + { + return _match_context; + } + +private: + RegexContext() + { + _general_context = pcre2_general_context_create(my_malloc, my_free, nullptr); + _compile_context = pcre2_compile_context_create(_general_context); + _match_context = pcre2_match_context_create(_general_context); + _jit_stack = pcre2_jit_stack_create(4096, 1024 * 1024, nullptr); // 1 page min and 1MB max + pcre2_jit_stack_assign(_match_context, nullptr, _jit_stack); + } + pcre2_general_context *_general_context = nullptr; + pcre2_compile_context *_compile_context = nullptr; + pcre2_match_context *_match_context = nullptr; + pcre2_jit_stack *_jit_stack = nullptr; + thread_local static RegexContext *_regex_context; }; -thread_local JitStackCleanup jsc; +thread_local RegexContext *RegexContext::_regex_context = nullptr; -pcre_jit_stack * -get_jit_stack(void *) +//---------------------------------------------------------------------------- +namespace { - if (!jit_stack) { - jit_stack = pcre_jit_stack_alloc(4096, 1024 * 1024); // 1 page min and 1MB max - } - return jit_stack; -} - -} // end anonymous namespace -#endif // def PCRE_CONFIG_JIT +struct RegexContextCleanup { + ~RegexContextCleanup() { delete RegexContext::get_instance(); } +}; +thread_local RegexContextCleanup cleanup; +} // namespace -Regex::Regex(Regex &&that) noexcept : regex(that.regex), regex_extra(that.regex_extra) +//---------------------------------------------------------------------------- +RegexMatches::RegexMatches(uint32_t size) { - that.regex = nullptr; - that.regex_extra = nullptr; + pcre2_general_context *ctx = pcre2_general_context_create( + &RegexMatches::malloc, [](void *, void *) -> void {}, static_cast<void *>(this)); + + _match_data = pcre2_match_data_create(size, ctx); } -bool -Regex::compile(const char *pattern, const unsigned flags) +//---------------------------------------------------------------------------- +void * +RegexMatches::malloc(size_t size, void *caller) { - const char *error = nullptr; - int erroffset = 0; - int options = 0; - int study_opts = 0; + auto *matches = static_cast<RegexMatches *>(caller); - if (regex) { - return false; + // allocate from the buffer if possible + if (size <= sizeof(matches->_buffer) - matches->_buffer_bytes_used) { + void *ptr = matches->_buffer + matches->_buffer_bytes_used; + matches->_buffer_bytes_used += size; + return ptr; } - if (flags & RE_CASE_INSENSITIVE) { - options |= PCRE_CASELESS; - } + // otherwise use system malloc if the buffer is too small + void *ptr = ::malloc(size); + return ptr; +} - if (flags & RE_ANCHORED) { - options |= PCRE_ANCHORED; +//---------------------------------------------------------------------------- +RegexMatches::~RegexMatches() +{ + if (_match_data != nullptr) { + pcre2_match_data_free(_match_data); } +} - regex = pcre_compile(pattern, options, &error, &erroffset, nullptr); - if (error) { - regex = nullptr; - return false; - } +//---------------------------------------------------------------------------- +size_t * +RegexMatches::get_ovector_pointer() +{ + return pcre2_get_ovector_pointer(_match_data); +} + +//---------------------------------------------------------------------------- +int32_t +RegexMatches::size() const +{ + return _size; +} + +//---------------------------------------------------------------------------- +pcre2_match_data * +RegexMatches::get_match_data() +{ + return _match_data; +} -#ifdef PCRE_CONFIG_JIT - study_opts |= PCRE_STUDY_JIT_COMPILE; -#endif +//---------------------------------------------------------------------------- +void +RegexMatches::set_size(int32_t size) +{ + _size = size; +} - regex_extra = pcre_study(as_pcre(regex), study_opts, &error); +//---------------------------------------------------------------------------- +void +RegexMatches::set_subject(std::string_view subject) +{ + _subject = subject; +} -#ifdef PCRE_CONFIG_JIT - if (regex_extra) { - pcre_assign_jit_stack(as_extra(regex_extra), &get_jit_stack, nullptr); +//---------------------------------------------------------------------------- +std::string_view +RegexMatches::operator[](size_t index) const +{ + // check if the index is valid + if (index >= pcre2_get_ovector_count(_match_data)) { + return std::string_view(); } -#endif - return true; + PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(_match_data); + return std::string_view(_subject.data() + ovector[2 * index], ovector[2 * index + 1] - ovector[2 * index]); } -int -Regex::get_capture_count() +//---------------------------------------------------------------------------- +Regex::Regex(Regex &&that) noexcept { - int captures = -1; - if (pcre_fullinfo(as_pcre(regex), as_extra(regex_extra), PCRE_INFO_CAPTURECOUNT, &captures) != 0) { - return -1; - } + _code = that._code; + that._code = nullptr; +} - return captures; +//---------------------------------------------------------------------------- +Regex::~Regex() +{ + if (_code != nullptr) { + pcre2_code_free(_code); + } } +//---------------------------------------------------------------------------- bool -Regex::exec(std::string_view const &str) const +Regex::compile(std::string_view pattern, uint32_t flags) { - std::array<int, DEFAULT_GROUP_COUNT * 3> ovector = {{0}}; - return this->exec(str, ovector); + std::string error; + int erroroffset; + + return this->compile(pattern, error, erroroffset, flags); } +//---------------------------------------------------------------------------- bool -Regex::exec(std::string_view const &str, int *ovector, int ovecsize) const +Regex::compile(std::string_view pattern, std::string &error, int &erroroffset, uint32_t flags) { - int rv; + if (_code != nullptr) { + pcre2_code_free(_code); + } + PCRE2_SIZE error_offset; + int error_code; + _code = pcre2_compile(reinterpret_cast<PCRE2_SPTR>(pattern.data()), pattern.size(), flags, &error_code, &error_offset, + RegexContext::get_instance()->get_compile_context()); + if (!_code) { + erroroffset = error_offset; + + // get pcre2 error message + PCRE2_UCHAR buffer[256]; + pcre2_get_error_message(error_code, buffer, sizeof(buffer)); + error.assign((char *)buffer); + return false; + } + + // support for JIT + pcre2_jit_compile(_code, PCRE2_JIT_COMPLETE); - rv = pcre_exec(as_pcre(regex), as_extra(regex_extra), str.data(), static_cast<int>(str.size()), 0, 0, ovector, ovecsize); - return rv > 0; + return true; } +//---------------------------------------------------------------------------- bool -Regex::exec(std::string_view str, swoc::MemSpan<int> groups) const +Regex::exec(std::string_view subject) const { - return 0 < - pcre_exec(as_pcre(regex), as_extra(regex_extra), str.data(), int(str.size()), 0, 0, groups.data(), int(groups.count())); + if (_code == nullptr) { + return false; + } + RegexMatches matches; + + int count = this->exec(subject, matches); + return count > 0; } -Regex::~Regex() +//---------------------------------------------------------------------------- +int32_t +Regex::exec(std::string_view subject, RegexMatches &matches) const { - if (regex_extra) { -#ifdef PCRE_CONFIG_JIT - pcre_free_study(as_extra(regex_extra)); -#else - pcre_free(regex_extra); -#endif + if (_code == nullptr) { + return 0; + } + int count = pcre2_match(_code, reinterpret_cast<PCRE2_SPTR>(subject.data()), subject.size(), 0, 0, matches.get_match_data(), + RegexContext::get_instance()->get_match_context()); + + matches.set_size(count); + + if (count < 0) { + return count; } - if (regex) { - pcre_free(regex); + + if (count > 0) { + matches.set_subject(subject); + } + + return count; +} + +//---------------------------------------------------------------------------- +int32_t +Regex::get_capture_count() +{ + int captures = -1; + if (pcre2_pattern_info(_code, PCRE2_INFO_CAPTURECOUNT, &captures) != 0) { + return -1; } + return captures; } +//---------------------------------------------------------------------------- DFA::~DFA() {} +//---------------------------------------------------------------------------- bool -DFA::build(std::string_view const &pattern, unsigned flags) +DFA::build(const std::string_view pattern, unsigned flags) { Regex rxp; std::string string{pattern}; @@ -188,22 +309,24 @@ DFA::build(std::string_view const &pattern, unsigned flags) flags |= RE_ANCHORED; } - if (!rxp.compile(string.c_str(), flags)) { + if (!rxp.compile(pattern, flags)) { return false; } _patterns.emplace_back(std::move(rxp), std::move(string)); return true; } -int -DFA::compile(std::string_view const &pattern, unsigned flags) +//---------------------------------------------------------------------------- +int32_t +DFA::compile(std::string_view pattern, unsigned flags) { assert(_patterns.empty()); this->build(pattern, flags); return _patterns.size(); } -int +//---------------------------------------------------------------------------- +int32_t DFA::compile(std::string_view *patterns, int npatterns, unsigned flags) { _patterns.reserve(npatterns); // try to pre-allocate. @@ -213,7 +336,8 @@ DFA::compile(std::string_view *patterns, int npatterns, unsigned flags) return _patterns.size(); } -int +//---------------------------------------------------------------------------- +int32_t DFA::compile(const char **patterns, int npatterns, unsigned flags) { _patterns.reserve(npatterns); // try to pre-allocate. @@ -223,8 +347,9 @@ DFA::compile(const char **patterns, int npatterns, unsigned flags) return _patterns.size(); } -int -DFA::match(std::string_view const &str) const +//---------------------------------------------------------------------------- +int32_t +DFA::match(std::string_view str) const { for (auto spot = _patterns.begin(), limit = _patterns.end(); spot != limit; ++spot) { if (spot->_re.exec(str)) { diff --git a/src/tsutil/unit_tests/test_Regex.cc b/src/tsutil/unit_tests/test_Regex.cc index 16f327dff6..f17d2b17c8 100644 --- a/src/tsutil/unit_tests/test_Regex.cc +++ b/src/tsutil/unit_tests/test_Regex.cc @@ -20,8 +20,8 @@ limitations under the License. */ -#include <array> #include <string_view> +#include <vector> #include "tscore/ink_assert.h" #include "tscore/ink_defs.h" @@ -35,22 +35,148 @@ struct subject_match_t { struct test_t { std::string_view regex; - std::array<subject_match_t, 4> tests; + std::vector<subject_match_t> tests; }; -std::array<test_t, 2> test_data{ - {{{"^foo"}, {{{{"foo"}, true}, {{"bar"}, false}, {{"foobar"}, true}, {{"foobarbaz"}, true}}}}, - {{"foo$"}, {{{{"foo"}, true}, {{"bar"}, false}, {{"foobar"}, false}, {{"foobarbaz"}, false}}}}} +std::vector<test_t> test_data{ + { + {{R"(^foo)"}, {{{{"foo"}, true}, {{"bar"}, false}, {{"foobar"}, true}, {{"foobarbaz"}, true}}}}, + {{R"(foo$)"}, {{{{"foo"}, true}, {{"bar"}, false}, {{"foobar"}, false}, {{"foobarbaz"}, false}}}}, + // url regular expression + {{R"(^(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?$)"}, + {{{{"http://www.example.com"}, true}, + {{"https://www.example.com"}, true}, + {{"http://~example.com"}, false}, + {{"http://www.example.com/foo/bar"}, true}}}}, + // ip address regular expression + {R"(^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$)", + {{{{"1.2.3.4"}, true}, {{"127.0.0.1"}, true}, {{"256.256.256.256"}, false}, {{".1.1.1.1"}, false}}}}, + } +}; + +// test case insensitive test data +std::vector<test_t> test_data_case_insensitive{ + { + {{R"(^foo)"}, {{{{"FoO"}, true}, {{"bar"}, false}, {{"foObar"}, true}, {{"foobaRbaz"}, true}}}}, + {{R"(foo$)"}, {{{{"foO"}, true}, {{"bar"}, false}, {{"foobar"}, false}, {{"foobarbaz"}, false}}}}, + } +}; + +// test case for anchored flag +std::vector<test_t> test_data_anchored{ + { + {{R"(foo)"}, {{{{"foo"}, true}, {{"bar"}, false}, {{"foobar"}, true}, {{"foobarbaz"}, true}}}}, + {{R"(bar)"}, {{{{"foo"}, false}, {{"bar"}, true}, {{"foobar"}, false}, {{"foobarbaz"}, false}}}}, + } +}; + +struct submatch_t { + std::string_view subject; + int32_t count; + std::vector<std::string_view> submatches; +}; + +struct submatch_test_t { + std::string_view regex; + int capture_count; + std::vector<submatch_t> tests; +}; + +std::vector<submatch_test_t> submatch_test_data{ + { + {{R"(^foo)"}, 0, {{{{"foo"}, 1, {{"foo"}}}, {{"bar"}, -1, {}}, {{"foobar"}, 1, {{"foo"}}}, {{"foobarbaz"}, 1, {{"foo"}}}}}}, + {{R"(foo$)"}, 0, {{{{"foo"}, 1, {{"foo"}}}, {{"bar"}, -1, {}}, {{"foobar"}, -1, {}}, {{"foobarbaz"}, -1, {}}}}}, + {{R"(^(foo)(bar))"}, 2, {{{{"foobar"}, 3, {{"foobar", "foo", "bar"}}}, {{"barfoo"}, -1, {}}, {{"foo"}, -1, {}}}}}, + } }; TEST_CASE("Regex", "[libts][Regex]") { + // case sensitive test for (auto &item : test_data) { Regex r; - r.compile(item.regex.data()); + REQUIRE(r.compile(item.regex.data()) == true); for (auto &test : item.tests) { REQUIRE(r.exec(test.subject.data()) == test.match); } } + + // case insensitive test + for (auto &item : test_data_case_insensitive) { + Regex r; + REQUIRE(r.compile(item.regex.data(), RE_CASE_INSENSITIVE) == true); + + for (auto &test : item.tests) { + REQUIRE(r.exec(test.subject.data()) == test.match); + } + } + + // case anchored test + for (auto &item : test_data_anchored) { + Regex r; + REQUIRE(r.compile(item.regex.data(), RE_ANCHORED) == true); + + for (auto &test : item.tests) { + REQUIRE(r.exec(test.subject.data()) == test.match); + } + } + + // test getting submatches with operator[] + for (auto &item : submatch_test_data) { + Regex r; + REQUIRE(r.compile(item.regex.data()) == true); + REQUIRE(r.get_capture_count() == item.capture_count); + + for (auto &test : item.tests) { + RegexMatches matches; + REQUIRE(r.exec(test.subject.data(), matches) == test.count); + REQUIRE(matches.size() == test.count); + + for (int32_t i = 0; i < test.count; i++) { + REQUIRE(matches[i] == test.submatches[i]); + } + } + } + + // test getting submatches with ovector pointer + for (auto &item : submatch_test_data) { + Regex r; + REQUIRE(r.compile(item.regex.data()) == true); + REQUIRE(r.get_capture_count() == item.capture_count); + + for (auto &test : item.tests) { + RegexMatches matches; + REQUIRE(r.exec(test.subject.data(), matches) == test.count); + REQUIRE(matches.size() == test.count); + + size_t *ovector = matches.get_ovector_pointer(); + for (int32_t i = 0; i < test.count; i++) { + REQUIRE(test.submatches[i] == std::string_view{test.subject.data() + ovector[i * 2], ovector[i * 2 + 1] - ovector[i * 2]}); + } + } + } + + // test for invalid regular expression + { + Regex r; + REQUIRE(r.compile(R"((\d+)", RE_CASE_INSENSITIVE) == false); + } + + // test for not compiling regular expression + { + Regex r; + RegexMatches matches; + REQUIRE(r.exec("foo") == false); + REQUIRE(r.exec("foo", matches) == 0); + } + + // test for recompiling the regular expression + { + Regex r; + REQUIRE(r.compile(R"(foo)") == true); + REQUIRE(r.exec("foo") == true); + REQUIRE(r.compile(R"(bar)") == true); + REQUIRE(r.exec("bar") == true); + } }