On Tue, Jun 10, 2014 at 1:56 PM, Tim Shen <timshe...@gmail.com> wrote: > On Tue, Jun 10, 2014 at 9:54 AM, Jonathan Wakely <jwak...@redhat.com> wrote: >> I'm sure this is because I still don't understand all the regex code, >> but doesn't this change mean that for an "extended" mode regex with >> backrefs, the user could define _GLIBCXX_REGEX_USE_THOMPSON_NFA and >> backrefs wouldn't work? > > Sorry I missed that basic POSIX (BRE) has back-references (damn!), but > extended POSIX (ERE) doesn't. So it should look like: > - if (!__re._M_automaton->_M_has_backref > + if (!(__re._M_automaton->_M_has_backref || (__re._M_flags & > regex_constants::ECMAScript)) > ...and all deleted _M_has_backref lines should be undeleted. > > This patch is a temporary (I'm not sure how long though) workaround; > BFS's support for ECMAScript with no back-references shall be done > finally.
Sorry for late; this is the complete patch. Bootstrapped and tested. Thanks! -- Regards, Tim Shen
commit 10c1e55884525ff4c3ad959d29181b85966630c1 Author: Tim Shen <tims...@google.com> Date: Fri Jun 27 19:24:29 2014 -0700 PR libstdc++/61424 * include/bits/regex.tcc (__regex_algo_impl<>): Use DFS for ECMAScript, not just regex containing back-references. * include/bits/regex_compiler.tcc (_Compiler<>::_M_disjunction): exchange _M_next and _M_alt for alternative operator, making matching from left to right. * include/bits/regex_executor.h (_State_info<>::_M_get_sol_pos): Add position tracking fom DFS. * include/bits/regex_executor.tcc (_Executor<>::_M_main_dispatch, _Executor<>::_M_dfs): Likewise. * include/bits/regex_scanner.h: Remove unused enum entry. * testsuite/28_regex/algorithms/regex_search/61424.cc: New testcase from PR. diff --git a/libstdc++-v3/include/bits/regex.tcc b/libstdc++-v3/include/bits/regex.tcc index a81f517..3322379 100644 --- a/libstdc++-v3/include/bits/regex.tcc +++ b/libstdc++-v3/include/bits/regex.tcc @@ -71,6 +71,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // _GLIBCXX_REGEX_USE_THOMPSON_NFA if they need to use this approach. bool __ret; if (!__re._M_automaton->_M_has_backref + && !(__re._M_flags & regex_constants::ECMAScript) #ifndef _GLIBCXX_REGEX_USE_THOMPSON_NFA && __policy == _RegexExecutorPolicy::_S_alternate #endif diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc index 0df10cc..f15f7dd 100644 --- a/libstdc++-v3/include/bits/regex_compiler.tcc +++ b/libstdc++-v3/include/bits/regex_compiler.tcc @@ -103,9 +103,12 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION auto __end = _M_nfa._M_insert_dummy(); __alt1._M_append(__end); __alt2._M_append(__end); + // __alt2 is state._M_next, __alt1 is state._M_alt. The executor + // executes _M_alt before _M_next, as well as executing left + // alternative before right one. _M_stack.push(_StateSeqT(_M_nfa, - _M_nfa._M_insert_alt(__alt1._M_start, - __alt2._M_start, false), + _M_nfa._M_insert_alt(__alt2._M_start, + __alt1._M_start, false), __end)); } } diff --git a/libstdc++-v3/include/bits/regex_executor.h b/libstdc++-v3/include/bits/regex_executor.h index 1991c00..e02fa65 100644 --- a/libstdc++-v3/include/bits/regex_executor.h +++ b/libstdc++-v3/include/bits/regex_executor.h @@ -173,6 +173,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION void _M_queue(_StateIdT __i, const _ResultsVec& __res) { _M_match_queue.emplace_back(__i, __res); } + _BiIter* _M_get_sol_pos() { return nullptr; } + // Saves states that need to be considered for the next character. vector<pair<_StateIdT, _ResultsVec>> _M_match_queue; // Indicates which states are already visited. @@ -191,12 +193,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // Dummy implementations for DFS mode. bool _M_visited(_StateIdT) const { return false; } void _M_queue(_StateIdT, const _ResultsVec&) { } + _BiIter* _M_get_sol_pos() { return &_M_sol_pos; } // To record current solution. _StateIdT _M_start; + _BiIter _M_sol_pos; }; - public: _ResultsVec _M_cur_results; _BiIter _M_current; diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc index aefa8f4..38b8ff2 100644 --- a/libstdc++-v3/include/bits/regex_executor.tcc +++ b/libstdc++-v3/include/bits/regex_executor.tcc @@ -82,6 +82,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _M_main_dispatch(_Match_mode __match_mode, __dfs) { _M_has_sol = false; + *_M_states._M_get_sol_pos() = _BiIter(); _M_cur_results = _M_results; _M_dfs(__match_mode, _M_states._M_start); return _M_has_sol; @@ -338,7 +339,29 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION && (_M_flags & regex_constants::match_not_null)) _M_has_sol = false; if (_M_has_sol) - _M_results = _M_cur_results; + { + if (_M_nfa._M_flags & regex_constants::ECMAScript) + _M_results = _M_cur_results; + else // POSIX + { + _GLIBCXX_DEBUG_ASSERT(_M_states._M_get_sol_pos()); + // Here's POSIX's logic: match the longest one. However + // we never know which one (lhs or rhs of "|") is longer + // unless we try both of them and compare the results. + // The member variable _M_sol_pos records the end + // position of the last successful match. It's better + // to be larger, because POSIX regex is always greedy. + // TODO: This could be slow. + if (*_M_states._M_get_sol_pos() == _BiIter() + || std::distance(_M_begin, + *_M_states._M_get_sol_pos()) + < std::distance(_M_begin, _M_current)) + { + *_M_states._M_get_sol_pos() = _M_current; + _M_results = _M_cur_results; + } + } + } } else { @@ -354,9 +377,25 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION } break; case _S_opcode_alternative: - _M_dfs(__match_mode, __state._M_alt); - if (!__dfs_mode || !_M_has_sol) - _M_dfs(__match_mode, __state._M_next); + if (_M_nfa._M_flags & regex_constants::ECMAScript) + { + // TODO: Let DFS support ECMAScript's alternative operation. + _GLIBCXX_DEBUG_ASSERT(!__dfs_mode); + _M_dfs(__match_mode, __state._M_alt); + // Pick lhs if it matches. Only try rhs if it doesn't. + if (!_M_has_sol) + _M_dfs(__match_mode, __state._M_next); + } + else + { + // Try both and compare the result. + // See "case _S_opcode_accept:" handling above. + _M_dfs(__match_mode, __state._M_alt); + auto __has_sol = _M_has_sol; + _M_has_sol = false; + _M_dfs(__match_mode, __state._M_next); + _M_has_sol |= __has_sol; + } break; default: _GLIBCXX_DEBUG_ASSERT(false); diff --git a/libstdc++-v3/include/bits/regex_scanner.h b/libstdc++-v3/include/bits/regex_scanner.h index 6627db9..5552226 100644 --- a/libstdc++-v3/include/bits/regex_scanner.h +++ b/libstdc++-v3/include/bits/regex_scanner.h @@ -67,7 +67,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _S_token_or, _S_token_closure0, _S_token_closure1, - _S_token_ungreedy, _S_token_line_begin, _S_token_line_end, _S_token_word_bound, // neg if _M_value[0] == 'n' diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/61424.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/61424.cc new file mode 100644 index 0000000..bdccb4a --- /dev/null +++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/61424.cc @@ -0,0 +1,52 @@ +// { dg-options "-std=gnu++11" } + +// Copyright (C) 2014 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. + +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// <http://www.gnu.org/licenses/>. + +// PR libstdc++/61424 + +#include <regex> +#include <testsuite_hooks.h> +#include <testsuite_regex.h> + +using namespace std; +using namespace __gnu_test; + +int main() +{ + regex_constants::syntax_option_type grammar[] = { + regex_constants::ECMAScript, regex_constants::extended, + regex_constants::awk, regex_constants::egrep + }; + + string sol[] = { + "tour", + "tournament", + "tournament", + "tournament", + }; + int i = 0; + for (auto g : grammar) + { + regex re("tour|tournament|tourn", g); + const char str[] = "tournament"; + cmatch m; + VERIFY(regex_search_debug(str, m, re)); + VERIFY(sol[i] == m[0]); + i++; + } +}