Try to use regex with char_type

Kornel Benko Mon, 21 Dec 2020 08:15:44 -0800

This is for Adv-Find only.

Yuriy, could you please check if this meets your requirements with Cyrillic 
chars?


        Kornel

diff --git a/src/lyxfind.cpp b/src/lyxfind.cpp
index 20e1104aa6..e73c6d2a2d 100644
--- a/src/lyxfind.cpp
+++ b/src/lyxfind.cpp
@@ -791,17 +791,17 @@ bool regex_replace(string const & s, string & t, string const & searchstr,
  ** \frac{.*}{x} matches \frac{x+\frac{y}{x}}{z} with .* being 'x+\frac{y'.
  **
  ** @param unmatched
  ** Number of open braces that must remain open at the end for the verification to succeed.
  **/
-bool braces_match(string::const_iterator const & beg,
-		  string::const_iterator const & end,
+bool braces_match(docstring::const_iterator const & beg,
+		  docstring::const_iterator const & end,
 		  int unmatched = 0)
 {
 	int open_pars = 0;
-	string::const_iterator it = beg;
-	LYXERR(Debug::FIND, "Checking " << unmatched << " unmatched braces in '" << string(beg, end) << "'");
+	docstring::const_iterator it = beg;
+	LYXERR(Debug::FIND, "Checking " << unmatched << " unmatched braces in '" << docstring(beg, end) << "'");
 	for (; it != end; ++it) {
 		// Skip escaped braces in the count
 		if (*it == '\\') {
 			++it;
 			if (it == end)
@@ -835,10 +835,13 @@ public:
 	MatchResult(): match_len(0),match2end(0), pos(0) {};
 };
 
 /** The class performing a match between a position in the document and the FindAdvOptions.
  **/
+typedef basic_regex<char_type> docregex;
+typedef regex_iterator<docstring::const_iterator> docregex_iterator;
+
 class MatchStringAdv {
 public:
 	MatchStringAdv(lyx::Buffer & buf, FindAndReplaceOptions const & opt);
 
 	/** Tests if text starting at the supplied position matches with the one provided to the MatchStringAdv
@@ -880,13 +883,13 @@ private:
 	 **/
 	string normalize(docstring const & s, bool hack_braces) const;
 	// normalized string to search
 	string par_as_string;
 	// regular expression to use for searching
-	regex regexp;
+	docregex regexp;
 	// same as regexp, but prefixed with a ".*?"
-	regex regexp2;
+	docregex regexp2;
 	// leading format material as string
 	string lead_as_string;
 	// par_as_string after removal of lead_as_string
 	string par_as_string_nolead;
 	// unmatched open braces in the search string/regexp
@@ -1318,10 +1321,26 @@ static void buildAccentsMap()
   accents["negthinspace"]  = u8"\uf0003";	// to omit backslashed latex macros
   accents["medspace"]      = u8"\uf0004";	// See https://en.wikipedia.org/wiki/Private_Use_Areas
   accents["negmedspace"]   = u8"\uf0005";
   accents["thickspace"]    = u8"\uf0006";
   accents["negthickspace"] = u8"\uf0007";
+  accents["lyx"]           = u8"\uf0010";	// Used logos
+  accents["LyX"]           = u8"\uf0010";
+  accents["tex"]           = u8"\uf0011";
+  accents["TeX"]           = u8"\uf0011";
+  accents["latex"]         = u8"\uf0012";
+  accents["LaTeX"]         = u8"\uf0012";
+  accents["latexe"]        = u8"\uf0013";
+  accents["LaTeXe"]        = u8"\uf0013";
+  accents["backslash lyx"]           = u8"\uf0010";	// Used logos inserted with starting \backslash
+  accents["backslash LyX"]           = u8"\uf0010";
+  accents["backslash tex"]           = u8"\uf0011";
+  accents["backslash TeX"]           = u8"\uf0011";
+  accents["backslash latex"]         = u8"\uf0012";
+  accents["backslash LaTeX"]         = u8"\uf0012";
+  accents["backslash latexe"]        = u8"\uf0013";
+  accents["backslash LaTeXe"]        = u8"\uf0013";
   accents["ddot{\\imath}"] = "Ã¯";
   buildaccent("ddot", "aAeEhHiIioOtuUwWxXyY",
                       "Ã¤ÃÃ«Ãá¸§á¸¦Ã¯ÃÃ¯Ã¶ÃáºÃ¼ÃáºáºáºáºÃ¿Å¸");	// umlaut
   buildaccent("dot|.", "aAbBcCdDeEfFGghHIimMnNoOpPrRsStTwWxXyYzZ",
                        "È§È¦á¸á¸ÄÄá¸á¸ÄÄá¸á¸Ä Ä¡á¸£á¸¢Ä°Ä°á¹á¹á¹á¹È¯È®á¹á¹á¹á¹á¹¡á¹ á¹«á¹ªáºáºáºáºáºáºÅ¼Å»");	// dot{i} can only happen if ignoring case, but there is no lowercase of 'Ä°'
@@ -1381,11 +1400,13 @@ static void buildAccentsMap()
  */
 void Intervall::removeAccents()
 {
   if (accents.empty())
     buildAccentsMap();
-  static regex const accre("\\\\(([\\S]|grave|breve|ddot|dot|acute|dacute|mathring|check|hat|bar|tilde|subdot|ogonek|cedilla|subring|textsubring|subhat|textsubcircum|subtilde|textsubtilde|dgrave|textdoublegrave|rcap|textroundcap|slashed)\\{[^\\{\\}]+\\}|(i|imath|jmath|cdot|[a-z]+space)(?![a-zA-Z]))");
+  static regex const accre("\\\\(([\\S]|grave|breve|ddot|dot|acute|dacute|mathring|check|hat|bar|tilde|subdot|ogonek|"
+  		"cedilla|subring|textsubring|subhat|textsubcircum|subtilde|textsubtilde|dgrave|textdoublegrave|rcap|textroundcap|slashed)\\{[^\\{\\}]+\\}"
+  	"|((i|imath|jmath|cdot|[a-z]+space)|((backslash )?([lL]y[xX]|[tT]e[xX]|[lL]a[tT]e[xX]e?)))(?![a-zA-Z]))");
   smatch sub;
   for (sregex_iterator itacc(par.begin(), par.end(), accre), end; itacc != end; ++itacc) {
     sub = *itacc;
     string key = sub.str(1);
     if (accents.find(key) != accents.end()) {
@@ -2089,11 +2110,11 @@ void LatexInfo::buildKeys(bool isPatternString)
   makeKey("parbox", KeyInfo(KeyInfo::doRemove, 1, true), isPatternString);
   // like ('tiny{}' or '\tiny ' ... )
   makeKey("footnotesize|tiny|scriptsize|small|large|Large|LARGE|huge|Huge", KeyInfo(KeyInfo::isSize, 0, false), isPatternString);
 
   // Survives, like known character
-  makeKey("lyx|LyX|latex|LaTeX|latexe|LaTeXe|tex|TeX", KeyInfo(KeyInfo::isChar, 0, false), isPatternString);
+  // makeKey("lyx|LyX|latex|LaTeX|latexe|LaTeXe|tex|TeX", KeyInfo(KeyInfo::isChar, 0, false), isPatternString);
   makeKey("item|listitem", KeyInfo(KeyInfo::isList, 1, false), isPatternString);
 
   makeKey("begin|end", KeyInfo(KeyInfo::isMath, 1, false), isPatternString);
   makeKey("[|]", KeyInfo(KeyInfo::isMath, 1, false), isPatternString);
   makeKey("$", KeyInfo(KeyInfo::isMath, 1, false), isPatternString);
@@ -2863,24 +2884,24 @@ MatchStringAdv::MatchStringAdv(lyx::Buffer & buf, FindAndReplaceOptions const &
 			}
 			regexp_str = "(" + lead_as_regexp + ")" + par_as_string;
 			regexp2_str = "(" + lead_as_regexp + ").*?" + par_as_string;
 		}
 		LYXERR(Debug::FIND, "Setting regexp to : '" << regexp_str << "'");
-		regexp = regex(regexp_str);
+		regexp = docregex(from_utf8(regexp_str));
 
 		LYXERR(Debug::FIND, "Setting regexp2 to: '" << regexp2_str << "'");
-		regexp2 = regex(regexp2_str);
+		regexp2 = docregex(from_utf8(regexp2_str));
 	}
 }
 
 
 // Count number of characters in string
 // {]} ==> 1
 // \&  ==> 1
 // --- ==> 1
 // \\[a-zA-Z]+ ==> 1
-static int computeSize(string s, int len)
+static int computeSize(docstring s, int len)
 {
 	if (len == 0)
 		return 0;
 	int skip = 1;
 	int count = 0;
@@ -2943,32 +2964,26 @@ MatchResult MatchStringAdv::findAux(DocIterator const & cur, int len, bool at_be
 		return mres;
 	}
 	LYXERR(Debug::FIND, "Matching against     '" << lyx::to_utf8(docstr) << "'");
 	LYXERR(Debug::FIND, "After normalization: '" << str << "'");
 
+	docstr = from_utf8(str);
 	if (use_regexp) {
 		LYXERR(Debug::FIND, "Searching in regexp mode: at_begin=" << at_begin);
-		regex const *p_regexp;
+		docregex const *p_regexp;
 		regex_constants::match_flag_type flags;
 		if (at_begin) {
 			flags = regex_constants::match_continuous;
 			p_regexp = &regexp;
 		} else {
 			flags = regex_constants::match_default;
 			p_regexp = &regexp2;
 		}
-		sregex_iterator re_it(str.begin(), str.end(), *p_regexp, flags);
-		if (re_it == sregex_iterator())
+		docregex_iterator re_it(docstr.begin(), docstr.end(), *p_regexp, flags);
+		if (re_it == docregex_iterator())
 			return mres;
-		match_results<string::const_iterator> const & m = *re_it;
-
-		if (0) { // Kornel Benko: DO NOT CHECKK
-			// Check braces on the segment that matched the entire regexp expression,
-			// plus the last subexpression, if a (.*?) was inserted in the constructor.
-			if (!braces_match(m[0].first, m[0].second, open_braces))
-				return mres;
-		}
+		match_results<docstring::const_iterator> const & m = *re_it;
 
 		// Check braces on segments that matched all (.*?) subexpressions,
 		// except the last "padding" one inserted by lyx.
 		for (size_t i = 1; i < m.size() - 1; ++i)
 			if (!braces_match(m[i].first, m[i].second, open_braces))
@@ -2994,21 +3009,21 @@ MatchResult MatchStringAdv::findAux(DocIterator const & cur, int len, bool at_be
 			result =  m[m.size() - close_wildcards].first - m[0].first;
 
 		size_t pos = m.position(size_t(0));
 		// Ignore last closing characters
 		while (result > 0) {
-			if (str[pos+result-1] == '}')
+			if (docstr[pos+result-1] == '}')
 				--result;
 			else
 				break;
 		}
 		if (result > leadingsize)
 			result -= leadingsize;
 		else
 			result = 0;
-		mres.match_len = computeSize(str.substr(pos+leadingsize,result), result);
-		mres.match2end = str.size() - pos - leadingsize;
+		mres.match_len = computeSize(docstr.substr(pos+leadingsize,result), result);
+		mres.match2end = docstr.size() - pos - leadingsize;
 		mres.pos = pos+leadingsize;
 		return mres;
 	}
 
 	// else !use_regexp: but all code paths above return

pgp4um6LBeUEu.pgp
Description: Digitale Signatur von OpenPGP

-- 
lyx-devel mailing list
lyx-devel@lists.lyx.org
http://lists.lyx.org/mailman/listinfo/lyx-devel

Try to use regex with char_type

Reply via email to