BUG: Lucene.Net.Core.Util.Automaton.RegExp.Peek(): Method not taking into account surrogate pairs. Created an IndexOf extension method overload for string that accepts a codePoint, similar to Java's String class.
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/b3940f2e Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/b3940f2e Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/b3940f2e Branch: refs/heads/api-work Commit: b3940f2e41076a67b9588170be9a0017a328b9e4 Parents: fbfcb81 Author: Shad Storhaug <[email protected]> Authored: Sun Mar 26 06:33:27 2017 +0700 Committer: Shad Storhaug <[email protected]> Committed: Sun Mar 26 08:54:09 2017 +0700 ---------------------------------------------------------------------- src/Lucene.Net.Core/Support/StringExtensions.cs | 34 ++++++++++++++++++++ src/Lucene.Net.Core/Util/Automaton/RegExp.cs | 10 +++--- .../Search/TestDocTermOrdsRewriteMethod.cs | 8 ++--- 3 files changed, 41 insertions(+), 11 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b3940f2e/src/Lucene.Net.Core/Support/StringExtensions.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Core/Support/StringExtensions.cs b/src/Lucene.Net.Core/Support/StringExtensions.cs index 8405a90..41aa6a3 100644 --- a/src/Lucene.Net.Core/Support/StringExtensions.cs +++ b/src/Lucene.Net.Core/Support/StringExtensions.cs @@ -85,5 +85,39 @@ namespace Lucene.Net.Support { return new StringCharSequenceWrapper(str); } + + /// <summary> + /// Returns the index within this string of the first occurrence of the + /// specified <paramref name="codePoint"/>. + /// </summary> + /// <param name="str">this string</param> + /// <param name="codePoint">a codePoint representing a single character or surrogate pair</param> + /// <returns>the index of the first occurrence of the character (or surrogate pair) in the string, + /// or <c>-1</c> if the character (or surrogate pair) doesn't occur.</returns> + public static int IndexOf(this string str, int codePoint) + { + if (codePoint >= 0 && codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT) + { + // handle most cases here (codePoint is a BMP code point) + return str.IndexOf((char)codePoint); + } + else if (codePoint >= Character.MIN_CODE_POINT && codePoint <= Character.MAX_CODE_POINT) + { + // codePoint is a surogate pair + char[] pair = Character.ToChars(codePoint); + char hi = pair[0]; + char lo = pair[1]; + for (int i = 0; i < str.Length - 1; i++) + { + if (str[i] == hi && str[i + 1] == lo) + { + return i; + } + } + } + + // codePoint is negative or not found in string + return -1; + } } } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b3940f2e/src/Lucene.Net.Core/Util/Automaton/RegExp.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Core/Util/Automaton/RegExp.cs b/src/Lucene.Net.Core/Util/Automaton/RegExp.cs index 868f8f0..fc07d45 100644 --- a/src/Lucene.Net.Core/Util/Automaton/RegExp.cs +++ b/src/Lucene.Net.Core/Util/Automaton/RegExp.cs @@ -701,11 +701,11 @@ namespace Lucene.Net.Util.Automaton break; case Kind.REGEXP_CHAR: - b.Append("\\").Append(Character.ToChars(c)); + b.Append("\\").AppendCodePoint(c); break; case Kind.REGEXP_CHAR_RANGE: - b.Append("[\\").Append(Character.ToChars(from)).Append("-\\").Append(Character.ToChars(to)).Append("]"); + b.Append("[\\").AppendCodePoint(from).Append("-\\").AppendCodePoint(to).Append("]"); break; case Kind.REGEXP_ANYCHAR: @@ -835,7 +835,7 @@ namespace Lucene.Net.Util.Automaton } else { - b.Append(Character.ToChars(exp1.c)); + b.AppendCodePoint(exp1.c); } if (exp2.kind == Kind.REGEXP_STRING) { @@ -843,7 +843,7 @@ namespace Lucene.Net.Util.Automaton } else { - b.Append(Character.ToChars(exp2.c)); + b.AppendCodePoint(exp2.c); } return MakeString(b.ToString()); } @@ -970,7 +970,7 @@ namespace Lucene.Net.Util.Automaton private bool Peek(string s) { - return More() && s.IndexOf((char)Character.CodePointAt(b, pos)) != -1; + return More() && s.IndexOf(b.CodePointAt(pos)) != -1; } private bool Match(int c) http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b3940f2e/src/Lucene.Net.Tests/Search/TestDocTermOrdsRewriteMethod.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests/Search/TestDocTermOrdsRewriteMethod.cs b/src/Lucene.Net.Tests/Search/TestDocTermOrdsRewriteMethod.cs index 2923d51..9273365 100644 --- a/src/Lucene.Net.Tests/Search/TestDocTermOrdsRewriteMethod.cs +++ b/src/Lucene.Net.Tests/Search/TestDocTermOrdsRewriteMethod.cs @@ -1,4 +1,4 @@ -using System; +using System; using System.Collections.Generic; using Lucene.Net.Attributes; using Lucene.Net.Documents; @@ -116,11 +116,7 @@ namespace Lucene.Net.Search /// <summary> /// test a bunch of random regular expressions </summary> -#if !NETSTANDARD - // LUCENENET: There is no Timeout on NUnit for .NET Core. - [Timeout(60000)] -#endif - [Test, HasTimeout] + [Test] public virtual void TestRegexps() { int num = AtLeast(1000);
