Lucene.Net.Core.Support.StringBuilderExtensions: Reverted Reverse() method to original Java implementation (faster), and added CodePointCount and GetChars() methods
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/8bb1cc92 Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/8bb1cc92 Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/8bb1cc92 Branch: refs/heads/api-work Commit: 8bb1cc9209b39ab97b938f2c491a803d9f66b882 Parents: 6032fd7 Author: Shad Storhaug <[email protected]> Authored: Tue Jan 31 12:48:02 2017 +0700 Committer: Shad Storhaug <[email protected]> Committed: Tue Jan 31 12:48:02 2017 +0700 ---------------------------------------------------------------------- .../Support/StringBuilderExtensions.cs | 111 +++++++++++++++---- .../Suggest/Jaspell/JaspellTernarySearchTrie.cs | 5 +- 2 files changed, 92 insertions(+), 24 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucenenet/blob/8bb1cc92/src/Lucene.Net.Core/Support/StringBuilderExtensions.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Core/Support/StringBuilderExtensions.cs b/src/Lucene.Net.Core/Support/StringBuilderExtensions.cs index 1eb18f1..5292e0c 100644 --- a/src/Lucene.Net.Core/Support/StringBuilderExtensions.cs +++ b/src/Lucene.Net.Core/Support/StringBuilderExtensions.cs @@ -1,34 +1,63 @@ -using System.Globalization; +using System; using System.Text; namespace Lucene.Net.Support { public static class StringBuilderExtensions { - public static StringBuilder Reverse(this StringBuilder text) // LUCENENET TODO: The reverse is in-place. Returning the StringBuilder makes this confusing. + /// <summary> + /// Causes this character sequence to be replaced by the reverse of + /// the sequence. If there are any surrogate pairs included in the + /// sequence, these are treated as single characters for the + /// reverse operation. Thus, the order of the high-low surrogates + /// is never reversed. + /// <para/> + /// Let <c>n</c> be the character length of this character sequence + /// (not the length in <see cref="char"/> values) just prior to + /// execution of the <see cref="Reverse"/> method. Then the + /// character at index <c>k</c> in the new character sequence is + /// equal to the character at index <c>n-k-1</c> in the old + /// character sequence. + /// <para/> + /// Note that the reverse operation may result in producing + /// surrogate pairs that were unpaired low-surrogates and + /// high-surrogates before the operation. For example, reversing + /// "\uDC00\uD800" produces "\uD800\uDC00" which is + /// a valid surrogate pair. + /// </summary> + /// <param name="text">this <see cref="StringBuilder"/></param> + /// <returns>a reference to this <see cref="StringBuilder"/>.</returns> + public static StringBuilder Reverse(this StringBuilder text) { - int textLength = text.Length; - if (textLength > 1) + bool hasSurrogate = false; + int codePointCount = text.Length; + int n = text.Length - 1; + for (int j = (n - 1) >> 1; j >= 0; --j) { - // Pull the string out of the StringBuilder so we - // can work with the various text elements (chars, glyphs, graphemes, etc) - // and reverse the order of the string without reversing chars that need to be - // in a specific order to represent the same text as the forward string. - // Reference: http://stackoverflow.com/a/36310993/181087 - int offset = textLength; - var enumerator = StringInfo.GetTextElementEnumerator(text.ToString()); - while (enumerator.MoveNext()) + char temp = text[j]; + char temp2 = text[n - j]; + if (!hasSurrogate) { - string element = enumerator.GetTextElement(); - - // Back up the current offset by the length of the element - offset -= element.Length; - - for (int i = 0; i < element.Length; i++) + hasSurrogate = (temp >= Character.MIN_SURROGATE && temp <= Character.MAX_SURROGATE) + || (temp2 >= Character.MIN_SURROGATE && temp2 <= Character.MAX_SURROGATE); + } + text[j] = temp2; + text[n - j] = temp; + } + if (hasSurrogate) + { + // Reverse back all valid surrogate pairs + for (int i = 0; i < text.Length - 1; i++) + { + char c2 = text[i]; + if (char.IsLowSurrogate(c2)) { - // Write the chars in forward order from the element - // to the StringBuilder based on the offset. - text[i + offset] = element[i]; + char c1 = text[i + 1]; + if (char.IsHighSurrogate(c1)) + { + text[i++] = c1; + text[i] = c2; + } } } } @@ -37,6 +66,46 @@ namespace Lucene.Net.Support } /// <summary> + /// Returns the number of Unicode code points in the specified text + /// range of this <see cref="StringBuilder"/>. The text range begins at the specified + /// <paramref name="beginIndex"/> and extends to the <see cref="char"/> at + /// index <c>endIndex - 1</c>. Thus the length (in + /// <see cref="char"/>s) of the text range is + /// <c>endIndex-beginIndex</c>. Unpaired surrogates within + /// this sequence count as one code point each. + /// </summary> + /// <param name="text">this <see cref="StringBuilder"/></param> + /// <param name="beginIndex">the index to the first <see cref="char"/> of the text range.</param> + /// <param name="endIndex">the index after the last <see cref="char"/> of the text range.</param> + /// <returns>the number of Unicode code points in the specified text range.</returns> + /// <exception cref="IndexOutOfRangeException"> + /// if the <paramref name="beginIndex"/> is negative, or <paramref name="endIndex"/> + /// is larger than the length of this sequence, or + /// <paramref name="beginIndex"/> is larger than <paramref name="endIndex"/>. + /// </exception> + public static int CodePointCount(this StringBuilder text, int beginIndex, int endIndex) + { + if (beginIndex < 0 || endIndex > text.Length || beginIndex > endIndex) + { + throw new IndexOutOfRangeException(); + } + return Character.CodePointCountImpl(text.GetChars(), beginIndex, endIndex - beginIndex); + } + + /// <summary> + /// Copies the array from the <see cref="StringBuilder"/> into a new array + /// and returns it. + /// </summary> + /// <param name="text">this <see cref="StringBuilder"/></param> + /// <returns></returns> + public static char[] GetChars(this StringBuilder text) + { + char[] chars = new char[text.Length]; + text.CopyTo(0, chars, 0, text.Length); + return chars; + } + + /// <summary> /// Appends the string representation of the <paramref name="codePoint"/> /// argument to this sequence. /// http://git-wip-us.apache.org/repos/asf/lucenenet/blob/8bb1cc92/src/Lucene.Net.Suggest/Suggest/Jaspell/JaspellTernarySearchTrie.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Suggest/Suggest/Jaspell/JaspellTernarySearchTrie.cs b/src/Lucene.Net.Suggest/Suggest/Jaspell/JaspellTernarySearchTrie.cs index 11fb8ca..98a91fb 100644 --- a/src/Lucene.Net.Suggest/Suggest/Jaspell/JaspellTernarySearchTrie.cs +++ b/src/Lucene.Net.Suggest/Suggest/Jaspell/JaspellTernarySearchTrie.cs @@ -549,9 +549,8 @@ namespace Lucene.Net.Search.Suggest.Jaspell currentNode = currentNode.relatives[TSTNode.PARENT]; } - // LUCENENET NOTE: Reverse doesn't happen in place in a .NET StringBuilder, - // so we need to return the reversed result. - return getKeyBuffer.Reverse().ToString(); + getKeyBuffer.Reverse(); + return getKeyBuffer.ToString(); } /// <summary>
