BUG: Lucene.Net.Core.Util.UnicodeUtil: counter for char array size not accurate and causing Lucene.Net.Misc.Util.Fst.TestFSTsMisc.TestRandomWords() to fail. Changed initial array size to count * 2 and removed resize logic, since we trim off the excess anyway at the end. Also added a threashhold of 1024 - if the count is greater than this we do a pre-loop to determine the exact amount of memory to allocate.
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/631cfa7a Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/631cfa7a Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/631cfa7a Branch: refs/heads/api-work Commit: 631cfa7ad37187c918b068fef77797fffd0bfff8 Parents: f5d02d6 Author: Shad Storhaug <[email protected]> Authored: Fri Mar 24 06:24:41 2017 +0700 Committer: Shad Storhaug <[email protected]> Committed: Fri Mar 24 06:28:21 2017 +0700 ---------------------------------------------------------------------- src/Lucene.Net.Core/Util/UnicodeUtil.cs | 35 +++++++++++++++++----------- 1 file changed, 22 insertions(+), 13 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucenenet/blob/631cfa7a/src/Lucene.Net.Core/Util/UnicodeUtil.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Core/Util/UnicodeUtil.cs b/src/Lucene.Net.Core/Util/UnicodeUtil.cs index c3bb61e..31a998c 100644 --- a/src/Lucene.Net.Core/Util/UnicodeUtil.cs +++ b/src/Lucene.Net.Core/Util/UnicodeUtil.cs @@ -715,15 +715,34 @@ namespace Lucene.Net.Util /// <returns> a char array representing the code points between offset and count </returns> // LUCENENET NOTE: This code was originally in the NewString() method (above). // It has been refactored from the original to remove the exception throw/catch and - // instead proactively resizes the array instead of relying on excpetions. + // instead proactively resizes the array instead of relying on excpetions + copy operations public static char[] ToCharArray(int[] codePoints, int offset, int count) { if (count < 0) { throw new System.ArgumentException(); } - // LUCENENET: as a first approximation, assume each codepoint is 1 character - char[] chars = new char[count]; + int countThreashhold = 1024; // If the number of chars exceeds this, we count them instead of allocating count * 2 + // LUCENENET: as a first approximation, assume each codepoint + // is 2 characters (since it cannot be longer than this) + int arrayLength = count * 2; + // LUCENENET: if we go over the threashhold, count the number of + // chars we will need so we can allocate the precise amount of memory + if (count > countThreashhold) + { + arrayLength = 0; + for (int r = offset, e = offset + count; r < e; ++r) + { + arrayLength += codePoints[r] < 0x010000 ? 1 : 2; + } + if (arrayLength < 1) + { + arrayLength = count * 2; + } + } + // Initialize our array to our exact or oversized length. + // It is now safe to assume we have enough space for all of the characters. + char[] chars = new char[arrayLength]; int w = 0; for (int r = offset, e = offset + count; r < e; ++r) { @@ -739,18 +758,8 @@ namespace Lucene.Net.Util else { chars[w++] = (char)(LEAD_SURROGATE_OFFSET_ + (cp >> LEAD_SURROGATE_SHIFT_)); - // LUCENENET: resize to the exact length: it's slightly faster to check if the resize is needed - if (w >= chars.Length) - { - Array.Resize(ref chars, chars.Length + (e - r) * 2 - 1); - } chars[w++] = (char)(TRAIL_SURROGATE_MIN_VALUE + (cp & TRAIL_SURROGATE_MASK_)); } - // LUCENENET: resize to the exact length: it's slightly faster to check if the resize is needed - if (w != chars.Length) - { - Array.Resize(ref chars, w); - } } var result = new char[w];
