This is an automated email from the ASF dual-hosted git repository. nightowl888 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/lucenenet.git
commit c4b9987c08806fbe535423617c63cc1c16cd5dbe Author: Shad Storhaug <[email protected]> AuthorDate: Mon Jul 8 04:58:24 2019 +0700 BUG: Intermittent failures of Lucene.Net.Facet.Taxonomy.WriterCache.TestCharBlockArray.TestArray(). The test was not setup to with encoders that fallback to '?' for unmapped/invalid characters. Also, the BinaryReader/BinaryWriter was too strict with regard to validating surrogate pairs for this type of serialization, so implemented custom extension methods over Stream that do not use encoding. --- .../Taxonomy/WriterCache/CharBlockArray.cs | 91 +++++++++++------- .../Taxonomy/WriterCache/TestCharBlockArray.cs | 25 ++++- .../WriterCache/TestCompactLabelToOrdinal.cs | 14 ++- src/Lucene.Net/Support/IO/StreamExtensions.cs | 102 +++++++++++++++++++++ 4 files changed, 192 insertions(+), 40 deletions(-) diff --git a/src/Lucene.Net.Facet/Taxonomy/WriterCache/CharBlockArray.cs b/src/Lucene.Net.Facet/Taxonomy/WriterCache/CharBlockArray.cs index 151ea8a..98ae751 100644 --- a/src/Lucene.Net.Facet/Taxonomy/WriterCache/CharBlockArray.cs +++ b/src/Lucene.Net.Facet/Taxonomy/WriterCache/CharBlockArray.cs @@ -1,4 +1,5 @@ using Lucene.Net.Support; +using Lucene.Net.Support.IO; using System; using System.Collections.Generic; using System.IO; @@ -35,7 +36,7 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache // BinaryFormatter is not implemented in .NET Standard 1.x. internal class CharBlockArray : ICharSequence { - //private const long serialVersionUID = 1L; // LUCENENET: Not used + private const long serialVersionUID = 1L; private const int DEFAULT_BLOCK_SIZE = 32 * 1024; // 32 KB default size @@ -44,7 +45,7 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache : System.ICloneable #endif { - //internal const long serialVersionUID = 1L; // LUCENENET: Not used + internal const long serialVersionUID = 1L; internal readonly char[] chars; internal int length; @@ -64,8 +65,9 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache } // LUCENENET specific - public void Serialize(BinaryWriter writer) + public void Serialize(Stream writer) { + writer.Write(serialVersionUID); // Version of this object to use when deserializing writer.Write(chars.Length); writer.Write(chars); writer.Write(length); @@ -73,11 +75,23 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache // LUCENENET specific // Deserialization constructor - public Block(BinaryReader reader) + public Block(Stream reader) { - int charsLength = reader.ReadInt32(); - this.chars = reader.ReadChars(charsLength); - this.length = reader.ReadInt32(); + long serialVersion = reader.ReadInt64(); + + switch (serialVersion) + { + case serialVersionUID: + int charsLength = reader.ReadInt32(); + this.chars = reader.ReadChars(charsLength); + this.length = reader.ReadInt32(); + break; + + // case 1L: + // LUCENENET TODO: When object fields change, increment serialVersionUID and move the above block here for legacy support... + default: + throw new InvalidDataException($"Version {serialVersion} of {this.GetType().ToString()} deserialization is not supported."); + } } } @@ -252,49 +266,56 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache internal virtual void Flush(Stream @out) { - using (var writer = new BinaryWriter(@out, new UTF8Encoding(false, true), true)) + @out.Write(serialVersionUID); // version of this object to use when deserializing + @out.Write(blocks.Count); + int currentIndex = 0; + for (int i = 0; i < blocks.Count; i++) { - writer.Write(blocks.Count); - int currentIndex = 0; - for (int i = 0; i < blocks.Count; i++) + var block = blocks[i]; + block.Serialize(@out); + if (block == current) { - var block = blocks[i]; - block.Serialize(writer); - if (block == current) - { - currentIndex = i; - } + currentIndex = i; } - // Write the index of the current block so we can - // set the reference when deserializing - writer.Write(currentIndex); - writer.Write(blockSize); - writer.Write(length); - writer.Flush(); } + // Write the index of the current block so we can + // set the reference when deserializing + @out.Write(currentIndex); + @out.Write(blockSize); + @out.Write(length); + @out.Flush(); } // LUCENENET specific // Deserialization constructor - internal CharBlockArray(BinaryReader reader) + internal CharBlockArray(Stream reader) { - var blocksCount = reader.ReadInt32(); - this.blocks = new List<Block>(blocksCount); - for (int i = 0; i < blocksCount; i++) + long serialVersion = reader.ReadInt64(); + + switch (serialVersion) { - blocks.Add(new Block(reader)); + case serialVersionUID: + var blocksCount = reader.ReadInt32(); + this.blocks = new List<Block>(blocksCount); + for (int i = 0; i < blocksCount; i++) + { + blocks.Add(new Block(reader)); + } + this.current = blocks[reader.ReadInt32()]; + this.blockSize = reader.ReadInt32(); + this.length = reader.ReadInt32(); + break; + + // case 1L: + // LUCENENET TODO: When object fields change, increment serialVersionUID and move the above block here for legacy support... + default: + throw new InvalidDataException($"Version {serialVersion} of {this.GetType().ToString()} deserialization is not supported."); } - this.current = blocks[reader.ReadInt32()]; - this.blockSize = reader.ReadInt32(); - this.length = reader.ReadInt32(); } public static CharBlockArray Open(Stream @in) { - using (var writer = new BinaryReader(@in, new UTF8Encoding(false, true), true)) - { - return new CharBlockArray(writer); - } + return new CharBlockArray(@in); } } } \ No newline at end of file diff --git a/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCharBlockArray.cs b/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCharBlockArray.cs index 1249ff6..3041c2c 100644 --- a/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCharBlockArray.cs +++ b/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCharBlockArray.cs @@ -35,14 +35,25 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache byte[] buffer = new byte[50]; + // This is essentially the equivalent of + // CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder() + // .onUnmappableCharacter(CodingErrorAction.REPLACE) + // .onMalformedInput(CodingErrorAction.REPLACE); + // + // Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage, + // new EncoderReplacementFallback("?"), + // new DecoderReplacementFallback("?")); + for (int i = 0; i < n; i++) { Random().NextBytes(buffer); int size = 1 + Random().Next(50); // This test is turning random bytes into a string, // this is asking for trouble. - - string s = Encoding.UTF8.GetString(buffer, 0, size); + Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage, + new EncoderReplacementFallback("?"), + new DecoderReplacementFallback("?")); + string s = decoder.GetString(buffer, 0, size); array.Append(s); builder.Append(s); } @@ -53,7 +64,10 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache int size = 1 + Random().Next(50); // This test is turning random bytes into a string, // this is asking for trouble. - string s = Encoding.UTF8.GetString(buffer, 0, size); + Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage, + new EncoderReplacementFallback("?"), + new DecoderReplacementFallback("?")); + string s = decoder.GetString(buffer, 0, size); array.Append(s); builder.Append(s); } @@ -64,7 +78,10 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache int size = 1 + Random().Next(50); // This test is turning random bytes into a string, // this is asking for trouble. - string s = Encoding.UTF8.GetString(buffer, 0, size); + Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage, + new EncoderReplacementFallback("?"), + new DecoderReplacementFallback("?")); + string s = decoder.GetString(buffer, 0, size); for (int j = 0; j < s.Length; j++) { array.Append(s[j]); diff --git a/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCompactLabelToOrdinal.cs b/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCompactLabelToOrdinal.cs index 852638b..c2cfeb0 100644 --- a/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCompactLabelToOrdinal.cs +++ b/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCompactLabelToOrdinal.cs @@ -42,6 +42,15 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache string[] uniqueValues = new string[numUniqueValues]; byte[] buffer = new byte[50]; + // This is essentially the equivalent of + // CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder() + // .onUnmappableCharacter(CodingErrorAction.REPLACE) + // .onMalformedInput(CodingErrorAction.REPLACE); + // + // Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage, + // new EncoderReplacementFallback("?"), + // new DecoderReplacementFallback("?")); + Random random = Random(); for (int i = 0; i < numUniqueValues;) { @@ -50,7 +59,10 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache // This test is turning random bytes into a string, // this is asking for trouble. - uniqueValues[i] = Encoding.UTF8.GetString(buffer, 0, size); + Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage, + new EncoderReplacementFallback("?"), + new DecoderReplacementFallback("?")); + uniqueValues[i] = decoder.GetString(buffer, 0, size); // we cannot have empty path components, so eliminate all prefix as well // as middle consecutive delimiter chars. uniqueValues[i] = Regex.Replace(uniqueValues[i], "/+", "/"); diff --git a/src/Lucene.Net/Support/IO/StreamExtensions.cs b/src/Lucene.Net/Support/IO/StreamExtensions.cs new file mode 100644 index 0000000..134245d --- /dev/null +++ b/src/Lucene.Net/Support/IO/StreamExtensions.cs @@ -0,0 +1,102 @@ +using System.IO; + +namespace Lucene.Net.Support.IO +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Extension methods that make a <see cref="Stream"/> effectively into a + /// binary serializer with no encoding. We simply convert types into bytes + /// and write them without any concern whether surrogate pairs are respected, + /// similar to what BinaryFormatter does. + /// This makes it possible to serialize/deserialize raw character arrays + /// and get the data back in the same order without any exceptions warning + /// that the order is not valid and without the need for BinaryFormatter. + /// <para/> + /// Byte order is little-endian (same as <see cref="BinaryReader"/> and <see cref="BinaryWriter"/>). + /// </summary> + public static class StreamExtensions + { + public static void Write(this Stream stream, char[] chars) + { + byte[] newBytes = new byte[chars.Length * 2]; + for (int index = 0; index < chars.Length; index++) + { + int newIndex = index == 0 ? index : index * 2; + newBytes[newIndex] = (byte)chars[index]; + newBytes[newIndex + 1] = (byte)(chars[index] >> 8); + } + stream.Write(newBytes, 0, newBytes.Length); + } + + public static char[] ReadChars(this Stream stream, int count) + { + byte[] buff = new byte[2]; + char[] newChars = new char[count]; + for (int i = 0; i < count; i++) + { + stream.Read(buff, 0, 2); + newChars[i] = (char)((buff[0] & 0xff) | ((buff[1] & 0xff) << 8)); + } + return newChars; + } + + public static void Write(this Stream stream, int value) + { + byte[] buff = new byte[4]; + buff[0] = (byte)(value); + buff[1] = (byte)(value >> 8); + buff[2] = (byte)(value >> 16); + buff[3] = (byte)(value >> 24); + stream.Write(buff, 0, buff.Length); + } + + public static int ReadInt32(this Stream stream) + { + byte[] buff = new byte[4]; + stream.Read(buff, 0, buff.Length); + return (buff[0] & 0xff) | ((buff[1] & 0xff) << 8) | + ((buff[2] & 0xff) << 16) | ((buff[3] & 0xff) << 24); + } + + public static void Write(this Stream stream, long value) + { + byte[] buff = new byte[8]; + buff[0] = (byte)value; + buff[1] = (byte)(value >> 8); + buff[2] = (byte)(value >> 16); + buff[3] = (byte)(value >> 24); + buff[4] = (byte)(value >> 32); + buff[5] = (byte)(value >> 40); + buff[6] = (byte)(value >> 48); + buff[7] = (byte)(value >> 56); + stream.Write(buff, 0, buff.Length); + } + + public static long ReadInt64(this Stream stream) + { + byte[] buff = new byte[8]; + stream.Read(buff, 0, buff.Length); + uint lo = (uint)(buff[0] | buff[1] << 8 | + buff[2] << 16 | buff[3] << 24); + uint hi = (uint)(buff[4] | buff[5] << 8 | + buff[6] << 16 | buff[7] << 24); + return (long)((ulong)hi) << 32 | lo; + } + } +}
