This is an automated email from the ASF dual-hosted git repository.

nightowl888 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git

commit c4b9987c08806fbe535423617c63cc1c16cd5dbe
Author: Shad Storhaug <[email protected]>
AuthorDate: Mon Jul 8 04:58:24 2019 +0700

    BUG: Intermittent failures of 
Lucene.Net.Facet.Taxonomy.WriterCache.TestCharBlockArray.TestArray(). The test 
was not setup to with encoders that fallback to '?' for unmapped/invalid 
characters. Also, the BinaryReader/BinaryWriter was too strict with regard to 
validating surrogate pairs for this type of serialization, so implemented 
custom extension methods over Stream that do not use encoding.
---
 .../Taxonomy/WriterCache/CharBlockArray.cs         |  91 +++++++++++-------
 .../Taxonomy/WriterCache/TestCharBlockArray.cs     |  25 ++++-
 .../WriterCache/TestCompactLabelToOrdinal.cs       |  14 ++-
 src/Lucene.Net/Support/IO/StreamExtensions.cs      | 102 +++++++++++++++++++++
 4 files changed, 192 insertions(+), 40 deletions(-)

diff --git a/src/Lucene.Net.Facet/Taxonomy/WriterCache/CharBlockArray.cs 
b/src/Lucene.Net.Facet/Taxonomy/WriterCache/CharBlockArray.cs
index 151ea8a..98ae751 100644
--- a/src/Lucene.Net.Facet/Taxonomy/WriterCache/CharBlockArray.cs
+++ b/src/Lucene.Net.Facet/Taxonomy/WriterCache/CharBlockArray.cs
@@ -1,4 +1,5 @@
 using Lucene.Net.Support;
+using Lucene.Net.Support.IO;
 using System;
 using System.Collections.Generic;
 using System.IO;
@@ -35,7 +36,7 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache
     // BinaryFormatter is not implemented in .NET Standard 1.x.
     internal class CharBlockArray : ICharSequence
     {
-        //private const long serialVersionUID = 1L; // LUCENENET: Not used
+        private const long serialVersionUID = 1L;
 
         private const int DEFAULT_BLOCK_SIZE = 32 * 1024; // 32 KB default size
 
@@ -44,7 +45,7 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache
             : System.ICloneable
 #endif
         {
-            //internal const long serialVersionUID = 1L; // LUCENENET: Not used
+            internal const long serialVersionUID = 1L;
 
             internal readonly char[] chars;
             internal int length;
@@ -64,8 +65,9 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache
             }
 
             // LUCENENET specific
-            public void Serialize(BinaryWriter writer)
+            public void Serialize(Stream writer)
             {
+                writer.Write(serialVersionUID); // Version of this object to 
use when deserializing
                 writer.Write(chars.Length);
                 writer.Write(chars);
                 writer.Write(length);
@@ -73,11 +75,23 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache
 
             // LUCENENET specific
             // Deserialization constructor
-            public Block(BinaryReader reader)
+            public Block(Stream reader)
             {
-                int charsLength = reader.ReadInt32();
-                this.chars = reader.ReadChars(charsLength);
-                this.length = reader.ReadInt32();
+                long serialVersion = reader.ReadInt64();
+
+                switch (serialVersion)
+                {
+                    case serialVersionUID:
+                        int charsLength = reader.ReadInt32();
+                        this.chars = reader.ReadChars(charsLength);
+                        this.length = reader.ReadInt32();
+                        break;
+
+                    // case 1L:
+                    // LUCENENET TODO: When object fields change, increment 
serialVersionUID and move the above block here for legacy support...
+                    default:
+                        throw new InvalidDataException($"Version 
{serialVersion} of {this.GetType().ToString()} deserialization is not 
supported.");
+                }
             }
         }
 
@@ -252,49 +266,56 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache
 
         internal virtual void Flush(Stream @out)
         {
-            using (var writer = new BinaryWriter(@out, new UTF8Encoding(false, 
true), true))
+            @out.Write(serialVersionUID); // version of this object to use 
when deserializing
+            @out.Write(blocks.Count);
+            int currentIndex = 0;
+            for (int i = 0; i < blocks.Count; i++)
             {
-                writer.Write(blocks.Count);
-                int currentIndex = 0;
-                for (int i = 0; i < blocks.Count; i++)
+                var block = blocks[i];
+                block.Serialize(@out);
+                if (block == current)
                 {
-                    var block = blocks[i];
-                    block.Serialize(writer);
-                    if (block == current)
-                    {
-                        currentIndex = i;
-                    }
+                    currentIndex = i;
                 }
-                // Write the index of the current block so we can
-                // set the reference when deserializing
-                writer.Write(currentIndex);
-                writer.Write(blockSize);
-                writer.Write(length);
-                writer.Flush();
             }
+            // Write the index of the current block so we can
+            // set the reference when deserializing
+            @out.Write(currentIndex);
+            @out.Write(blockSize);
+            @out.Write(length);
+            @out.Flush();
         }
 
         // LUCENENET specific
         // Deserialization constructor
-        internal CharBlockArray(BinaryReader reader)
+        internal CharBlockArray(Stream reader)
         {
-            var blocksCount = reader.ReadInt32();
-            this.blocks = new List<Block>(blocksCount);
-            for (int i = 0; i < blocksCount; i++)
+            long serialVersion = reader.ReadInt64();
+
+            switch (serialVersion)
             {
-                blocks.Add(new Block(reader));
+                case serialVersionUID:
+                    var blocksCount = reader.ReadInt32();
+                    this.blocks = new List<Block>(blocksCount);
+                    for (int i = 0; i < blocksCount; i++)
+                    {
+                        blocks.Add(new Block(reader));
+                    }
+                    this.current = blocks[reader.ReadInt32()];
+                    this.blockSize = reader.ReadInt32();
+                    this.length = reader.ReadInt32();
+                    break;
+
+                // case 1L:
+                // LUCENENET TODO: When object fields change, increment 
serialVersionUID and move the above block here for legacy support...
+                default:
+                    throw new InvalidDataException($"Version {serialVersion} 
of {this.GetType().ToString()} deserialization is not supported.");
             }
-            this.current = blocks[reader.ReadInt32()];
-            this.blockSize = reader.ReadInt32();
-            this.length = reader.ReadInt32();
         }
 
         public static CharBlockArray Open(Stream @in)
         {
-            using (var writer = new BinaryReader(@in, new UTF8Encoding(false, 
true), true))
-            {
-                return new CharBlockArray(writer);
-            }
+            return new CharBlockArray(@in);
         }
     }
 }
\ No newline at end of file
diff --git 
a/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCharBlockArray.cs 
b/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCharBlockArray.cs
index 1249ff6..3041c2c 100644
--- a/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCharBlockArray.cs
+++ b/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCharBlockArray.cs
@@ -35,14 +35,25 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache
 
             byte[] buffer = new byte[50];
 
+            // This is essentially the equivalent of
+            // CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder()
+            //     .onUnmappableCharacter(CodingErrorAction.REPLACE)
+            //     .onMalformedInput(CodingErrorAction.REPLACE);
+            // 
+            // Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage, 
+            //     new EncoderReplacementFallback("?"), 
+            //     new DecoderReplacementFallback("?"));
+
             for (int i = 0; i < n; i++)
             {
                 Random().NextBytes(buffer);
                 int size = 1 + Random().Next(50);
                 // This test is turning random bytes into a string,
                 // this is asking for trouble.
-
-                string s = Encoding.UTF8.GetString(buffer, 0, size);
+                Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage,
+                    new EncoderReplacementFallback("?"),
+                    new DecoderReplacementFallback("?"));
+                string s = decoder.GetString(buffer, 0, size);
                 array.Append(s);
                 builder.Append(s);
             }
@@ -53,7 +64,10 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache
                 int size = 1 + Random().Next(50);
                 // This test is turning random bytes into a string,
                 // this is asking for trouble.
-                string s = Encoding.UTF8.GetString(buffer, 0, size);
+                Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage,
+                    new EncoderReplacementFallback("?"),
+                    new DecoderReplacementFallback("?"));
+                string s = decoder.GetString(buffer, 0, size);
                 array.Append(s);
                 builder.Append(s);
             }
@@ -64,7 +78,10 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache
                 int size = 1 + Random().Next(50);
                 // This test is turning random bytes into a string,
                 // this is asking for trouble.
-                string s = Encoding.UTF8.GetString(buffer, 0, size);
+                Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage,
+                    new EncoderReplacementFallback("?"),
+                    new DecoderReplacementFallback("?"));
+                string s = decoder.GetString(buffer, 0, size);
                 for (int j = 0; j < s.Length; j++)
                 {
                     array.Append(s[j]);
diff --git 
a/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCompactLabelToOrdinal.cs 
b/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCompactLabelToOrdinal.cs
index 852638b..c2cfeb0 100644
--- 
a/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCompactLabelToOrdinal.cs
+++ 
b/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCompactLabelToOrdinal.cs
@@ -42,6 +42,15 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache
             string[] uniqueValues = new string[numUniqueValues];
             byte[] buffer = new byte[50];
 
+            // This is essentially the equivalent of
+            // CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder()
+            //     .onUnmappableCharacter(CodingErrorAction.REPLACE)
+            //     .onMalformedInput(CodingErrorAction.REPLACE);
+            // 
+            // Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage, 
+            //     new EncoderReplacementFallback("?"), 
+            //     new DecoderReplacementFallback("?"));
+
             Random random = Random();
             for (int i = 0; i < numUniqueValues;)
             {
@@ -50,7 +59,10 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache
 
                 // This test is turning random bytes into a string,
                 // this is asking for trouble.
-                uniqueValues[i] = Encoding.UTF8.GetString(buffer, 0, size);
+                Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage,
+                    new EncoderReplacementFallback("?"),
+                    new DecoderReplacementFallback("?"));
+                uniqueValues[i] = decoder.GetString(buffer, 0, size);
                 // we cannot have empty path components, so eliminate all 
prefix as well
                 // as middle consecutive delimiter chars.
                 uniqueValues[i] = Regex.Replace(uniqueValues[i], "/+", "/");
diff --git a/src/Lucene.Net/Support/IO/StreamExtensions.cs 
b/src/Lucene.Net/Support/IO/StreamExtensions.cs
new file mode 100644
index 0000000..134245d
--- /dev/null
+++ b/src/Lucene.Net/Support/IO/StreamExtensions.cs
@@ -0,0 +1,102 @@
+using System.IO;
+
+namespace Lucene.Net.Support.IO
+{
+    /*
+        * Licensed to the Apache Software Foundation (ASF) under one or more
+        * contributor license agreements.  See the NOTICE file distributed with
+        * this work for additional information regarding copyright ownership.
+        * The ASF licenses this file to You under the Apache License, Version 
2.0
+        * (the "License"); you may not use this file except in compliance with
+        * the License.  You may obtain a copy of the License at
+        *
+        *     http://www.apache.org/licenses/LICENSE-2.0
+        *
+        * Unless required by applicable law or agreed to in writing, software
+        * distributed under the License is distributed on an "AS IS" BASIS,
+        * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 
implied.
+        * See the License for the specific language governing permissions and
+        * limitations under the License.
+        */
+
+    /// <summary>
+    /// Extension methods that make a <see cref="Stream"/> effectively into a
+    /// binary serializer with no encoding. We simply convert types into bytes
+    /// and write them without any concern whether surrogate pairs are 
respected,
+    /// similar to what BinaryFormatter does.
+    /// This makes it possible to serialize/deserialize raw character arrays
+    /// and get the data back in the same order without any exceptions warning
+    /// that the order is not valid and without the need for BinaryFormatter.
+    /// <para/>
+    /// Byte order is little-endian (same as <see cref="BinaryReader"/> and 
<see cref="BinaryWriter"/>).
+    /// </summary>
+    public static class StreamExtensions
+    {
+        public static void Write(this Stream stream, char[] chars)
+        {
+            byte[] newBytes = new byte[chars.Length * 2];
+            for (int index = 0; index < chars.Length; index++)
+            {
+                int newIndex = index == 0 ? index : index * 2;
+                newBytes[newIndex] = (byte)chars[index];
+                newBytes[newIndex + 1] = (byte)(chars[index] >> 8);
+            }
+            stream.Write(newBytes, 0, newBytes.Length);
+        }
+
+        public static char[] ReadChars(this Stream stream, int count)
+        {
+            byte[] buff = new byte[2];
+            char[] newChars = new char[count];
+            for (int i = 0; i < count; i++)
+            {
+                stream.Read(buff, 0, 2);
+                newChars[i] = (char)((buff[0] & 0xff) | ((buff[1] & 0xff) << 
8));
+            }
+            return newChars;
+        }
+
+        public static void Write(this Stream stream, int value)
+        {
+            byte[] buff = new byte[4];
+            buff[0] = (byte)(value);
+            buff[1] = (byte)(value >> 8);
+            buff[2] = (byte)(value >> 16);
+            buff[3] = (byte)(value >> 24);
+            stream.Write(buff, 0, buff.Length);
+        }
+
+        public static int ReadInt32(this Stream stream)
+        {
+            byte[] buff = new byte[4];
+            stream.Read(buff, 0, buff.Length);
+            return (buff[0] & 0xff) | ((buff[1] & 0xff) << 8) |
+                ((buff[2] & 0xff) << 16) | ((buff[3] & 0xff) << 24);
+        }
+
+        public static void Write(this Stream stream, long value)
+        {
+            byte[] buff = new byte[8];
+            buff[0] = (byte)value;
+            buff[1] = (byte)(value >> 8);
+            buff[2] = (byte)(value >> 16);
+            buff[3] = (byte)(value >> 24);
+            buff[4] = (byte)(value >> 32);
+            buff[5] = (byte)(value >> 40);
+            buff[6] = (byte)(value >> 48);
+            buff[7] = (byte)(value >> 56);
+            stream.Write(buff, 0, buff.Length);
+        }
+
+        public static long ReadInt64(this Stream stream)
+        {
+            byte[] buff = new byte[8];
+            stream.Read(buff, 0, buff.Length);
+            uint lo = (uint)(buff[0] | buff[1] << 8 |
+                             buff[2] << 16 | buff[3] << 24);
+            uint hi = (uint)(buff[4] | buff[5] << 8 |
+                             buff[6] << 16 | buff[7] << 24);
+            return (long)((ulong)hi) << 32 | lo;
+        }
+    }
+}

Reply via email to