BinaryWriter was too strict with regard to validating surrogate pairs for this type of serialization, so implemented custom extension methods over Stream that do not use encoding.

nightowl888 Tue, 09 Jul 2019 14:29:26 -0700

This is an automated email from the ASF dual-hosted git repository.

nightowl888 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git


commit c4b9987c08806fbe535423617c63cc1c16cd5dbe
Author: Shad Storhaug <[email protected]>
AuthorDate: Mon Jul 8 04:58:24 2019 +0700

    BUG: Intermittent failures of 
Lucene.Net.Facet.Taxonomy.WriterCache.TestCharBlockArray.TestArray(). The test 
was not setup to with encoders that fallback to '?' for unmapped/invalid 
characters. Also, the BinaryReader/BinaryWriter was too strict with regard to 
validating surrogate pairs for this type of serialization, so implemented 
custom extension methods over Stream that do not use encoding.
---
 .../Taxonomy/WriterCache/CharBlockArray.cs         |  91 +++++++++++-------
 .../Taxonomy/WriterCache/TestCharBlockArray.cs     |  25 ++++-
 .../WriterCache/TestCompactLabelToOrdinal.cs       |  14 ++-
 src/Lucene.Net/Support/IO/StreamExtensions.cs      | 102 +++++++++++++++++++++
 4 files changed, 192 insertions(+), 40 deletions(-)

diff --git a/src/Lucene.Net.Facet/Taxonomy/WriterCache/CharBlockArray.cs 
b/src/Lucene.Net.Facet/Taxonomy/WriterCache/CharBlockArray.cs
index 151ea8a..98ae751 100644
--- a/src/Lucene.Net.Facet/Taxonomy/WriterCache/CharBlockArray.cs
+++ b/src/Lucene.Net.Facet/Taxonomy/WriterCache/CharBlockArray.cs
@@ -1,4 +1,5 @@
 using Lucene.Net.Support;
+using Lucene.Net.Support.IO;
 using System;
 using System.Collections.Generic;
 using System.IO;
@@ -35,7 +36,7 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache
     // BinaryFormatter is not implemented in .NET Standard 1.x.
     internal class CharBlockArray : ICharSequence
     {
-        //private const long serialVersionUID = 1L; // LUCENENET: Not used
+        private const long serialVersionUID = 1L;
 
         private const int DEFAULT_BLOCK_SIZE = 32 * 1024; // 32 KB default size
 
@@ -44,7 +45,7 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache
             : System.ICloneable
 #endif
         {
-            //internal const long serialVersionUID = 1L; // LUCENENET: Not used
+            internal const long serialVersionUID = 1L;
 
             internal readonly char[] chars;
             internal int length;
@@ -64,8 +65,9 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache
             }
 
             // LUCENENET specific
-            public void Serialize(BinaryWriter writer)
+            public void Serialize(Stream writer)
             {
+                writer.Write(serialVersionUID); // Version of this object to 
use when deserializing
                 writer.Write(chars.Length);
                 writer.Write(chars);
                 writer.Write(length);
@@ -73,11 +75,23 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache
 
             // LUCENENET specific
             // Deserialization constructor
-            public Block(BinaryReader reader)
+            public Block(Stream reader)
             {
-                int charsLength = reader.ReadInt32();
-                this.chars = reader.ReadChars(charsLength);
-                this.length = reader.ReadInt32();
+                long serialVersion = reader.ReadInt64();
+
+                switch (serialVersion)
+                {
+                    case serialVersionUID:
+                        int charsLength = reader.ReadInt32();
+                        this.chars = reader.ReadChars(charsLength);
+                        this.length = reader.ReadInt32();
+                        break;
+
+                    // case 1L:
+                    // LUCENENET TODO: When object fields change, increment 
serialVersionUID and move the above block here for legacy support...
+                    default:
+                        throw new InvalidDataException($"Version 
{serialVersion} of {this.GetType().ToString()} deserialization is not 
supported.");
+                }
             }
         }
 
@@ -252,49 +266,56 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache
 
         internal virtual void Flush(Stream @out)
         {
-            using (var writer = new BinaryWriter(@out, new UTF8Encoding(false, 
true), true))
+            @out.Write(serialVersionUID); // version of this object to use 
when deserializing
+            @out.Write(blocks.Count);
+            int currentIndex = 0;
+            for (int i = 0; i < blocks.Count; i++)
             {
-                writer.Write(blocks.Count);
-                int currentIndex = 0;
-                for (int i = 0; i < blocks.Count; i++)
+                var block = blocks[i];
+                block.Serialize(@out);
+                if (block == current)
                 {
-                    var block = blocks[i];
-                    block.Serialize(writer);
-                    if (block == current)
-                    {
-                        currentIndex = i;
-                    }
+                    currentIndex = i;
                 }
-                // Write the index of the current block so we can
-                // set the reference when deserializing
-                writer.Write(currentIndex);
-                writer.Write(blockSize);
-                writer.Write(length);
-                writer.Flush();
             }
+            // Write the index of the current block so we can
+            // set the reference when deserializing
+            @out.Write(currentIndex);
+            @out.Write(blockSize);
+            @out.Write(length);
+            @out.Flush();
         }
 
         // LUCENENET specific
         // Deserialization constructor
-        internal CharBlockArray(BinaryReader reader)
+        internal CharBlockArray(Stream reader)
         {
-            var blocksCount = reader.ReadInt32();
-            this.blocks = new List<Block>(blocksCount);
-            for (int i = 0; i < blocksCount; i++)
+            long serialVersion = reader.ReadInt64();
+
+            switch (serialVersion)
             {
-                blocks.Add(new Block(reader));
+                case serialVersionUID:
+                    var blocksCount = reader.ReadInt32();
+                    this.blocks = new List<Block>(blocksCount);
+                    for (int i = 0; i < blocksCount; i++)
+                    {
+                        blocks.Add(new Block(reader));
+                    }
+                    this.current = blocks[reader.ReadInt32()];
+                    this.blockSize = reader.ReadInt32();
+                    this.length = reader.ReadInt32();
+                    break;
+
+                // case 1L:
+                // LUCENENET TODO: When object fields change, increment 
serialVersionUID and move the above block here for legacy support...
+                default:
+                    throw new InvalidDataException($"Version {serialVersion} 
of {this.GetType().ToString()} deserialization is not supported.");
             }
-            this.current = blocks[reader.ReadInt32()];
-            this.blockSize = reader.ReadInt32();
-            this.length = reader.ReadInt32();
         }
 
         public static CharBlockArray Open(Stream @in)
         {
-            using (var writer = new BinaryReader(@in, new UTF8Encoding(false, 
true), true))
-            {
-                return new CharBlockArray(writer);
-            }
+            return new CharBlockArray(@in);
         }
     }
 }
\ No newline at end of file
diff --git 
a/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCharBlockArray.cs 
b/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCharBlockArray.cs
index 1249ff6..3041c2c 100644
--- a/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCharBlockArray.cs
+++ b/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCharBlockArray.cs
@@ -35,14 +35,25 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache
 
             byte[] buffer = new byte[50];
 
+            // This is essentially the equivalent of
+            // CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder()
+            //     .onUnmappableCharacter(CodingErrorAction.REPLACE)
+            //     .onMalformedInput(CodingErrorAction.REPLACE);
+            // 
+            // Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage, 
+            //     new EncoderReplacementFallback("?"), 
+            //     new DecoderReplacementFallback("?"));
+
             for (int i = 0; i < n; i++)
             {
                 Random().NextBytes(buffer);
                 int size = 1 + Random().Next(50);
                 // This test is turning random bytes into a string,
                 // this is asking for trouble.
-
-                string s = Encoding.UTF8.GetString(buffer, 0, size);
+                Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage,
+                    new EncoderReplacementFallback("?"),
+                    new DecoderReplacementFallback("?"));
+                string s = decoder.GetString(buffer, 0, size);
                 array.Append(s);
                 builder.Append(s);
             }
@@ -53,7 +64,10 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache
                 int size = 1 + Random().Next(50);
                 // This test is turning random bytes into a string,
                 // this is asking for trouble.
-                string s = Encoding.UTF8.GetString(buffer, 0, size);
+                Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage,
+                    new EncoderReplacementFallback("?"),
+                    new DecoderReplacementFallback("?"));
+                string s = decoder.GetString(buffer, 0, size);
                 array.Append(s);
                 builder.Append(s);
             }
@@ -64,7 +78,10 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache
                 int size = 1 + Random().Next(50);
                 // This test is turning random bytes into a string,
                 // this is asking for trouble.
-                string s = Encoding.UTF8.GetString(buffer, 0, size);
+                Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage,
+                    new EncoderReplacementFallback("?"),
+                    new DecoderReplacementFallback("?"));
+                string s = decoder.GetString(buffer, 0, size);
                 for (int j = 0; j < s.Length; j++)
                 {
                     array.Append(s[j]);
diff --git 
a/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCompactLabelToOrdinal.cs 
b/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCompactLabelToOrdinal.cs
index 852638b..c2cfeb0 100644
--- 
a/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCompactLabelToOrdinal.cs
+++ 
b/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCompactLabelToOrdinal.cs
@@ -42,6 +42,15 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache
             string[] uniqueValues = new string[numUniqueValues];
             byte[] buffer = new byte[50];
 
+            // This is essentially the equivalent of
+            // CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder()
+            //     .onUnmappableCharacter(CodingErrorAction.REPLACE)
+            //     .onMalformedInput(CodingErrorAction.REPLACE);
+            // 
+            // Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage, 
+            //     new EncoderReplacementFallback("?"), 
+            //     new DecoderReplacementFallback("?"));
+
             Random random = Random();
             for (int i = 0; i < numUniqueValues;)
             {
@@ -50,7 +59,10 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache
 
                 // This test is turning random bytes into a string,
                 // this is asking for trouble.
-                uniqueValues[i] = Encoding.UTF8.GetString(buffer, 0, size);
+                Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage,
+                    new EncoderReplacementFallback("?"),
+                    new DecoderReplacementFallback("?"));
+                uniqueValues[i] = decoder.GetString(buffer, 0, size);
                 // we cannot have empty path components, so eliminate all 
prefix as well
                 // as middle consecutive delimiter chars.
                 uniqueValues[i] = Regex.Replace(uniqueValues[i], "/+", "/");
diff --git a/src/Lucene.Net/Support/IO/StreamExtensions.cs 
b/src/Lucene.Net/Support/IO/StreamExtensions.cs
new file mode 100644
index 0000000..134245d
--- /dev/null
+++ b/src/Lucene.Net/Support/IO/StreamExtensions.cs
@@ -0,0 +1,102 @@
+using System.IO;
+
+namespace Lucene.Net.Support.IO
+{
+    /*
+        * Licensed to the Apache Software Foundation (ASF) under one or more
+        * contributor license agreements.  See the NOTICE file distributed with
+        * this work for additional information regarding copyright ownership.
+        * The ASF licenses this file to You under the Apache License, Version 
2.0
+        * (the "License"); you may not use this file except in compliance with
+        * the License.  You may obtain a copy of the License at
+        *
+        *     http://www.apache.org/licenses/LICENSE-2.0
+        *
+        * Unless required by applicable law or agreed to in writing, software
+        * distributed under the License is distributed on an "AS IS" BASIS,
+        * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 
implied.
+        * See the License for the specific language governing permissions and
+        * limitations under the License.
+        */
+
+    /// <summary>
+    /// Extension methods that make a <see cref="Stream"/> effectively into a
+    /// binary serializer with no encoding. We simply convert types into bytes
+    /// and write them without any concern whether surrogate pairs are 
respected,
+    /// similar to what BinaryFormatter does.
+    /// This makes it possible to serialize/deserialize raw character arrays
+    /// and get the data back in the same order without any exceptions warning
+    /// that the order is not valid and without the need for BinaryFormatter.
+    /// <para/>
+    /// Byte order is little-endian (same as <see cref="BinaryReader"/> and 
<see cref="BinaryWriter"/>).
+    /// </summary>
+    public static class StreamExtensions
+    {
+        public static void Write(this Stream stream, char[] chars)
+        {
+            byte[] newBytes = new byte[chars.Length * 2];
+            for (int index = 0; index < chars.Length; index++)
+            {
+                int newIndex = index == 0 ? index : index * 2;
+                newBytes[newIndex] = (byte)chars[index];
+                newBytes[newIndex + 1] = (byte)(chars[index] >> 8);
+            }
+            stream.Write(newBytes, 0, newBytes.Length);
+        }
+
+        public static char[] ReadChars(this Stream stream, int count)
+        {
+            byte[] buff = new byte[2];
+            char[] newChars = new char[count];
+            for (int i = 0; i < count; i++)
+            {
+                stream.Read(buff, 0, 2);
+                newChars[i] = (char)((buff[0] & 0xff) | ((buff[1] & 0xff) << 
8));
+            }
+            return newChars;
+        }
+
+        public static void Write(this Stream stream, int value)
+        {
+            byte[] buff = new byte[4];
+            buff[0] = (byte)(value);
+            buff[1] = (byte)(value >> 8);
+            buff[2] = (byte)(value >> 16);
+            buff[3] = (byte)(value >> 24);
+            stream.Write(buff, 0, buff.Length);
+        }
+
+        public static int ReadInt32(this Stream stream)
+        {
+            byte[] buff = new byte[4];
+            stream.Read(buff, 0, buff.Length);
+            return (buff[0] & 0xff) | ((buff[1] & 0xff) << 8) |
+                ((buff[2] & 0xff) << 16) | ((buff[3] & 0xff) << 24);
+        }
+
+        public static void Write(this Stream stream, long value)
+        {
+            byte[] buff = new byte[8];
+            buff[0] = (byte)value;
+            buff[1] = (byte)(value >> 8);
+            buff[2] = (byte)(value >> 16);
+            buff[3] = (byte)(value >> 24);
+            buff[4] = (byte)(value >> 32);
+            buff[5] = (byte)(value >> 40);
+            buff[6] = (byte)(value >> 48);
+            buff[7] = (byte)(value >> 56);
+            stream.Write(buff, 0, buff.Length);
+        }
+
+        public static long ReadInt64(this Stream stream)
+        {
+            byte[] buff = new byte[8];
+            stream.Read(buff, 0, buff.Length);
+            uint lo = (uint)(buff[0] | buff[1] << 8 |
+                             buff[2] << 16 | buff[3] << 24);
+            uint hi = (uint)(buff[4] | buff[5] << 8 |
+                             buff[6] << 16 | buff[7] << 24);
+            return (long)((ulong)hi) << 32 | lo;
+        }
+    }
+}

Reply via email to