(lucenenet) branch master updated: Use DecoderFallback.ExceptionFallback to match Java's CodingErrorAction.REPORT, #1076 (#1089)

paulirwin Sat, 11 Jan 2025 20:16:41 -0800

This is an automated email from the ASF dual-hosted git repository.

paulirwin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git



The following commit(s) were added to refs/heads/master by this push:
     new 4bf492c3b Use DecoderFallback.ExceptionFallback to match Java's 
CodingErrorAction.REPORT, #1076 (#1089)
4bf492c3b is described below

commit 4bf492c3b55fd94e9bd00822f6980c34b8c794b2
Author: Paul Irwin <[email protected]>
AuthorDate: Sat Jan 11 21:16:27 2025 -0700

    Use DecoderFallback.ExceptionFallback to match Java's 
CodingErrorAction.REPORT, #1076 (#1089)
    
    * Use DecoderFallback.ExceptionFallback to match Java behavior, #1076
    
    * Add unit test for WithDecoderExceptionFallback
    
    * Fix unit test namespace and doc comment
    
    * Lucene.Net.Support.Buffers: Added ArrayPoolExtensions class to simplify 
returning arrays that might be null
    
    * Lucene.Net.Index.Term::ToString(): Optimized writing UTF8 string on 
target frameworks that support System.Text.Unicode.Utf8. Added tests to verify 
fallback is working.
    
    * Cache decoder fallback encoding lookup, #1076
    
    * Treat Encoder/DecoderFallbackExceptions as IOExceptions to match Java, 
#1076
    
    * Fix translation of replacement fallback test code, 
IOException/RuntimeException tests
    
    * Use Encoding.Default instead of GetEncoding(0), #1076
    
    * Cache GB2312 encoding lookup, #1076
    
    * Replace StandardCharsets.UTF_8 with Encoding.UTF8 in two tests, #1076
    
    * Fix test extension method for detecting IllegalArgumentException, #1076
    
    * Cascade call from IsIllegalArgumentException
    
    ---------
    
    Co-authored-by: Shad Storhaug <[email protected]>
---
 .gitignore                                         |  3 +-
 Directory.Build.targets                            |  7 +++
 .../Analysis/Hunspell/Dictionary.cs                |  2 +-
 .../Analysis/Hunspell/ISO8859_14Decoder.cs         |  7 ++-
 .../Analysis/Synonym/FSTSynonymFilterFactory.cs    |  5 +-
 .../Analysis/Util/AbstractAnalysisFactory.cs       |  4 +-
 .../JapaneseTokenizerFactory.cs                    |  3 +-
 .../Tools/ConnectionCostsBuilder.cs                |  4 +-
 .../Tools/TokenInfoDictionaryBuilder.cs            |  3 +-
 .../Tools/UnknownDictionaryBuilder.cs              |  3 +-
 .../Hhmm/AbstractDictionary.cs                     | 11 ++--
 .../Hhmm/BigramDictionary.cs                       |  2 +-
 .../Hhmm/WordDictionary.cs                         |  2 +-
 .../ByTask/Feeds/ContentItemsSource.cs             |  6 +--
 .../ByTask/Tasks/CreateIndexTask.cs                |  2 +-
 .../Quality/Trec/QueryDriver.cs                    |  9 ++--
 .../ExceptionHandling/ExceptionExtensions.cs       | 10 ++--
 src/Lucene.Net.TestFramework/Util/LineFileDocs.cs  |  6 ++-
 .../ExceptionHandling/ExceptionScanningTestCase.cs |  9 ++--
 .../ExceptionHandling/TestExceptionExtensions.cs   | 27 +++++-----
 .../ByTask/Tasks/CreateIndexTaskTest.cs            |  4 +-
 src/Lucene.Net.Tests.Demo/TestDemo.cs              |  6 +--
 .../Taxonomy/WriterCache/TestCharBlockArray.cs     | 22 ++------
 .../WriterCache/TestCompactLabelToOrdinal.cs       | 18 ++-----
 src/Lucene.Net.Tests/Index/TestTerm.cs             | 60 +++++++++++++++++++++-
 src/Lucene.Net.Tests/Support/TestApiConsistency.cs |  2 +-
 .../Text/TestEncodingExtensions.cs}                | 35 ++++++-------
 src/Lucene.Net/Index/Term.cs                       | 52 +++++++++++++++++--
 .../Support/Buffers/ArrayPoolExtensions.cs         | 43 ++++++++++++++++
 .../ExceptionHandling/ExceptionExtensions.cs       | 16 ++++--
 src/Lucene.Net/Support/Text/EncodingExtensions.cs  | 58 +++++++++++++++++++++
 src/Lucene.Net/Util/IOUtils.cs                     |  4 +-
 32 files changed, 327 insertions(+), 118 deletions(-)

diff --git a/.gitignore b/.gitignore
index 7446dd01a..6f8520af1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -65,4 +65,5 @@ websites/apidocs/api/**/*.manifest
 svn-*/
 
 # vscode files
-.vscode/
\ No newline at end of file
+.vscode/
+.idea/**/misc.xml
diff --git a/Directory.Build.targets b/Directory.Build.targets
index fd71ab055..1dc7daa03 100644
--- a/Directory.Build.targets
+++ b/Directory.Build.targets
@@ -37,6 +37,13 @@
 
   </PropertyGroup>
 
+  <!-- Features in .NET 8.x and .NET 9.x only -->
+  <PropertyGroup Condition=" $(TargetFramework.StartsWith('net8.')) Or 
$(TargetFramework.StartsWith('net9.')) ">
+
+    <DefineConstants>$(DefineConstants);FEATURE_UTF8_TOUTF16</DefineConstants>
+
+  </PropertyGroup>
+
   <!-- Features in .NET 6.x, .NET 7.x, .NET 8.x, and .NET 9.x only -->
   <PropertyGroup Condition=" $(TargetFramework.StartsWith('net6.')) Or 
$(TargetFramework.StartsWith('net7.')) Or 
$(TargetFramework.StartsWith('net8.')) Or 
$(TargetFramework.StartsWith('net9.')) ">
 
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Dictionary.cs 
b/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Dictionary.cs
index ceeb7eb53..b790df5e6 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Dictionary.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Dictionary.cs
@@ -746,7 +746,7 @@ namespace Lucene.Net.Analysis.Hunspell
             }
             if ("ISO8859-14".Equals(encoding, 
StringComparison.OrdinalIgnoreCase))
             {
-                return new ISO8859_14Encoding();
+                return ISO8859_14Encoding.Default;
             }
             // .NET doesn't recognize the encoding without a dash between ISO 
and the number
             // 
https://msdn.microsoft.com/en-us/library/system.text.encodinginfo.getencoding(v=vs.110).aspx
diff --git 
a/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/ISO8859_14Decoder.cs 
b/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/ISO8859_14Decoder.cs
index 607895404..7b7eb59c1 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/ISO8859_14Decoder.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/ISO8859_14Decoder.cs
@@ -28,6 +28,11 @@ namespace Lucene.Net.Analysis.Hunspell
     [ExceptionToClassNameConvention]
     internal sealed class ISO8859_14Encoding : Encoding
     {
+        /// <summary>
+        /// The default singleton instance of the <see 
cref="ISO8859_14Encoding"/> class.
+        /// </summary>
+        public static new ISO8859_14Encoding Default { get; } = new 
ISO8859_14Encoding();
+
         private static readonly Decoder decoder = new ISO8859_14Decoder();
         public override Decoder GetDecoder()
         {
@@ -119,4 +124,4 @@ namespace Lucene.Net.Analysis.Hunspell
             return writeCount;
         }
     }
-}
\ No newline at end of file
+}
diff --git 
a/src/Lucene.Net.Analysis.Common/Analysis/Synonym/FSTSynonymFilterFactory.cs 
b/src/Lucene.Net.Analysis.Common/Analysis/Synonym/FSTSynonymFilterFactory.cs
index 41303cb1b..c862dce16 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Synonym/FSTSynonymFilterFactory.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Synonym/FSTSynonymFilterFactory.cs
@@ -1,6 +1,7 @@
 // Lucene version compatibility level 4.8.1
 using Lucene.Net.Analysis.Core;
 using Lucene.Net.Analysis.Util;
+using Lucene.Net.Support.Text;
 using Lucene.Net.Util;
 using System;
 using System.Collections.Generic;
@@ -117,7 +118,7 @@ namespace Lucene.Net.Analysis.Synonym
         /// </summary>
         private SynonymMap LoadSynonyms(IResourceLoader loader, string cname, 
bool dedup, Analyzer analyzer)
         {
-            Encoding decoder = Encoding.UTF8;
+            Encoding decoder = Encoding.UTF8.WithDecoderExceptionFallback();
 
             SynonymMap.Parser parser;
             Type clazz = loader.FindType(cname /*, typeof(SynonymMap.Parser) 
*/);
@@ -165,4 +166,4 @@ namespace Lucene.Net.Analysis.Synonym
             }
         }
     }
-}
\ No newline at end of file
+}
diff --git 
a/src/Lucene.Net.Analysis.Common/Analysis/Util/AbstractAnalysisFactory.cs 
b/src/Lucene.Net.Analysis.Common/Analysis/Util/AbstractAnalysisFactory.cs
index 9839b027d..874c020fd 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Util/AbstractAnalysisFactory.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Util/AbstractAnalysisFactory.cs
@@ -1,6 +1,7 @@
 // Lucene version compatibility level 4.8.1
 using Lucene.Net.Analysis.Core;
 using Lucene.Net.Support;
+using Lucene.Net.Support.Text;
 using Lucene.Net.Util;
 using System;
 using System.Collections.Generic;
@@ -385,8 +386,9 @@ namespace Lucene.Net.Analysis.Util
                 words = new CharArraySet(m_luceneMatchVersion, files.Count * 
10, ignoreCase);
                 foreach (string file in files)
                 {
+                    Encoding decoder = 
Encoding.UTF8.WithDecoderExceptionFallback();
                     using (Stream stream = loader.OpenResource(file.Trim()))
-                    using (TextReader reader = new StreamReader(stream, 
Encoding.UTF8))
+                    using (TextReader reader = new StreamReader(stream, 
decoder))
                     {
                         WordlistLoader.GetSnowballWordSet(reader, words);
                     }
diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizerFactory.cs 
b/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizerFactory.cs
index 5595375e4..6e6d402d1 100644
--- a/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizerFactory.cs
+++ b/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizerFactory.cs
@@ -1,5 +1,6 @@
 using Lucene.Net.Analysis.Ja.Dict;
 using Lucene.Net.Analysis.Util;
+using Lucene.Net.Support.Text;
 using Lucene.Net.Util;
 using System;
 using System.Collections.Generic;
@@ -88,7 +89,7 @@ namespace Lucene.Net.Analysis.Ja
                 {
                     encoding = Encoding.UTF8.WebName;
                 }
-                Encoding decoder = Encoding.GetEncoding(encoding);
+                Encoding decoder = 
Encoding.GetEncoding(encoding).WithDecoderExceptionFallback();
                 TextReader reader = new StreamReader(stream, decoder);
                 userDictionary = new UserDictionary(reader);
             }
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Tools/ConnectionCostsBuilder.cs 
b/src/Lucene.Net.Analysis.Kuromoji/Tools/ConnectionCostsBuilder.cs
index 5d5f1d4c7..ef8b482f1 100644
--- a/src/Lucene.Net.Analysis.Kuromoji/Tools/ConnectionCostsBuilder.cs
+++ b/src/Lucene.Net.Analysis.Kuromoji/Tools/ConnectionCostsBuilder.cs
@@ -1,5 +1,6 @@
 using J2N.Text;
 using Lucene.Net.Diagnostics;
+using Lucene.Net.Support.Text;
 using System.Globalization;
 using System.IO;
 using System.Text;
@@ -31,7 +32,8 @@ namespace Lucene.Net.Analysis.Ja.Util
         public static ConnectionCostsWriter Build(string filename)
         {
             using Stream inputStream = new FileStream(filename, FileMode.Open, 
FileAccess.Read);
-            using StreamReader streamReader = new StreamReader(inputStream, 
Encoding.ASCII, detectEncodingFromByteOrderMarks: true, bufferSize: 1024, 
leaveOpen: true); // LUCENENET: CA2000: Use using statement
+            Encoding decoder = Encoding.ASCII.WithDecoderExceptionFallback();
+            using StreamReader streamReader = new StreamReader(inputStream, 
decoder, detectEncodingFromByteOrderMarks: true, bufferSize: 1024, leaveOpen: 
true); // LUCENENET: CA2000: Use using statement
 
             string line = streamReader.ReadLine();
             string[] dimensions = whiteSpaceRegex.Split(line).TrimEnd();
diff --git 
a/src/Lucene.Net.Analysis.Kuromoji/Tools/TokenInfoDictionaryBuilder.cs 
b/src/Lucene.Net.Analysis.Kuromoji/Tools/TokenInfoDictionaryBuilder.cs
index 737182961..7e0df4ad7 100644
--- a/src/Lucene.Net.Analysis.Kuromoji/Tools/TokenInfoDictionaryBuilder.cs
+++ b/src/Lucene.Net.Analysis.Kuromoji/Tools/TokenInfoDictionaryBuilder.cs
@@ -1,5 +1,6 @@
 using J2N.Text;
 using Lucene.Net.Support;
+using Lucene.Net.Support.Text;
 using Lucene.Net.Util;
 using Lucene.Net.Util.Fst;
 using Lucene.Net.Util.Packed;
@@ -71,7 +72,7 @@ namespace Lucene.Net.Analysis.Ja.Util
             foreach (string file in csvFiles)
             {
                 using Stream inputStream = new FileStream(file, FileMode.Open, 
FileAccess.Read);
-                Encoding decoder = Encoding.GetEncoding(encoding);
+                Encoding decoder = 
Encoding.GetEncoding(encoding).WithDecoderExceptionFallback();
                 using TextReader reader = new StreamReader(inputStream, 
decoder, detectEncodingFromByteOrderMarks: true, bufferSize: 1024, leaveOpen: 
true); // LUCENENET: CA2000: Use using statement
 
                 string line = null;
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Tools/UnknownDictionaryBuilder.cs 
b/src/Lucene.Net.Analysis.Kuromoji/Tools/UnknownDictionaryBuilder.cs
index 3fde184c0..b2023e462 100644
--- a/src/Lucene.Net.Analysis.Kuromoji/Tools/UnknownDictionaryBuilder.cs
+++ b/src/Lucene.Net.Analysis.Kuromoji/Tools/UnknownDictionaryBuilder.cs
@@ -1,5 +1,6 @@
 using J2N.Text;
 using Lucene.Net.Analysis.Ja.Dict;
+using Lucene.Net.Support.Text;
 using System;
 using System.Collections.Generic;
 using System.Globalization;
@@ -55,7 +56,7 @@ namespace Lucene.Net.Analysis.Ja.Util
             UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 
* 1024 * 1024);
 
             JCG.List<string[]> lines = new JCG.List<string[]>();
-            Encoding decoder = Encoding.GetEncoding(encoding);
+            Encoding decoder = 
Encoding.GetEncoding(encoding).WithDecoderExceptionFallback();
             using (Stream inputStream = new FileStream(filename, 
FileMode.Open, FileAccess.Read))
             using (TextReader reader = new StreamReader(inputStream, decoder))
             {
diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs 
b/src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs
index 5e2139018..1d5da6d3a 100644
--- a/src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs
+++ b/src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs
@@ -32,6 +32,9 @@ namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
     /// </summary>
     internal abstract class AbstractDictionary
     {
+        // LUCENENET specific: cached GB2312 encoding to avoid repeated calls 
to Encoding.GetEncoding("GB2312")
+        protected static readonly Encoding gb2312Encoding = 
Encoding.GetEncoding("GB2312");
+
         /// <summary>
         /// First Chinese Character in GB2312 (15 * 94)
         /// Characters in GB2312 are arranged in a grid of 94 * 94, 0-14 are 
unassigned or punctuation.
@@ -39,7 +42,7 @@ namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
         public const int GB2312_FIRST_CHAR = 1410;
 
         /// <summary>
-        /// Last Chinese Character in GB2312 (87 * 94). 
+        /// Last Chinese Character in GB2312 (87 * 94).
         /// Characters in GB2312 are arranged in a grid of 94 * 94, 88-94 are 
unassigned.
         /// </summary>
         public const int GB2312_CHAR_NUM = 87 * 94;
@@ -98,7 +101,7 @@ namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
             try
             {
                 //String cchar = new String(buffer, "GB2312");
-                string cchar = 
Encoding.GetEncoding("GB2312").GetString(buffer);
+                string cchar = gb2312Encoding.GetString(buffer); // LUCENENET 
specific: use cached encoding instance
                 return cchar;
             }
             catch (Exception e) when (e.IsUnsupportedEncodingException()) // 
Encoding is not supported by the platform
@@ -117,7 +120,7 @@ namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
             try
             {
                 //byte[] buffer = Character.ToString(ch).getBytes("GB2312");
-                byte[] buffer = 
Encoding.GetEncoding("GB2312").GetBytes(ch.ToString());
+                byte[] buffer = gb2312Encoding.GetBytes(ch.ToString()); // 
LUCENENET specific: use cached encoding instance
                 //byte[] buffer = 
Encoding.GetEncoding("hz-gb-2312").GetBytes(ch.ToString());
                 if (buffer.Length != 2)
                 {
@@ -125,7 +128,7 @@ namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
                     return -1;
                 }
                 int b0 = (buffer[0] & 0x0FF) - 161; // Code starts from A1, 
therefore subtract 0xA1=161
-                int b1 = (buffer[1] & 0x0FF) - 161; // There is no Chinese 
char for the first and last symbol. 
+                int b1 = (buffer[1] & 0x0FF) - 161; // There is no Chinese 
char for the first and last symbol.
                                                     // Therefore, each code 
page only has 16*6-2=94 characters.
                 return (short)(b0 * 94 + b1);
             }
diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs 
b/src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs
index b9d16273a..da712cb0d 100644
--- a/src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs
+++ b/src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs
@@ -302,7 +302,7 @@ namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
                         byte[] lchBuffer = new byte[length];
                         dctFile.Read(lchBuffer, 0, lchBuffer.Length);
                         //tmpword = new String(lchBuffer, "GB2312");
-                        tmpword = 
Encoding.GetEncoding("GB2312").GetString(lchBuffer);
+                        tmpword = gb2312Encoding.GetString(lchBuffer); // 
LUCENENET specific: use cached encoding instance from base class
                         //tmpword = 
Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer);
                         if (i != 3755 + GB2312_FIRST_CHAR)
                         {
diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs 
b/src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs
index b8cd7cbbf..b6e42be52 100644
--- a/src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs
+++ b/src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs
@@ -395,7 +395,7 @@ namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
                         {
                             byte[] lchBuffer = new byte[length];
                             dctFile.Read(lchBuffer, 0, lchBuffer.Length);
-                            tmpword = 
Encoding.GetEncoding("GB2312").GetString(lchBuffer);
+                            tmpword = gb2312Encoding.GetString(lchBuffer); // 
LUCENENET specific: use cached encoding instance from base class
                             wordItem_charArrayTable[i][j] = 
tmpword.ToCharArray();
                         }
                         else
diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/ContentItemsSource.cs 
b/src/Lucene.Net.Benchmark/ByTask/Feeds/ContentItemsSource.cs
index 7932f749e..4b50076cd 100644
--- a/src/Lucene.Net.Benchmark/ByTask/Feeds/ContentItemsSource.cs
+++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/ContentItemsSource.cs
@@ -32,7 +32,7 @@ namespace Lucene.Net.Benchmarks.ByTask.Feeds
     /// Base class for source of data for benchmarking.
     /// </summary>
     /// <remarks>
-    /// Keeps track of various statistics, such as how many data items were 
generated, 
+    /// Keeps track of various statistics, such as how many data items were 
generated,
     /// size in bytes etc.
     /// <para/>
     /// Supports the following configuration parameters:
@@ -40,7 +40,7 @@ namespace Lucene.Net.Benchmarks.ByTask.Feeds
     ///     <item><term>content.source.forever</term><description>specifies 
whether to generate items forever (<b>default=true</b>).</description></item>
     ///     <item><term>content.source.verbose</term><description>specifies 
whether messages should be output by the content source 
(<b>default=false</b>).</description></item>
     ///     <item><term>content.source.encoding</term><description>
-    ///         specifies which encoding to use when 
+    ///         specifies which encoding to use when
     ///         reading the files of that content source. Certain 
implementations may define
     ///         a default value if this parameter is not specified. 
(<b>default=null</b>).
     ///     </description></item>
@@ -199,7 +199,7 @@ namespace Lucene.Net.Benchmarks.ByTask.Feeds
             }
             else
             {
-                m_encoding = Encoding.GetEncoding(0); // Default system 
encoding
+                m_encoding = Encoding.Default; // Default system encoding
             }
         }
 
diff --git a/src/Lucene.Net.Benchmark/ByTask/Tasks/CreateIndexTask.cs 
b/src/Lucene.Net.Benchmark/ByTask/Tasks/CreateIndexTask.cs
index 6fd4cba20..1e8ed8336 100644
--- a/src/Lucene.Net.Benchmark/ByTask/Tasks/CreateIndexTask.cs
+++ b/src/Lucene.Net.Benchmark/ByTask/Tasks/CreateIndexTask.cs
@@ -215,7 +215,7 @@ namespace Lucene.Net.Benchmarks.ByTask.Tasks
                 else
                 {
                     FileInfo f = new FileInfo(infoStreamVal);
-                    iwc.SetInfoStream(new StreamWriter(new 
FileStream(f.FullName, FileMode.Create, FileAccess.Write), 
Encoding.GetEncoding(0)));
+                    iwc.SetInfoStream(new StreamWriter(new 
FileStream(f.FullName, FileMode.Create, FileAccess.Write), Encoding.Default));
                 }
             }
             IndexWriter writer = new IndexWriter(runData.Directory, iwc);
diff --git a/src/Lucene.Net.Benchmark/Quality/Trec/QueryDriver.cs 
b/src/Lucene.Net.Benchmark/Quality/Trec/QueryDriver.cs
index d08dab75a..a223653dc 100644
--- a/src/Lucene.Net.Benchmark/Quality/Trec/QueryDriver.cs
+++ b/src/Lucene.Net.Benchmark/Quality/Trec/QueryDriver.cs
@@ -77,10 +77,13 @@ namespace Lucene.Net.Benchmarks.Quality.Trec
             string fieldSpec = args.Length == 5 ? args[4] : "T"; // default to 
Title-only if not specified.
             IndexSearcher searcher = new IndexSearcher(reader);
 
-            int maxResults = 1000;
-            string docNameField = "docname";
+            const int maxResults = 1000;
+            const string docNameField = "docname";
 
-            TextWriter logger = Console.Out; //new StreamWriter(Console, 
Encoding.GetEncoding(0));
+            using TextWriter logger = new 
StreamWriter(System.Console.OpenStandardOutput(), Encoding.Default)
+            {
+                AutoFlush = true,
+            };
 
             // use trec utilities to read trec topics into quality queries
             TrecTopicsReader qReader = new TrecTopicsReader();
diff --git 
a/src/Lucene.Net.TestFramework/Support/ExceptionHandling/ExceptionExtensions.cs 
b/src/Lucene.Net.TestFramework/Support/ExceptionHandling/ExceptionExtensions.cs
index f2d39de48..424869718 100644
--- 
a/src/Lucene.Net.TestFramework/Support/ExceptionHandling/ExceptionExtensions.cs
+++ 
b/src/Lucene.Net.TestFramework/Support/ExceptionHandling/ExceptionExtensions.cs
@@ -1,5 +1,6 @@
 using System;
 using System.Runtime.CompilerServices;
+using System.Text;
 
 namespace Lucene.Net
 {
@@ -53,12 +54,9 @@ namespace Lucene.Net
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static bool IsIllegalArgumentException(this Exception e)
         {
-            // If our exception implements IError and subclasses 
ArgumentException, we will ignore it.
-            if (e is null || e.IsError() || e.IsAlwaysIgnored()) return false;
-
-            return e is ArgumentException &&
-                e is not ArgumentNullException &&     // Corresponds to 
NullPointerException, so we don't catch it here.
-                e is not ArgumentOutOfRangeException; // Corresponds to 
IndexOutOfBoundsException (and subclasses), so we don't catch it here.
+            return Lucene.ExceptionExtensions.IsIllegalArgumentException(e)
+                && e is not ArgumentNullException // Corresponds to 
NullPointerException, so we don't catch it here.
+                and not ArgumentOutOfRangeException; // Corresponds to 
IndexOutOfBoundsException (and subclasses), so we don't catch it here.
         }
     }
 }
diff --git a/src/Lucene.Net.TestFramework/Util/LineFileDocs.cs 
b/src/Lucene.Net.TestFramework/Util/LineFileDocs.cs
index 991cdb628..168d0e746 100644
--- a/src/Lucene.Net.TestFramework/Util/LineFileDocs.cs
+++ b/src/Lucene.Net.TestFramework/Util/LineFileDocs.cs
@@ -2,6 +2,7 @@
 using J2N.Threading.Atomic;
 using Lucene.Net.Documents;
 using Lucene.Net.Support.IO;
+using Lucene.Net.Support.Text;
 using Lucene.Net.Support.Threading;
 using RandomizedTesting.Generators;
 using System;
@@ -236,7 +237,8 @@ namespace Lucene.Net.Util
                     } while (b >= 0 && b != 13 && b != 10);
                 }
 
-                reader = new StreamReader(@is, Encoding.UTF8, 
detectEncodingFromByteOrderMarks: false, bufferSize: BUFFER_SIZE);
+                Encoding decoder = 
Encoding.UTF8.WithDecoderExceptionFallback();
+                reader = new StreamReader(@is, decoder, 
detectEncodingFromByteOrderMarks: false, bufferSize: BUFFER_SIZE);
 
                 if (seekTo > 0L)
                 {
@@ -399,4 +401,4 @@ namespace Lucene.Net.Util
             return result;
         }
     }
-}
\ No newline at end of file
+}
diff --git 
a/src/Lucene.Net.Tests.AllProjects/Support/ExceptionHandling/ExceptionScanningTestCase.cs
 
b/src/Lucene.Net.Tests.AllProjects/Support/ExceptionHandling/ExceptionScanningTestCase.cs
index 8fa941f42..8cd3942d6 100644
--- 
a/src/Lucene.Net.Tests.AllProjects/Support/ExceptionHandling/ExceptionScanningTestCase.cs
+++ 
b/src/Lucene.Net.Tests.AllProjects/Support/ExceptionHandling/ExceptionScanningTestCase.cs
@@ -13,6 +13,7 @@ using System.Linq;
 using System.Reflection;
 using System.Resources;
 using System.Security;
+using System.Text;
 using Assert = Lucene.Net.TestFramework.Assert;
 
 namespace Lucene.Net.Support.ExceptionHandling
@@ -184,6 +185,8 @@ namespace Lucene.Net.Support.ExceptionHandling
                 typeof(UnauthorizedAccessException),
                 typeof(ObjectDisposedException),
                 typeof(Lucene.AlreadyClosedException),
+                typeof(EncoderFallbackException), // In Java, 
CharacterCodingException subclasses IOException
+                typeof(DecoderFallbackException),
             }.Union(AllIOExceptionTypes)
             // .NET Framework only - Subclasses UnauthorizedAccessException
             .Union(new[] { PrivilegeNotHeldExceptionType });
@@ -221,8 +224,6 @@ namespace Lucene.Net.Support.ExceptionHandling
             // Subclasses
             typeof(System.DuplicateWaitObjectException),
             typeof(System.Globalization.CultureNotFoundException),
-            typeof(System.Text.DecoderFallbackException),
-            typeof(System.Text.EncoderFallbackException),
         };
 
         public static readonly IEnumerable<Type> 
KnownIllegalArgumentExceptionTypes_TestEnvironment = new Type[] {
@@ -234,8 +235,6 @@ namespace Lucene.Net.Support.ExceptionHandling
             // Subclasses
             typeof(System.DuplicateWaitObjectException),
             typeof(System.Globalization.CultureNotFoundException),
-            typeof(System.Text.DecoderFallbackException),
-            typeof(System.Text.EncoderFallbackException),
         };
 
         public static readonly IEnumerable<Type> KnownRuntimeExceptionTypes = 
LoadKnownRuntimeExceptionTypes();
@@ -367,8 +366,6 @@ namespace Lucene.Net.Support.ExceptionHandling
                 typeof(System.Runtime.Serialization.SerializationException),
                 typeof(System.Security.Cryptography.CryptographicException),
                 typeof(System.Security.VerificationException),
-                typeof(System.Text.DecoderFallbackException), // LUCENENET 
TODO: Need to be sure about this one
-                typeof(System.Text.EncoderFallbackException), // LUCENENET 
TODO: Need to be sure about this one
                 typeof(System.Threading.AbandonedMutexException),
                 typeof(System.Threading.SemaphoreFullException),
                 typeof(System.Threading.SynchronizationLockException),
diff --git 
a/src/Lucene.Net.Tests.AllProjects/Support/ExceptionHandling/TestExceptionExtensions.cs
 
b/src/Lucene.Net.Tests.AllProjects/Support/ExceptionHandling/TestExceptionExtensions.cs
index 8525684ac..9efc1c5b2 100644
--- 
a/src/Lucene.Net.Tests.AllProjects/Support/ExceptionHandling/TestExceptionExtensions.cs
+++ 
b/src/Lucene.Net.Tests.AllProjects/Support/ExceptionHandling/TestExceptionExtensions.cs
@@ -1,5 +1,4 @@
-using J2N.Text;
-using Lucene.Net.Attributes;
+using Lucene.Net.Attributes;
 using NUnit.Framework;
 using System;
 using System.Collections.Generic;
@@ -195,7 +194,7 @@ namespace Lucene.Net.Support.ExceptionHandling
         }
 
         [Test]
-        [TestCaseSource("ThrowableTypeExpressions")]
+        [TestCaseSource(nameof(ThrowableTypeExpressions))]
         public void TestIsThrowable(Type exceptionType, bool expectedToThrow, 
Action expression) // LUCENENET NOTE: exceptionType is only here to make NUnit 
display them all
         {
             static bool extensionMethod(Exception e) => e.IsThrowable();
@@ -211,7 +210,7 @@ namespace Lucene.Net.Support.ExceptionHandling
         }
 
         [Test]
-        [TestCaseSource("ErrorTypeExpressions")]
+        [TestCaseSource(nameof(ErrorTypeExpressions))]
         public void TestIsError(Type exceptionType, bool expectedToThrow, 
Action expression) // LUCENENET NOTE: exceptionType is only here to make NUnit 
display them all
         {
             static bool extensionMethod(Exception e) => e.IsError();
@@ -229,7 +228,7 @@ namespace Lucene.Net.Support.ExceptionHandling
         // This test ensures that all known Error types from Java are not 
caught by
         // our IsException() handler.
         [Test]
-        [TestCaseSource("ExceptionTypeExpressions")]
+        [TestCaseSource(nameof(ExceptionTypeExpressions))]
         public void TestIsException(Type exceptionType, bool expectedToThrow, 
Action expression) // LUCENENET NOTE: exceptionType is only here to make NUnit 
display them all
         {
             static bool extensionMethod(Exception e) => e.IsException();
@@ -247,7 +246,7 @@ namespace Lucene.Net.Support.ExceptionHandling
         // This test ensures that all known Error types from Java are not 
caught by
         // our IsRuntimeException() handler.
         [Test]
-        [TestCaseSource("RuntimeExceptionTypeExpressions")]
+        [TestCaseSource(nameof(RuntimeExceptionTypeExpressions))]
         public void TestIsRuntimeException(Type exceptionType, bool 
expectedToThrow, Action expression) // LUCENENET NOTE: exceptionType is only 
here to make NUnit display them all
         {
             static bool extensionMethod(Exception e) => e.IsRuntimeException();
@@ -263,7 +262,7 @@ namespace Lucene.Net.Support.ExceptionHandling
         }
 
         [Test]
-        [TestCaseSource("IOExceptionTypeExpressions")]
+        [TestCaseSource(nameof(IOExceptionTypeExpressions))]
         public void TestIsIOException(Type exceptionType, bool 
expectedToThrow, Action expression) // LUCENENET NOTE: exceptionType is only 
here to make NUnit display them all
         {
             static bool extensionMethod(Exception e) => e.IsIOException();
@@ -282,7 +281,7 @@ namespace Lucene.Net.Support.ExceptionHandling
         // NUnit's AssertionException and MultipleAssertException types are 
all treated as if they were AssertionError
         // in Java.
         [Test]
-        [TestCaseSource("AssertionErrorTypeExpressions")]
+        [TestCaseSource(nameof(AssertionErrorTypeExpressions))]
         public void TestIsAssertionError(Type exceptionType, bool 
expectedToThrow, Action expression) // LUCENENET NOTE: exceptionType is only 
here to make NUnit display them all
         {
             static bool extensionMethod(Exception e) => e.IsAssertionError();
@@ -302,7 +301,7 @@ namespace Lucene.Net.Support.ExceptionHandling
         // Java has 2 other types ArrayIndexOutOfBoundsException and 
StringIndexOutOfBoundsException, whose alias
         // exception types are also part of the test.
         [Test]
-        [TestCaseSource("IndexOutOfBoundsExceptionTypeExpressions")]
+        [TestCaseSource(nameof(IndexOutOfBoundsExceptionTypeExpressions))]
         public void TestIsIndexOutOfBoundsException(Type exceptionType, bool 
expectedToThrow, Action expression) // LUCENENET NOTE: exceptionType is only 
here to make NUnit display them all
         {
             static bool extensionMethod(Exception e) => 
e.IsIndexOutOfBoundsException();
@@ -320,7 +319,7 @@ namespace Lucene.Net.Support.ExceptionHandling
         // This test ensures that ArgumentNullException and 
NullReferenceException are both caught by our
         // NullPointerException handler, because they both correspond to 
NullPointerException in Java
         [Test]
-        [TestCaseSource("NullPointerExceptionTypeExpressions")]
+        [TestCaseSource(nameof(NullPointerExceptionTypeExpressions))]
         public void TestIsNullPointerException(Type exceptionType, bool 
expectedToThrow, Action expression) // LUCENENET NOTE: exceptionType is only 
here to make NUnit display them all
         {
             static bool extensionMethod(Exception e) => 
e.IsNullPointerException();
@@ -339,7 +338,7 @@ namespace Lucene.Net.Support.ExceptionHandling
         // We do it this way in production to ensure that if we "upgrade" to a 
.NET
         // ArgumentNullException or ArgumentOutOfRangeException it won't break 
the code.
         [Test]
-        [TestCaseSource("IllegalArgumentExceptionTypeExpressions")]
+        [TestCaseSource(nameof(IllegalArgumentExceptionTypeExpressions))]
         public void TestIsIllegalArgumentException(Type exceptionType, bool 
expectedToThrow, Action expression) // LUCENENET NOTE: exceptionType is only 
here to make NUnit display them all
         {
             // Make sure we are testing the production code
@@ -360,7 +359,7 @@ namespace Lucene.Net.Support.ExceptionHandling
         // in the test environment to ensure that if a test is specified wrong 
it will fail and should be updated
         // and commented to indicate we diverged from Lucene.
         [Test]
-        
[TestCaseSource("IllegalArgumentExceptionTypeExpressions_TestEnvironment")]
+        
[TestCaseSource(nameof(IllegalArgumentExceptionTypeExpressions_TestEnvironment))]
         public void TestIsIllegalArgumentException_TestEnvironment(Type 
exceptionType, bool expectedToThrow, Action expression) // LUCENENET NOTE: 
exceptionType is only here to make NUnit display them all
         {
             // Make sure we are testing the test environment code
@@ -376,7 +375,7 @@ namespace Lucene.Net.Support.ExceptionHandling
             }
         }
 
-        private void AssertCatches(Action action, Func<Exception, bool> 
extensionMethodExpression)
+        private static void AssertCatches(Action action, Func<Exception, bool> 
extensionMethodExpression)
         {
             try
             {
@@ -397,7 +396,7 @@ namespace Lucene.Net.Support.ExceptionHandling
             }
         }
 
-        private void AssertDoesNotCatch(Action action, Func<Exception, bool> 
extensionMethodExpression)
+        private static void AssertDoesNotCatch(Action action, Func<Exception, 
bool> extensionMethodExpression)
         {
             try
             {
diff --git a/src/Lucene.Net.Tests.Benchmark/ByTask/Tasks/CreateIndexTaskTest.cs 
b/src/Lucene.Net.Tests.Benchmark/ByTask/Tasks/CreateIndexTaskTest.cs
index 00f85106e..169b78150 100644
--- a/src/Lucene.Net.Tests.Benchmark/ByTask/Tasks/CreateIndexTaskTest.cs
+++ b/src/Lucene.Net.Tests.Benchmark/ByTask/Tasks/CreateIndexTaskTest.cs
@@ -56,7 +56,7 @@ namespace Lucene.Net.Benchmarks.ByTask.Tasks
 
             TextWriter curOut = Console.Out;
             ByteArrayOutputStream baos = new ByteArrayOutputStream();
-            Console.Out = new StreamWriter(baos, Encoding.GetEncoding(0));
+            Console.Out = new StreamWriter(baos, Encoding.Default);
             try
             {
                 PerfRunData runData = createPerfRunData("SystemOut");
@@ -72,7 +72,7 @@ namespace Lucene.Net.Benchmarks.ByTask.Tasks
 
             TextWriter curErr = Console.Error;
             baos = new ByteArrayOutputStream();
-            Console.Error = new StreamWriter(baos, Encoding.GetEncoding(0));
+            Console.Error = new StreamWriter(baos, Encoding.Default);
             try
             {
                 PerfRunData runData = createPerfRunData("SystemErr");
diff --git a/src/Lucene.Net.Tests.Demo/TestDemo.cs 
b/src/Lucene.Net.Tests.Demo/TestDemo.cs
index 16b2379a5..9957f7b71 100644
--- a/src/Lucene.Net.Tests.Demo/TestDemo.cs
+++ b/src/Lucene.Net.Tests.Demo/TestDemo.cs
@@ -33,8 +33,7 @@ namespace Lucene.Net.Demo
             try
             {
                 MemoryStream bytes = new MemoryStream();
-                // .NET NOTE: GetEncoding(0) returns the current system's 
default encoding
-                var fakeSystemOut = new StreamWriter(bytes, 
Encoding.GetEncoding(0));
+                var fakeSystemOut = new StreamWriter(bytes, Encoding.Default);
                 Console.SetOut(fakeSystemOut);
                 // LUCENENET specific: changed the arguments to act more like 
the dotnet.exe commands.
                 // * only optional arguments start with -
@@ -44,8 +43,7 @@ namespace Lucene.Net.Demo
                 // it consistent with the lucene-cli utility.
                 SearchFiles.Main(new string[] { indexPath.FullName, "--query", 
query });
                 fakeSystemOut.Flush();
-                // .NET NOTE: GetEncoding(0) returns the current system's 
default encoding
-                string output = 
Encoding.GetEncoding(0).GetString(bytes.ToArray()); // intentionally use 
default encoding
+                string output = Encoding.Default.GetString(bytes.ToArray()); 
// intentionally use default encoding
                 assertTrue("output=" + output, 
output.Contains(expectedHitCount + " total matching documents"));
             }
             finally
diff --git 
a/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCharBlockArray.cs 
b/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCharBlockArray.cs
index ecf2d317b..0c9dab11d 100644
--- a/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCharBlockArray.cs
+++ b/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCharBlockArray.cs
@@ -2,6 +2,7 @@
 using J2N.IO;
 using J2N.Text;
 using Lucene.Net.Attributes;
+using Lucene.Net.Support;
 using NUnit.Framework;
 using System;
 using System.IO;
@@ -40,24 +41,13 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache
 
             byte[] buffer = new byte[50];
 
-            // This is essentially the equivalent of
-            // CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder()
-            //     .onUnmappableCharacter(CodingErrorAction.REPLACE)
-            //     .onMalformedInput(CodingErrorAction.REPLACE);
-            //
-            // Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage,
-            //     new EncoderReplacementFallback("?"),
-            //     new DecoderReplacementFallback("?"));
-
             for (int i = 0; i < n; i++)
             {
                 Random.NextBytes(buffer);
                 int size = 1 + Random.Next(50);
                 // This test is turning random bytes into a string,
                 // this is asking for trouble.
-                Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage,
-                    new EncoderReplacementFallback("?"),
-                    new DecoderReplacementFallback("?"));
+                Encoding decoder = Encoding.UTF8; // LUCENENET specific: no 
need to set decoder fallback, because Encoding.UTF8 already replaces by default
                 string s = decoder.GetString(buffer, 0, size);
                 array.Append(s);
                 builder.Append(s);
@@ -69,9 +59,7 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache
                 int size = 1 + Random.Next(50);
                 // This test is turning random bytes into a string,
                 // this is asking for trouble.
-                Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage,
-                    new EncoderReplacementFallback("?"),
-                    new DecoderReplacementFallback("?"));
+                Encoding decoder = Encoding.UTF8; // LUCENENET specific: no 
need to set decoder fallback, because Encoding.UTF8 already replaces by default
                 string s = decoder.GetString(buffer, 0, size);
                 array.Append(s);
                 builder.Append(s);
@@ -83,9 +71,7 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache
                 int size = 1 + Random.Next(50);
                 // This test is turning random bytes into a string,
                 // this is asking for trouble.
-                Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage,
-                    new EncoderReplacementFallback("?"),
-                    new DecoderReplacementFallback("?"));
+                Encoding decoder = Encoding.UTF8; // LUCENENET specific: no 
need to set decoder fallback, because Encoding.UTF8 already replaces by default
                 string s = decoder.GetString(buffer, 0, size);
                 for (int j = 0; j < s.Length; j++)
                 {
diff --git 
a/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCompactLabelToOrdinal.cs 
b/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCompactLabelToOrdinal.cs
index b8d013a6d..d38b1fd45 100644
--- 
a/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCompactLabelToOrdinal.cs
+++ 
b/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCompactLabelToOrdinal.cs
@@ -1,5 +1,6 @@
 // Lucene version compatibility level 4.8.1
 using Lucene.Net.Attributes;
+using Lucene.Net.Support;
 using NUnit.Framework;
 using System;
 using System.Collections.Generic;
@@ -43,15 +44,6 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache
             string[] uniqueValues = new string[numUniqueValues];
             byte[] buffer = new byte[50];
 
-            // This is essentially the equivalent of
-            // CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder()
-            //     .onUnmappableCharacter(CodingErrorAction.REPLACE)
-            //     .onMalformedInput(CodingErrorAction.REPLACE);
-            // 
-            // Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage, 
-            //     new EncoderReplacementFallback("?"), 
-            //     new DecoderReplacementFallback("?"));
-
             Random random = Random;
             for (int i = 0; i < numUniqueValues;)
             {
@@ -60,9 +52,7 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache
 
                 // This test is turning random bytes into a string,
                 // this is asking for trouble.
-                Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage,
-                    new EncoderReplacementFallback("?"),
-                    new DecoderReplacementFallback("?"));
+                Encoding decoder = Encoding.UTF8; // LUCENENET specific: no 
need to set decoder fallback, because Encoding.UTF8 already replaces by default
                 uniqueValues[i] = decoder.GetString(buffer, 0, size);
                 // we cannot have empty path components, so eliminate all 
prefix as well
                 // as middle consecutive delimiter chars.
@@ -292,6 +282,6 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache
                 }
                 return LabelToOrdinal.INVALID_ORDINAL;
             }
-        } 
+        }
     }
-}
\ No newline at end of file
+}
diff --git a/src/Lucene.Net.Tests/Index/TestTerm.cs 
b/src/Lucene.Net.Tests/Index/TestTerm.cs
index 425670dcd..577a3781e 100644
--- a/src/Lucene.Net.Tests/Index/TestTerm.cs
+++ b/src/Lucene.Net.Tests/Index/TestTerm.cs
@@ -1,3 +1,5 @@
+using Lucene.Net.Attributes;
+using Lucene.Net.Util;
 using NUnit.Framework;
 using Assert = Lucene.Net.TestFramework.Assert;
 
@@ -39,5 +41,61 @@ namespace Lucene.Net.Index
             Assert.IsFalse(@base.Equals(differentText));
             Assert.IsFalse(@base.Equals(differentType));
         }
+
+        [Test, LuceneNetSpecific]
+        public void TestToString_ValidUtf8Data()
+        {
+            // Arrange
+            var validUtf8 = new byte[] { 0x48, 0x65, 0x6C, 0x6C, 0x6F }; // 
"Hello"
+            var bytesRef = new BytesRef(validUtf8, 0, validUtf8.Length);
+
+            // Act
+            string result = Term.ToString(bytesRef);
+
+            // Assert
+            Assert.AreEqual("Hello", result);
+        }
+
+        [Test, LuceneNetSpecific]
+        public void TestToString_InvalidUtf8Data()
+        {
+            // Arrange
+            var invalidUtf8 = new byte[] { 0xC3, 0x28 }; // Invalid UTF-8 
sequence
+            var bytesRef = new BytesRef(invalidUtf8, 0, invalidUtf8.Length);
+
+            // Act
+            string result = Term.ToString(bytesRef);
+
+            // Assert
+            Assert.AreEqual("[c3 28]", result); // Should match 
BytesRef.ToString()
+        }
+
+        [Test, LuceneNetSpecific]
+        public void TestToString_Utf8WithBom()
+        {
+            // Arrange
+            var utf8WithBom = new byte[] { 0xEF, 0xBB, 0xBF, 0x48, 0x69 }; // 
BOM + "Hi"
+            var bytesRef = new BytesRef(utf8WithBom, 0, utf8WithBom.Length);
+
+            // Act
+            string result = Term.ToString(bytesRef);
+
+            // Assert
+            Assert.AreEqual("\uFEFFHi", result); // BOM is preserved in the 
string
+        }
+
+        [Test, LuceneNetSpecific]
+        public void TestToString_Utf8WithoutBom()
+        {
+            // Arrange
+            var utf8WithoutBom = new byte[] { 0x48, 0x69 }; // "Hi"
+            var bytesRef = new BytesRef(utf8WithoutBom, 0, 
utf8WithoutBom.Length);
+
+            // Act
+            string result = Term.ToString(bytesRef);
+
+            // Assert
+            Assert.AreEqual("Hi", result);
+        }
     }
-}
\ No newline at end of file
+}
diff --git a/src/Lucene.Net.Tests/Support/TestApiConsistency.cs 
b/src/Lucene.Net.Tests/Support/TestApiConsistency.cs
index f565676ac..04b96b91b 100644
--- a/src/Lucene.Net.Tests/Support/TestApiConsistency.cs
+++ b/src/Lucene.Net.Tests/Support/TestApiConsistency.cs
@@ -38,7 +38,7 @@ namespace Lucene.Net
         [TestCase(typeof(Lucene.Net.Analysis.Analyzer))]
         public override void TestPrivateFieldNames(Type typeFromTargetAssembly)
         {
-            base.TestPrivateFieldNames(typeFromTargetAssembly, 
@"^Lucene\.Net\.Support\.(?:ConcurrentHashSet|PlatformHelper|DateTimeOffsetUtil|Arrays|IO\.FileSupport)|^Lucene\.ExceptionExtensions|^Lucene\.Net\.Util\.Constants\.MaxStackByteLimit|^Lucene\.Net\.Search\.TopDocs\.ShardByteSize|^Lucene\.Net\.Store\.BaseDirectory\.(?:True|False)");
+            base.TestPrivateFieldNames(typeFromTargetAssembly, 
@"^Lucene\.Net\.Support\.(?:ConcurrentHashSet|PlatformHelper|DateTimeOffsetUtil|Arrays|IO\.FileSupport)|^Lucene\.ExceptionExtensions|^Lucene\.Net\.Util\.Constants\.MaxStackByteLimit|^Lucene\.Net\.Search\.TopDocs\.ShardByteSize|^Lucene\.Net\.Store\.BaseDirectory\.(?:True|False)|CharStackBufferSize$");
         }
 
         [Test, LuceneNetSpecific]
diff --git a/src/Lucene.Net.Tests/Index/TestTerm.cs 
b/src/Lucene.Net.Tests/Support/Text/TestEncodingExtensions.cs
similarity index 54%
copy from src/Lucene.Net.Tests/Index/TestTerm.cs
copy to src/Lucene.Net.Tests/Support/Text/TestEncodingExtensions.cs
index 425670dcd..55123917e 100644
--- a/src/Lucene.Net.Tests/Index/TestTerm.cs
+++ b/src/Lucene.Net.Tests/Support/Text/TestEncodingExtensions.cs
@@ -1,7 +1,9 @@
+using Lucene.Net.Attributes;
+using Lucene.Net.Util;
 using NUnit.Framework;
-using Assert = Lucene.Net.TestFramework.Assert;
+using System.Text;
 
-namespace Lucene.Net.Index
+namespace Lucene.Net.Support.Text
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
@@ -20,24 +22,21 @@ namespace Lucene.Net.Index
      * limitations under the License.
      */
 
-    using LuceneTestCase = Lucene.Net.Util.LuceneTestCase;
-
     [TestFixture]
-    public class TestTerm : LuceneTestCase
+    public class TestEncodingExtensions : LuceneTestCase
     {
-        [Test]
-        public virtual void TestEquals()
+        [Test, LuceneNetSpecific]
+        public void TestWithDecoderExceptionFallback()
         {
-            Term @base = new Term("same", "same");
-            Term same = new Term("same", "same");
-            Term differentField = new Term("different", "same");
-            Term differentText = new Term("same", "different");
-            const string differentType = "AString";
-            Assert.AreEqual(@base, @base);
-            Assert.AreEqual(@base, same);
-            Assert.IsFalse(@base.Equals(differentField));
-            Assert.IsFalse(@base.Equals(differentText));
-            Assert.IsFalse(@base.Equals(differentType));
+            Encoding encoding = Encoding.UTF8;
+            Encoding newEncoding = encoding.WithDecoderExceptionFallback();
+            Assert.AreNotSame(encoding, newEncoding);
+            Assert.AreEqual(DecoderFallback.ExceptionFallback, 
newEncoding.DecoderFallback);
+
+            Assert.Throws<DecoderFallbackException>(() =>
+            {
+                _ = newEncoding.GetString(new byte[] { 0xF0 });
+            });
         }
     }
-}
\ No newline at end of file
+}
diff --git a/src/Lucene.Net/Index/Term.cs b/src/Lucene.Net/Index/Term.cs
index 38eda37a3..6930fa543 100644
--- a/src/Lucene.Net/Index/Term.cs
+++ b/src/Lucene.Net/Index/Term.cs
@@ -1,6 +1,9 @@
 using J2N.Text;
 using Lucene.Net.Support;
+using Lucene.Net.Support.Buffers;
+using Lucene.Net.Support.Text;
 using System;
+using System.Buffers;
 using System.Text;
 
 namespace Lucene.Net.Index
@@ -34,6 +37,8 @@ namespace Lucene.Net.Index
     /// </summary>
     public sealed class Term : IComparable<Term>, IEquatable<Term> // 
LUCENENET specific - class implements IEquatable<T>
     {
+        private const int CharStackBufferSize = 64;
+
         /// <summary>
         /// Constructs a <see cref="Term"/> with the given field and bytes.
         /// <para/>Note that a null field or null bytes value results in 
undefined
@@ -84,24 +89,65 @@ namespace Lucene.Net.Index
         /// </summary>
         public string Text => ToString(Bytes); // LUCENENET: Changed to a 
property. While this calls a method internally, its expected usage is that it 
will return a deterministic value.
 
+#nullable enable
         /// <summary>
         /// Returns human-readable form of the term text. If the term is not 
unicode,
         /// the raw bytes will be printed instead.
         /// </summary>
         public static string ToString(BytesRef termText)
         {
+            if (termText is null)
+                throw new ArgumentNullException(nameof(termText)); // 
LUCENENET: Added guard clause
+#if FEATURE_UTF8_TOUTF16
+            // View the relevant portion of the byte array
+            ReadOnlySpan<byte> utf8Span = new 
ReadOnlySpan<byte>(termText.Bytes, termText.Offset, termText.Length);
+
+            // Allocate a buffer for the maximum possible UTF-16 output
+            int maxChars = utf8Span.Length; // Worst case: 1 byte -> 1 char 
(ASCII)
+            char[]? arrayToReturnToPool = null;
+
+            Span<char> charBuffer = maxChars > CharStackBufferSize
+                ? (arrayToReturnToPool = ArrayPool<char>.Shared.Rent(maxChars))
+                : stackalloc char[CharStackBufferSize];
+            try
+            {
+                // Decode the UTF-8 bytes to UTF-16 chars
+                OperationStatus status = System.Text.Unicode.Utf8.ToUtf16(
+                    utf8Span,
+                    charBuffer,
+                    out int bytesConsumed,
+                    out int charsWritten,
+                    replaceInvalidSequences: false); // Causes 
OperationStatus.InvalidData to occur rather than replace
+
+                // NOTE: We handle OperationStatus.InvalidData below in the 
fallback path.
+                if (status == OperationStatus.Done)
+                {
+                    // Successfully decoded the UTF-8 input
+                    return charBuffer.Slice(0, charsWritten).ToString();
+                }
+            }
+            finally
+            {
+                // Return the buffer to the pool
+                ArrayPool<char>.Shared.ReturnIfNotNull(arrayToReturnToPool);
+            }
+
+            // Fallback to the default string representation if decoding fails
+            return termText.ToString();
+#else
             // the term might not be text, but usually is. so we make a best 
effort
-            // LUCENENET TODO: determine if we should use 
DecoderFallback.ExceptionFallback here
-            Encoding decoder = StandardCharsets.UTF_8;
+            Encoding decoder = 
StandardCharsets.UTF_8.WithDecoderExceptionFallback();
             try
             {
                 return decoder.GetString(termText.Bytes, termText.Offset, 
termText.Length);
             }
-            catch
+            catch (DecoderFallbackException)
             {
                 return termText.ToString();
             }
+#endif
         }
+#nullable restore
 
         /// <summary>
         /// Returns the bytes of this term.
diff --git a/src/Lucene.Net/Support/Buffers/ArrayPoolExtensions.cs 
b/src/Lucene.Net/Support/Buffers/ArrayPoolExtensions.cs
new file mode 100644
index 000000000..baad58540
--- /dev/null
+++ b/src/Lucene.Net/Support/Buffers/ArrayPoolExtensions.cs
@@ -0,0 +1,43 @@
+using System.Buffers;
+using System.Runtime.CompilerServices;
+#nullable enable
+
+namespace Lucene.Net.Support.Buffers
+{
+    /// <summary>
+    /// Extensions to <see cref="ArrayPool{T}"/>
+    /// </summary>
+    internal static class ArrayPoolExtensions
+    {
+        /// <summary>
+        /// Returns to the pool an array that was previously obtained via <see 
cref="ArrayPool{T}.Rent"/> on the same
+        /// <see cref="ArrayPool{T}"/> instance. This method is a no-op if 
<paramref name="array"/> is <c>null</c>.
+        /// </summary>
+        /// <param name="pool">This <see cref="ArrayPool{T}"/>.</param>
+        /// <param name="array">
+        /// The buffer previously obtained from <see 
cref="ArrayPool{T}.Rent"/> to return to the pool. If <c>null</c>,
+        /// no operation will take place.
+        /// </param>
+        /// <param name="clearArray">
+        /// If <c>true</c> and if the pool will store the buffer to enable 
subsequent reuse, <see cref="ReturnIfNotNull"/>
+        /// will clear <paramref name="array"/> of its contents so that a 
subsequent consumer via <see cref="ArrayPool{T}.Rent"/>
+        /// will not see the previous consumer's content.  If <c>false</c> or 
if the pool will release the buffer,
+        /// the array's contents are left unchanged.
+        /// </param>
+        /// <remarks>
+        /// Once a buffer has been returned to the pool, the caller gives up 
all ownership of the buffer
+        /// and must not use it. The reference returned from a given call to 
<see cref="ArrayPool{T}.Rent"/> must only be
+        /// returned via <see cref="ReturnIfNotNull"/> once.  The default <see 
cref="ArrayPool{T}"/>
+        /// may hold onto the returned buffer in order to rent it again, or it 
may release the returned buffer
+        /// if it's determined that the pool already has enough buffers stored.
+        /// </remarks>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static void ReturnIfNotNull<T>(this ArrayPool<T> pool, T[]? 
array, bool clearArray = false)
+        {
+            if (array != null)
+            {
+                pool.Return(array, clearArray);
+            }
+        }
+    }
+}
diff --git a/src/Lucene.Net/Support/ExceptionHandling/ExceptionExtensions.cs 
b/src/Lucene.Net/Support/ExceptionHandling/ExceptionExtensions.cs
index f7b9c1e51..09ba5bdf7 100644
--- a/src/Lucene.Net/Support/ExceptionHandling/ExceptionExtensions.cs
+++ b/src/Lucene.Net/Support/ExceptionHandling/ExceptionExtensions.cs
@@ -6,6 +6,7 @@ using System.Reflection;
 using System.Resources;
 using System.Runtime.CompilerServices;
 using System.Security;
+using System.Text;
 using System.Threading;
 
 namespace Lucene
@@ -213,8 +214,11 @@ namespace Lucene
             if (e is null || e.IsAlwaysIgnored()) return false;
 
             return e is IOException ||
-                e.IsAlreadyClosedException() || // In Lucene, 
AlreadyClosedException subclass IOException instead of 
InvalidOperationException, so we need a special case here
-                e is UnauthorizedAccessException; // In Java, 
java.nio.file.AccessDeniedException subclasses IOException
+                   e.IsAlreadyClosedException() || // In Lucene, 
AlreadyClosedException subclass IOException instead of 
InvalidOperationException, so we need a special case here
+                   e is
+                       UnauthorizedAccessException // In Java, 
java.nio.file.AccessDeniedException subclasses IOException
+                       or DecoderFallbackException // In Java, 
CharacterCodingException subclasses IOException
+                       or EncoderFallbackException;
         }
 
         /// <summary>
@@ -368,9 +372,11 @@ namespace Lucene
             // LUCENENET: In production, there is a chance that we will 
upgrade to ArgumentNullExcpetion or ArgumentOutOfRangeException
             // and it is still important that those are caught. However, we 
have a copy of this method in the test environment
             // where this is done more strictly to catch ArgumentException 
without its known subclasses so we can be more explicit in tests.
-            return e is ArgumentException;
-                //!(e is ArgumentNullException) &&     // Corresponds to 
NullPointerException, so we don't catch it here.
-                //!(e is ArgumentOutOfRangeException); // Corresponds to 
IndexOutOfBoundsException (and subclasses), so we don't catch it here.
+            return e is ArgumentException
+                and not DecoderFallbackException // In Java, 
CharacterCodingException subclasses IOException, not ArgumentException
+                and not EncoderFallbackException;
+            //!(e is ArgumentNullException) &&     // Corresponds to 
NullPointerException, so we don't catch it here.
+            //!(e is ArgumentOutOfRangeException); // Corresponds to 
IndexOutOfBoundsException (and subclasses), so we don't catch it here.
         }
 
         /// <summary>
diff --git a/src/Lucene.Net/Support/Text/EncodingExtensions.cs 
b/src/Lucene.Net/Support/Text/EncodingExtensions.cs
new file mode 100644
index 000000000..5e1c3574c
--- /dev/null
+++ b/src/Lucene.Net/Support/Text/EncodingExtensions.cs
@@ -0,0 +1,58 @@
+using System.Collections.Concurrent;
+using System.Text;
+#nullable enable
+
+namespace Lucene.Net.Support.Text
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Extension methods for <see cref="Encoding"/>.
+    /// </summary>
+    internal static class EncodingExtensions
+    {
+        private static readonly ConcurrentDictionary<Encoding, Encoding> 
decoderExceptionFallbackCache = new();
+
+        /// <summary>
+        /// Returns an <see cref="Encoding"/> instance with the <see 
cref="DecoderFallback"/> set to throw
+        /// an exception when an invalid byte sequence is encountered.
+        /// <para />
+        /// This is equivalent to Java's <c>CodingErrorAction.REPORT</c> for 
both <c>onMalformedInput</c> and
+        /// <c>onUnmappableCharacter</c> and will throw a <see 
cref="DecoderFallbackException"/> when failing
+        /// to decode a string. This exception is equivalent to Java's 
<c>CharacterCodingException</c>, which is
+        /// a base exception type for both <c>MalformedInputException</c> and 
<c>UnmappableCharacterException</c>.
+        /// Thus, to translate Java code that catches any of those exceptions, 
you can catch
+        /// <see cref="DecoderFallbackException"/>.
+        /// </summary>
+        /// <param name="encoding">The encoding to clone and set the fallback 
on.</param>
+        /// <returns>An <see cref="Encoding"/> instance with the fallback set 
to throw an exception.</returns>
+        /// <remarks>
+        /// Note that it is necessary to clone the <see cref="Encoding"/> 
instance because
+        /// the <see cref="Encoding.DecoderFallback"/> property is read-only 
without cloning.
+        /// </remarks>
+        public static Encoding WithDecoderExceptionFallback(this Encoding 
encoding)
+        {
+            return decoderExceptionFallbackCache.GetOrAdd(encoding, static e =>
+            {
+                Encoding newEncoding = (Encoding)e.Clone();
+                newEncoding.DecoderFallback = 
DecoderFallback.ExceptionFallback;
+                return newEncoding;
+            });
+        }
+    }
+}
diff --git a/src/Lucene.Net/Util/IOUtils.cs b/src/Lucene.Net/Util/IOUtils.cs
index c3141b00d..624336ecf 100644
--- a/src/Lucene.Net/Util/IOUtils.cs
+++ b/src/Lucene.Net/Util/IOUtils.cs
@@ -2,6 +2,7 @@
 using Lucene.Net.Diagnostics;
 using Lucene.Net.Support;
 using Lucene.Net.Support.IO;
+using Lucene.Net.Support.Text;
 using System;
 using System.Collections.Generic;
 using System.Diagnostics;
@@ -378,7 +379,8 @@ namespace Lucene.Net.Util
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static TextReader GetDecodingReader(Stream stream, Encoding 
charSet)
         {
-            return new StreamReader(stream, charSet);
+            var charSetDecoder = charSet.WithDecoderExceptionFallback();
+            return new StreamReader(stream, charSetDecoder);
         }
 
         /// <summary>

(lucenenet) branch master updated: Use DecoderFallback.ExceptionFallback to match Java's CodingErrorAction.REPORT, #1076 (#1089)

Reply via email to