This is an automated email from the ASF dual-hosted git repository.
paulirwin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git
The following commit(s) were added to refs/heads/master by this push:
new 4bf492c3b Use DecoderFallback.ExceptionFallback to match Java's
CodingErrorAction.REPORT, #1076 (#1089)
4bf492c3b is described below
commit 4bf492c3b55fd94e9bd00822f6980c34b8c794b2
Author: Paul Irwin <[email protected]>
AuthorDate: Sat Jan 11 21:16:27 2025 -0700
Use DecoderFallback.ExceptionFallback to match Java's
CodingErrorAction.REPORT, #1076 (#1089)
* Use DecoderFallback.ExceptionFallback to match Java behavior, #1076
* Add unit test for WithDecoderExceptionFallback
* Fix unit test namespace and doc comment
* Lucene.Net.Support.Buffers: Added ArrayPoolExtensions class to simplify
returning arrays that might be null
* Lucene.Net.Index.Term::ToString(): Optimized writing UTF8 string on
target frameworks that support System.Text.Unicode.Utf8. Added tests to verify
fallback is working.
* Cache decoder fallback encoding lookup, #1076
* Treat Encoder/DecoderFallbackExceptions as IOExceptions to match Java,
#1076
* Fix translation of replacement fallback test code,
IOException/RuntimeException tests
* Use Encoding.Default instead of GetEncoding(0), #1076
* Cache GB2312 encoding lookup, #1076
* Replace StandardCharsets.UTF_8 with Encoding.UTF8 in two tests, #1076
* Fix test extension method for detecting IllegalArgumentException, #1076
* Cascade call from IsIllegalArgumentException
---------
Co-authored-by: Shad Storhaug <[email protected]>
---
.gitignore | 3 +-
Directory.Build.targets | 7 +++
.../Analysis/Hunspell/Dictionary.cs | 2 +-
.../Analysis/Hunspell/ISO8859_14Decoder.cs | 7 ++-
.../Analysis/Synonym/FSTSynonymFilterFactory.cs | 5 +-
.../Analysis/Util/AbstractAnalysisFactory.cs | 4 +-
.../JapaneseTokenizerFactory.cs | 3 +-
.../Tools/ConnectionCostsBuilder.cs | 4 +-
.../Tools/TokenInfoDictionaryBuilder.cs | 3 +-
.../Tools/UnknownDictionaryBuilder.cs | 3 +-
.../Hhmm/AbstractDictionary.cs | 11 ++--
.../Hhmm/BigramDictionary.cs | 2 +-
.../Hhmm/WordDictionary.cs | 2 +-
.../ByTask/Feeds/ContentItemsSource.cs | 6 +--
.../ByTask/Tasks/CreateIndexTask.cs | 2 +-
.../Quality/Trec/QueryDriver.cs | 9 ++--
.../ExceptionHandling/ExceptionExtensions.cs | 10 ++--
src/Lucene.Net.TestFramework/Util/LineFileDocs.cs | 6 ++-
.../ExceptionHandling/ExceptionScanningTestCase.cs | 9 ++--
.../ExceptionHandling/TestExceptionExtensions.cs | 27 +++++-----
.../ByTask/Tasks/CreateIndexTaskTest.cs | 4 +-
src/Lucene.Net.Tests.Demo/TestDemo.cs | 6 +--
.../Taxonomy/WriterCache/TestCharBlockArray.cs | 22 ++------
.../WriterCache/TestCompactLabelToOrdinal.cs | 18 ++-----
src/Lucene.Net.Tests/Index/TestTerm.cs | 60 +++++++++++++++++++++-
src/Lucene.Net.Tests/Support/TestApiConsistency.cs | 2 +-
.../Text/TestEncodingExtensions.cs} | 35 ++++++-------
src/Lucene.Net/Index/Term.cs | 52 +++++++++++++++++--
.../Support/Buffers/ArrayPoolExtensions.cs | 43 ++++++++++++++++
.../ExceptionHandling/ExceptionExtensions.cs | 16 ++++--
src/Lucene.Net/Support/Text/EncodingExtensions.cs | 58 +++++++++++++++++++++
src/Lucene.Net/Util/IOUtils.cs | 4 +-
32 files changed, 327 insertions(+), 118 deletions(-)
diff --git a/.gitignore b/.gitignore
index 7446dd01a..6f8520af1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -65,4 +65,5 @@ websites/apidocs/api/**/*.manifest
svn-*/
# vscode files
-.vscode/
\ No newline at end of file
+.vscode/
+.idea/**/misc.xml
diff --git a/Directory.Build.targets b/Directory.Build.targets
index fd71ab055..1dc7daa03 100644
--- a/Directory.Build.targets
+++ b/Directory.Build.targets
@@ -37,6 +37,13 @@
</PropertyGroup>
+ <!-- Features in .NET 8.x and .NET 9.x only -->
+ <PropertyGroup Condition=" $(TargetFramework.StartsWith('net8.')) Or
$(TargetFramework.StartsWith('net9.')) ">
+
+ <DefineConstants>$(DefineConstants);FEATURE_UTF8_TOUTF16</DefineConstants>
+
+ </PropertyGroup>
+
<!-- Features in .NET 6.x, .NET 7.x, .NET 8.x, and .NET 9.x only -->
<PropertyGroup Condition=" $(TargetFramework.StartsWith('net6.')) Or
$(TargetFramework.StartsWith('net7.')) Or
$(TargetFramework.StartsWith('net8.')) Or
$(TargetFramework.StartsWith('net9.')) ">
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Dictionary.cs
b/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Dictionary.cs
index ceeb7eb53..b790df5e6 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Dictionary.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Dictionary.cs
@@ -746,7 +746,7 @@ namespace Lucene.Net.Analysis.Hunspell
}
if ("ISO8859-14".Equals(encoding,
StringComparison.OrdinalIgnoreCase))
{
- return new ISO8859_14Encoding();
+ return ISO8859_14Encoding.Default;
}
// .NET doesn't recognize the encoding without a dash between ISO
and the number
//
https://msdn.microsoft.com/en-us/library/system.text.encodinginfo.getencoding(v=vs.110).aspx
diff --git
a/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/ISO8859_14Decoder.cs
b/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/ISO8859_14Decoder.cs
index 607895404..7b7eb59c1 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/ISO8859_14Decoder.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/ISO8859_14Decoder.cs
@@ -28,6 +28,11 @@ namespace Lucene.Net.Analysis.Hunspell
[ExceptionToClassNameConvention]
internal sealed class ISO8859_14Encoding : Encoding
{
+ /// <summary>
+ /// The default singleton instance of the <see
cref="ISO8859_14Encoding"/> class.
+ /// </summary>
+ public static new ISO8859_14Encoding Default { get; } = new
ISO8859_14Encoding();
+
private static readonly Decoder decoder = new ISO8859_14Decoder();
public override Decoder GetDecoder()
{
@@ -119,4 +124,4 @@ namespace Lucene.Net.Analysis.Hunspell
return writeCount;
}
}
-}
\ No newline at end of file
+}
diff --git
a/src/Lucene.Net.Analysis.Common/Analysis/Synonym/FSTSynonymFilterFactory.cs
b/src/Lucene.Net.Analysis.Common/Analysis/Synonym/FSTSynonymFilterFactory.cs
index 41303cb1b..c862dce16 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Synonym/FSTSynonymFilterFactory.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Synonym/FSTSynonymFilterFactory.cs
@@ -1,6 +1,7 @@
// Lucene version compatibility level 4.8.1
using Lucene.Net.Analysis.Core;
using Lucene.Net.Analysis.Util;
+using Lucene.Net.Support.Text;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
@@ -117,7 +118,7 @@ namespace Lucene.Net.Analysis.Synonym
/// </summary>
private SynonymMap LoadSynonyms(IResourceLoader loader, string cname,
bool dedup, Analyzer analyzer)
{
- Encoding decoder = Encoding.UTF8;
+ Encoding decoder = Encoding.UTF8.WithDecoderExceptionFallback();
SynonymMap.Parser parser;
Type clazz = loader.FindType(cname /*, typeof(SynonymMap.Parser)
*/);
@@ -165,4 +166,4 @@ namespace Lucene.Net.Analysis.Synonym
}
}
}
-}
\ No newline at end of file
+}
diff --git
a/src/Lucene.Net.Analysis.Common/Analysis/Util/AbstractAnalysisFactory.cs
b/src/Lucene.Net.Analysis.Common/Analysis/Util/AbstractAnalysisFactory.cs
index 9839b027d..874c020fd 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Util/AbstractAnalysisFactory.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Util/AbstractAnalysisFactory.cs
@@ -1,6 +1,7 @@
// Lucene version compatibility level 4.8.1
using Lucene.Net.Analysis.Core;
using Lucene.Net.Support;
+using Lucene.Net.Support.Text;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
@@ -385,8 +386,9 @@ namespace Lucene.Net.Analysis.Util
words = new CharArraySet(m_luceneMatchVersion, files.Count *
10, ignoreCase);
foreach (string file in files)
{
+ Encoding decoder =
Encoding.UTF8.WithDecoderExceptionFallback();
using (Stream stream = loader.OpenResource(file.Trim()))
- using (TextReader reader = new StreamReader(stream,
Encoding.UTF8))
+ using (TextReader reader = new StreamReader(stream,
decoder))
{
WordlistLoader.GetSnowballWordSet(reader, words);
}
diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizerFactory.cs
b/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizerFactory.cs
index 5595375e4..6e6d402d1 100644
--- a/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizerFactory.cs
+++ b/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizerFactory.cs
@@ -1,5 +1,6 @@
using Lucene.Net.Analysis.Ja.Dict;
using Lucene.Net.Analysis.Util;
+using Lucene.Net.Support.Text;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
@@ -88,7 +89,7 @@ namespace Lucene.Net.Analysis.Ja
{
encoding = Encoding.UTF8.WebName;
}
- Encoding decoder = Encoding.GetEncoding(encoding);
+ Encoding decoder =
Encoding.GetEncoding(encoding).WithDecoderExceptionFallback();
TextReader reader = new StreamReader(stream, decoder);
userDictionary = new UserDictionary(reader);
}
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Tools/ConnectionCostsBuilder.cs
b/src/Lucene.Net.Analysis.Kuromoji/Tools/ConnectionCostsBuilder.cs
index 5d5f1d4c7..ef8b482f1 100644
--- a/src/Lucene.Net.Analysis.Kuromoji/Tools/ConnectionCostsBuilder.cs
+++ b/src/Lucene.Net.Analysis.Kuromoji/Tools/ConnectionCostsBuilder.cs
@@ -1,5 +1,6 @@
using J2N.Text;
using Lucene.Net.Diagnostics;
+using Lucene.Net.Support.Text;
using System.Globalization;
using System.IO;
using System.Text;
@@ -31,7 +32,8 @@ namespace Lucene.Net.Analysis.Ja.Util
public static ConnectionCostsWriter Build(string filename)
{
using Stream inputStream = new FileStream(filename, FileMode.Open,
FileAccess.Read);
- using StreamReader streamReader = new StreamReader(inputStream,
Encoding.ASCII, detectEncodingFromByteOrderMarks: true, bufferSize: 1024,
leaveOpen: true); // LUCENENET: CA2000: Use using statement
+ Encoding decoder = Encoding.ASCII.WithDecoderExceptionFallback();
+ using StreamReader streamReader = new StreamReader(inputStream,
decoder, detectEncodingFromByteOrderMarks: true, bufferSize: 1024, leaveOpen:
true); // LUCENENET: CA2000: Use using statement
string line = streamReader.ReadLine();
string[] dimensions = whiteSpaceRegex.Split(line).TrimEnd();
diff --git
a/src/Lucene.Net.Analysis.Kuromoji/Tools/TokenInfoDictionaryBuilder.cs
b/src/Lucene.Net.Analysis.Kuromoji/Tools/TokenInfoDictionaryBuilder.cs
index 737182961..7e0df4ad7 100644
--- a/src/Lucene.Net.Analysis.Kuromoji/Tools/TokenInfoDictionaryBuilder.cs
+++ b/src/Lucene.Net.Analysis.Kuromoji/Tools/TokenInfoDictionaryBuilder.cs
@@ -1,5 +1,6 @@
using J2N.Text;
using Lucene.Net.Support;
+using Lucene.Net.Support.Text;
using Lucene.Net.Util;
using Lucene.Net.Util.Fst;
using Lucene.Net.Util.Packed;
@@ -71,7 +72,7 @@ namespace Lucene.Net.Analysis.Ja.Util
foreach (string file in csvFiles)
{
using Stream inputStream = new FileStream(file, FileMode.Open,
FileAccess.Read);
- Encoding decoder = Encoding.GetEncoding(encoding);
+ Encoding decoder =
Encoding.GetEncoding(encoding).WithDecoderExceptionFallback();
using TextReader reader = new StreamReader(inputStream,
decoder, detectEncodingFromByteOrderMarks: true, bufferSize: 1024, leaveOpen:
true); // LUCENENET: CA2000: Use using statement
string line = null;
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Tools/UnknownDictionaryBuilder.cs
b/src/Lucene.Net.Analysis.Kuromoji/Tools/UnknownDictionaryBuilder.cs
index 3fde184c0..b2023e462 100644
--- a/src/Lucene.Net.Analysis.Kuromoji/Tools/UnknownDictionaryBuilder.cs
+++ b/src/Lucene.Net.Analysis.Kuromoji/Tools/UnknownDictionaryBuilder.cs
@@ -1,5 +1,6 @@
using J2N.Text;
using Lucene.Net.Analysis.Ja.Dict;
+using Lucene.Net.Support.Text;
using System;
using System.Collections.Generic;
using System.Globalization;
@@ -55,7 +56,7 @@ namespace Lucene.Net.Analysis.Ja.Util
UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5
* 1024 * 1024);
JCG.List<string[]> lines = new JCG.List<string[]>();
- Encoding decoder = Encoding.GetEncoding(encoding);
+ Encoding decoder =
Encoding.GetEncoding(encoding).WithDecoderExceptionFallback();
using (Stream inputStream = new FileStream(filename,
FileMode.Open, FileAccess.Read))
using (TextReader reader = new StreamReader(inputStream, decoder))
{
diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs
b/src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs
index 5e2139018..1d5da6d3a 100644
--- a/src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs
+++ b/src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs
@@ -32,6 +32,9 @@ namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
/// </summary>
internal abstract class AbstractDictionary
{
+ // LUCENENET specific: cached GB2312 encoding to avoid repeated calls
to Encoding.GetEncoding("GB2312")
+ protected static readonly Encoding gb2312Encoding =
Encoding.GetEncoding("GB2312");
+
/// <summary>
/// First Chinese Character in GB2312 (15 * 94)
/// Characters in GB2312 are arranged in a grid of 94 * 94, 0-14 are
unassigned or punctuation.
@@ -39,7 +42,7 @@ namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
public const int GB2312_FIRST_CHAR = 1410;
/// <summary>
- /// Last Chinese Character in GB2312 (87 * 94).
+ /// Last Chinese Character in GB2312 (87 * 94).
/// Characters in GB2312 are arranged in a grid of 94 * 94, 88-94 are
unassigned.
/// </summary>
public const int GB2312_CHAR_NUM = 87 * 94;
@@ -98,7 +101,7 @@ namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
try
{
//String cchar = new String(buffer, "GB2312");
- string cchar =
Encoding.GetEncoding("GB2312").GetString(buffer);
+ string cchar = gb2312Encoding.GetString(buffer); // LUCENENET
specific: use cached encoding instance
return cchar;
}
catch (Exception e) when (e.IsUnsupportedEncodingException()) //
Encoding is not supported by the platform
@@ -117,7 +120,7 @@ namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
try
{
//byte[] buffer = Character.ToString(ch).getBytes("GB2312");
- byte[] buffer =
Encoding.GetEncoding("GB2312").GetBytes(ch.ToString());
+ byte[] buffer = gb2312Encoding.GetBytes(ch.ToString()); //
LUCENENET specific: use cached encoding instance
//byte[] buffer =
Encoding.GetEncoding("hz-gb-2312").GetBytes(ch.ToString());
if (buffer.Length != 2)
{
@@ -125,7 +128,7 @@ namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
return -1;
}
int b0 = (buffer[0] & 0x0FF) - 161; // Code starts from A1,
therefore subtract 0xA1=161
- int b1 = (buffer[1] & 0x0FF) - 161; // There is no Chinese
char for the first and last symbol.
+ int b1 = (buffer[1] & 0x0FF) - 161; // There is no Chinese
char for the first and last symbol.
// Therefore, each code
page only has 16*6-2=94 characters.
return (short)(b0 * 94 + b1);
}
diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs
b/src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs
index b9d16273a..da712cb0d 100644
--- a/src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs
+++ b/src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs
@@ -302,7 +302,7 @@ namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
byte[] lchBuffer = new byte[length];
dctFile.Read(lchBuffer, 0, lchBuffer.Length);
//tmpword = new String(lchBuffer, "GB2312");
- tmpword =
Encoding.GetEncoding("GB2312").GetString(lchBuffer);
+ tmpword = gb2312Encoding.GetString(lchBuffer); //
LUCENENET specific: use cached encoding instance from base class
//tmpword =
Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer);
if (i != 3755 + GB2312_FIRST_CHAR)
{
diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs
b/src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs
index b8cd7cbbf..b6e42be52 100644
--- a/src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs
+++ b/src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs
@@ -395,7 +395,7 @@ namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
{
byte[] lchBuffer = new byte[length];
dctFile.Read(lchBuffer, 0, lchBuffer.Length);
- tmpword =
Encoding.GetEncoding("GB2312").GetString(lchBuffer);
+ tmpword = gb2312Encoding.GetString(lchBuffer); //
LUCENENET specific: use cached encoding instance from base class
wordItem_charArrayTable[i][j] =
tmpword.ToCharArray();
}
else
diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/ContentItemsSource.cs
b/src/Lucene.Net.Benchmark/ByTask/Feeds/ContentItemsSource.cs
index 7932f749e..4b50076cd 100644
--- a/src/Lucene.Net.Benchmark/ByTask/Feeds/ContentItemsSource.cs
+++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/ContentItemsSource.cs
@@ -32,7 +32,7 @@ namespace Lucene.Net.Benchmarks.ByTask.Feeds
/// Base class for source of data for benchmarking.
/// </summary>
/// <remarks>
- /// Keeps track of various statistics, such as how many data items were
generated,
+ /// Keeps track of various statistics, such as how many data items were
generated,
/// size in bytes etc.
/// <para/>
/// Supports the following configuration parameters:
@@ -40,7 +40,7 @@ namespace Lucene.Net.Benchmarks.ByTask.Feeds
/// <item><term>content.source.forever</term><description>specifies
whether to generate items forever (<b>default=true</b>).</description></item>
/// <item><term>content.source.verbose</term><description>specifies
whether messages should be output by the content source
(<b>default=false</b>).</description></item>
/// <item><term>content.source.encoding</term><description>
- /// specifies which encoding to use when
+ /// specifies which encoding to use when
/// reading the files of that content source. Certain
implementations may define
/// a default value if this parameter is not specified.
(<b>default=null</b>).
/// </description></item>
@@ -199,7 +199,7 @@ namespace Lucene.Net.Benchmarks.ByTask.Feeds
}
else
{
- m_encoding = Encoding.GetEncoding(0); // Default system
encoding
+ m_encoding = Encoding.Default; // Default system encoding
}
}
diff --git a/src/Lucene.Net.Benchmark/ByTask/Tasks/CreateIndexTask.cs
b/src/Lucene.Net.Benchmark/ByTask/Tasks/CreateIndexTask.cs
index 6fd4cba20..1e8ed8336 100644
--- a/src/Lucene.Net.Benchmark/ByTask/Tasks/CreateIndexTask.cs
+++ b/src/Lucene.Net.Benchmark/ByTask/Tasks/CreateIndexTask.cs
@@ -215,7 +215,7 @@ namespace Lucene.Net.Benchmarks.ByTask.Tasks
else
{
FileInfo f = new FileInfo(infoStreamVal);
- iwc.SetInfoStream(new StreamWriter(new
FileStream(f.FullName, FileMode.Create, FileAccess.Write),
Encoding.GetEncoding(0)));
+ iwc.SetInfoStream(new StreamWriter(new
FileStream(f.FullName, FileMode.Create, FileAccess.Write), Encoding.Default));
}
}
IndexWriter writer = new IndexWriter(runData.Directory, iwc);
diff --git a/src/Lucene.Net.Benchmark/Quality/Trec/QueryDriver.cs
b/src/Lucene.Net.Benchmark/Quality/Trec/QueryDriver.cs
index d08dab75a..a223653dc 100644
--- a/src/Lucene.Net.Benchmark/Quality/Trec/QueryDriver.cs
+++ b/src/Lucene.Net.Benchmark/Quality/Trec/QueryDriver.cs
@@ -77,10 +77,13 @@ namespace Lucene.Net.Benchmarks.Quality.Trec
string fieldSpec = args.Length == 5 ? args[4] : "T"; // default to
Title-only if not specified.
IndexSearcher searcher = new IndexSearcher(reader);
- int maxResults = 1000;
- string docNameField = "docname";
+ const int maxResults = 1000;
+ const string docNameField = "docname";
- TextWriter logger = Console.Out; //new StreamWriter(Console,
Encoding.GetEncoding(0));
+ using TextWriter logger = new
StreamWriter(System.Console.OpenStandardOutput(), Encoding.Default)
+ {
+ AutoFlush = true,
+ };
// use trec utilities to read trec topics into quality queries
TrecTopicsReader qReader = new TrecTopicsReader();
diff --git
a/src/Lucene.Net.TestFramework/Support/ExceptionHandling/ExceptionExtensions.cs
b/src/Lucene.Net.TestFramework/Support/ExceptionHandling/ExceptionExtensions.cs
index f2d39de48..424869718 100644
---
a/src/Lucene.Net.TestFramework/Support/ExceptionHandling/ExceptionExtensions.cs
+++
b/src/Lucene.Net.TestFramework/Support/ExceptionHandling/ExceptionExtensions.cs
@@ -1,5 +1,6 @@
using System;
using System.Runtime.CompilerServices;
+using System.Text;
namespace Lucene.Net
{
@@ -53,12 +54,9 @@ namespace Lucene.Net
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool IsIllegalArgumentException(this Exception e)
{
- // If our exception implements IError and subclasses
ArgumentException, we will ignore it.
- if (e is null || e.IsError() || e.IsAlwaysIgnored()) return false;
-
- return e is ArgumentException &&
- e is not ArgumentNullException && // Corresponds to
NullPointerException, so we don't catch it here.
- e is not ArgumentOutOfRangeException; // Corresponds to
IndexOutOfBoundsException (and subclasses), so we don't catch it here.
+ return Lucene.ExceptionExtensions.IsIllegalArgumentException(e)
+ && e is not ArgumentNullException // Corresponds to
NullPointerException, so we don't catch it here.
+ and not ArgumentOutOfRangeException; // Corresponds to
IndexOutOfBoundsException (and subclasses), so we don't catch it here.
}
}
}
diff --git a/src/Lucene.Net.TestFramework/Util/LineFileDocs.cs
b/src/Lucene.Net.TestFramework/Util/LineFileDocs.cs
index 991cdb628..168d0e746 100644
--- a/src/Lucene.Net.TestFramework/Util/LineFileDocs.cs
+++ b/src/Lucene.Net.TestFramework/Util/LineFileDocs.cs
@@ -2,6 +2,7 @@
using J2N.Threading.Atomic;
using Lucene.Net.Documents;
using Lucene.Net.Support.IO;
+using Lucene.Net.Support.Text;
using Lucene.Net.Support.Threading;
using RandomizedTesting.Generators;
using System;
@@ -236,7 +237,8 @@ namespace Lucene.Net.Util
} while (b >= 0 && b != 13 && b != 10);
}
- reader = new StreamReader(@is, Encoding.UTF8,
detectEncodingFromByteOrderMarks: false, bufferSize: BUFFER_SIZE);
+ Encoding decoder =
Encoding.UTF8.WithDecoderExceptionFallback();
+ reader = new StreamReader(@is, decoder,
detectEncodingFromByteOrderMarks: false, bufferSize: BUFFER_SIZE);
if (seekTo > 0L)
{
@@ -399,4 +401,4 @@ namespace Lucene.Net.Util
return result;
}
}
-}
\ No newline at end of file
+}
diff --git
a/src/Lucene.Net.Tests.AllProjects/Support/ExceptionHandling/ExceptionScanningTestCase.cs
b/src/Lucene.Net.Tests.AllProjects/Support/ExceptionHandling/ExceptionScanningTestCase.cs
index 8fa941f42..8cd3942d6 100644
---
a/src/Lucene.Net.Tests.AllProjects/Support/ExceptionHandling/ExceptionScanningTestCase.cs
+++
b/src/Lucene.Net.Tests.AllProjects/Support/ExceptionHandling/ExceptionScanningTestCase.cs
@@ -13,6 +13,7 @@ using System.Linq;
using System.Reflection;
using System.Resources;
using System.Security;
+using System.Text;
using Assert = Lucene.Net.TestFramework.Assert;
namespace Lucene.Net.Support.ExceptionHandling
@@ -184,6 +185,8 @@ namespace Lucene.Net.Support.ExceptionHandling
typeof(UnauthorizedAccessException),
typeof(ObjectDisposedException),
typeof(Lucene.AlreadyClosedException),
+ typeof(EncoderFallbackException), // In Java,
CharacterCodingException subclasses IOException
+ typeof(DecoderFallbackException),
}.Union(AllIOExceptionTypes)
// .NET Framework only - Subclasses UnauthorizedAccessException
.Union(new[] { PrivilegeNotHeldExceptionType });
@@ -221,8 +224,6 @@ namespace Lucene.Net.Support.ExceptionHandling
// Subclasses
typeof(System.DuplicateWaitObjectException),
typeof(System.Globalization.CultureNotFoundException),
- typeof(System.Text.DecoderFallbackException),
- typeof(System.Text.EncoderFallbackException),
};
public static readonly IEnumerable<Type>
KnownIllegalArgumentExceptionTypes_TestEnvironment = new Type[] {
@@ -234,8 +235,6 @@ namespace Lucene.Net.Support.ExceptionHandling
// Subclasses
typeof(System.DuplicateWaitObjectException),
typeof(System.Globalization.CultureNotFoundException),
- typeof(System.Text.DecoderFallbackException),
- typeof(System.Text.EncoderFallbackException),
};
public static readonly IEnumerable<Type> KnownRuntimeExceptionTypes =
LoadKnownRuntimeExceptionTypes();
@@ -367,8 +366,6 @@ namespace Lucene.Net.Support.ExceptionHandling
typeof(System.Runtime.Serialization.SerializationException),
typeof(System.Security.Cryptography.CryptographicException),
typeof(System.Security.VerificationException),
- typeof(System.Text.DecoderFallbackException), // LUCENENET
TODO: Need to be sure about this one
- typeof(System.Text.EncoderFallbackException), // LUCENENET
TODO: Need to be sure about this one
typeof(System.Threading.AbandonedMutexException),
typeof(System.Threading.SemaphoreFullException),
typeof(System.Threading.SynchronizationLockException),
diff --git
a/src/Lucene.Net.Tests.AllProjects/Support/ExceptionHandling/TestExceptionExtensions.cs
b/src/Lucene.Net.Tests.AllProjects/Support/ExceptionHandling/TestExceptionExtensions.cs
index 8525684ac..9efc1c5b2 100644
---
a/src/Lucene.Net.Tests.AllProjects/Support/ExceptionHandling/TestExceptionExtensions.cs
+++
b/src/Lucene.Net.Tests.AllProjects/Support/ExceptionHandling/TestExceptionExtensions.cs
@@ -1,5 +1,4 @@
-using J2N.Text;
-using Lucene.Net.Attributes;
+using Lucene.Net.Attributes;
using NUnit.Framework;
using System;
using System.Collections.Generic;
@@ -195,7 +194,7 @@ namespace Lucene.Net.Support.ExceptionHandling
}
[Test]
- [TestCaseSource("ThrowableTypeExpressions")]
+ [TestCaseSource(nameof(ThrowableTypeExpressions))]
public void TestIsThrowable(Type exceptionType, bool expectedToThrow,
Action expression) // LUCENENET NOTE: exceptionType is only here to make NUnit
display them all
{
static bool extensionMethod(Exception e) => e.IsThrowable();
@@ -211,7 +210,7 @@ namespace Lucene.Net.Support.ExceptionHandling
}
[Test]
- [TestCaseSource("ErrorTypeExpressions")]
+ [TestCaseSource(nameof(ErrorTypeExpressions))]
public void TestIsError(Type exceptionType, bool expectedToThrow,
Action expression) // LUCENENET NOTE: exceptionType is only here to make NUnit
display them all
{
static bool extensionMethod(Exception e) => e.IsError();
@@ -229,7 +228,7 @@ namespace Lucene.Net.Support.ExceptionHandling
// This test ensures that all known Error types from Java are not
caught by
// our IsException() handler.
[Test]
- [TestCaseSource("ExceptionTypeExpressions")]
+ [TestCaseSource(nameof(ExceptionTypeExpressions))]
public void TestIsException(Type exceptionType, bool expectedToThrow,
Action expression) // LUCENENET NOTE: exceptionType is only here to make NUnit
display them all
{
static bool extensionMethod(Exception e) => e.IsException();
@@ -247,7 +246,7 @@ namespace Lucene.Net.Support.ExceptionHandling
// This test ensures that all known Error types from Java are not
caught by
// our IsRuntimeException() handler.
[Test]
- [TestCaseSource("RuntimeExceptionTypeExpressions")]
+ [TestCaseSource(nameof(RuntimeExceptionTypeExpressions))]
public void TestIsRuntimeException(Type exceptionType, bool
expectedToThrow, Action expression) // LUCENENET NOTE: exceptionType is only
here to make NUnit display them all
{
static bool extensionMethod(Exception e) => e.IsRuntimeException();
@@ -263,7 +262,7 @@ namespace Lucene.Net.Support.ExceptionHandling
}
[Test]
- [TestCaseSource("IOExceptionTypeExpressions")]
+ [TestCaseSource(nameof(IOExceptionTypeExpressions))]
public void TestIsIOException(Type exceptionType, bool
expectedToThrow, Action expression) // LUCENENET NOTE: exceptionType is only
here to make NUnit display them all
{
static bool extensionMethod(Exception e) => e.IsIOException();
@@ -282,7 +281,7 @@ namespace Lucene.Net.Support.ExceptionHandling
// NUnit's AssertionException and MultipleAssertException types are
all treated as if they were AssertionError
// in Java.
[Test]
- [TestCaseSource("AssertionErrorTypeExpressions")]
+ [TestCaseSource(nameof(AssertionErrorTypeExpressions))]
public void TestIsAssertionError(Type exceptionType, bool
expectedToThrow, Action expression) // LUCENENET NOTE: exceptionType is only
here to make NUnit display them all
{
static bool extensionMethod(Exception e) => e.IsAssertionError();
@@ -302,7 +301,7 @@ namespace Lucene.Net.Support.ExceptionHandling
// Java has 2 other types ArrayIndexOutOfBoundsException and
StringIndexOutOfBoundsException, whose alias
// exception types are also part of the test.
[Test]
- [TestCaseSource("IndexOutOfBoundsExceptionTypeExpressions")]
+ [TestCaseSource(nameof(IndexOutOfBoundsExceptionTypeExpressions))]
public void TestIsIndexOutOfBoundsException(Type exceptionType, bool
expectedToThrow, Action expression) // LUCENENET NOTE: exceptionType is only
here to make NUnit display them all
{
static bool extensionMethod(Exception e) =>
e.IsIndexOutOfBoundsException();
@@ -320,7 +319,7 @@ namespace Lucene.Net.Support.ExceptionHandling
// This test ensures that ArgumentNullException and
NullReferenceException are both caught by our
// NullPointerException handler, because they both correspond to
NullPointerException in Java
[Test]
- [TestCaseSource("NullPointerExceptionTypeExpressions")]
+ [TestCaseSource(nameof(NullPointerExceptionTypeExpressions))]
public void TestIsNullPointerException(Type exceptionType, bool
expectedToThrow, Action expression) // LUCENENET NOTE: exceptionType is only
here to make NUnit display them all
{
static bool extensionMethod(Exception e) =>
e.IsNullPointerException();
@@ -339,7 +338,7 @@ namespace Lucene.Net.Support.ExceptionHandling
// We do it this way in production to ensure that if we "upgrade" to a
.NET
// ArgumentNullException or ArgumentOutOfRangeException it won't break
the code.
[Test]
- [TestCaseSource("IllegalArgumentExceptionTypeExpressions")]
+ [TestCaseSource(nameof(IllegalArgumentExceptionTypeExpressions))]
public void TestIsIllegalArgumentException(Type exceptionType, bool
expectedToThrow, Action expression) // LUCENENET NOTE: exceptionType is only
here to make NUnit display them all
{
// Make sure we are testing the production code
@@ -360,7 +359,7 @@ namespace Lucene.Net.Support.ExceptionHandling
// in the test environment to ensure that if a test is specified wrong
it will fail and should be updated
// and commented to indicate we diverged from Lucene.
[Test]
-
[TestCaseSource("IllegalArgumentExceptionTypeExpressions_TestEnvironment")]
+
[TestCaseSource(nameof(IllegalArgumentExceptionTypeExpressions_TestEnvironment))]
public void TestIsIllegalArgumentException_TestEnvironment(Type
exceptionType, bool expectedToThrow, Action expression) // LUCENENET NOTE:
exceptionType is only here to make NUnit display them all
{
// Make sure we are testing the test environment code
@@ -376,7 +375,7 @@ namespace Lucene.Net.Support.ExceptionHandling
}
}
- private void AssertCatches(Action action, Func<Exception, bool>
extensionMethodExpression)
+ private static void AssertCatches(Action action, Func<Exception, bool>
extensionMethodExpression)
{
try
{
@@ -397,7 +396,7 @@ namespace Lucene.Net.Support.ExceptionHandling
}
}
- private void AssertDoesNotCatch(Action action, Func<Exception, bool>
extensionMethodExpression)
+ private static void AssertDoesNotCatch(Action action, Func<Exception,
bool> extensionMethodExpression)
{
try
{
diff --git a/src/Lucene.Net.Tests.Benchmark/ByTask/Tasks/CreateIndexTaskTest.cs
b/src/Lucene.Net.Tests.Benchmark/ByTask/Tasks/CreateIndexTaskTest.cs
index 00f85106e..169b78150 100644
--- a/src/Lucene.Net.Tests.Benchmark/ByTask/Tasks/CreateIndexTaskTest.cs
+++ b/src/Lucene.Net.Tests.Benchmark/ByTask/Tasks/CreateIndexTaskTest.cs
@@ -56,7 +56,7 @@ namespace Lucene.Net.Benchmarks.ByTask.Tasks
TextWriter curOut = Console.Out;
ByteArrayOutputStream baos = new ByteArrayOutputStream();
- Console.Out = new StreamWriter(baos, Encoding.GetEncoding(0));
+ Console.Out = new StreamWriter(baos, Encoding.Default);
try
{
PerfRunData runData = createPerfRunData("SystemOut");
@@ -72,7 +72,7 @@ namespace Lucene.Net.Benchmarks.ByTask.Tasks
TextWriter curErr = Console.Error;
baos = new ByteArrayOutputStream();
- Console.Error = new StreamWriter(baos, Encoding.GetEncoding(0));
+ Console.Error = new StreamWriter(baos, Encoding.Default);
try
{
PerfRunData runData = createPerfRunData("SystemErr");
diff --git a/src/Lucene.Net.Tests.Demo/TestDemo.cs
b/src/Lucene.Net.Tests.Demo/TestDemo.cs
index 16b2379a5..9957f7b71 100644
--- a/src/Lucene.Net.Tests.Demo/TestDemo.cs
+++ b/src/Lucene.Net.Tests.Demo/TestDemo.cs
@@ -33,8 +33,7 @@ namespace Lucene.Net.Demo
try
{
MemoryStream bytes = new MemoryStream();
- // .NET NOTE: GetEncoding(0) returns the current system's
default encoding
- var fakeSystemOut = new StreamWriter(bytes,
Encoding.GetEncoding(0));
+ var fakeSystemOut = new StreamWriter(bytes, Encoding.Default);
Console.SetOut(fakeSystemOut);
// LUCENENET specific: changed the arguments to act more like
the dotnet.exe commands.
// * only optional arguments start with -
@@ -44,8 +43,7 @@ namespace Lucene.Net.Demo
// it consistent with the lucene-cli utility.
SearchFiles.Main(new string[] { indexPath.FullName, "--query",
query });
fakeSystemOut.Flush();
- // .NET NOTE: GetEncoding(0) returns the current system's
default encoding
- string output =
Encoding.GetEncoding(0).GetString(bytes.ToArray()); // intentionally use
default encoding
+ string output = Encoding.Default.GetString(bytes.ToArray());
// intentionally use default encoding
assertTrue("output=" + output,
output.Contains(expectedHitCount + " total matching documents"));
}
finally
diff --git
a/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCharBlockArray.cs
b/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCharBlockArray.cs
index ecf2d317b..0c9dab11d 100644
--- a/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCharBlockArray.cs
+++ b/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCharBlockArray.cs
@@ -2,6 +2,7 @@
using J2N.IO;
using J2N.Text;
using Lucene.Net.Attributes;
+using Lucene.Net.Support;
using NUnit.Framework;
using System;
using System.IO;
@@ -40,24 +41,13 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache
byte[] buffer = new byte[50];
- // This is essentially the equivalent of
- // CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder()
- // .onUnmappableCharacter(CodingErrorAction.REPLACE)
- // .onMalformedInput(CodingErrorAction.REPLACE);
- //
- // Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage,
- // new EncoderReplacementFallback("?"),
- // new DecoderReplacementFallback("?"));
-
for (int i = 0; i < n; i++)
{
Random.NextBytes(buffer);
int size = 1 + Random.Next(50);
// This test is turning random bytes into a string,
// this is asking for trouble.
- Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage,
- new EncoderReplacementFallback("?"),
- new DecoderReplacementFallback("?"));
+ Encoding decoder = Encoding.UTF8; // LUCENENET specific: no
need to set decoder fallback, because Encoding.UTF8 already replaces by default
string s = decoder.GetString(buffer, 0, size);
array.Append(s);
builder.Append(s);
@@ -69,9 +59,7 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache
int size = 1 + Random.Next(50);
// This test is turning random bytes into a string,
// this is asking for trouble.
- Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage,
- new EncoderReplacementFallback("?"),
- new DecoderReplacementFallback("?"));
+ Encoding decoder = Encoding.UTF8; // LUCENENET specific: no
need to set decoder fallback, because Encoding.UTF8 already replaces by default
string s = decoder.GetString(buffer, 0, size);
array.Append(s);
builder.Append(s);
@@ -83,9 +71,7 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache
int size = 1 + Random.Next(50);
// This test is turning random bytes into a string,
// this is asking for trouble.
- Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage,
- new EncoderReplacementFallback("?"),
- new DecoderReplacementFallback("?"));
+ Encoding decoder = Encoding.UTF8; // LUCENENET specific: no
need to set decoder fallback, because Encoding.UTF8 already replaces by default
string s = decoder.GetString(buffer, 0, size);
for (int j = 0; j < s.Length; j++)
{
diff --git
a/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCompactLabelToOrdinal.cs
b/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCompactLabelToOrdinal.cs
index b8d013a6d..d38b1fd45 100644
---
a/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCompactLabelToOrdinal.cs
+++
b/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCompactLabelToOrdinal.cs
@@ -1,5 +1,6 @@
// Lucene version compatibility level 4.8.1
using Lucene.Net.Attributes;
+using Lucene.Net.Support;
using NUnit.Framework;
using System;
using System.Collections.Generic;
@@ -43,15 +44,6 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache
string[] uniqueValues = new string[numUniqueValues];
byte[] buffer = new byte[50];
- // This is essentially the equivalent of
- // CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder()
- // .onUnmappableCharacter(CodingErrorAction.REPLACE)
- // .onMalformedInput(CodingErrorAction.REPLACE);
- //
- // Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage,
- // new EncoderReplacementFallback("?"),
- // new DecoderReplacementFallback("?"));
-
Random random = Random;
for (int i = 0; i < numUniqueValues;)
{
@@ -60,9 +52,7 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache
// This test is turning random bytes into a string,
// this is asking for trouble.
- Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage,
- new EncoderReplacementFallback("?"),
- new DecoderReplacementFallback("?"));
+ Encoding decoder = Encoding.UTF8; // LUCENENET specific: no
need to set decoder fallback, because Encoding.UTF8 already replaces by default
uniqueValues[i] = decoder.GetString(buffer, 0, size);
// we cannot have empty path components, so eliminate all
prefix as well
// as middle consecutive delimiter chars.
@@ -292,6 +282,6 @@ namespace Lucene.Net.Facet.Taxonomy.WriterCache
}
return LabelToOrdinal.INVALID_ORDINAL;
}
- }
+ }
}
-}
\ No newline at end of file
+}
diff --git a/src/Lucene.Net.Tests/Index/TestTerm.cs
b/src/Lucene.Net.Tests/Index/TestTerm.cs
index 425670dcd..577a3781e 100644
--- a/src/Lucene.Net.Tests/Index/TestTerm.cs
+++ b/src/Lucene.Net.Tests/Index/TestTerm.cs
@@ -1,3 +1,5 @@
+using Lucene.Net.Attributes;
+using Lucene.Net.Util;
using NUnit.Framework;
using Assert = Lucene.Net.TestFramework.Assert;
@@ -39,5 +41,61 @@ namespace Lucene.Net.Index
Assert.IsFalse(@base.Equals(differentText));
Assert.IsFalse(@base.Equals(differentType));
}
+
+ [Test, LuceneNetSpecific]
+ public void TestToString_ValidUtf8Data()
+ {
+ // Arrange
+ var validUtf8 = new byte[] { 0x48, 0x65, 0x6C, 0x6C, 0x6F }; //
"Hello"
+ var bytesRef = new BytesRef(validUtf8, 0, validUtf8.Length);
+
+ // Act
+ string result = Term.ToString(bytesRef);
+
+ // Assert
+ Assert.AreEqual("Hello", result);
+ }
+
+ [Test, LuceneNetSpecific]
+ public void TestToString_InvalidUtf8Data()
+ {
+ // Arrange
+ var invalidUtf8 = new byte[] { 0xC3, 0x28 }; // Invalid UTF-8
sequence
+ var bytesRef = new BytesRef(invalidUtf8, 0, invalidUtf8.Length);
+
+ // Act
+ string result = Term.ToString(bytesRef);
+
+ // Assert
+ Assert.AreEqual("[c3 28]", result); // Should match
BytesRef.ToString()
+ }
+
+ [Test, LuceneNetSpecific]
+ public void TestToString_Utf8WithBom()
+ {
+ // Arrange
+ var utf8WithBom = new byte[] { 0xEF, 0xBB, 0xBF, 0x48, 0x69 }; //
BOM + "Hi"
+ var bytesRef = new BytesRef(utf8WithBom, 0, utf8WithBom.Length);
+
+ // Act
+ string result = Term.ToString(bytesRef);
+
+ // Assert
+ Assert.AreEqual("\uFEFFHi", result); // BOM is preserved in the
string
+ }
+
+ [Test, LuceneNetSpecific]
+ public void TestToString_Utf8WithoutBom()
+ {
+ // Arrange
+ var utf8WithoutBom = new byte[] { 0x48, 0x69 }; // "Hi"
+ var bytesRef = new BytesRef(utf8WithoutBom, 0,
utf8WithoutBom.Length);
+
+ // Act
+ string result = Term.ToString(bytesRef);
+
+ // Assert
+ Assert.AreEqual("Hi", result);
+ }
}
-}
\ No newline at end of file
+}
diff --git a/src/Lucene.Net.Tests/Support/TestApiConsistency.cs
b/src/Lucene.Net.Tests/Support/TestApiConsistency.cs
index f565676ac..04b96b91b 100644
--- a/src/Lucene.Net.Tests/Support/TestApiConsistency.cs
+++ b/src/Lucene.Net.Tests/Support/TestApiConsistency.cs
@@ -38,7 +38,7 @@ namespace Lucene.Net
[TestCase(typeof(Lucene.Net.Analysis.Analyzer))]
public override void TestPrivateFieldNames(Type typeFromTargetAssembly)
{
- base.TestPrivateFieldNames(typeFromTargetAssembly,
@"^Lucene\.Net\.Support\.(?:ConcurrentHashSet|PlatformHelper|DateTimeOffsetUtil|Arrays|IO\.FileSupport)|^Lucene\.ExceptionExtensions|^Lucene\.Net\.Util\.Constants\.MaxStackByteLimit|^Lucene\.Net\.Search\.TopDocs\.ShardByteSize|^Lucene\.Net\.Store\.BaseDirectory\.(?:True|False)");
+ base.TestPrivateFieldNames(typeFromTargetAssembly,
@"^Lucene\.Net\.Support\.(?:ConcurrentHashSet|PlatformHelper|DateTimeOffsetUtil|Arrays|IO\.FileSupport)|^Lucene\.ExceptionExtensions|^Lucene\.Net\.Util\.Constants\.MaxStackByteLimit|^Lucene\.Net\.Search\.TopDocs\.ShardByteSize|^Lucene\.Net\.Store\.BaseDirectory\.(?:True|False)|CharStackBufferSize$");
}
[Test, LuceneNetSpecific]
diff --git a/src/Lucene.Net.Tests/Index/TestTerm.cs
b/src/Lucene.Net.Tests/Support/Text/TestEncodingExtensions.cs
similarity index 54%
copy from src/Lucene.Net.Tests/Index/TestTerm.cs
copy to src/Lucene.Net.Tests/Support/Text/TestEncodingExtensions.cs
index 425670dcd..55123917e 100644
--- a/src/Lucene.Net.Tests/Index/TestTerm.cs
+++ b/src/Lucene.Net.Tests/Support/Text/TestEncodingExtensions.cs
@@ -1,7 +1,9 @@
+using Lucene.Net.Attributes;
+using Lucene.Net.Util;
using NUnit.Framework;
-using Assert = Lucene.Net.TestFramework.Assert;
+using System.Text;
-namespace Lucene.Net.Index
+namespace Lucene.Net.Support.Text
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -20,24 +22,21 @@ namespace Lucene.Net.Index
* limitations under the License.
*/
- using LuceneTestCase = Lucene.Net.Util.LuceneTestCase;
-
[TestFixture]
- public class TestTerm : LuceneTestCase
+ public class TestEncodingExtensions : LuceneTestCase
{
- [Test]
- public virtual void TestEquals()
+ [Test, LuceneNetSpecific]
+ public void TestWithDecoderExceptionFallback()
{
- Term @base = new Term("same", "same");
- Term same = new Term("same", "same");
- Term differentField = new Term("different", "same");
- Term differentText = new Term("same", "different");
- const string differentType = "AString";
- Assert.AreEqual(@base, @base);
- Assert.AreEqual(@base, same);
- Assert.IsFalse(@base.Equals(differentField));
- Assert.IsFalse(@base.Equals(differentText));
- Assert.IsFalse(@base.Equals(differentType));
+ Encoding encoding = Encoding.UTF8;
+ Encoding newEncoding = encoding.WithDecoderExceptionFallback();
+ Assert.AreNotSame(encoding, newEncoding);
+ Assert.AreEqual(DecoderFallback.ExceptionFallback,
newEncoding.DecoderFallback);
+
+ Assert.Throws<DecoderFallbackException>(() =>
+ {
+ _ = newEncoding.GetString(new byte[] { 0xF0 });
+ });
}
}
-}
\ No newline at end of file
+}
diff --git a/src/Lucene.Net/Index/Term.cs b/src/Lucene.Net/Index/Term.cs
index 38eda37a3..6930fa543 100644
--- a/src/Lucene.Net/Index/Term.cs
+++ b/src/Lucene.Net/Index/Term.cs
@@ -1,6 +1,9 @@
using J2N.Text;
using Lucene.Net.Support;
+using Lucene.Net.Support.Buffers;
+using Lucene.Net.Support.Text;
using System;
+using System.Buffers;
using System.Text;
namespace Lucene.Net.Index
@@ -34,6 +37,8 @@ namespace Lucene.Net.Index
/// </summary>
public sealed class Term : IComparable<Term>, IEquatable<Term> //
LUCENENET specific - class implements IEquatable<T>
{
+ private const int CharStackBufferSize = 64;
+
/// <summary>
/// Constructs a <see cref="Term"/> with the given field and bytes.
/// <para/>Note that a null field or null bytes value results in
undefined
@@ -84,24 +89,65 @@ namespace Lucene.Net.Index
/// </summary>
public string Text => ToString(Bytes); // LUCENENET: Changed to a
property. While this calls a method internally, its expected usage is that it
will return a deterministic value.
+#nullable enable
/// <summary>
/// Returns human-readable form of the term text. If the term is not
unicode,
/// the raw bytes will be printed instead.
/// </summary>
public static string ToString(BytesRef termText)
{
+ if (termText is null)
+ throw new ArgumentNullException(nameof(termText)); //
LUCENENET: Added guard clause
+#if FEATURE_UTF8_TOUTF16
+ // View the relevant portion of the byte array
+ ReadOnlySpan<byte> utf8Span = new
ReadOnlySpan<byte>(termText.Bytes, termText.Offset, termText.Length);
+
+ // Allocate a buffer for the maximum possible UTF-16 output
+ int maxChars = utf8Span.Length; // Worst case: 1 byte -> 1 char
(ASCII)
+ char[]? arrayToReturnToPool = null;
+
+ Span<char> charBuffer = maxChars > CharStackBufferSize
+ ? (arrayToReturnToPool = ArrayPool<char>.Shared.Rent(maxChars))
+ : stackalloc char[CharStackBufferSize];
+ try
+ {
+ // Decode the UTF-8 bytes to UTF-16 chars
+ OperationStatus status = System.Text.Unicode.Utf8.ToUtf16(
+ utf8Span,
+ charBuffer,
+ out int bytesConsumed,
+ out int charsWritten,
+ replaceInvalidSequences: false); // Causes
OperationStatus.InvalidData to occur rather than replace
+
+ // NOTE: We handle OperationStatus.InvalidData below in the
fallback path.
+ if (status == OperationStatus.Done)
+ {
+ // Successfully decoded the UTF-8 input
+ return charBuffer.Slice(0, charsWritten).ToString();
+ }
+ }
+ finally
+ {
+ // Return the buffer to the pool
+ ArrayPool<char>.Shared.ReturnIfNotNull(arrayToReturnToPool);
+ }
+
+ // Fallback to the default string representation if decoding fails
+ return termText.ToString();
+#else
// the term might not be text, but usually is. so we make a best
effort
- // LUCENENET TODO: determine if we should use
DecoderFallback.ExceptionFallback here
- Encoding decoder = StandardCharsets.UTF_8;
+ Encoding decoder =
StandardCharsets.UTF_8.WithDecoderExceptionFallback();
try
{
return decoder.GetString(termText.Bytes, termText.Offset,
termText.Length);
}
- catch
+ catch (DecoderFallbackException)
{
return termText.ToString();
}
+#endif
}
+#nullable restore
/// <summary>
/// Returns the bytes of this term.
diff --git a/src/Lucene.Net/Support/Buffers/ArrayPoolExtensions.cs
b/src/Lucene.Net/Support/Buffers/ArrayPoolExtensions.cs
new file mode 100644
index 000000000..baad58540
--- /dev/null
+++ b/src/Lucene.Net/Support/Buffers/ArrayPoolExtensions.cs
@@ -0,0 +1,43 @@
+using System.Buffers;
+using System.Runtime.CompilerServices;
+#nullable enable
+
+namespace Lucene.Net.Support.Buffers
+{
+ /// <summary>
+ /// Extensions to <see cref="ArrayPool{T}"/>
+ /// </summary>
+ internal static class ArrayPoolExtensions
+ {
+ /// <summary>
+ /// Returns to the pool an array that was previously obtained via <see
cref="ArrayPool{T}.Rent"/> on the same
+ /// <see cref="ArrayPool{T}"/> instance. This method is a no-op if
<paramref name="array"/> is <c>null</c>.
+ /// </summary>
+ /// <param name="pool">This <see cref="ArrayPool{T}"/>.</param>
+ /// <param name="array">
+ /// The buffer previously obtained from <see
cref="ArrayPool{T}.Rent"/> to return to the pool. If <c>null</c>,
+ /// no operation will take place.
+ /// </param>
+ /// <param name="clearArray">
+ /// If <c>true</c> and if the pool will store the buffer to enable
subsequent reuse, <see cref="ReturnIfNotNull"/>
+ /// will clear <paramref name="array"/> of its contents so that a
subsequent consumer via <see cref="ArrayPool{T}.Rent"/>
+ /// will not see the previous consumer's content. If <c>false</c> or
if the pool will release the buffer,
+ /// the array's contents are left unchanged.
+ /// </param>
+ /// <remarks>
+ /// Once a buffer has been returned to the pool, the caller gives up
all ownership of the buffer
+ /// and must not use it. The reference returned from a given call to
<see cref="ArrayPool{T}.Rent"/> must only be
+ /// returned via <see cref="ReturnIfNotNull"/> once. The default <see
cref="ArrayPool{T}"/>
+ /// may hold onto the returned buffer in order to rent it again, or it
may release the returned buffer
+ /// if it's determined that the pool already has enough buffers stored.
+ /// </remarks>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static void ReturnIfNotNull<T>(this ArrayPool<T> pool, T[]?
array, bool clearArray = false)
+ {
+ if (array != null)
+ {
+ pool.Return(array, clearArray);
+ }
+ }
+ }
+}
diff --git a/src/Lucene.Net/Support/ExceptionHandling/ExceptionExtensions.cs
b/src/Lucene.Net/Support/ExceptionHandling/ExceptionExtensions.cs
index f7b9c1e51..09ba5bdf7 100644
--- a/src/Lucene.Net/Support/ExceptionHandling/ExceptionExtensions.cs
+++ b/src/Lucene.Net/Support/ExceptionHandling/ExceptionExtensions.cs
@@ -6,6 +6,7 @@ using System.Reflection;
using System.Resources;
using System.Runtime.CompilerServices;
using System.Security;
+using System.Text;
using System.Threading;
namespace Lucene
@@ -213,8 +214,11 @@ namespace Lucene
if (e is null || e.IsAlwaysIgnored()) return false;
return e is IOException ||
- e.IsAlreadyClosedException() || // In Lucene,
AlreadyClosedException subclass IOException instead of
InvalidOperationException, so we need a special case here
- e is UnauthorizedAccessException; // In Java,
java.nio.file.AccessDeniedException subclasses IOException
+ e.IsAlreadyClosedException() || // In Lucene,
AlreadyClosedException subclass IOException instead of
InvalidOperationException, so we need a special case here
+ e is
+ UnauthorizedAccessException // In Java,
java.nio.file.AccessDeniedException subclasses IOException
+ or DecoderFallbackException // In Java,
CharacterCodingException subclasses IOException
+ or EncoderFallbackException;
}
/// <summary>
@@ -368,9 +372,11 @@ namespace Lucene
// LUCENENET: In production, there is a chance that we will
upgrade to ArgumentNullExcpetion or ArgumentOutOfRangeException
// and it is still important that those are caught. However, we
have a copy of this method in the test environment
// where this is done more strictly to catch ArgumentException
without its known subclasses so we can be more explicit in tests.
- return e is ArgumentException;
- //!(e is ArgumentNullException) && // Corresponds to
NullPointerException, so we don't catch it here.
- //!(e is ArgumentOutOfRangeException); // Corresponds to
IndexOutOfBoundsException (and subclasses), so we don't catch it here.
+ return e is ArgumentException
+ and not DecoderFallbackException // In Java,
CharacterCodingException subclasses IOException, not ArgumentException
+ and not EncoderFallbackException;
+ //!(e is ArgumentNullException) && // Corresponds to
NullPointerException, so we don't catch it here.
+ //!(e is ArgumentOutOfRangeException); // Corresponds to
IndexOutOfBoundsException (and subclasses), so we don't catch it here.
}
/// <summary>
diff --git a/src/Lucene.Net/Support/Text/EncodingExtensions.cs
b/src/Lucene.Net/Support/Text/EncodingExtensions.cs
new file mode 100644
index 000000000..5e1c3574c
--- /dev/null
+++ b/src/Lucene.Net/Support/Text/EncodingExtensions.cs
@@ -0,0 +1,58 @@
+using System.Collections.Concurrent;
+using System.Text;
+#nullable enable
+
+namespace Lucene.Net.Support.Text
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Extension methods for <see cref="Encoding"/>.
+ /// </summary>
+ internal static class EncodingExtensions
+ {
+ private static readonly ConcurrentDictionary<Encoding, Encoding>
decoderExceptionFallbackCache = new();
+
+ /// <summary>
+ /// Returns an <see cref="Encoding"/> instance with the <see
cref="DecoderFallback"/> set to throw
+ /// an exception when an invalid byte sequence is encountered.
+ /// <para />
+ /// This is equivalent to Java's <c>CodingErrorAction.REPORT</c> for
both <c>onMalformedInput</c> and
+ /// <c>onUnmappableCharacter</c> and will throw a <see
cref="DecoderFallbackException"/> when failing
+ /// to decode a string. This exception is equivalent to Java's
<c>CharacterCodingException</c>, which is
+ /// a base exception type for both <c>MalformedInputException</c> and
<c>UnmappableCharacterException</c>.
+ /// Thus, to translate Java code that catches any of those exceptions,
you can catch
+ /// <see cref="DecoderFallbackException"/>.
+ /// </summary>
+ /// <param name="encoding">The encoding to clone and set the fallback
on.</param>
+ /// <returns>An <see cref="Encoding"/> instance with the fallback set
to throw an exception.</returns>
+ /// <remarks>
+ /// Note that it is necessary to clone the <see cref="Encoding"/>
instance because
+ /// the <see cref="Encoding.DecoderFallback"/> property is read-only
without cloning.
+ /// </remarks>
+ public static Encoding WithDecoderExceptionFallback(this Encoding
encoding)
+ {
+ return decoderExceptionFallbackCache.GetOrAdd(encoding, static e =>
+ {
+ Encoding newEncoding = (Encoding)e.Clone();
+ newEncoding.DecoderFallback =
DecoderFallback.ExceptionFallback;
+ return newEncoding;
+ });
+ }
+ }
+}
diff --git a/src/Lucene.Net/Util/IOUtils.cs b/src/Lucene.Net/Util/IOUtils.cs
index c3141b00d..624336ecf 100644
--- a/src/Lucene.Net/Util/IOUtils.cs
+++ b/src/Lucene.Net/Util/IOUtils.cs
@@ -2,6 +2,7 @@
using Lucene.Net.Diagnostics;
using Lucene.Net.Support;
using Lucene.Net.Support.IO;
+using Lucene.Net.Support.Text;
using System;
using System.Collections.Generic;
using System.Diagnostics;
@@ -378,7 +379,8 @@ namespace Lucene.Net.Util
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static TextReader GetDecodingReader(Stream stream, Encoding
charSet)
{
- return new StreamReader(stream, charSet);
+ var charSetDecoder = charSet.WithDecoderExceptionFallback();
+ return new StreamReader(stream, charSetDecoder);
}
/// <summary>