This is an automated email from the ASF dual-hosted git repository.
nightowl888 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git
The following commit(s) were added to refs/heads/master by this push:
new 4c5b94c BUG: Fixes #573. Changed segment names to match Lucene and
Lucene.NET 3.x. This is a breaking change. Added a
SegmentInfos.UseLegacySegmentNames to read 4.8.0-beta00001 thru 4.8.0-beta00015
indexes, which is false by default.
4c5b94c is described below
commit 4c5b94c837dfad43f645b8900d51ecd000614c42
Author: Shad Storhaug <[email protected]>
AuthorDate: Mon Dec 13 17:10:29 2021 +0700
BUG: Fixes #573. Changed segment names to match Lucene and Lucene.NET 3.x.
This is a breaking change. Added a SegmentInfos.UseLegacySegmentNames to read
4.8.0-beta00001 thru 4.8.0-beta00015 indexes, which is false by default.
---
.../Codecs/Lucene3x/TestTermInfosReaderIndex.cs | 4 +-
src/Lucene.Net.Tests/Index/TestCodecs.cs | 5 +-
.../Index/TestConsistentFieldNumbers.cs | 46 +++++++
src/Lucene.Net/Index/CheckIndex.cs | 6 +-
src/Lucene.Net/Index/IndexFileNames.cs | 4 +-
src/Lucene.Net/Index/IndexWriter.cs | 2 +-
src/Lucene.Net/Index/ReadersAndUpdates.cs | 8 +-
src/Lucene.Net/Index/SegmentDocValues.cs | 8 +-
src/Lucene.Net/Index/SegmentInfos.cs | 148 +++++++++++++++++++++
src/Lucene.Net/Index/SegmentReader.cs | 8 +-
10 files changed, 226 insertions(+), 13 deletions(-)
diff --git a/src/Lucene.Net.Tests/Codecs/Lucene3x/TestTermInfosReaderIndex.cs
b/src/Lucene.Net.Tests/Codecs/Lucene3x/TestTermInfosReaderIndex.cs
index 2b5f104..68c56a2 100644
--- a/src/Lucene.Net.Tests/Codecs/Lucene3x/TestTermInfosReaderIndex.cs
+++ b/src/Lucene.Net.Tests/Codecs/Lucene3x/TestTermInfosReaderIndex.cs
@@ -7,6 +7,8 @@ using System.Collections.Generic;
using System.Globalization;
using JCG = J2N.Collections.Generic;
using Assert = Lucene.Net.TestFramework.Assert;
+using J2N;
+using RandomizedTesting.Generators;
namespace Lucene.Net.Codecs.Lucene3x
{
@@ -226,7 +228,7 @@ namespace Lucene.Net.Codecs.Lucene3x
writer.Dispose();
}
- private static string Text => Convert.ToString(Random.Next(),
CultureInfo.InvariantCulture);
+ private static string Text =>
Random.NextInt64().ToString(Character.MaxRadix);
}
#pragma warning restore 612, 618
}
\ No newline at end of file
diff --git a/src/Lucene.Net.Tests/Index/TestCodecs.cs
b/src/Lucene.Net.Tests/Index/TestCodecs.cs
index 25fb066..b06b0bf 100644
--- a/src/Lucene.Net.Tests/Index/TestCodecs.cs
+++ b/src/Lucene.Net.Tests/Index/TestCodecs.cs
@@ -1,4 +1,5 @@
-using J2N.Text;
+using J2N;
+using J2N.Text;
using J2N.Threading;
using Lucene.Net.Diagnostics;
using Lucene.Net.Documents;
@@ -340,7 +341,7 @@ namespace Lucene.Net.Index
for (int i = 0; i < NUM_TERMS; i++)
{
int[] docs = new int[] { i };
- string text = Convert.ToString(i);
+ string text = i.ToString(Character.MaxRadix);
terms[i] = new TermData(this, text, docs, null);
}
diff --git a/src/Lucene.Net.Tests/Index/TestConsistentFieldNumbers.cs
b/src/Lucene.Net.Tests/Index/TestConsistentFieldNumbers.cs
index 75251af..c881ee5 100644
--- a/src/Lucene.Net.Tests/Index/TestConsistentFieldNumbers.cs
+++ b/src/Lucene.Net.Tests/Index/TestConsistentFieldNumbers.cs
@@ -1,4 +1,6 @@
using System;
+using System.Globalization;
+using Lucene.Net.Attributes;
using Lucene.Net.Documents;
using Lucene.Net.Index.Extensions;
using Lucene.Net.Support;
@@ -419,5 +421,49 @@ namespace Lucene.Net.Index
return null;
}
}
+
+ [Test]
+ [LuceneNetSpecific]
+ public void TestSegmentNumberToStringGeneration()
+ {
+ // We cover the 100 literal values that we return plus an
additional 5 to ensure continuation
+ const long MaxSegment = 105;
+
+ bool temp = SegmentInfos.UseLegacySegmentNames;
+ try
+ {
+ // Normal usage
+ SegmentInfos.UseLegacySegmentNames = false;
+ for (long seg = 0; seg < MaxSegment; seg++)
+ {
+ string expected =
J2N.IntegralNumberExtensions.ToString(seg, J2N.Character.MaxRadix);
+ string actual = SegmentInfos.SegmentNumberToString(seg);
+ Assert.AreEqual(expected, actual);
+ }
+
+ // This is for places where we were generating the names
correctly. We don't want to flip
+ // to radix 10 when the feature is enabled here.
+ SegmentInfos.UseLegacySegmentNames = true;
+ for (long seg = 0; seg < MaxSegment; seg++)
+ {
+ string expected =
J2N.IntegralNumberExtensions.ToString(seg, J2N.Character.MaxRadix);
+ string actual = SegmentInfos.SegmentNumberToString(seg,
allowLegacyNames: false);
+ Assert.AreEqual(expected, actual);
+ }
+
+ // This is to generate names with radix 10 (to read indexes
from beta 1 thru 15 only)
+ SegmentInfos.UseLegacySegmentNames = true;
+ for (long seg = 0; seg < MaxSegment; seg++)
+ {
+ string expected =
seg.ToString(CultureInfo.InvariantCulture);
+ string actual = SegmentInfos.SegmentNumberToString(seg);
+ Assert.AreEqual(expected, actual);
+ }
+ }
+ finally
+ {
+ SegmentInfos.UseLegacySegmentNames = temp;
+ }
+ }
}
}
\ No newline at end of file
diff --git a/src/Lucene.Net/Index/CheckIndex.cs
b/src/Lucene.Net/Index/CheckIndex.cs
index 1e83760..b21bb1d 100644
--- a/src/Lucene.Net/Index/CheckIndex.cs
+++ b/src/Lucene.Net/Index/CheckIndex.cs
@@ -670,6 +670,10 @@ namespace Lucene.Net.Index
result.NewSegments.Clear();
result.MaxSegmentName = -1;
+ // LUCENENET: We created the segments names wrong in
4.8.0-beta00001 - 4.8.0-beta00015,
+ // so we added a switch to be able to read these indexes in later
versions.
+ int segmentRadix = SegmentInfos.useLegacySegmentNames ? 10 :
J2N.Character.MaxRadix;
+
for (int i = 0; i < numSegments; i++)
{
SegmentCommitInfo info = sis.Info(i);
@@ -677,7 +681,7 @@ namespace Lucene.Net.Index
try
{
// LUCENENET: Optimized to not allocate a substring during
the parse
- segmentName = Integer.Parse(info.Info.Name, 1,
info.Info.Name.Length - 1, radix: 10);
+ segmentName = Integer.Parse(info.Info.Name, 1,
info.Info.Name.Length - 1, radix: segmentRadix);
}
catch
{
diff --git a/src/Lucene.Net/Index/IndexFileNames.cs
b/src/Lucene.Net/Index/IndexFileNames.cs
index 9869681..bdc7295 100644
--- a/src/Lucene.Net/Index/IndexFileNames.cs
+++ b/src/Lucene.Net/Index/IndexFileNames.cs
@@ -1,4 +1,4 @@
-using J2N;
+using J2N;
using Lucene.Net.Diagnostics;
using System;
using System.Text;
@@ -111,7 +111,7 @@ namespace Lucene.Net.Index
// to the gen length as string (hopefully an upper limit so SB
won't
// expand in the middle.
StringBuilder res = (new StringBuilder(@base.Length + 6 +
ext.Length))
-
.Append(@base).Append('_').Append(gen.ToString(Character.MaxRadix));
+
.Append(@base).Append('_').Append(SegmentInfos.SegmentNumberToString(gen,
allowLegacyNames: false)); // LUCENENET specific - we had this right thru all
of the betas, so don't change if the legacy feature is enabled
if (ext.Length > 0)
{
res.Append('.').Append(ext);
diff --git a/src/Lucene.Net/Index/IndexWriter.cs
b/src/Lucene.Net/Index/IndexWriter.cs
index 1f3468e..fe1beb3 100644
--- a/src/Lucene.Net/Index/IndexWriter.cs
+++ b/src/Lucene.Net/Index/IndexWriter.cs
@@ -2131,7 +2131,7 @@ namespace Lucene.Net.Index
// problems at least with ConcurrentMergeScheduler.
changeCount++;
segmentInfos.Changed();
- return "_" +
(segmentInfos.Counter++).ToString(J2N.Character.MaxRadix);
+ return "_" +
SegmentInfos.SegmentNumberToString(segmentInfos.Counter++, allowLegacyNames:
false); // LUCENENET specific - we had this right thru all of the betas, so
don't change if the legacy feature is enabled
}
finally
{
diff --git a/src/Lucene.Net/Index/ReadersAndUpdates.cs
b/src/Lucene.Net/Index/ReadersAndUpdates.cs
index 8dcfde1..e31ed02 100644
--- a/src/Lucene.Net/Index/ReadersAndUpdates.cs
+++ b/src/Lucene.Net/Index/ReadersAndUpdates.cs
@@ -1,4 +1,5 @@
-using J2N.Threading.Atomic;
+using J2N;
+using J2N.Threading.Atomic;
using Lucene.Net.Diagnostics;
using Lucene.Net.Documents;
using Lucene.Net.Support.Threading;
@@ -579,7 +580,10 @@ namespace Lucene.Net.Index
fieldInfos = builder.Finish();
long nextFieldInfosGen = Info.NextFieldInfosGen;
- string segmentSuffix =
nextFieldInfosGen.ToString(CultureInfo.InvariantCulture);//Convert.ToString(nextFieldInfosGen,
Character.MAX_RADIX));
+ // LUCENENET specific: We created the segments names
wrong in 4.8.0-beta00001 - 4.8.0-beta00015,
+ // so we added a switch to be able to read these
indexes in later versions. This logic as well as an
+ // optimization on the first 100 segment values is
implmeneted in SegmentInfos.SegmentNumberToString().
+ string segmentSuffix =
SegmentInfos.SegmentNumberToString(nextFieldInfosGen);
SegmentWriteState state = new SegmentWriteState(null,
trackingDir, Info.Info, fieldInfos, writer.Config.TermIndexInterval, null,
IOContext.DEFAULT, segmentSuffix);
DocValuesFormat docValuesFormat =
codec.DocValuesFormat;
DocValuesConsumer fieldsConsumer =
docValuesFormat.FieldsConsumer(state);
diff --git a/src/Lucene.Net/Index/SegmentDocValues.cs
b/src/Lucene.Net/Index/SegmentDocValues.cs
index e87e0c7..4472ed3 100644
--- a/src/Lucene.Net/Index/SegmentDocValues.cs
+++ b/src/Lucene.Net/Index/SegmentDocValues.cs
@@ -1,4 +1,5 @@
-using J2N.Collections.Generic.Extensions;
+using J2N;
+using J2N.Collections.Generic.Extensions;
using Lucene.Net.Diagnostics;
using Lucene.Net.Support.Threading;
using Lucene.Net.Util;
@@ -46,7 +47,10 @@ namespace Lucene.Net.Index
if (gen != -1)
{
dvDir = si.Info.Dir; // gen'd files are written outside CFS,
so use SegInfo directory
- segmentSuffix =
gen.ToString(CultureInfo.InvariantCulture);//Convert.ToString((long)gen,
Character.MAX_RADIX);
+ // LUCENENET specific: We created the segments names wrong in
4.8.0-beta00001 - 4.8.0-beta00015,
+ // so we added a switch to be able to read these indexes in
later versions. This logic as well as an
+ // optimization on the first 100 segment values is implmeneted
in SegmentInfos.SegmentNumberToString().
+ segmentSuffix = SegmentInfos.SegmentNumberToString(gen);
}
// set SegmentReadState to list only the fields that are relevant
to that gen
diff --git a/src/Lucene.Net/Index/SegmentInfos.cs
b/src/Lucene.Net/Index/SegmentInfos.cs
index 2379ed6..2ea96b5 100644
--- a/src/Lucene.Net/Index/SegmentInfos.cs
+++ b/src/Lucene.Net/Index/SegmentInfos.cs
@@ -6,6 +6,7 @@ using Lucene.Net.Support.IO;
using System;
using System.Collections;
using System.Collections.Generic;
+using System.Globalization;
using System.IO;
using System.Runtime.CompilerServices;
using System.Runtime.ExceptionServices;
@@ -141,6 +142,153 @@ namespace Lucene.Net.Index
public static readonly int FORMAT_SEGMENTS_GEN_CURRENT =
FORMAT_SEGMENTS_GEN_CHECKSUM;
/// <summary>
+ /// Setting this to true will generate the same file names that were
used in 4.8.0-beta00001 through 4.8.0-beta00015.
+ /// When writing more than 10 segments, these segment names were
incompatible with prior versions of Lucene.NET and incompatible with Lucene
4.8.0.
+ /// <para/>
+ /// This is only for reading codecs from the affected 4.8.0 beta
versions, it is not recommended to use this setting for general use.
+ /// <para/>
+ /// This must be set prior to opening an index at application startup.
When setting it at other times the behavior is undefined.
+ /// <para/>
+ /// Note that this property can also be set using the
"useLegacySegmentNames" system property to "true" (such as setting the
environment variable "lucene:useLegacySegmentNames").
+ /// System properties can also be injected by supplying a <see
cref="Configuration.IConfigurationFactory"/> at application startup
+ /// through <see
cref="Configuration.ConfigurationSettings.SetConfigurationFactory(Configuration.IConfigurationFactory)"/>.
+ /// </summary>
+ public static bool UseLegacySegmentNames
+ {
+ get => useLegacySegmentNames;
+ set => useLegacySegmentNames = value;
+ }
+ internal static bool useLegacySegmentNames =
Util.SystemProperties.GetPropertyAsBoolean("useLegacySegmentNames",
defaultValue: false);
+
+ /// <summary>
+ /// Optimized version of <see
cref="J2N.IntegralNumberExtensions.ToString(long, int)"/> with a radix of 36,
that
+ /// simply does a switch case for the first 100 numbers, which takes
only 5% of the time as calculating it.
+ /// We fall back to calling the method after 100 segments.
+ /// <para/>
+ /// This also implements the switch for <see
cref="UseLegacySegmentNames"/> so it doesn't have to be dealt with externally.
+ /// </summary>
+ /// <
+ internal static string SegmentNumberToString(long segment, bool
allowLegacyNames = true)
+ {
+ switch (segment)
+ {
+ case 0: return "0";
+ case 1: return "1";
+ case 2: return "2";
+ case 3: return "3";
+ case 4: return "4";
+ case 5: return "5";
+ case 6: return "6";
+ case 7: return "7";
+ case 8: return "8";
+ case 9: return "9";
+ }
+
+ if (!allowLegacyNames || !useLegacySegmentNames)
+ {
+ return segment switch
+ {
+ 10 => "a",
+ 11 => "b",
+ 12 => "c",
+ 13 => "d",
+ 14 => "e",
+ 15 => "f",
+ 16 => "g",
+ 17 => "h",
+ 18 => "i",
+ 19 => "j",
+ 20 => "k",
+ 21 => "l",
+ 22 => "m",
+ 23 => "n",
+ 24 => "o",
+ 25 => "p",
+ 26 => "q",
+ 27 => "r",
+ 28 => "s",
+ 29 => "t",
+ 30 => "u",
+ 31 => "v",
+ 32 => "w",
+ 33 => "x",
+ 34 => "y",
+ 35 => "z",
+ 36 => "10",
+ 37 => "11",
+ 38 => "12",
+ 39 => "13",
+ 40 => "14",
+ 41 => "15",
+ 42 => "16",
+ 43 => "17",
+ 44 => "18",
+ 45 => "19",
+ 46 => "1a",
+ 47 => "1b",
+ 48 => "1c",
+ 49 => "1d",
+ 50 => "1e",
+ 51 => "1f",
+ 52 => "1g",
+ 53 => "1h",
+ 54 => "1i",
+ 55 => "1j",
+ 56 => "1k",
+ 57 => "1l",
+ 58 => "1m",
+ 59 => "1n",
+ 60 => "1o",
+ 61 => "1p",
+ 62 => "1q",
+ 63 => "1r",
+ 64 => "1s",
+ 65 => "1t",
+ 66 => "1u",
+ 67 => "1v",
+ 68 => "1w",
+ 69 => "1x",
+ 70 => "1y",
+ 71 => "1z",
+ 72 => "20",
+ 73 => "21",
+ 74 => "22",
+ 75 => "23",
+ 76 => "24",
+ 77 => "25",
+ 78 => "26",
+ 79 => "27",
+ 80 => "28",
+ 81 => "29",
+ 82 => "2a",
+ 83 => "2b",
+ 84 => "2c",
+ 85 => "2d",
+ 86 => "2e",
+ 87 => "2f",
+ 88 => "2g",
+ 89 => "2h",
+ 90 => "2i",
+ 91 => "2j",
+ 92 => "2k",
+ 93 => "2l",
+ 94 => "2m",
+ 95 => "2n",
+ 96 => "2o",
+ 97 => "2p",
+ 98 => "2q",
+ 99 => "2r",
+ _ => segment.ToString(Character.MaxRadix),
+ };
+ }
+
+ // This is wrong! Unfortunately, this is how the segment names
were generated in
+ // beta 1 thru 15, so we end up here if the switch is enabled to
read them.
+ // We should actually be using a radix of 36 rather than 10 (which
was done correctly in Lucene.NET 3.0.3).
+ return segment.ToString(CultureInfo.InvariantCulture);
+ }
+
+ /// <summary>
/// Used to name new segments. </summary>
public int Counter { get; set; }
diff --git a/src/Lucene.Net/Index/SegmentReader.cs
b/src/Lucene.Net/Index/SegmentReader.cs
index 6700f02..c306867 100644
--- a/src/Lucene.Net/Index/SegmentReader.cs
+++ b/src/Lucene.Net/Index/SegmentReader.cs
@@ -1,4 +1,5 @@
-using J2N.Runtime.CompilerServices;
+using J2N;
+using J2N.Runtime.CompilerServices;
using Lucene.Net.Diagnostics;
using Lucene.Net.Util;
using System;
@@ -232,7 +233,10 @@ namespace Lucene.Net.Index
try
{
- string segmentSuffix = info.FieldInfosGen == -1 ? "" :
info.FieldInfosGen.ToString(CultureInfo.InvariantCulture);//Convert.ToString(info.FieldInfosGen,
Character.MAX_RADIX));
+ // LUCENENET specific: We created the segments names wrong in
4.8.0-beta00001 - 4.8.0-beta00015,
+ // so we added a switch to be able to read these indexes in
later versions. This logic as well as an
+ // optimization on the first 100 segment values is implmeneted
in SegmentInfos.SegmentNumberToString().
+ string segmentSuffix = info.FieldInfosGen == -1 ? string.Empty
: SegmentInfos.SegmentNumberToString(info.FieldInfosGen);
return
info.Info.Codec.FieldInfosFormat.FieldInfosReader.Read(dir, info.Info.Name,
segmentSuffix, IOContext.READ_ONCE);
}
finally