This is an automated email from the ASF dual-hosted git repository.

nightowl888 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git


The following commit(s) were added to refs/heads/master by this push:
     new 4c5b94c  BUG: Fixes #573. Changed segment names to match Lucene and 
Lucene.NET 3.x. This is a breaking change. Added a 
SegmentInfos.UseLegacySegmentNames to read 4.8.0-beta00001 thru 4.8.0-beta00015 
indexes, which is false by default.
4c5b94c is described below

commit 4c5b94c837dfad43f645b8900d51ecd000614c42
Author: Shad Storhaug <[email protected]>
AuthorDate: Mon Dec 13 17:10:29 2021 +0700

    BUG: Fixes #573. Changed segment names to match Lucene and Lucene.NET 3.x. 
This is a breaking change. Added a SegmentInfos.UseLegacySegmentNames to read 
4.8.0-beta00001 thru 4.8.0-beta00015 indexes, which is false by default.
---
 .../Codecs/Lucene3x/TestTermInfosReaderIndex.cs    |   4 +-
 src/Lucene.Net.Tests/Index/TestCodecs.cs           |   5 +-
 .../Index/TestConsistentFieldNumbers.cs            |  46 +++++++
 src/Lucene.Net/Index/CheckIndex.cs                 |   6 +-
 src/Lucene.Net/Index/IndexFileNames.cs             |   4 +-
 src/Lucene.Net/Index/IndexWriter.cs                |   2 +-
 src/Lucene.Net/Index/ReadersAndUpdates.cs          |   8 +-
 src/Lucene.Net/Index/SegmentDocValues.cs           |   8 +-
 src/Lucene.Net/Index/SegmentInfos.cs               | 148 +++++++++++++++++++++
 src/Lucene.Net/Index/SegmentReader.cs              |   8 +-
 10 files changed, 226 insertions(+), 13 deletions(-)

diff --git a/src/Lucene.Net.Tests/Codecs/Lucene3x/TestTermInfosReaderIndex.cs 
b/src/Lucene.Net.Tests/Codecs/Lucene3x/TestTermInfosReaderIndex.cs
index 2b5f104..68c56a2 100644
--- a/src/Lucene.Net.Tests/Codecs/Lucene3x/TestTermInfosReaderIndex.cs
+++ b/src/Lucene.Net.Tests/Codecs/Lucene3x/TestTermInfosReaderIndex.cs
@@ -7,6 +7,8 @@ using System.Collections.Generic;
 using System.Globalization;
 using JCG = J2N.Collections.Generic;
 using Assert = Lucene.Net.TestFramework.Assert;
+using J2N;
+using RandomizedTesting.Generators;
 
 namespace Lucene.Net.Codecs.Lucene3x
 {
@@ -226,7 +228,7 @@ namespace Lucene.Net.Codecs.Lucene3x
             writer.Dispose();
         }
 
-        private static string Text => Convert.ToString(Random.Next(), 
CultureInfo.InvariantCulture);
+        private static string Text => 
Random.NextInt64().ToString(Character.MaxRadix);
     }
 #pragma warning restore 612, 618
 }
\ No newline at end of file
diff --git a/src/Lucene.Net.Tests/Index/TestCodecs.cs 
b/src/Lucene.Net.Tests/Index/TestCodecs.cs
index 25fb066..b06b0bf 100644
--- a/src/Lucene.Net.Tests/Index/TestCodecs.cs
+++ b/src/Lucene.Net.Tests/Index/TestCodecs.cs
@@ -1,4 +1,5 @@
-using J2N.Text;
+using J2N;
+using J2N.Text;
 using J2N.Threading;
 using Lucene.Net.Diagnostics;
 using Lucene.Net.Documents;
@@ -340,7 +341,7 @@ namespace Lucene.Net.Index
             for (int i = 0; i < NUM_TERMS; i++)
             {
                 int[] docs = new int[] { i };
-                string text = Convert.ToString(i);
+                string text = i.ToString(Character.MaxRadix);
                 terms[i] = new TermData(this, text, docs, null);
             }
 
diff --git a/src/Lucene.Net.Tests/Index/TestConsistentFieldNumbers.cs 
b/src/Lucene.Net.Tests/Index/TestConsistentFieldNumbers.cs
index 75251af..c881ee5 100644
--- a/src/Lucene.Net.Tests/Index/TestConsistentFieldNumbers.cs
+++ b/src/Lucene.Net.Tests/Index/TestConsistentFieldNumbers.cs
@@ -1,4 +1,6 @@
 using System;
+using System.Globalization;
+using Lucene.Net.Attributes;
 using Lucene.Net.Documents;
 using Lucene.Net.Index.Extensions;
 using Lucene.Net.Support;
@@ -419,5 +421,49 @@ namespace Lucene.Net.Index
                     return null;
             }
         }
+
+        [Test]
+        [LuceneNetSpecific]
+        public void TestSegmentNumberToStringGeneration()
+        {
+            // We cover the 100 literal values that we return plus an 
additional 5 to ensure continuation
+            const long MaxSegment = 105;
+
+            bool temp = SegmentInfos.UseLegacySegmentNames;
+            try
+            {
+                // Normal usage
+                SegmentInfos.UseLegacySegmentNames = false;
+                for (long seg = 0; seg < MaxSegment; seg++)
+                {
+                    string expected = 
J2N.IntegralNumberExtensions.ToString(seg, J2N.Character.MaxRadix);
+                    string actual = SegmentInfos.SegmentNumberToString(seg);
+                    Assert.AreEqual(expected, actual);
+                }
+
+                // This is for places where we were generating the names 
correctly. We don't want to flip
+                // to radix 10 when the feature is enabled here.
+                SegmentInfos.UseLegacySegmentNames = true;
+                for (long seg = 0; seg < MaxSegment; seg++)
+                {
+                    string expected = 
J2N.IntegralNumberExtensions.ToString(seg, J2N.Character.MaxRadix);
+                    string actual = SegmentInfos.SegmentNumberToString(seg, 
allowLegacyNames: false);
+                    Assert.AreEqual(expected, actual);
+                }
+
+                // This is to generate names with radix 10 (to read indexes 
from beta 1 thru 15 only)
+                SegmentInfos.UseLegacySegmentNames = true;
+                for (long seg = 0; seg < MaxSegment; seg++)
+                {
+                    string expected = 
seg.ToString(CultureInfo.InvariantCulture);
+                    string actual = SegmentInfos.SegmentNumberToString(seg);
+                    Assert.AreEqual(expected, actual);
+                }
+            }
+            finally
+            {
+                SegmentInfos.UseLegacySegmentNames = temp;
+            }
+        }
     }
 }
\ No newline at end of file
diff --git a/src/Lucene.Net/Index/CheckIndex.cs 
b/src/Lucene.Net/Index/CheckIndex.cs
index 1e83760..b21bb1d 100644
--- a/src/Lucene.Net/Index/CheckIndex.cs
+++ b/src/Lucene.Net/Index/CheckIndex.cs
@@ -670,6 +670,10 @@ namespace Lucene.Net.Index
             result.NewSegments.Clear();
             result.MaxSegmentName = -1;
 
+            // LUCENENET: We created the segments names wrong in 
4.8.0-beta00001 - 4.8.0-beta00015,
+            // so we added a switch to be able to read these indexes in later 
versions.
+            int segmentRadix = SegmentInfos.useLegacySegmentNames ? 10 : 
J2N.Character.MaxRadix;
+
             for (int i = 0; i < numSegments; i++)
             {
                 SegmentCommitInfo info = sis.Info(i);
@@ -677,7 +681,7 @@ namespace Lucene.Net.Index
                 try
                 {
                     // LUCENENET: Optimized to not allocate a substring during 
the parse
-                    segmentName = Integer.Parse(info.Info.Name, 1, 
info.Info.Name.Length - 1, radix: 10);
+                    segmentName = Integer.Parse(info.Info.Name, 1, 
info.Info.Name.Length - 1, radix: segmentRadix);
                 }
                 catch
                 {
diff --git a/src/Lucene.Net/Index/IndexFileNames.cs 
b/src/Lucene.Net/Index/IndexFileNames.cs
index 9869681..bdc7295 100644
--- a/src/Lucene.Net/Index/IndexFileNames.cs
+++ b/src/Lucene.Net/Index/IndexFileNames.cs
@@ -1,4 +1,4 @@
-using J2N;
+using J2N;
 using Lucene.Net.Diagnostics;
 using System;
 using System.Text;
@@ -111,7 +111,7 @@ namespace Lucene.Net.Index
                 // to the gen length as string (hopefully an upper limit so SB 
won't
                 // expand in the middle.
                 StringBuilder res = (new StringBuilder(@base.Length + 6 + 
ext.Length))
-                    
.Append(@base).Append('_').Append(gen.ToString(Character.MaxRadix));
+                    
.Append(@base).Append('_').Append(SegmentInfos.SegmentNumberToString(gen, 
allowLegacyNames: false)); // LUCENENET specific - we had this right thru all 
of the betas, so don't change if the legacy feature is enabled
                 if (ext.Length > 0)
                 {
                     res.Append('.').Append(ext);
diff --git a/src/Lucene.Net/Index/IndexWriter.cs 
b/src/Lucene.Net/Index/IndexWriter.cs
index 1f3468e..fe1beb3 100644
--- a/src/Lucene.Net/Index/IndexWriter.cs
+++ b/src/Lucene.Net/Index/IndexWriter.cs
@@ -2131,7 +2131,7 @@ namespace Lucene.Net.Index
                 // problems at least with ConcurrentMergeScheduler.
                 changeCount++;
                 segmentInfos.Changed();
-                return "_" + 
(segmentInfos.Counter++).ToString(J2N.Character.MaxRadix);
+                return "_" + 
SegmentInfos.SegmentNumberToString(segmentInfos.Counter++, allowLegacyNames: 
false); // LUCENENET specific - we had this right thru all of the betas, so 
don't change if the legacy feature is enabled
             }
             finally
             {
diff --git a/src/Lucene.Net/Index/ReadersAndUpdates.cs 
b/src/Lucene.Net/Index/ReadersAndUpdates.cs
index 8dcfde1..e31ed02 100644
--- a/src/Lucene.Net/Index/ReadersAndUpdates.cs
+++ b/src/Lucene.Net/Index/ReadersAndUpdates.cs
@@ -1,4 +1,5 @@
-using J2N.Threading.Atomic;
+using J2N;
+using J2N.Threading.Atomic;
 using Lucene.Net.Diagnostics;
 using Lucene.Net.Documents;
 using Lucene.Net.Support.Threading;
@@ -579,7 +580,10 @@ namespace Lucene.Net.Index
 
                         fieldInfos = builder.Finish();
                         long nextFieldInfosGen = Info.NextFieldInfosGen;
-                        string segmentSuffix = 
nextFieldInfosGen.ToString(CultureInfo.InvariantCulture);//Convert.ToString(nextFieldInfosGen,
 Character.MAX_RADIX));
+                        // LUCENENET specific: We created the segments names 
wrong in 4.8.0-beta00001 - 4.8.0-beta00015,
+                        // so we added a switch to be able to read these 
indexes in later versions. This logic as well as an
+                        // optimization on the first 100 segment values is 
implmeneted in SegmentInfos.SegmentNumberToString().
+                        string segmentSuffix = 
SegmentInfos.SegmentNumberToString(nextFieldInfosGen);
                         SegmentWriteState state = new SegmentWriteState(null, 
trackingDir, Info.Info, fieldInfos, writer.Config.TermIndexInterval, null, 
IOContext.DEFAULT, segmentSuffix);
                         DocValuesFormat docValuesFormat = 
codec.DocValuesFormat;
                         DocValuesConsumer fieldsConsumer = 
docValuesFormat.FieldsConsumer(state);
diff --git a/src/Lucene.Net/Index/SegmentDocValues.cs 
b/src/Lucene.Net/Index/SegmentDocValues.cs
index e87e0c7..4472ed3 100644
--- a/src/Lucene.Net/Index/SegmentDocValues.cs
+++ b/src/Lucene.Net/Index/SegmentDocValues.cs
@@ -1,4 +1,5 @@
-using J2N.Collections.Generic.Extensions;
+using J2N;
+using J2N.Collections.Generic.Extensions;
 using Lucene.Net.Diagnostics;
 using Lucene.Net.Support.Threading;
 using Lucene.Net.Util;
@@ -46,7 +47,10 @@ namespace Lucene.Net.Index
             if (gen != -1)
             {
                 dvDir = si.Info.Dir; // gen'd files are written outside CFS, 
so use SegInfo directory
-                segmentSuffix = 
gen.ToString(CultureInfo.InvariantCulture);//Convert.ToString((long)gen, 
Character.MAX_RADIX);
+                // LUCENENET specific: We created the segments names wrong in 
4.8.0-beta00001 - 4.8.0-beta00015,
+                // so we added a switch to be able to read these indexes in 
later versions. This logic as well as an
+                // optimization on the first 100 segment values is implmeneted 
in SegmentInfos.SegmentNumberToString().
+                segmentSuffix = SegmentInfos.SegmentNumberToString(gen);
             }
 
             // set SegmentReadState to list only the fields that are relevant 
to that gen
diff --git a/src/Lucene.Net/Index/SegmentInfos.cs 
b/src/Lucene.Net/Index/SegmentInfos.cs
index 2379ed6..2ea96b5 100644
--- a/src/Lucene.Net/Index/SegmentInfos.cs
+++ b/src/Lucene.Net/Index/SegmentInfos.cs
@@ -6,6 +6,7 @@ using Lucene.Net.Support.IO;
 using System;
 using System.Collections;
 using System.Collections.Generic;
+using System.Globalization;
 using System.IO;
 using System.Runtime.CompilerServices;
 using System.Runtime.ExceptionServices;
@@ -141,6 +142,153 @@ namespace Lucene.Net.Index
         public static readonly int FORMAT_SEGMENTS_GEN_CURRENT = 
FORMAT_SEGMENTS_GEN_CHECKSUM;
 
         /// <summary>
+        /// Setting this to true will generate the same file names that were 
used in 4.8.0-beta00001 through 4.8.0-beta00015.
+        /// When writing more than 10 segments, these segment names were 
incompatible with prior versions of Lucene.NET and incompatible with Lucene 
4.8.0.
+        /// <para/>
+        /// This is only for reading codecs from the affected 4.8.0 beta 
versions, it is not recommended to use this setting for general use.
+        /// <para/>
+        /// This must be set prior to opening an index at application startup. 
When setting it at other times the behavior is undefined.
+        /// <para/>
+        /// Note that this property can also be set using the 
"useLegacySegmentNames" system property to "true" (such as setting the 
environment variable "lucene:useLegacySegmentNames").
+        /// System properties can also be injected by supplying a <see 
cref="Configuration.IConfigurationFactory"/> at application startup
+        /// through <see 
cref="Configuration.ConfigurationSettings.SetConfigurationFactory(Configuration.IConfigurationFactory)"/>.
+        /// </summary>
+        public static bool UseLegacySegmentNames
+        {
+            get => useLegacySegmentNames;
+            set => useLegacySegmentNames = value;
+        }
+        internal static bool useLegacySegmentNames = 
Util.SystemProperties.GetPropertyAsBoolean("useLegacySegmentNames", 
defaultValue: false);
+
+        /// <summary>
+        /// Optimized version of <see 
cref="J2N.IntegralNumberExtensions.ToString(long, int)"/> with a radix of 36, 
that
+        /// simply does a switch case for the first 100 numbers, which takes 
only 5% of the time as calculating it.
+        /// We fall back to calling the method after 100 segments.
+        /// <para/>
+        /// This also implements the switch for <see 
cref="UseLegacySegmentNames"/> so it doesn't have to be dealt with externally.
+        /// </summary>
+        /// <
+        internal static string SegmentNumberToString(long segment, bool 
allowLegacyNames = true)
+        {
+            switch (segment)
+            {
+                case 0: return "0";
+                case 1: return "1";
+                case 2: return "2";
+                case 3: return "3";
+                case 4: return "4";
+                case 5: return "5";
+                case 6: return "6";
+                case 7: return "7";
+                case 8: return "8";
+                case 9: return "9";
+            }
+
+            if (!allowLegacyNames || !useLegacySegmentNames)
+            {
+                return segment switch
+                {
+                    10 => "a",
+                    11 => "b",
+                    12 => "c",
+                    13 => "d",
+                    14 => "e",
+                    15 => "f",
+                    16 => "g",
+                    17 => "h",
+                    18 => "i",
+                    19 => "j",
+                    20 => "k",
+                    21 => "l",
+                    22 => "m",
+                    23 => "n",
+                    24 => "o",
+                    25 => "p",
+                    26 => "q",
+                    27 => "r",
+                    28 => "s",
+                    29 => "t",
+                    30 => "u",
+                    31 => "v",
+                    32 => "w",
+                    33 => "x",
+                    34 => "y",
+                    35 => "z",
+                    36 => "10",
+                    37 => "11",
+                    38 => "12",
+                    39 => "13",
+                    40 => "14",
+                    41 => "15",
+                    42 => "16",
+                    43 => "17",
+                    44 => "18",
+                    45 => "19",
+                    46 => "1a",
+                    47 => "1b",
+                    48 => "1c",
+                    49 => "1d",
+                    50 => "1e",
+                    51 => "1f",
+                    52 => "1g",
+                    53 => "1h",
+                    54 => "1i",
+                    55 => "1j",
+                    56 => "1k",
+                    57 => "1l",
+                    58 => "1m",
+                    59 => "1n",
+                    60 => "1o",
+                    61 => "1p",
+                    62 => "1q",
+                    63 => "1r",
+                    64 => "1s",
+                    65 => "1t",
+                    66 => "1u",
+                    67 => "1v",
+                    68 => "1w",
+                    69 => "1x",
+                    70 => "1y",
+                    71 => "1z",
+                    72 => "20",
+                    73 => "21",
+                    74 => "22",
+                    75 => "23",
+                    76 => "24",
+                    77 => "25",
+                    78 => "26",
+                    79 => "27",
+                    80 => "28",
+                    81 => "29",
+                    82 => "2a",
+                    83 => "2b",
+                    84 => "2c",
+                    85 => "2d",
+                    86 => "2e",
+                    87 => "2f",
+                    88 => "2g",
+                    89 => "2h",
+                    90 => "2i",
+                    91 => "2j",
+                    92 => "2k",
+                    93 => "2l",
+                    94 => "2m",
+                    95 => "2n",
+                    96 => "2o",
+                    97 => "2p",
+                    98 => "2q",
+                    99 => "2r",
+                    _ => segment.ToString(Character.MaxRadix),
+                };
+            }
+
+            // This is wrong! Unfortunately, this is how the segment names 
were generated in
+            // beta 1 thru 15, so we end up here if the switch is enabled to 
read them.
+            // We should actually be using a radix of 36 rather than 10 (which 
was done correctly in Lucene.NET 3.0.3).
+            return segment.ToString(CultureInfo.InvariantCulture);
+        }
+
+        /// <summary>
         /// Used to name new segments. </summary>
         public int Counter { get; set; }
 
diff --git a/src/Lucene.Net/Index/SegmentReader.cs 
b/src/Lucene.Net/Index/SegmentReader.cs
index 6700f02..c306867 100644
--- a/src/Lucene.Net/Index/SegmentReader.cs
+++ b/src/Lucene.Net/Index/SegmentReader.cs
@@ -1,4 +1,5 @@
-using J2N.Runtime.CompilerServices;
+using J2N;
+using J2N.Runtime.CompilerServices;
 using Lucene.Net.Diagnostics;
 using Lucene.Net.Util;
 using System;
@@ -232,7 +233,10 @@ namespace Lucene.Net.Index
 
             try
             {
-                string segmentSuffix = info.FieldInfosGen == -1 ? "" : 
info.FieldInfosGen.ToString(CultureInfo.InvariantCulture);//Convert.ToString(info.FieldInfosGen,
 Character.MAX_RADIX));
+                // LUCENENET specific: We created the segments names wrong in 
4.8.0-beta00001 - 4.8.0-beta00015,
+                // so we added a switch to be able to read these indexes in 
later versions. This logic as well as an
+                // optimization on the first 100 segment values is implmeneted 
in SegmentInfos.SegmentNumberToString().
+                string segmentSuffix = info.FieldInfosGen == -1 ? string.Empty 
: SegmentInfos.SegmentNumberToString(info.FieldInfosGen);
                 return 
info.Info.Codec.FieldInfosFormat.FieldInfosReader.Read(dir, info.Info.Name, 
segmentSuffix, IOContext.READ_ONCE);
             }
             finally

Reply via email to