NightOwl888 commented on issue #896:
URL: https://github.com/apache/lucenenet/issues/896#issuecomment-1902097205

   I was able to isolate this by adding code to the test to generate the 
sequence of method calls.
   
   <details>
     <summary>Expand code example</summary>
     
   ```c#
           [Test]
           public virtual void TestRandomRealisticWhiteSpace()
           {
               IDictionary<string, string> map = new Dictionary<string, 
string>();
               int numTerms = AtLeast(50);
               for (int i = 0; i < numTerms; i++)
               {
                   string randomRealisticUnicodeString = 
TestUtil.RandomRealisticUnicodeString(Random);
                   char[] charArray = 
randomRealisticUnicodeString.ToCharArray();
                   StringBuilder sb = new StringBuilder();
                   for (int j = 0; j < charArray.Length;)
                   {
                       int cp = Character.CodePointAt(charArray, j, 
charArray.Length);
                       if (!Character.IsWhiteSpace(cp))
                       {
                           sb.AppendCodePoint(cp);
                       }
                       j += Character.CharCount(cp);
                   }
                   if (sb.Length > 0)
                   {
                       string value = TestUtil.RandomSimpleString(Random);
                       map[sb.ToString()] = value.Length == 0 ? "a" : value;
   
                   }
               }
               if (map.Count == 0)
               {
                   map["booked"] = "books";
               }
   
               using var writer = new StreamWriter(@"F:\testgen.txt", append: 
false, Encoding.UTF8);
   
               bool ignoreCase = Random.nextBoolean();
   
               writer.WriteLine($"StemmerOverrideFilter.Builder builder = new 
StemmerOverrideFilter.Builder({ignoreCase.ToString().ToLowerInvariant()});");
               writer.WriteLine($"StringBuilder input = new StringBuilder();");
               writer.WriteLine($"IList<string> output = new 
JCG.List<string>();");
               writer.WriteLine();
   
               StemmerOverrideFilter.Builder builder = new 
StemmerOverrideFilter.Builder(ignoreCase);
               IDictionary<string, string> entrySet = map;
               StringBuilder input = new StringBuilder();
               IList<string> output = new JCG.List<string>();
               int index = 0;
               foreach (KeyValuePair<string, string> entry in entrySet)
               {
                   writer.WriteLine();
                   writer.WriteLine($"builder.Add(\"{entry.Key}\", 
\"{entry.Value}\");");
                   builder.Add(entry.Key, entry.Value);
                   if (Random.nextBoolean() || output.Count == 0)
                   {
                       
writer.WriteLine($"input.Append(\"{entry.Key}\").Append(' ');");
                       writer.WriteLine($"output.Add(\"{entry.Value}\");");
   
                       input.Append(entry.Key).Append(' ');
                       output.Add(entry.Value);
                       index++;
                   }
                   
               }
               writer.WriteLine();
               writer.WriteLine($"Tokenizer tokenizer = new 
WhitespaceTokenizer(TEST_VERSION_CURRENT, new 
StringReader(input.ToString()));");
               writer.WriteLine($"TokenStream stream = new PorterStemFilter(new 
StemmerOverrideFilter(tokenizer, builder.Build()));");
               writer.WriteLine($"AssertTokenStreamContents(stream, 
output.ToArray());");
   
               Tokenizer tokenizer = new 
WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input.ToString()));
               TokenStream stream = new PorterStemFilter(new 
StemmerOverrideFilter(tokenizer, builder.Build()));
               AssertTokenStreamContents(stream, output.ToArray());
           }
   ```
   </details>
   
   Then I took the result and made a new test and whittled it down to a minimal 
example.
   
   <details>
     <summary>Expand code example</summary>
     
   ```c#
           [Test]
           public virtual void TestRandomRealisticWhiteSpace_Fixed()
           {
               StemmerOverrideFilter.Builder builder = new 
StemmerOverrideFilter.Builder(true);
               StringBuilder input = new StringBuilder();
               IList<string> output = new JCG.List<string>();
   
               builder.Add("β…’", "ceiqskp");
               input.Append("β…’").Append(' ');
               output.Add("ceiqskp");
   
               builder.Add("πΊπ‘‡π‘„πŸπ€π–πœπ‰π΅π‘Šπ‘‰π‘‹", "mrjxmndpat");
               builder.Add("πŸ„†πŸ‡žπŸ…ΈπŸ„‡πŸ…žπŸ‡²πŸ…—πŸ‡πŸ‡―πŸ„™πŸ†³πŸ‡―πŸ†˜πŸ…΅πŸ„„πŸ‡€πŸ…„", "a");
               builder.Add("Ν¨Μ‘Μš", "dvaihyu");
               builder.Add("οΉ οΉ›οΉ‘οΉ•οΉͺοΉ‘", "erczyhyhi");
               builder.Add("βΈ΅βΉ»βΉ”βΉ³βΉ›βΈŸβΈ€βΉŸβΉ“βΉΉβΈ…βΈ·βΈ‘βΈŽ", "a");
               builder.Add("πŠœπŠ€πŠ‹πŠ›πŠšπŠ“πŠπŠπŠ•πŠ†πŠŽπŠ‹πŠ™πŠπŠ›πŠŒπŠˆπŠŽπŠπŠ‡", "a");
               builder.Add("β…²", "etmdu");
               input.Append("β…²").Append(' ');
               output.Add("etmdu");
   
               Tokenizer tokenizer = new 
WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input.ToString()));
               TokenStream stream = new PorterStemFilter(new 
StemmerOverrideFilter(tokenizer, builder.Build()));
               AssertTokenStreamContents(stream, output.ToArray());
           }
   ```
   
   </details>
   
   I ported this to Java and it fails there, too. But it became obvious what 
the issue is - `ignoreCase` is set to `true` and with stemming applied and 
casing ignored, we end up with `iii` in both cases.
   
   I checked the main branch on Lucene and this has been fixed already, which I 
traced to the following commit:
   
   
https://github.com/apache/lucene/commit/bce10efeb40c11271cb398c37b859408818b8a00
   
   So, no problem at all with the production code, just a poorly thought out 
test that randomly causes collisions that fail. I will port the changes from 
the above commit and submit a PR with a patch so the test doesn't randomly fail.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: dev-unsubscr...@lucenenet.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to