This is an automated email from the ASF dual-hosted git repository.
paulirwin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git
The following commit(s) were added to refs/heads/master by this push:
new ca3bdacd5 ShingleFilter produces invalid queries (#946)
ca3bdacd5 is described below
commit ca3bdacd56c105174e9700d6eceec6c7c53b243c
Author: tohidemyname <[email protected]>
AuthorDate: Mon Nov 4 23:48:25 2024 +0800
ShingleFilter produces invalid queries (#946)
* ShingleFilter produces invalid queries
https://github.com/apache/lucenenet/issues/943
* Add LUCENENET-specific backport comment, fix test name, fix test position
and code style
---------
Co-authored-by: tohidemyname <tohidemyname>
Co-authored-by: Paul Irwin <[email protected]>
---
.../Analysis/Shingle/ShingleFilter.cs | 11 ++-
.../Analysis/Shingle/ShingleFilterTest.cs | 96 +++++++++++++++++++++-
2 files changed, 104 insertions(+), 3 deletions(-)
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Shingle/ShingleFilter.cs
b/src/Lucene.Net.Analysis.Common/Analysis/Shingle/ShingleFilter.cs
index 9cadafd2d..f36f9b8a7 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Shingle/ShingleFilter.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Shingle/ShingleFilter.cs
@@ -366,7 +366,16 @@ namespace Lucene.Net.Analysis.Shingle
noShingleOutput = false;
}
offsetAtt.SetOffset(offsetAtt.StartOffset,
nextToken.offsetAtt.EndOffset);
- posLenAtt.PositionLength = builtGramSize;
+ // LUCENENET-specific: backported fix from Lucene 6.5.0
(LUCENE-7708)
+ if (outputUnigrams)
+ {
+ posLenAtt.PositionLength = builtGramSize;
+ }
+ else
+ {
+ // position length for this token is the number of
position created by shingles of smaller size.
+ posLenAtt.PositionLength = Math.Max(1, (builtGramSize
- minShingleSize) + 1);
+ }
isOutputHere = true;
gramSize.Advance();
tokenAvailable = true;
diff --git
a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Shingle/ShingleFilterTest.cs
b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Shingle/ShingleFilterTest.cs
index 1cf5ad687..8b3fc9ee9 100644
--- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Shingle/ShingleFilterTest.cs
+++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Shingle/ShingleFilterTest.cs
@@ -1,4 +1,4 @@
-// Lucene version compatibility level 4.8.1
+// Lucene version compatibility level 4.8.1
using Lucene.Net.Analysis.Core;
using Lucene.Net.Analysis.TokenAttributes;
using NUnit.Framework;
@@ -617,5 +617,97 @@ namespace Lucene.Net.Analysis.Shingle
AssertTokenStreamContents(filter, new string[] { "purple",
"purplewizard", "purplewizard", "wizard", "wizard", "wizard" }, new int[] { 0,
0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1,
0, 0 }, 20);
}
+
+ // LUCENENET-specific: backported fix from Lucene 6.5.0 (LUCENE-7708)
+ [Test]
+ public void TestPositionLength()
+ {
+ Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName,
reader) =>
+ {
+ MockBytesAttributeFactory factory = new
MockBytesAttributeFactory();
+ Tokenizer tokenizer = new MockTokenizer(factory, reader,
MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
+ ShingleFilter filter = new ShingleFilter(tokenizer, 4, 4);
+ filter.SetOutputUnigrams(false);
+ return new TokenStreamComponents(tokenizer, filter);
+ });
+
+ AssertTokenStreamContents(a.GetTokenStream("", "to be or not to
be"),
+ new string[] {"to be or not", "be or not to", "or not to be"},
+ new int[] {0, 3, 6},
+ new int[] { 12, 15, 18 },
+ null,
+ new int[] { 1, 1, 1 },
+ new int[] { 1, 1, 1 },
+ 18,
+ // offsets are correct but assertTokenStreamContents does not
handle multiple terms with different offsets
+ // finishing at the same position
+ false);
+
+ a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+ {
+ MockBytesAttributeFactory factory = new
MockBytesAttributeFactory();
+ Tokenizer tokenizer = new MockTokenizer(factory, reader,
MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
+ ShingleFilter filter = new ShingleFilter(tokenizer, 2, 4);
+ filter.SetOutputUnigrams(false);
+ return new TokenStreamComponents(tokenizer, filter);
+ });
+
+ AssertTokenStreamContents(a.GetTokenStream("", "to be or not to
be"),
+ new string[] {"to be", "to be or", "to be or not", "be or",
"be or not", "be or not to", "or not", "or not to",
+ "or not to be", "not to", "not to be", "to be"},
+ new int[] { 0, 0, 0, 3, 3, 3, 6, 6, 6, 9, 9, 13 },
+ new int[] { 5, 8, 12, 8, 12, 15, 12, 15, 18, 15, 18, 18 },
+ null,
+ new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1 },
+ new int[] { 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 1 },
+ 18,
+ // offsets are correct but assertTokenStreamContents does not
handle multiple terms with different offsets
+ // finishing at the same position
+ false);
+
+ a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+ {
+ MockBytesAttributeFactory factory = new
MockBytesAttributeFactory();
+ Tokenizer tokenizer = new MockTokenizer(factory, reader,
MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
+ ShingleFilter filter = new ShingleFilter(tokenizer, 3, 4);
+ filter.SetOutputUnigrams(false);
+ return new TokenStreamComponents(tokenizer, filter);
+ });
+
+ AssertTokenStreamContents(a.GetTokenStream("", "to be or not to
be"),
+ new string[] {"to be or", "to be or not", "be or not", "be or
not to", "or not to",
+ "or not to be", "not to be"},
+ new int[] { 0, 0, 3, 3, 6, 6, 9 },
+ new int[] { 8, 12, 12, 15, 15, 18, 18 },
+ null,
+ new int[] { 1, 0, 1, 0, 1, 0, 1, 0 },
+ new int[] { 1, 2, 1, 2, 1, 2, 1, 2 },
+ 18,
+ // offsets are correct but assertTokenStreamContents does not
handle multiple terms with different offsets
+ // finishing at the same position
+ false);
+
+ a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+ {
+ MockBytesAttributeFactory factory = new
MockBytesAttributeFactory();
+ Tokenizer tokenizer = new MockTokenizer(factory, reader,
MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
+ ShingleFilter filter = new ShingleFilter(tokenizer, 3, 5);
+ filter.SetOutputUnigrams(false);
+ return new TokenStreamComponents(tokenizer, filter);
+ });
+
+ AssertTokenStreamContents(a.GetTokenStream("", "to be or not to
be"),
+ new string[] {"to be or", "to be or not", "to be or not to",
"be or not", "be or not to",
+ "be or not to be", "or not to", "or not to be", "not
to be"},
+ new int[] { 0, 0, 0, 3, 3, 3, 6, 6, 9, 9 },
+ new int[] { 8, 12, 15, 12, 15, 18, 15, 18, 18 },
+ null,
+ new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 1, 0 },
+ new int[] { 1, 2, 3, 1, 2, 3, 1, 2, 1 },
+ 18,
+ // offsets are correct but assertTokenStreamContents does not
handle multiple terms with different offsets
+ // finishing at the same position
+ false);
+ }
}
-}
\ No newline at end of file
+}