Pawel Rog created LUCENE-8695:
---------------------------------
Summary: Word delimiter graph or span queries bug
Key: LUCENE-8695
URL: https://issues.apache.org/jira/browse/LUCENE-8695
Project: Lucene - Core
Issue Type: Bug
Affects Versions: 7.7
Reporter: Pawel Rog
I have a simple query phrase query and a token stream which uses word delimiter
graph which fails. I tried different configurations of word delimiter graph but
could find a good solution for this. I don't actually know if the problem is on
word delimiter side or
Query which is generated:
{code:java}
spanNear([field:added, spanOr([field:foobarbaz, spanNear([field:foo,
field:bar, field:baz], 0, true)]), field:entry], 0, true)
{code}
Code of test where I isolated the problem is attached below:
{code:java}
public class TestPhrase extends LuceneTestCase {
private static IndexSearcher searcher;
private static IndexReader reader;
private Query query;
private static Directory directory;
private static Analyzer searchAnalyzer = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
TokenFilter filter1 = new WordDelimiterGraphFilter(tokenizer,
WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE,
WordDelimiterGraphFilter.GENERATE_WORD_PARTS |
WordDelimiterGraphFilter.CATENATE_WORDS |
WordDelimiterGraphFilter.CATENATE_NUMBERS |
WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE,
CharArraySet.EMPTY_SET);
TokenFilter filter2 = new LowerCaseFilter(filter1);
return new TokenStreamComponents(tokenizer, filter2);
}
};
private static Analyzer indexAnalyzer = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
TokenFilter filter1 = new WordDelimiterGraphFilter(tokenizer,
WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE,
WordDelimiterGraphFilter.GENERATE_WORD_PARTS |
WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS |
WordDelimiterGraphFilter.CATENATE_WORDS |
WordDelimiterGraphFilter.CATENATE_NUMBERS |
WordDelimiterGraphFilter.PRESERVE_ORIGINAL |
WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE,
CharArraySet.EMPTY_SET);
TokenFilter filter2 = new LowerCaseFilter(filter1);
return new TokenStreamComponents(tokenizer, filter2);
}
@Override
public int getPositionIncrementGap(String fieldName) {
return 100;
}
};
@BeforeClass
public static void beforeClass() throws Exception {
directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
indexAnalyzer);
Document doc = new Document();
doc.add(newTextField("field", "Added FooBarBaz entry", Field.Store.YES));
writer.addDocument(doc);
reader = writer.getReader();
writer.close();
searcher = new IndexSearcher(reader);
}
@Override
public void setUp() throws Exception {
super.setUp();
}
@AfterClass
public static void afterClass() throws Exception {
searcher = null;
reader.close();
reader = null;
directory.close();
directory = null;
}
public void testSearch() throws Exception {
QueryParser parser = new QueryParser("field", searchAnalyzer);
query = parser.parse("\"Added FooBarBaz entry\"");
System.out.println(query);
ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
assertEquals(1, hits.length);
}
}
{code}
NOTE: I tested it on Lucene 7.1.0, 7.4.0 and 7.7.0
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]