Hello,I recently started using lucene and checking built-in synonyms processing facilities. So, the main question so far is what is the benefit of using /SynonymFilter/ over explicitly adding synonyms as document fields?The former has an obvious drawback that it doesn't support transitive relations. Consider a simple example below - registering pairs (/"first"/, /"second"/) and (/"first"/, /"third"/) as synonyms; indexing /"second"/; searching against /"third"/; no match:
package com.my.social.search.lucene;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.Tokenizer;import org.apache.lucene.analysis.core.LowerCaseFilter;import org.apache.lucene.analysis.core.StopFilter;import org.apache.lucene.analysis.en.EnglishAnalyzer;import org.apache.lucene.analysis.en.EnglishPossessiveFilter;import org.apache.lucene.analysis.en.PorterStemFilter;import org.apache.lucene.analysis.miscellaneous.LengthFilter;import org.apache.lucene.analysis.standard.StandardFilter;import org.apache.lucene.analysis.standard.StandardTokenizer;import org.apache.lucene.analysis.synonym.SynonymFilter;import org.apache.lucene.analysis.synonym.SynonymMap;import org.apache.lucene.analysis.util.StopwordAnalyzerBase;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.TextField;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.queryparser.classic.ParseException;import org.apache.lucene.queryparser.classic.QueryParser;import org.apache.lucene.search.*;import org.apache.lucene.store.RAMDirectory;import org.apache.lucene.util.CharsRef;import org.apache.lucene.util.Version;import org.jetbrains.annotations.NotNull;import java.io.IOException;import java.io.Reader;/** * @author Denis Zhdanov * @since 9/5/13 12:10 AM */public class LuceneTest { public static void main(String[] args) throws IOException, ParseException { RAMDirectory dir = new RAMDirectory(); SynonymMap.Builder builder = new SynonymMap.Builder(true); builder.add(new CharsRef("first"), new CharsRef("second"), true); builder.add(new CharsRef("first"), new CharsRef("third"), true); MyAnalyzer analyzer = new MyAnalyzer(builder.build()); try (IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_44, analyzer))) { Document document = new Document(); document.add(new TextField("tag", "second", Field.Store.YES)); writer.addDocument(document); } IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(dir)); QueryParser queryParser = new QueryParser(Version.LUCENE_44, "tag", analyzer); Query query = queryParser.parse("third"); TopDocs hits = searcher.search(query, null, 10); for (ScoreDoc scoreDoc : hits.scoreDocs) { Document doc = searcher.doc(scoreDoc.doc); System.out.println(doc.get("tag")); //Explanation explain = searcher.explain(query, scoreDoc.doc); //System.out.println(explain); } } private static class MyAnalyzer extends StopwordAnalyzerBase { private final SynonymMap synonyms; MyAnalyzer(@NotNull SynonymMap synonyms) { super(Version.LUCENE_44); this.synonyms = synonyms; } @Override protected Analyzer.TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new EnglishPossessiveFilter(matchVersion, result); result = new LowerCaseFilter(matchVersion, result); result = new SynonymFilter(result, synonyms, true); result = new PorterStemFilter(result); return new Analyzer.TokenStreamComponents(source, result); } }} That means that I need to explicitly register all possible pairs from a set of synonyms to get SynonymFilter-based approach work (I have a large set of english synonyms (built from gutenberg dictionary) where every synonyms group contains more than two words).I see the only possible benefit of using /SynonymFilter/ so far - phrase search where synonym position matters: package com.my.social.search.lucene;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.Tokenizer;import org.apache.lucene.analysis.core.LowerCaseFilter;import org.apache.lucene.analysis.core.StopFilter;import org.apache.lucene.analysis.en.EnglishAnalyzer;import org.apache.lucene.analysis.en.EnglishPossessiveFilter;import org.apache.lucene.analysis.en.PorterStemFilter;import org.apache.lucene.analysis.miscellaneous.LengthFilter;import org.apache.lucene.analysis.standard.StandardFilter;import org.apache.lucene.analysis.standard.StandardTokenizer;import org.apache.lucene.analysis.synonym.SynonymFilter;import org.apache.lucene.analysis.synonym.SynonymMap;import org.apache.lucene.analysis.util.StopwordAnalyzerBase;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.TextField;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.queryparser.classic.ParseException;import org.apache.lucene.queryparser.classic.QueryParser;import org.apache.lucene.search.*;import org.apache.lucene.store.RAMDirectory;import org.apache.lucene.util.CharsRef;import org.apache.lucene.util.Version;import org.jetbrains.annotations.NotNull;import java.io.IOException;import java.io.Reader;/** * @author Denis Zhdanov * @since 9/5/13 12:10 AM */public class LuceneTest { public static void main(String[] args) throws IOException, ParseException { RAMDirectory dir = new RAMDirectory(); SynonymMap.Builder builder = new SynonymMap.Builder(true); builder.add(new CharsRef("first"), new CharsRef("second"), true); MyAnalyzer analyzer = new MyAnalyzer(builder.build()); try (IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_44, analyzer))) { Document document = new Document(); document.add(new TextField("tag", "second point or number", Field.Store.YES)); writer.addDocument(document); document = new Document(); document.add(new TextField("tag", "first number dummy", Field.Store.YES)); writer.addDocument(document); } IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(dir)); QueryParser queryParser = new QueryParser(Version.LUCENE_44, "tag", analyzer); Query query = queryParser.parse("\"second number\""); TopDocs hits = searcher.search(query, null, 10); for (ScoreDoc scoreDoc : hits.scoreDocs) { Document doc = searcher.doc(scoreDoc.doc); System.out.println(doc.get("tag")); //Explanation explain = searcher.explain(query, scoreDoc.doc); //System.out.println(explain); } } private static class MyAnalyzer extends StopwordAnalyzerBase { private final SynonymMap synonyms; MyAnalyzer(@NotNull SynonymMap synonyms) { super(Version.LUCENE_44); this.synonyms = synonyms; } @Override protected Analyzer.TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new EnglishPossessiveFilter(matchVersion, result); result = new LowerCaseFilter(matchVersion, result); result = new SynonymFilter(result, synonyms, true); result = new PorterStemFilter(result); return new Analyzer.TokenStreamComponents(source, result); } }} I google in order to find out if my understanding is correct but unfortunately it doesn't show any results. That's why I decided to ask the community before digging into lucene sources -- View this message in context: http://lucene.472066.n3.nabble.com/SynonymFilter-benefit-over-explicit-field-composition-tp4088819.html Sent from the Lucene - Java Users mailing list archive at Nabble.com.