I'm trying to configure per-field similarity to disregard term
frequency (omitTf) in a 'title' field. I'm trying to follow the
example docs without success: my custom similarity doesn't seem to
have any effect on 'tf'. Is the NoTfSimilarity function below
written correctly? Any advice is much appreciated.
my schema.xml:
<field name="title" type="text_custom_sim" indexed="true"
stored="true" omitNorms="true" termVectors="true" />
<similarity class="solr.SchemaSimilarityFactory"/>
<fieldType name="text_custom_sim" class="solr.TextField"
positionIncrementGap="100" autoGeneratePhraseQueries="true">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StandardFilterFactory"/>
<similarity class="com.ssww.NoTfSimilarityFactory" />
.....
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StandardFilterFactory"/>
<similarity class="com.ssww.NoTfSimilarityFactory" />
.....
</analyzer>
NoTfSimilarityFactory.java:
package com.ssww;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.solr.schema.SimilarityFactory;
public class NoTfSimilarityFactory extends SimilarityFactory {
@Override
public Similarity getSimilarity() {
return new NoTfSimilarity();
}
}
NoTfSimilarity.java:
package com.ssww;
import org.apache.lucene.search.similarities.DefaultSimilarity;
public final class NoTfSimilarity extends DefaultSimilarity {
public float tf(int i) {
return 1;
}
}
These two files are in a jar in the lib directory of this core.
Here's the results of a search for "paint" with custom and default
similarity:
Indexed with per-field NoTfSimilarity:
284.5441 = (MATCH) boost(+(title:paint^8.0 | search_keywords:paint |
shingle_text:paint^2.0 | description:paint^0.5 | nosyn:paint^5.0 |
bullets:paint^0.5) () () () () () (),scale(int(page_views),1.0,3.0)),
product of:
280.5598 = (MATCH) sum of:
280.5598 = (MATCH) max of:
280.5598 = (MATCH) weight(title:paint^8.0 in 48) [], result of:
280.5598 = score(doc=48,freq=2.0 = termFreq=2.0
), product of:
39.83825 = queryWeight, product of:
8.0 = boost
4.979781 = idf(docFreq=187, maxDocs=10059)
1.0 = queryNorm
7.042474 = fieldWeight in 48, product of:
1.4142135 = tf(freq=2.0), with freq of:
2.0 = termFreq=2.0
4.979781 = idf(docFreq=187, maxDocs=10059)
1.0 = fieldNorm(doc=48)
18.217428 = (MATCH) weight(search_keywords:paint in 48) [],
result of:
18.217428 = score(doc=48,freq=1.0 = termFreq=1.0
), product of:
4.268188 = queryWeight, product of:
4.268188 = idf(docFreq=382, maxDocs=10059)
1.0 = queryNorm
4.268188 = fieldWeight in 48, product of:
1.0 = tf(freq=1.0), with freq of:
1.0 = termFreq=1.0
4.268188 = idf(docFreq=382, maxDocs=10059)
1.0 = fieldNorm(doc=48)
7.725952 = (MATCH) weight(description:paint^0.5 in 48) [],
result of:
7.725952 = score(doc=48,freq=2.0 = termFreq=2.0
), product of:
1.6527361 = queryWeight, product of:
0.5 = boost
3.3054721 = idf(docFreq=1002, maxDocs=10059)
1.0 = queryNorm
4.6746435 = fieldWeight in 48, product of:
1.4142135 = tf(freq=2.0), with freq of:
2.0 = termFreq=2.0
3.3054721 = idf(docFreq=1002, maxDocs=10059)
1.0 = fieldNorm(doc=48)
106.50396 = (MATCH) weight(nosyn:paint^5.0 in 48) [], result of:
106.50396 = score(doc=48,freq=4.0 = termFreq=4.0
), product of:
16.317472 = queryWeight, product of:
5.0 = boost
3.2634945 = idf(docFreq=1045, maxDocs=10059)
1.0 = queryNorm
6.526989 = fieldWeight in 48, product of:
2.0 = tf(freq=4.0), with freq of:
4.0 = termFreq=4.0
3.2634945 = idf(docFreq=1045, maxDocs=10059)
1.0 = fieldNorm(doc=48)
1.0142012 =
scale(int(page_views)=18,toMin=1.0,toMax=3.0,fromMin=0.0,fromMax=2535.0)
Indexed with DefaultSimilarity:
7.630908 = (MATCH) boost(+(title:paint^8.0 | search_keywords:paint |
shingle_text:paint^2.0 | description:paint^0.5 | nosyn:paint^5.0 |
bullets:paint^0.5) () () () () () (),scale(int(page_views),1.0,3.0)),
product of:
7.524058 = (MATCH) sum of:
7.524058 = (MATCH) max of:
7.524058 = (MATCH) weight(title:paint^8.0 in 3504)
[DefaultSimilarity], result of:
7.524058 = fieldWeight in 3504, product of:
1.4142135 = tf(freq=2.0), with freq of:
2.0 = termFreq=2.0
5.3203125 = idf(docFreq=197, maxDocs=14892)
1.0 = fieldNorm(doc=3504)
0.5091842 = (MATCH) weight(search_keywords:paint in 3504)
[DefaultSimilarity], result of:
0.5091842 = score(doc=3504,freq=1.0 = termFreq=1.0
), product of:
0.10937647 = queryWeight, product of:
4.655336 = idf(docFreq=384, maxDocs=14892)
0.02349486 = queryNorm
4.655336 = fieldWeight in 3504, product of:
1.0 = tf(freq=1.0), with freq of:
1.0 = termFreq=1.0
4.655336 = idf(docFreq=384, maxDocs=14892)
1.0 = fieldNorm(doc=3504)
0.20965372 = (MATCH) weight(description:paint^0.5 in 3504)
[DefaultSimilarity], result of:
0.20965372 = score(doc=3504,freq=2.0 = termFreq=2.0
), product of:
0.04173162 = queryWeight, product of:
0.5 = boost
3.5524042 = idf(docFreq=1159, maxDocs=14892)
0.02349486 = queryNorm
5.023858 = fieldWeight in 3504, product of:
1.4142135 = tf(freq=2.0), with freq of:
2.0 = termFreq=2.0
3.5524042 = idf(docFreq=1159, maxDocs=14892)
1.0 = fieldNorm(doc=3504)
2.8990223 = (MATCH) weight(nosyn:paint^5.0 in 3504)
[DefaultSimilarity], result of:
2.8990223 = score(doc=3504,freq=4.0 = termFreq=4.0
), product of:
0.41265035 = queryWeight, product of:
5.0 = boost
3.5126863 = idf(docFreq=1206, maxDocs=14892)
0.02349486 = queryNorm
7.0253725 = fieldWeight in 3504, product of:
2.0 = tf(freq=4.0), with freq of:
4.0 = termFreq=4.0
3.5126863 = idf(docFreq=1206, maxDocs=14892)
1.0 = fieldNorm(doc=3504)
1.0142012 =
scale(int(page_views)=18,toMin=1.0,toMax=3.0,fromMin=0.0,fromMax=2535.0)