CommonGramsFilter

Nawab Zada Asad Iqbal Thu, 04 Jan 2018 18:09:00 -0800

Hi,

I am looking at this documentation and wondering if it would be better to
optionally skip indexing of original stopwords.


https://lucene.apache.org/solr/guide/6_6/filter-descriptions.html#
FilterDescriptions-CommonGramsFilter

http://localhost:8983/solr/filesearch/select?q=not%20to%20or%20be&debugQuery=true


   - parsedquery: "+(-DisjunctionMaxQuery((commongram_field2:to)~0.01)
   DisjunctionMaxQuery((commongram_field2:be)~0.01))~1",



Other parameters are:


   - params: {
      - mm: " 1<-0% ",
      - q.alt: "*:*",
      - ps: "100",
      - echoParams: "all",
      - sort: "score desc",
      - rows: "35",
      - version: "2.2",
      - q: "not to or be",
      - tie: "0.01",
      - defType: "edismax",
      - qf: "commongram_field2",
      - sow: "false",
      - wt: "json",
      - debugQuery: "true"
      }


And it doesn't match my document, which has following fields:


   - id: "9191",
   - commongram_field2: "not to or be",



Commongram is defined as:

    <field name="commongram_field2" type="commongaram" indexed="true"
stored="true" omitPositions="false"/>

    <fieldType name="commongaram" class="solr.TextField"
positionIncrementGap="100">
      <analyzer type="index">
        <charFilter
class="org.apache.lucene.analysis.icu.ICUNormalizer2CharFilterFactory"
name="nfkc" mode="compose"/>
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.WordDelimiterGraphFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0" preserveOriginal="0"
splitOnCaseChange="1" splitOnNumerics="1" stemEnglishPossessive="0"/>
        <filter class="solr.FlattenGraphFilterFactory"/>
        <filter class="solr.PatternReplaceFilterFactory"
pattern="^(\p{Punct}*)(.*?)(\p{Punct}*)$" replacement="$2"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.ASCIIFoldingFilterFactory"/>
        <filter class="solr.CommonGramsFilterFactory" words="stopwords.txt"
ignoreCase="true"/>
        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
        <filter class="solr.LimitTokenCountFilterFactory"
maxTokenCount="10000" consumeAllTokens="false"/>
        <filter class="solr.LengthFilterFactory" min="1" max="255"/>
      </analyzer>
      <analyzer type="query">
        <charFilter
class="org.apache.lucene.analysis.icu.ICUNormalizer2CharFilterFactory"
name="nfkc" mode="compose"/>
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.WordDelimiterGraphFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0" preserveOriginal="0"
splitOnCaseChange="1" splitOnNumerics="1" stemEnglishPossessive="0"/>
        <filter class="solr.PatternReplaceFilterFactory"
pattern="^(\p{Punct}*)(.*?)(\p{Punct}*)$" replacement="$2"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.ASCIIFoldingFilterFactory"/>
        <filter class="solr.CommonGramsFilterFactory" words="stopwords.txt"
ignoreCase="true"/>
        <filter class="solr.LengthFilterFactory" min="1" max="255"/>
      </analyzer>
    </fieldType>


I am not sure what I am missing. I have also set sow=false so that the
whole query string is sent to field's analysis chain instead of sending
word by word. But that didnt' seem to help.

Thanks
Nawab

CommonGramsFilter

Reply via email to