SolrQueryParser.java

yonik Sat, 29 Nov 2008 08:38:31 -0800

Author: yonik
Date: Sat Nov 29 08:38:08 2008
New Revision: 721687

URL: http://svn.apache.org/viewvc?rev=721687&view=rev
Log:
SOLR-879: enable position increments in QP, fix example schemas


Modified:
    lucene/solr/trunk/CHANGES.txt
    lucene/solr/trunk/contrib/javascript/example/testsolr/solr/conf/schema.xml
    lucene/solr/trunk/contrib/velocity/src/main/solr/conf/schema.xml
    lucene/solr/trunk/example/solr/conf/schema.xml
    lucene/solr/trunk/src/java/org/apache/solr/search/SolrQueryParser.java

Modified: lucene/solr/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=721687&r1=721686&r2=721687&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Sat Nov 29 08:38:08 2008
@@ -135,6 +135,11 @@
 
 11. SOLR-872: Better error message for incorrect copyField destination (Noble 
Paul via shalin)
 
+12. SOLR-879: Enable position increments in the query parser and fix the
+    example schema to enable position increments for the stop filter in
+    both the index and query analyzers to fix the bug with phrase queries
+    with stopwords. (yonik)
+
 
 Other Changes
 ----------------------

Modified: 
lucene/solr/trunk/contrib/javascript/example/testsolr/solr/conf/schema.xml
URL: 
http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/javascript/example/testsolr/solr/conf/schema.xml?rev=721687&r1=721686&r2=721687&view=diff
==============================================================================
--- lucene/solr/trunk/contrib/javascript/example/testsolr/solr/conf/schema.xml 
(original)
+++ lucene/solr/trunk/contrib/javascript/example/testsolr/solr/conf/schema.xml 
Sat Nov 29 08:38:08 2008
@@ -1,4 +1,42 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!--  
+ This is the Solr schema file. This file should be named "schema.xml" and
+ should be in the conf directory under the solr home
+ (i.e. ./solr/conf/schema.xml by default) 
+ or located where the classloader for the Solr webapp can find it.
+
+ This example schema is the recommended starting point for users.
+ It should be kept correct and concise, usable out-of-the-box.
+
+ For more information, on how to customize this file, please see
+ http://wiki.apache.org/solr/SchemaXml
+-->
+
 <schema name="example" version="1.1">
+  <!-- attribute "name" is the name of this schema and is only used for 
display purposes.
+       Applications should change this to reflect the nature of the search 
collection.
+       version="1.1" is Solr's version number for the schema syntax and 
semantics.  It should
+       not normally be changed by applications.
+       1.0: multiValued attribute did not exist, all fields are multiValued by 
nature
+       1.1: multiValued attribute introduced, false by default -->
+
   <types>
     <!-- field type definitions. The "name" attribute is
        just a label to be used by field definitions.  The "class"
@@ -127,8 +165,8 @@
         <filter class="solr.SynonymFilterFactory" 
synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
         -->
         <!-- Case insensitive stop word removal.
-             enablePositionIncrements=true ensures that a 'gap' is left to
-             allow for accurate phrase queries.
+          add enablePositionIncrements=true in both the index and query
+          analyzers to leave a 'gap' for more accurate phrase queries.
         -->
         <filter class="solr.StopFilterFactory"
                 ignoreCase="true"
@@ -143,7 +181,11 @@
       <analyzer type="query">
         <tokenizer class="solr.WhitespaceTokenizerFactory"/>
         <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" 
ignoreCase="true" expand="true"/>
-        <filter class="solr.StopFilterFactory" ignoreCase="true" 
words="stopwords.txt"/>
+        <filter class="solr.StopFilterFactory"
+                ignoreCase="true"
+                words="stopwords.txt"
+                enablePositionIncrements="true"
+                />
         <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" 
generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" 
splitOnCaseChange="1"/>
         <filter class="solr.LowerCaseFilterFactory"/>
         <filter class="solr.EnglishPorterFilterFactory" 
protected="protwords.txt"/>
@@ -166,7 +208,6 @@
       </analyzer>
     </fieldType>
 
-
     <!--
      Setup simple analysis for spell checking
      -->
@@ -178,6 +219,16 @@
       </analyzer>
     </fieldType>
 
+    <!-- charFilter + "CharStream aware" WhitespaceTokenizer  -->
+    <!--
+    <fieldType name="textCharNorm" class="solr.TextField" 
positionIncrementGap="100" >
+      <analyzer>
+        <charFilter class="solr.MappingCharFilterFactory" 
mapping="mapping-ISOLatin1Accent.txt"/>
+        <tokenizer class="solr.CharStreamAwareWhitespaceTokenizerFactory"/>
+      </analyzer>
+    </fieldType>
+    -->
+
     <!-- This is an example of using the KeywordTokenizer along
          With various TokenFilterFactories to produce a sortable field
          that does not include some properties of the source text
@@ -210,6 +261,14 @@
         />
       </analyzer>
     </fieldType>
+    
+    <fieldtype name="phonetic" stored="false" indexed="true" 
class="solr.TextField" >
+      <analyzer>
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
+      </analyzer>
+    </fieldtype> 
+    
 
     <!-- since fields of this type are by default not stored or indexed, any 
data added to 
          them will be ignored outright 
@@ -220,32 +279,134 @@
 
 
  <fields>
+   <!-- Valid attributes for fields:
+     name: mandatory - the name for the field
+     type: mandatory - the name of a previously defined type from the <types> 
section
+     indexed: true if this field should be indexed (searchable or sortable)
+     stored: true if this field should be retrievable
+     compressed: [false] if this field should be stored using gzip compression
+       (this will only apply if the field type is compressable; among
+       the standard field types, only TextField and StrField are)
+     multiValued: true if this field may contain multiple values per document
+     omitNorms: (expert) set to true to omit the norms associated with
+       this field (this disables length normalization and index-time
+       boosting for the field, and saves some memory).  Only full-text
+       fields or fields that need an index-time boost need norms.
+     termVectors: [false] set to true to store the term vector for a given 
field.
+       When using MoreLikeThis, fields used for similarity should be stored 
for 
+       best performance.
+     termPositions: Store position information with the term vector.  This 
will increase storage costs.
+     termOffsets: Store offset information with the term vector. This will 
increase storage costs.
+   -->
+
    <field name="id" type="string" indexed="true" stored="true" required="true" 
/> 
-   <field name="title" type="text" indexed="true" stored="true"/>
-   <field name="text" type="text" indexed="true" stored="true"/>
-   <field name="date" type="date" indexed="true" stored="true"/>
-   <field name="dateline" type="text" indexed="true" stored="true"/>
-   <field name="places" type="string" indexed="true" stored="true" 
multiValued="true" omitNorms="true" termVectors="true" />
-   <field name="countryCodes" type="string" indexed="true" stored="true" 
multiValued="true" omitNorms="true" termVectors="true" />
-   <field name="topics" type="string" indexed="true" stored="true" 
multiValued="true" omitNorms="true" termVectors="true" />
-   <field name="organisations" type="string" indexed="true" stored="true" 
multiValued="true" omitNorms="true" termVectors="true" />
-   <field name="exchanges" type="string" indexed="true" stored="true" 
multiValued="true" omitNorms="true" termVectors="true" />
-   <field name="companies" type="string" indexed="true" stored="true" 
multiValued="true" omitNorms="true" termVectors="true" />
-   <field name="allText" type="text" indexed="true" stored="true" 
multiValued="true" omitNorms="true" termVectors="true" />
- </fields>
+   <field name="sku" type="textTight" indexed="true" stored="true" 
omitNorms="true"/>
+   <field name="name" type="text" indexed="true" stored="true"/>
+   <field name="nameSort" type="string" indexed="true" stored="false"/>
+   <field name="alphaNameSort" type="alphaOnlySort" indexed="true" 
stored="false"/>
+   <field name="manu" type="text" indexed="true" stored="true" 
omitNorms="true"/>
+   <field name="cat" type="text_ws" indexed="true" stored="true" 
multiValued="true" omitNorms="true" termVectors="true" />
+   <field name="features" type="text" indexed="true" stored="true" 
multiValued="true" termVectors="true" termPositions="true" termOffsets="true"/>
+   <field name="includes" type="text" indexed="true" stored="true"/>
+
+   <field name="weight" type="sfloat" indexed="true" stored="true"/>
+   <field name="price"  type="sfloat" indexed="true" stored="true"/>
+   <!-- "default" values can be specified for fields, indicating which
+        value should be used if no value is specified when adding a document.
+     -->
+   <field name="popularity" type="sint" indexed="true" stored="true" 
default="0"/>
+   <field name="inStock" type="boolean" indexed="true" stored="true"/>
 
-       <copyField source="title" dest="allText"/>
-       <copyField source="text" dest="allText"/>
-       <copyField source="places" dest="allText"/>
-       <copyField source="topics" dest="allText"/>
-       <copyField source="companies" dest="allText"/>
-       <copyField source="exchanges" dest="allText"/>
+   <!-- Some sample docs exists solely to demonstrate the spellchecker
+        functionality, this is the only field they container.
+        Typically you might build the spellchecker of "catchall" type field
+        containing all of the text in each document
+     -->
+   <field name="word" type="string" indexed="true" stored="true"/>
+
+   
+   <!-- catchall field, containing all other searchable text fields 
(implemented
+        via copyField further on in this schema  -->
+   <field name="text" type="text" indexed="true" stored="false" 
multiValued="true"/>
+
+   <!-- non-tokenized version of manufacturer to make it easier to sort or 
group
+        results by manufacturer.  copied from "manu" via copyField -->
+   <field name="manu_exact" type="string" indexed="true" stored="false"/>
+
+   <!-- Here, default is used to create a "timestamp" field indicating
+        When each document was indexed.
+     -->
+   <field name="timestamp" type="date" indexed="true" stored="true" 
default="NOW" multiValued="false"/>
+   
+   <field name="spell" type="textSpell" indexed="true" stored="true" 
multiValued="true"/>
+   <!-- Dynamic field definitions.  If a field name is not found, dynamicFields
+        will be used if the name matches any of the patterns.
+        RESTRICTION: the glob-like pattern in the name attribute must have
+        a "*" only at the start or the end.
+        EXAMPLE:  name="*_i" will match any field ending in _i (like myid_i, 
z_i)
+        Longer patterns will be matched first.  if equal size patterns
+        both match, the first appearing in the schema will be used.  -->
+   <dynamicField name="*_i"  type="sint"    indexed="true"  stored="true"/>
+   <dynamicField name="*_s"  type="string"  indexed="true"  stored="true"/>
+   <dynamicField name="*_l"  type="slong"   indexed="true"  stored="true"/>
+   <dynamicField name="*_t"  type="text"    indexed="true"  stored="true"/>
+   <dynamicField name="*_b"  type="boolean" indexed="true"  stored="true"/>
+   <dynamicField name="*_f"  type="sfloat"  indexed="true"  stored="true"/>
+   <dynamicField name="*_d"  type="sdouble" indexed="true"  stored="true"/>
+   <dynamicField name="*_dt" type="date"    indexed="true"  stored="true"/>
+
+   <dynamicField name="random*" type="random" />
+
+   <!-- uncomment the following to ignore any fields that don't already match 
an existing 
+        field name or dynamic field, rather than reporting them as an error. 
+        alternately, change the type="ignored" to some other type e.g. "text" 
if you want 
+        unknown fields indexed and/or stored by default --> 
+   <!--dynamicField name="*" type="ignored" /-->
+   
+ </fields>
 
+ <!-- Field to use to determine and enforce document uniqueness. 
+      Unless this field is marked with required="false", it will be a required 
field
+   -->
  <uniqueKey>id</uniqueKey>
 
+ <!-- field for the QueryParser to use when an explicit fieldname is absent -->
  <defaultSearchField>text</defaultSearchField>
 
+ <!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
  <solrQueryParser defaultOperator="OR"/>
 
+  <!-- copyField commands copy one field to another at the time a document
+        is added to the index.  It's used either to index the same field 
differently,
+        or to add multiple fields to the same field for easier/faster 
searching.  -->
+   <copyField source="id" dest="sku"/>
+
+   <copyField source="incubationdate_dt" dest="incubationdate_s"/>
+   <copyField source="cat" dest="text"/>
+   <copyField source="name" dest="text"/>
+   <copyField source="name" dest="nameSort"/>
+   <copyField source="name" dest="alphaNameSort"/>
+   <copyField source="manu" dest="text"/>
+   <copyField source="features" dest="text"/>
+   <copyField source="includes" dest="text"/>
+
+   <copyField source="manu" dest="manu_exact"/>
+
+  <copyField source="name" dest="spell"/>
+
+ <!-- Similarity is the scoring routine for each document vs. a query.
+      A custom similarity may be specified here, but the default is fine
+      for most applications.  -->
+ <!-- <similarity class="org.apache.lucene.search.DefaultSimilarity"/> -->
+ <!-- ... OR ...
+      Specify a SimilarityFactory class name implementation
+      allowing parameters to be used.
+ -->
+ <!--
+ <similarity class="com.example.solr.CustomSimilarityFactory">
+   <str name="paramkey">param value</str>
+ </similarity>
+ -->
+
 
 </schema>

Modified: lucene/solr/trunk/contrib/velocity/src/main/solr/conf/schema.xml
URL: 
http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/velocity/src/main/solr/conf/schema.xml?rev=721687&r1=721686&r2=721687&view=diff
==============================================================================
--- lucene/solr/trunk/contrib/velocity/src/main/solr/conf/schema.xml (original)
+++ lucene/solr/trunk/contrib/velocity/src/main/solr/conf/schema.xml Sat Nov 29 
08:38:08 2008
@@ -165,8 +165,8 @@
         <filter class="solr.SynonymFilterFactory" 
synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
         -->
         <!-- Case insensitive stop word removal.
-             enablePositionIncrements=true ensures that a 'gap' is left to
-             allow for accurate phrase queries.
+          add enablePositionIncrements=true in both the index and query
+          analyzers to leave a 'gap' for more accurate phrase queries.
         -->
         <filter class="solr.StopFilterFactory"
                 ignoreCase="true"
@@ -181,7 +181,11 @@
       <analyzer type="query">
         <tokenizer class="solr.WhitespaceTokenizerFactory"/>
         <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" 
ignoreCase="true" expand="true"/>
-        <filter class="solr.StopFilterFactory" ignoreCase="true" 
words="stopwords.txt"/>
+        <filter class="solr.StopFilterFactory"
+                ignoreCase="true"
+                words="stopwords.txt"
+                enablePositionIncrements="true"
+                />
         <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" 
generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" 
splitOnCaseChange="1"/>
         <filter class="solr.LowerCaseFilterFactory"/>
         <filter class="solr.EnglishPorterFilterFactory" 
protected="protwords.txt"/>
@@ -215,6 +219,16 @@
       </analyzer>
     </fieldType>
 
+    <!-- charFilter + "CharStream aware" WhitespaceTokenizer  -->
+    <!--
+    <fieldType name="textCharNorm" class="solr.TextField" 
positionIncrementGap="100" >
+      <analyzer>
+        <charFilter class="solr.MappingCharFilterFactory" 
mapping="mapping-ISOLatin1Accent.txt"/>
+        <tokenizer class="solr.CharStreamAwareWhitespaceTokenizerFactory"/>
+      </analyzer>
+    </fieldType>
+    -->
+
     <!-- This is an example of using the KeywordTokenizer along
          With various TokenFilterFactories to produce a sortable field
          that does not include some properties of the source text

Modified: lucene/solr/trunk/example/solr/conf/schema.xml
URL: 
http://svn.apache.org/viewvc/lucene/solr/trunk/example/solr/conf/schema.xml?rev=721687&r1=721686&r2=721687&view=diff
==============================================================================
--- lucene/solr/trunk/example/solr/conf/schema.xml (original)
+++ lucene/solr/trunk/example/solr/conf/schema.xml Sat Nov 29 08:38:08 2008
@@ -165,8 +165,8 @@
         <filter class="solr.SynonymFilterFactory" 
synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
         -->
         <!-- Case insensitive stop word removal.
-             enablePositionIncrements=true ensures that a 'gap' is left to
-             allow for accurate phrase queries.
+          add enablePositionIncrements=true in both the index and query
+          analyzers to leave a 'gap' for more accurate phrase queries.
         -->
         <filter class="solr.StopFilterFactory"
                 ignoreCase="true"
@@ -181,7 +181,11 @@
       <analyzer type="query">
         <tokenizer class="solr.WhitespaceTokenizerFactory"/>
         <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" 
ignoreCase="true" expand="true"/>
-        <filter class="solr.StopFilterFactory" ignoreCase="true" 
words="stopwords.txt"/>
+        <filter class="solr.StopFilterFactory"
+                ignoreCase="true"
+                words="stopwords.txt"
+                enablePositionIncrements="true"
+                />
         <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" 
generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" 
splitOnCaseChange="1"/>
         <filter class="solr.LowerCaseFilterFactory"/>
         <filter class="solr.EnglishPorterFilterFactory" 
protected="protwords.txt"/>

Modified: lucene/solr/trunk/src/java/org/apache/solr/search/SolrQueryParser.java
URL: 
http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/search/SolrQueryParser.java?rev=721687&r1=721686&r2=721687&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/search/SolrQueryParser.java 
(original)
+++ lucene/solr/trunk/src/java/org/apache/solr/search/SolrQueryParser.java Sat 
Nov 29 08:38:08 2008
@@ -73,6 +73,7 @@
     this.parser  = null;
     this.defaultField = defaultField;
     setLowercaseExpandedTerms(false);
+    setEnablePositionIncrements(true);    
   }
 
   public SolrQueryParser(QParser parser, String defaultField) {
@@ -85,6 +86,7 @@
     this.parser = parser;
     this.defaultField = defaultField;
     setLowercaseExpandedTerms(false);
+    setEnablePositionIncrements(true);
   }
 
   private void checkNullField(String field) throws SolrException {

svn commit: r721687 - in /lucene/solr/trunk: CHANGES.txt contrib/javascript/example/testsolr/solr/conf/schema.xml contrib/velocity/src/main/solr/conf/schema.xml example/solr/conf/schema.xml src/java/org/apache/solr/search/SolrQueryParser.java

Reply via email to