Author: gsingers
Date: Wed Apr  1 20:07:44 2009
New Revision: 761036

URL: http://svn.apache.org/viewvc?rev=761036&view=rev
Log:
SOLR-1095: fix stopword and keep filter performance issue

Added:
    
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestKeepFilterFactory.java
    
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestStopFilterFactory.java
    lucene/solr/trunk/src/test/test-files/solr/conf/keep-1.txt
    lucene/solr/trunk/src/test/test-files/solr/conf/keep-2.txt
    lucene/solr/trunk/src/test/test-files/solr/conf/schema-stop-keep.xml
    lucene/solr/trunk/src/test/test-files/solr/conf/stop-1.txt
    lucene/solr/trunk/src/test/test-files/solr/conf/stop-2.txt
Modified:
    lucene/solr/trunk/CHANGES.txt
    lucene/solr/trunk/src/java/org/apache/solr/analysis/KeepWordFilter.java
    
lucene/solr/trunk/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java
    lucene/solr/trunk/src/java/org/apache/solr/analysis/StopFilterFactory.java
    lucene/solr/trunk/src/test/org/apache/solr/util/TestUtils.java

Modified: lucene/solr/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=761036&r1=761035&r2=761036&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Wed Apr  1 20:07:44 2009
@@ -191,6 +191,8 @@
 
 33. SOLR-939: ValueSourceRangeFilter/Query - filter based on values in a 
FieldCache entry or on any arbitrary function of field values. (yonik)
 
+34. SOLR-1095: Fixed performance problem in the StopFilterFactory and 
simplified code.  Added tests as well.  (gsingers)
+
 
 Optimizations
 ----------------------

Modified: 
lucene/solr/trunk/src/java/org/apache/solr/analysis/KeepWordFilter.java
URL: 
http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/KeepWordFilter.java?rev=761036&r1=761035&r2=761036&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/KeepWordFilter.java 
(original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/KeepWordFilter.java Wed 
Apr  1 20:07:44 2009
@@ -43,7 +43,7 @@
 
   @Override
   public final Token next(Token in) throws IOException {
-    for (Token token=input.next(in); token!=null; token=input.next()) {
+    for (Token token=input.next(in); token!=null; token=input.next(token)) {
       if( words.contains( token.termBuffer(), 0, token.termLength() ) ) {
         return token;
       }

Modified: 
lucene/solr/trunk/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java
URL: 
http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java?rev=761036&r1=761035&r2=761036&view=diff
==============================================================================
--- 
lucene/solr/trunk/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java 
(original)
+++ 
lucene/solr/trunk/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java 
Wed Apr  1 20:07:44 2009
@@ -22,6 +22,7 @@
 import org.apache.solr.util.plugin.ResourceLoaderAware;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.CharArraySet;
 
 import java.util.HashSet;
 import java.util.List;
@@ -36,31 +37,25 @@
  */
 public class KeepWordFilterFactory extends BaseTokenFilterFactory implements 
ResourceLoaderAware {
 
-  private Set<String> words;
+  private CharArraySet words;
   private boolean ignoreCase;
 
   @SuppressWarnings("unchecked")
   public void inform(ResourceLoader loader) {
     String wordFiles = args.get("words");
-    ignoreCase = getBoolean("ignoreCase",false);
-
+    ignoreCase = getBoolean("ignoreCase", false);
     if (wordFiles != null) {
-      if (words == null)
-        words = new HashSet<String>();
       try {
-        java.io.File keepWordsFile = new File(wordFiles);
-        if (keepWordsFile.exists()) {
-          List<String> wlist = loader.getLines(wordFiles);
-          words = StopFilter.makeStopSet(
-              (String[])wlist.toArray(new String[0]), ignoreCase);
-        } else  {
-          List<String> files = StrUtils.splitFileNames(wordFiles);
-          for (String file : files) {
-            List<String> wlist = loader.getLines(file.trim());
-            words.addAll(StopFilter.makeStopSet((String[])wlist.toArray(new 
String[0]), ignoreCase));
-          }
+        List<String> files = StrUtils.splitFileNames(wordFiles);
+        if (words == null && files.size() > 0){
+          words = new CharArraySet(files.size() * 10, ignoreCase);
+        }
+        for (String file : files) {
+          List<String> wlist = loader.getLines(file.trim());
+          //TODO: once StopFilter.makeStopSet(List) method is available, 
switch to using that so we can avoid a toArray() call
+          words.addAll(StopFilter.makeStopSet((String[]) wlist.toArray(new 
String[0]), ignoreCase));
         }
-      } 
+      }
       catch (IOException e) {
         throw new RuntimeException(e);
       }
@@ -72,15 +67,22 @@
    * NOTE: if ignoreCase==true, the words are expected to be lowercase
    */
   public void setWords(Set<String> words) {
-    this.words = words;
+    this.words = new CharArraySet(words, ignoreCase);
   }
 
   public void setIgnoreCase(boolean ignoreCase) {
     this.ignoreCase = ignoreCase;
   }
-  
+
   public KeepWordFilter create(TokenStream input) {
-    return new KeepWordFilter(input,words,ignoreCase);
+    return new KeepWordFilter(input, words, ignoreCase);
   }
 
+  public CharArraySet getWords() {
+    return words;
+  }
+
+  public boolean isIgnoreCase() {
+    return ignoreCase;
+  }
 }

Modified: 
lucene/solr/trunk/src/java/org/apache/solr/analysis/StopFilterFactory.java
URL: 
http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/StopFilterFactory.java?rev=761036&r1=761035&r2=761036&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/StopFilterFactory.java 
(original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/StopFilterFactory.java 
Wed Apr  1 20:07:44 2009
@@ -23,6 +23,7 @@
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.StopAnalyzer;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.CharArraySet;
 
 import java.util.HashSet;
 import java.util.List;
@@ -42,29 +43,26 @@
     enablePositionIncrements = getBoolean("enablePositionIncrements",false);
 
     if (stopWordFiles != null) {
-      if (stopWords == null)
-        stopWords = new HashSet<String>();
       try {
-        java.io.File keepWordsFile = new File(stopWordFiles);
-        if (keepWordsFile.exists()) {
-          List<String> wlist = loader.getLines(stopWordFiles);
-          stopWords = StopFilter.makeStopSet((String[])wlist.toArray(new 
String[0]), ignoreCase);
-        } else  {
-          List<String> files = StrUtils.splitFileNames(stopWordFiles);
+        List<String> files = StrUtils.splitFileNames(stopWordFiles);
+          if (stopWords == null && files.size() > 0){
+            //default stopwords list has 35 or so words, but maybe don't make 
it that big to start
+            stopWords = new CharArraySet(files.size() * 10, ignoreCase);
+          }
           for (String file : files) {
             List<String> wlist = loader.getLines(file.trim());
+            //TODO: once StopFilter.makeStopSet(List) method is available, 
switch to using that so we can avoid a toArray() call
             
stopWords.addAll(StopFilter.makeStopSet((String[])wlist.toArray(new String[0]), 
ignoreCase));
           }
-        }
       } catch (IOException e) {
         throw new RuntimeException(e);
       }
     } else {
-      stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS, 
ignoreCase);
+      stopWords = (CharArraySet) 
StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS, ignoreCase);
     }
   }
-
-  private Set stopWords;
+  //Force the use of a char array set, as it is the most performant, although 
this may break things if Lucene ever goes away from it.  See SOLR-1095
+  private CharArraySet stopWords;
   private boolean ignoreCase;
   private boolean enablePositionIncrements;
 

Added: 
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestKeepFilterFactory.java
URL: 
http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestKeepFilterFactory.java?rev=761036&view=auto
==============================================================================
--- 
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestKeepFilterFactory.java 
(added)
+++ 
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestKeepFilterFactory.java 
Wed Apr  1 20:07:44 2009
@@ -0,0 +1,65 @@
+package org.apache.solr.analysis;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.solr.util.AbstractSolrTestCase;
+import org.apache.solr.common.ResourceLoader;
+
+import java.util.Set;
+import java.util.Map;
+import java.util.HashMap;
+
+
+/**
+ *
+ *
+ **/
+public class TestKeepFilterFactory extends AbstractSolrTestCase{
+  public String getSchemaFile() {
+    return "schema-stop-keep.xml";
+  }
+
+  public String getSolrConfigFile() {
+    return "solrconfig.xml";
+  }
+
+  public void testInform() throws Exception {
+    ResourceLoader loader = solrConfig.getResourceLoader();
+    assertTrue("loader is null and it shouldn't be", loader != null);
+    KeepWordFilterFactory factory = new KeepWordFilterFactory();
+    Map<String, String> args = new HashMap<String, String>();
+    args.put("words", "keep-1.txt");
+    args.put("ignoreCase", "true");
+    factory.init(args);
+    factory.inform(loader);
+    Set words = factory.getWords();
+    assertTrue("words is null and it shouldn't be", words != null);
+    assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() 
== 2);
+
+
+    factory = new KeepWordFilterFactory();
+    args.put("words", "keep-1.txt, keep-2.txt");
+    factory.init(args);
+    factory.inform(loader);
+    words = factory.getWords();
+    assertTrue("words is null and it shouldn't be", words != null);
+    assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() 
== 4);
+
+
+
+  }
+}
\ No newline at end of file

Added: 
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestStopFilterFactory.java
URL: 
http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestStopFilterFactory.java?rev=761036&view=auto
==============================================================================
--- 
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestStopFilterFactory.java 
(added)
+++ 
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestStopFilterFactory.java 
Wed Apr  1 20:07:44 2009
@@ -0,0 +1,66 @@
+package org.apache.solr.analysis;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.solr.util.AbstractSolrTestCase;
+import org.apache.solr.common.ResourceLoader;
+
+import java.util.Set;
+import java.util.Map;
+import java.util.HashMap;
+
+
+/**
+ *
+ *
+ **/
+public class TestStopFilterFactory extends AbstractSolrTestCase{
+  public String getSchemaFile() {
+    return "schema-stop-keep.xml";
+  }
+
+  public String getSolrConfigFile() {
+    return "solrconfig.xml";
+  }
+
+  public void testInform() throws Exception {
+    ResourceLoader loader = solrConfig.getResourceLoader();
+    assertTrue("loader is null and it shouldn't be", loader != null);
+    StopFilterFactory factory = new StopFilterFactory();
+    Map<String, String> args = new HashMap<String, String>();
+    args.put("words", "stop-1.txt");
+    args.put("ignoreCase", "true");
+    factory.init(args);
+    factory.inform(loader);
+    Set words = factory.getStopWords();
+    assertTrue("words is null and it shouldn't be", words != null);
+    assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() 
== 2);
+    assertTrue(factory.isIgnoreCase() + " does not equal: " + true, 
factory.isIgnoreCase() == true);
+
+    factory = new StopFilterFactory();
+    args.put("words", "stop-1.txt, stop-2.txt");
+    factory.init(args);
+    factory.inform(loader);
+    words = factory.getStopWords();
+    assertTrue("words is null and it shouldn't be", words != null);
+    assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() 
== 4);
+    assertTrue(factory.isIgnoreCase() + " does not equal: " + true, 
factory.isIgnoreCase() == true);
+
+
+  }
+}

Modified: lucene/solr/trunk/src/test/org/apache/solr/util/TestUtils.java
URL: 
http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/util/TestUtils.java?rev=761036&r1=761035&r2=761036&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/util/TestUtils.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/util/TestUtils.java Wed Apr  1 
20:07:44 2009
@@ -67,6 +67,10 @@
     assertEquals(2,arr.size());
     assertEquals("/h/s",arr.get(0));
     assertEquals("/h/,s",arr.get(1));
+
+    arr = StrUtils.splitFileNames("/h/s");
+    assertEquals(1,arr.size());
+    assertEquals("/h/s",arr.get(0));
   }
 
   public void testNamedLists()

Added: lucene/solr/trunk/src/test/test-files/solr/conf/keep-1.txt
URL: 
http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/test-files/solr/conf/keep-1.txt?rev=761036&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/test-files/solr/conf/keep-1.txt (added)
+++ lucene/solr/trunk/src/test/test-files/solr/conf/keep-1.txt Wed Apr  1 
20:07:44 2009
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+foo
+bar
\ No newline at end of file

Added: lucene/solr/trunk/src/test/test-files/solr/conf/keep-2.txt
URL: 
http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/test-files/solr/conf/keep-2.txt?rev=761036&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/test-files/solr/conf/keep-2.txt (added)
+++ lucene/solr/trunk/src/test/test-files/solr/conf/keep-2.txt Wed Apr  1 
20:07:44 2009
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+junk
+more
\ No newline at end of file

Added: lucene/solr/trunk/src/test/test-files/solr/conf/schema-stop-keep.xml
URL: 
http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/test-files/solr/conf/schema-stop-keep.xml?rev=761036&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/test-files/solr/conf/schema-stop-keep.xml (added)
+++ lucene/solr/trunk/src/test/test-files/solr/conf/schema-stop-keep.xml Wed 
Apr  1 20:07:44 2009
@@ -0,0 +1,67 @@
+<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!--
+
+  For testing stopword configuration and keep word configuration
+
+     $Id: schema.xml 382610 2006-03-03 01:43:03Z yonik $
+     $Source: 
/cvs/main/searching/solr-configs/test/WEB-INF/classes/schema.xml,v $
+     $Name:  $
+  -->
+
+<schema name="test" version="1.0">
+  <types>
+
+
+    <fieldtype name="integer" class="solr.IntField" />
+
+    <fieldtype name="string" class="solr.StrField" sortMissingLast="true"/>
+    <fieldtype name="stop-one" class="solr.TextField">
+      <analyzer>
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true"
+                words="stop-1.txt"/>
+
+      </analyzer>
+    </fieldtype>
+    <fieldtype name="stop-two" class="solr.TextField">
+      <analyzer>
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true"
+                words="stop-1.txt,stop-2.txt"/>
+
+      </analyzer>
+    </fieldtype>
+ </types>
+
+
+ <fields>
+   <field name="id" type="integer" indexed="true" stored="true" 
multiValued="false" required="false"/>
+   <field name="one" type="stop-one" indexed="true" stored="false"/>
+   <field name="two" type="stop-two" indexed="true" stored="false"/>
+
+ </fields>
+
+ <defaultSearchField>one</defaultSearchField>
+ <uniqueKey>id</uniqueKey>
+
+
+</schema>

Added: lucene/solr/trunk/src/test/test-files/solr/conf/stop-1.txt
URL: 
http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/test-files/solr/conf/stop-1.txt?rev=761036&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/test-files/solr/conf/stop-1.txt (added)
+++ lucene/solr/trunk/src/test/test-files/solr/conf/stop-1.txt Wed Apr  1 
20:07:44 2009
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+foo
+bar
\ No newline at end of file

Added: lucene/solr/trunk/src/test/test-files/solr/conf/stop-2.txt
URL: 
http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/test-files/solr/conf/stop-2.txt?rev=761036&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/test-files/solr/conf/stop-2.txt (added)
+++ lucene/solr/trunk/src/test/test-files/solr/conf/stop-2.txt Wed Apr  1 
20:07:44 2009
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+junk
+more
\ No newline at end of file


Reply via email to