Author: gsingers
Date: Wed Apr 1 20:07:44 2009
New Revision: 761036
URL: http://svn.apache.org/viewvc?rev=761036&view=rev
Log:
SOLR-1095: fix stopword and keep filter performance issue
Added:
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestKeepFilterFactory.java
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestStopFilterFactory.java
lucene/solr/trunk/src/test/test-files/solr/conf/keep-1.txt
lucene/solr/trunk/src/test/test-files/solr/conf/keep-2.txt
lucene/solr/trunk/src/test/test-files/solr/conf/schema-stop-keep.xml
lucene/solr/trunk/src/test/test-files/solr/conf/stop-1.txt
lucene/solr/trunk/src/test/test-files/solr/conf/stop-2.txt
Modified:
lucene/solr/trunk/CHANGES.txt
lucene/solr/trunk/src/java/org/apache/solr/analysis/KeepWordFilter.java
lucene/solr/trunk/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java
lucene/solr/trunk/src/java/org/apache/solr/analysis/StopFilterFactory.java
lucene/solr/trunk/src/test/org/apache/solr/util/TestUtils.java
Modified: lucene/solr/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=761036&r1=761035&r2=761036&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Wed Apr 1 20:07:44 2009
@@ -191,6 +191,8 @@
33. SOLR-939: ValueSourceRangeFilter/Query - filter based on values in a
FieldCache entry or on any arbitrary function of field values. (yonik)
+34. SOLR-1095: Fixed performance problem in the StopFilterFactory and
simplified code. Added tests as well. (gsingers)
+
Optimizations
----------------------
Modified:
lucene/solr/trunk/src/java/org/apache/solr/analysis/KeepWordFilter.java
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/KeepWordFilter.java?rev=761036&r1=761035&r2=761036&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/KeepWordFilter.java
(original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/KeepWordFilter.java Wed
Apr 1 20:07:44 2009
@@ -43,7 +43,7 @@
@Override
public final Token next(Token in) throws IOException {
- for (Token token=input.next(in); token!=null; token=input.next()) {
+ for (Token token=input.next(in); token!=null; token=input.next(token)) {
if( words.contains( token.termBuffer(), 0, token.termLength() ) ) {
return token;
}
Modified:
lucene/solr/trunk/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java?rev=761036&r1=761035&r2=761036&view=diff
==============================================================================
---
lucene/solr/trunk/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java
(original)
+++
lucene/solr/trunk/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java
Wed Apr 1 20:07:44 2009
@@ -22,6 +22,7 @@
import org.apache.solr.util.plugin.ResourceLoaderAware;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.CharArraySet;
import java.util.HashSet;
import java.util.List;
@@ -36,31 +37,25 @@
*/
public class KeepWordFilterFactory extends BaseTokenFilterFactory implements
ResourceLoaderAware {
- private Set<String> words;
+ private CharArraySet words;
private boolean ignoreCase;
@SuppressWarnings("unchecked")
public void inform(ResourceLoader loader) {
String wordFiles = args.get("words");
- ignoreCase = getBoolean("ignoreCase",false);
-
+ ignoreCase = getBoolean("ignoreCase", false);
if (wordFiles != null) {
- if (words == null)
- words = new HashSet<String>();
try {
- java.io.File keepWordsFile = new File(wordFiles);
- if (keepWordsFile.exists()) {
- List<String> wlist = loader.getLines(wordFiles);
- words = StopFilter.makeStopSet(
- (String[])wlist.toArray(new String[0]), ignoreCase);
- } else {
- List<String> files = StrUtils.splitFileNames(wordFiles);
- for (String file : files) {
- List<String> wlist = loader.getLines(file.trim());
- words.addAll(StopFilter.makeStopSet((String[])wlist.toArray(new
String[0]), ignoreCase));
- }
+ List<String> files = StrUtils.splitFileNames(wordFiles);
+ if (words == null && files.size() > 0){
+ words = new CharArraySet(files.size() * 10, ignoreCase);
+ }
+ for (String file : files) {
+ List<String> wlist = loader.getLines(file.trim());
+ //TODO: once StopFilter.makeStopSet(List) method is available,
switch to using that so we can avoid a toArray() call
+ words.addAll(StopFilter.makeStopSet((String[]) wlist.toArray(new
String[0]), ignoreCase));
}
- }
+ }
catch (IOException e) {
throw new RuntimeException(e);
}
@@ -72,15 +67,22 @@
* NOTE: if ignoreCase==true, the words are expected to be lowercase
*/
public void setWords(Set<String> words) {
- this.words = words;
+ this.words = new CharArraySet(words, ignoreCase);
}
public void setIgnoreCase(boolean ignoreCase) {
this.ignoreCase = ignoreCase;
}
-
+
public KeepWordFilter create(TokenStream input) {
- return new KeepWordFilter(input,words,ignoreCase);
+ return new KeepWordFilter(input, words, ignoreCase);
}
+ public CharArraySet getWords() {
+ return words;
+ }
+
+ public boolean isIgnoreCase() {
+ return ignoreCase;
+ }
}
Modified:
lucene/solr/trunk/src/java/org/apache/solr/analysis/StopFilterFactory.java
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/StopFilterFactory.java?rev=761036&r1=761035&r2=761036&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/StopFilterFactory.java
(original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/StopFilterFactory.java
Wed Apr 1 20:07:44 2009
@@ -23,6 +23,7 @@
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.CharArraySet;
import java.util.HashSet;
import java.util.List;
@@ -42,29 +43,26 @@
enablePositionIncrements = getBoolean("enablePositionIncrements",false);
if (stopWordFiles != null) {
- if (stopWords == null)
- stopWords = new HashSet<String>();
try {
- java.io.File keepWordsFile = new File(stopWordFiles);
- if (keepWordsFile.exists()) {
- List<String> wlist = loader.getLines(stopWordFiles);
- stopWords = StopFilter.makeStopSet((String[])wlist.toArray(new
String[0]), ignoreCase);
- } else {
- List<String> files = StrUtils.splitFileNames(stopWordFiles);
+ List<String> files = StrUtils.splitFileNames(stopWordFiles);
+ if (stopWords == null && files.size() > 0){
+ //default stopwords list has 35 or so words, but maybe don't make
it that big to start
+ stopWords = new CharArraySet(files.size() * 10, ignoreCase);
+ }
for (String file : files) {
List<String> wlist = loader.getLines(file.trim());
+ //TODO: once StopFilter.makeStopSet(List) method is available,
switch to using that so we can avoid a toArray() call
stopWords.addAll(StopFilter.makeStopSet((String[])wlist.toArray(new String[0]),
ignoreCase));
}
- }
} catch (IOException e) {
throw new RuntimeException(e);
}
} else {
- stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS,
ignoreCase);
+ stopWords = (CharArraySet)
StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS, ignoreCase);
}
}
-
- private Set stopWords;
+ //Force the use of a char array set, as it is the most performant, although
this may break things if Lucene ever goes away from it. See SOLR-1095
+ private CharArraySet stopWords;
private boolean ignoreCase;
private boolean enablePositionIncrements;
Added:
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestKeepFilterFactory.java
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestKeepFilterFactory.java?rev=761036&view=auto
==============================================================================
---
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestKeepFilterFactory.java
(added)
+++
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestKeepFilterFactory.java
Wed Apr 1 20:07:44 2009
@@ -0,0 +1,65 @@
+package org.apache.solr.analysis;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.solr.util.AbstractSolrTestCase;
+import org.apache.solr.common.ResourceLoader;
+
+import java.util.Set;
+import java.util.Map;
+import java.util.HashMap;
+
+
+/**
+ *
+ *
+ **/
+public class TestKeepFilterFactory extends AbstractSolrTestCase{
+ public String getSchemaFile() {
+ return "schema-stop-keep.xml";
+ }
+
+ public String getSolrConfigFile() {
+ return "solrconfig.xml";
+ }
+
+ public void testInform() throws Exception {
+ ResourceLoader loader = solrConfig.getResourceLoader();
+ assertTrue("loader is null and it shouldn't be", loader != null);
+ KeepWordFilterFactory factory = new KeepWordFilterFactory();
+ Map<String, String> args = new HashMap<String, String>();
+ args.put("words", "keep-1.txt");
+ args.put("ignoreCase", "true");
+ factory.init(args);
+ factory.inform(loader);
+ Set words = factory.getWords();
+ assertTrue("words is null and it shouldn't be", words != null);
+ assertTrue("words Size: " + words.size() + " is not: " + 2, words.size()
== 2);
+
+
+ factory = new KeepWordFilterFactory();
+ args.put("words", "keep-1.txt, keep-2.txt");
+ factory.init(args);
+ factory.inform(loader);
+ words = factory.getWords();
+ assertTrue("words is null and it shouldn't be", words != null);
+ assertTrue("words Size: " + words.size() + " is not: " + 4, words.size()
== 4);
+
+
+
+ }
+}
\ No newline at end of file
Added:
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestStopFilterFactory.java
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestStopFilterFactory.java?rev=761036&view=auto
==============================================================================
---
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestStopFilterFactory.java
(added)
+++
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestStopFilterFactory.java
Wed Apr 1 20:07:44 2009
@@ -0,0 +1,66 @@
+package org.apache.solr.analysis;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.solr.util.AbstractSolrTestCase;
+import org.apache.solr.common.ResourceLoader;
+
+import java.util.Set;
+import java.util.Map;
+import java.util.HashMap;
+
+
+/**
+ *
+ *
+ **/
+public class TestStopFilterFactory extends AbstractSolrTestCase{
+ public String getSchemaFile() {
+ return "schema-stop-keep.xml";
+ }
+
+ public String getSolrConfigFile() {
+ return "solrconfig.xml";
+ }
+
+ public void testInform() throws Exception {
+ ResourceLoader loader = solrConfig.getResourceLoader();
+ assertTrue("loader is null and it shouldn't be", loader != null);
+ StopFilterFactory factory = new StopFilterFactory();
+ Map<String, String> args = new HashMap<String, String>();
+ args.put("words", "stop-1.txt");
+ args.put("ignoreCase", "true");
+ factory.init(args);
+ factory.inform(loader);
+ Set words = factory.getStopWords();
+ assertTrue("words is null and it shouldn't be", words != null);
+ assertTrue("words Size: " + words.size() + " is not: " + 2, words.size()
== 2);
+ assertTrue(factory.isIgnoreCase() + " does not equal: " + true,
factory.isIgnoreCase() == true);
+
+ factory = new StopFilterFactory();
+ args.put("words", "stop-1.txt, stop-2.txt");
+ factory.init(args);
+ factory.inform(loader);
+ words = factory.getStopWords();
+ assertTrue("words is null and it shouldn't be", words != null);
+ assertTrue("words Size: " + words.size() + " is not: " + 4, words.size()
== 4);
+ assertTrue(factory.isIgnoreCase() + " does not equal: " + true,
factory.isIgnoreCase() == true);
+
+
+ }
+}
Modified: lucene/solr/trunk/src/test/org/apache/solr/util/TestUtils.java
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/util/TestUtils.java?rev=761036&r1=761035&r2=761036&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/util/TestUtils.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/util/TestUtils.java Wed Apr 1
20:07:44 2009
@@ -67,6 +67,10 @@
assertEquals(2,arr.size());
assertEquals("/h/s",arr.get(0));
assertEquals("/h/,s",arr.get(1));
+
+ arr = StrUtils.splitFileNames("/h/s");
+ assertEquals(1,arr.size());
+ assertEquals("/h/s",arr.get(0));
}
public void testNamedLists()
Added: lucene/solr/trunk/src/test/test-files/solr/conf/keep-1.txt
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/test-files/solr/conf/keep-1.txt?rev=761036&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/test-files/solr/conf/keep-1.txt (added)
+++ lucene/solr/trunk/src/test/test-files/solr/conf/keep-1.txt Wed Apr 1
20:07:44 2009
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+foo
+bar
\ No newline at end of file
Added: lucene/solr/trunk/src/test/test-files/solr/conf/keep-2.txt
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/test-files/solr/conf/keep-2.txt?rev=761036&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/test-files/solr/conf/keep-2.txt (added)
+++ lucene/solr/trunk/src/test/test-files/solr/conf/keep-2.txt Wed Apr 1
20:07:44 2009
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+junk
+more
\ No newline at end of file
Added: lucene/solr/trunk/src/test/test-files/solr/conf/schema-stop-keep.xml
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/test-files/solr/conf/schema-stop-keep.xml?rev=761036&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/test-files/solr/conf/schema-stop-keep.xml (added)
+++ lucene/solr/trunk/src/test/test-files/solr/conf/schema-stop-keep.xml Wed
Apr 1 20:07:44 2009
@@ -0,0 +1,67 @@
+<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!--
+
+ For testing stopword configuration and keep word configuration
+
+ $Id: schema.xml 382610 2006-03-03 01:43:03Z yonik $
+ $Source:
/cvs/main/searching/solr-configs/test/WEB-INF/classes/schema.xml,v $
+ $Name: $
+ -->
+
+<schema name="test" version="1.0">
+ <types>
+
+
+ <fieldtype name="integer" class="solr.IntField" />
+
+ <fieldtype name="string" class="solr.StrField" sortMissingLast="true"/>
+ <fieldtype name="stop-one" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true"
+ words="stop-1.txt"/>
+
+ </analyzer>
+ </fieldtype>
+ <fieldtype name="stop-two" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true"
+ words="stop-1.txt,stop-2.txt"/>
+
+ </analyzer>
+ </fieldtype>
+ </types>
+
+
+ <fields>
+ <field name="id" type="integer" indexed="true" stored="true"
multiValued="false" required="false"/>
+ <field name="one" type="stop-one" indexed="true" stored="false"/>
+ <field name="two" type="stop-two" indexed="true" stored="false"/>
+
+ </fields>
+
+ <defaultSearchField>one</defaultSearchField>
+ <uniqueKey>id</uniqueKey>
+
+
+</schema>
Added: lucene/solr/trunk/src/test/test-files/solr/conf/stop-1.txt
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/test-files/solr/conf/stop-1.txt?rev=761036&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/test-files/solr/conf/stop-1.txt (added)
+++ lucene/solr/trunk/src/test/test-files/solr/conf/stop-1.txt Wed Apr 1
20:07:44 2009
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+foo
+bar
\ No newline at end of file
Added: lucene/solr/trunk/src/test/test-files/solr/conf/stop-2.txt
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/test-files/solr/conf/stop-2.txt?rev=761036&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/test-files/solr/conf/stop-2.txt (added)
+++ lucene/solr/trunk/src/test/test-files/solr/conf/stop-2.txt Wed Apr 1
20:07:44 2009
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+junk
+more
\ No newline at end of file