http://www.mediawiki.org/wiki/Special:Code/MediaWiki/82929
Revision: 82929
Author: nikerabbit
Date: 2011-02-28 10:11:44 +0000 (Mon, 28 Feb 2011)
Log Message:
-----------
Cleanups to spelling, comments, imports and code duplication
Modified Paths:
--------------
trunk/lucene-search-2/src/org/apache/commons/lang/WordUtils.java
trunk/lucene-search-2/src/org/apache/lucene/analysis/KStemFilter.java
trunk/lucene-search-2/src/org/apache/lucene/analysis/KStemmer.java
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/AcronymFilter.java
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/AliasFilter.java
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/Alttitles.java
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/Analyzers.java
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/CJKFilter.java
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/CategoryAnalyzer.java
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EnglishKStemSingular.java
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EnglishSingular.java
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EnglishSingularFilter.java
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EsperantoStemFilter.java
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/FieldBuilder.java
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/HyphenFilter.java
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/LowercaseAnalyzer.java
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/PhraseFilter.java
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/StopWords.java
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/VietnameseFilter.java
trunk/lucene-search-2/src/org/wikimedia/lsearch/config/GlobalConfiguration.java
trunk/lucene-search-2/src/org/wikimedia/lsearch/config/IndexId.java
trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/HttpHandler.java
trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/HttpMonitor.java
trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/SearchDaemon.java
trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/SearchServer.java
trunk/lucene-search-2/src/org/wikimedia/lsearch/search/SearchEngine.java
trunk/lucene-search-2/test/org/wikimedia/lsearch/util/LocalizationTest.java
Modified: trunk/lucene-search-2/src/org/apache/commons/lang/WordUtils.java
===================================================================
--- trunk/lucene-search-2/src/org/apache/commons/lang/WordUtils.java
2011-02-28 03:39:05 UTC (rev 82928)
+++ trunk/lucene-search-2/src/org/apache/commons/lang/WordUtils.java
2011-02-28 10:11:44 UTC (rev 82929)
@@ -21,7 +21,7 @@
*
* <p>This class tries to handle <code>null</code> input gracefully.
* An exception will not be thrown for a <code>null</code> input.
- * Each method documents its behaviour in more detail.</p>
+ * Each method documents its behavior in more detail.</p>
*
* @author Apache Jakarta Velocity
* @author Stephen Colebourne
Modified: trunk/lucene-search-2/src/org/apache/lucene/analysis/KStemFilter.java
===================================================================
--- trunk/lucene-search-2/src/org/apache/lucene/analysis/KStemFilter.java
2011-02-28 03:39:05 UTC (rev 82928)
+++ trunk/lucene-search-2/src/org/apache/lucene/analysis/KStemFilter.java
2011-02-28 10:11:44 UTC (rev 82929)
@@ -45,56 +45,66 @@
import java.io.IOException;
-/** Transforms the token stream according to the KStem stemming algorithm.
- * For more information about KStem see <a
href="http://ciir.cs.umass.edu/pubfiles/ir-35.pdf">
- "Viewing Morphology as an Inference Process"</a>
- (Krovetz, R., Proceedings of the Sixteenth Annual International ACM SIGIR
- Conference on Research and Development in Information Retrieval, 191-203,
1993).
-
- Note: the input to the stemming filter must already be in lower case,
- so you will need to use LowerCaseFilter or LowerCaseTokenizer farther
- down the Tokenizer chain in order for this to work properly!
- <P>
- To use this filter with other analyzers, you'll want to write an
- Analyzer class that sets up the TokenStream chain as you want it.
- To use this with LowerCaseTokenizer, for example, you'd write an
- analyzer like this:
- <P>
- <PRE>
- class MyAnalyzer extends Analyzer {
- public final TokenStream tokenStream(String fieldName, Reader reader) {
- return new KStemStemFilter(new LowerCaseTokenizer(reader));
- }
- }
- </PRE>
-
+/**
+ * Transforms the token stream according to the KStem stemming algorithm. For
+ * more information about KStem see <a
+ * href="http://ciir.cs.umass.edu/pubfiles/ir-35.pdf">
+ * "Viewing Morphology as an Inference Process"</a> (Krovetz, R., Proceedings
of
+ * the Sixteenth Annual International ACM SIGIR Conference on Research and
+ * Development in Information Retrieval, 191-203, 1993).
+ *
+ * Note: the input to the stemming filter must already be in lower case, so you
+ * will need to use LowerCaseFilter or LowerCaseTokenizer farther down the
+ * Tokenizer chain in order for this to work properly!
+ * <P>
+ * To use this filter with other analyzers, you'll want to write an Analyzer
+ * class that sets up the TokenStream chain as you want it. To use this with
+ * LowerCaseTokenizer, for example, you'd write an analyzer like this:
+ * <P>
+ *
+ * <PRE>
+ * class MyAnalyzer extends Analyzer {
+ * public final TokenStream tokenStream(String fieldName, Reader reader) {
+ * return new KStemStemFilter(new LowerCaseTokenizer(reader));
+ * }
+ * }
+ * </PRE>
*/
public final class KStemFilter extends TokenFilter {
private KStemmer stemmer;
- /** Create a KStemmer with the given cache size.
- * @param in The TokenStream whose output will be the input to
KStemFilter.
- * @param cacheSize Maximum number of entries to store in the
- * Stemmer's cache (stems stored in this cache do not need to be
- * recomputed, speeding up the stemming process).
+ /**
+ * Create a KStemmer with the given cache size.
+ *
+ * @param in
+ * The TokenStream whose output will be the input to
KStemFilter.
+ * @param cacheSize
+ * Maximum number of entries to store in the Stemmer's cache
+ * (stems stored in this cache do not need to be recomputed,
+ * speeding up the stemming process).
*/
public KStemFilter(TokenStream in, int cacheSize) {
super(in);
stemmer = new KStemmer(cacheSize);
}
- /** Create a KStemmer with the default cache size of 20 000 entries.
- * @param in The TokenStream whose output will be the input to
KStemFilter.
+ /**
+ * Create a KStemmer with the default cache size of 20 000 entries.
+ *
+ * @param in
+ * The TokenStream whose output will be the input to
KStemFilter.
*/
public KStemFilter(TokenStream in) {
super(in);
stemmer = new KStemmer();
}
- /** Returns the next, stemmed, input Token.
- * @return The stemed form of a token.
- * @throws IOException
+ /**
+ * Returns the next, stemmed, input Token.
+ *
+ * @return The stemmed form of a token.
+ * @throws IOException
*/
public final Token next() throws IOException {
Token token = input.next();
@@ -103,7 +113,8 @@
else {
String s = stemmer.stem(token.termText());
if (!s.equals(token.termText()))
- return new Token(s, token.startOffset,
token.endOffset, token.type);
+ return new Token(s, token.startOffset,
token.endOffset,
+ token.type);
return token;
}
}
Modified: trunk/lucene-search-2/src/org/apache/lucene/analysis/KStemmer.java
===================================================================
--- trunk/lucene-search-2/src/org/apache/lucene/analysis/KStemmer.java
2011-02-28 03:39:05 UTC (rev 82928)
+++ trunk/lucene-search-2/src/org/apache/lucene/analysis/KStemmer.java
2011-02-28 10:11:44 UTC (rev 82929)
@@ -44,13 +44,14 @@
import java.io.*;
/**
- This class implements the Kstem algorithm
+ * This class implements the Kstem algorithm
*/
public class KStemmer {
- /** Default size of the cache that stores <code>(word,stem)</code>
pairs.
- <p>This speeds up processing since Kstem works by
- sucessive "transformations" to the input word until a
- suitable stem is found.
+ /**
+ * Default size of the cache that stores <code>(word,stem)</code> pairs.
+ *
+ * This speeds up processing since Kstem works by successive
+ * "transformations" to the input word until a suitable stem is found.
*/
static public int DEFAULT_CACHE_SIZE = 20000;
static private final int MaxWordLen = 100;
@@ -203,9 +204,9 @@
}
}
- private static Hashtable dict_ht = null;
+ private static Hashtable<String, DictEntry> dict_ht = null;
private int MaxCacheSize;
- private Hashtable stem_ht = null;
+ private Hashtable<String, String> stem_ht = null;
private StringBuffer word;
private int j; /* index of final letter in stem (within word) */
private int k; /* INDEX of final letter in word.
@@ -214,7 +215,7 @@
wordLength, which returns (k+1). */
private void initializeStemHash() {
- stem_ht = new Hashtable();
+ stem_ht = new Hashtable<String, String>();
}
private char finalChar() {
@@ -249,7 +250,7 @@
if (dict_ht != null)
return;
- dict_ht = new Hashtable();
+ dict_ht = new Hashtable<String, DictEntry>();
for (int i=0;i<exceptionWords.length;i++) {
if (!dict_ht.containsKey(exceptionWords[i])) {
entry = new DictEntry(exceptionWords[i],true);
@@ -282,110 +283,28 @@
}
defaultEntry = new DictEntry(null,false);
-
- String[] array;
- array = KStemData1.data;
-
- for (int i=0;i<array.length;i++) {
- if (!dict_ht.containsKey(array[i])) {
- dict_ht.put(array[i],defaultEntry);
+
+ appendStems( dict_ht, defaultEntry, KStemData1.data, "4" );
+ appendStems( dict_ht, defaultEntry, KStemData2.data, "4" );
+ appendStems( dict_ht, defaultEntry, KStemData3.data, "4" );
+ appendStems( dict_ht, defaultEntry, KStemData4.data, "4" );
+ appendStems( dict_ht, defaultEntry, KStemData5.data, "4" );
+ appendStems( dict_ht, defaultEntry, KStemData6.data, "4" );
+ appendStems( dict_ht, defaultEntry, KStemData7.data, "4" );
+ appendStems( dict_ht, defaultEntry, KStemData8.data, "4" );
+ appendStems( dict_ht, defaultEntry, supplementDict, "5" );
+ appendStems( dict_ht, defaultEntry, properNouns, "6" );
+ }
+
+ private static void appendStems( Hashtable<String, DictEntry> stems,
DictEntry defaultEntry, String[] array, String dict ) {
+ for (int i=0; i < array.length; i++) {
+ if (!stems.containsKey(array[i])) {
+ stems.put(array[i],defaultEntry);
} else {
System.out.println("Warning: Entry ["+array[i]+
- "] already in dictionary 4");
+ "] already in dictionary " + dict);
}
}
-
-
- array = KStemData2.data;
- for (int i=0;i<array.length;i++) {
- if (!dict_ht.containsKey(array[i])) {
- dict_ht.put(array[i],defaultEntry);
- } else {
- System.out.println("Warning: Entry ["+array[i]+
- "] already in dictionary 4");
- }
- }
-
- array = KStemData3.data;
- for (int i=0;i<array.length;i++) {
- if (!dict_ht.containsKey(array[i])) {
- dict_ht.put(array[i],defaultEntry);
- } else {
- System.out.println("Warning: Entry ["+array[i]+
- "] already in dictionary 4");
- }
- }
-
- array = KStemData4.data;
- for (int i=0;i<array.length;i++) {
- if (!dict_ht.containsKey(array[i])) {
- dict_ht.put(array[i],defaultEntry);
- } else {
- System.out.println("Warning: Entry ["+array[i]+
- "] already in dictionary 4");
- }
- }
-
-
- array = KStemData5.data;
- for (int i=0;i<array.length;i++) {
- if (!dict_ht.containsKey(array[i])) {
- dict_ht.put(array[i],defaultEntry);
- } else {
- System.out.println("Warning: Entry ["+array[i]+
- "] already in dictionary 4");
- }
- }
-
-
- array = KStemData6.data;
- for (int i=0;i<array.length;i++) {
- if (!dict_ht.containsKey(array[i])) {
- dict_ht.put(array[i],defaultEntry);
- } else {
- System.out.println("Warning: Entry ["+array[i]+
- "] already in dictionary 4");
- }
- }
-
- array = KStemData7.data;
- for (int i=0;i<array.length;i++) {
- if (!dict_ht.containsKey(array[i])) {
- dict_ht.put(array[i],defaultEntry);
- } else {
- System.out.println("Warning: Entry ["+array[i]+
- "] already in dictionary 4");
- }
- }
-
- for (int i=0;i<KStemData8.data.length;i++) {
- if (!dict_ht.containsKey(KStemData8.data[i])) {
- dict_ht.put(KStemData8.data[i],defaultEntry);
- } else {
- System.out.println("Warning: Entry
["+KStemData8.data[i]+
- "] already in dictionary 4");
- }
- }
-
- for (int i=0;i<supplementDict.length;i++) {
- if (!dict_ht.containsKey(supplementDict[i])) {
- dict_ht.put(supplementDict[i],defaultEntry);
- } else {
- System.out.println("Warning: Entry ["+
- supplementDict[i]+
- "] already in dictionary 5");
- }
- }
-
- for (int i=0;i<properNouns.length;i++) {
- if (!dict_ht.containsKey(properNouns[i])) {
- dict_ht.put(properNouns[i],defaultEntry);
- } else {
- System.out.println("Warning: Entry ["+
- properNouns[i]+
- "] already in dictionary 6");
- }
- }
}
private boolean isAlpha(char ch) {
Modified:
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/AcronymFilter.java
===================================================================
---
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/AcronymFilter.java
2011-02-28 03:39:05 UTC (rev 82928)
+++
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/AcronymFilter.java
2011-02-28 10:11:44 UTC (rev 82929)
@@ -6,6 +6,9 @@
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+/**
+ * Removes dots from acronyms?
+ */
public class AcronymFilter extends TokenFilter {
Token buffered = null;
Modified:
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/AliasFilter.java
===================================================================
--- trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/AliasFilter.java
2011-02-28 03:39:05 UTC (rev 82928)
+++ trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/AliasFilter.java
2011-02-28 10:11:44 UTC (rev 82929)
@@ -1,12 +1,9 @@
package org.wikimedia.lsearch.analyzers;
import java.io.IOException;
-import java.lang.reflect.Constructor;
-import java.lang.reflect.InvocationTargetException;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
/**
@@ -30,7 +27,6 @@
* 2) stemmers should never change tokens, if the text needs to be
* changed, return a new Token object
*
- * @param language
*/
public AliasFilter(FilterFactory filters, TokenStream input,
TokenStream duplicate){
this.input = input;
Modified:
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/Alttitles.java
===================================================================
--- trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/Alttitles.java
2011-02-28 03:39:05 UTC (rev 82928)
+++ trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/Alttitles.java
2011-02-28 10:11:44 UTC (rev 82929)
@@ -58,7 +58,7 @@
}
/**
- * Serialize alttitle for highlighting, serializies titles, redirects,
sections.
+ * Serialize alttitle for highlighting, serializes titles, redirects,
sections.
* Writes original names + highlight tokens.
*
* @param article
Modified:
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/Analyzers.java
===================================================================
--- trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/Analyzers.java
2011-02-28 03:39:05 UTC (rev 82928)
+++ trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/Analyzers.java
2011-02-28 10:11:44 UTC (rev 82929)
@@ -1,28 +1,12 @@
package org.wikimedia.lsearch.analyzers;
-import java.util.ArrayList;
-import java.util.HashMap;
import java.util.HashSet;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.PerFieldAnalyzerWrapper;
-import org.apache.lucene.analysis.PorterStemFilter;
-import org.apache.lucene.analysis.SimpleAnalyzer;
-import org.apache.lucene.analysis.de.GermanStemFilter;
-import org.apache.lucene.analysis.fr.FrenchStemFilter;
-import org.apache.lucene.analysis.nl.DutchStemFilter;
-import org.apache.lucene.analysis.ru.RussianStemFilter;
-import org.apache.lucene.analysis.th.ThaiWordFilter;
-import org.apache.lucene.search.FieldSortedHitQueue;
-import org.wikimedia.lsearch.analyzers.FieldBuilder.BuilderSet;
-import org.wikimedia.lsearch.beans.Article;
-import org.wikimedia.lsearch.beans.Title;
import org.wikimedia.lsearch.config.GlobalConfiguration;
import org.wikimedia.lsearch.config.IndexId;
-import org.wikimedia.lsearch.index.WikiIndexModifier;
-import org.wikimedia.lsearch.ranks.Links;
-import org.wikimedia.lsearch.related.RelatedTitle;
/**
* Global functions related to creation/usage of analyzers.
Modified:
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/CJKFilter.java
===================================================================
--- trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/CJKFilter.java
2011-02-28 03:39:05 UTC (rev 82928)
+++ trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/CJKFilter.java
2011-02-28 10:11:44 UTC (rev 82929)
@@ -72,8 +72,8 @@
(c >= 0x3300 && c <= 0x337f) ||
(c >= 0x3400 && c <= 0x3d2d) ||
(c >= 0x4e00 && c <= 0x9fff) ||
- (c >= 0xf900 && c <= 0xfaff) ||
- (c >= 0xac00 && c <= 0xd7af);
+ (c >= 0xf900 && c <= 0xfaff) ||
+ (c >= 0xac00 && c <= 0xd7af);
}
}
\ No newline at end of file
Modified:
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/CategoryAnalyzer.java
===================================================================
---
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/CategoryAnalyzer.java
2011-02-28 03:39:05 UTC (rev 82928)
+++
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/CategoryAnalyzer.java
2011-02-28 10:11:44 UTC (rev 82929)
@@ -6,12 +6,11 @@
import java.util.Iterator;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
/** Produces a token stream for category field in the lucene index.
- * Each token is a single category (category names themself are
+ * Each token is a single category (category names themselves are
* not tokenized) */
public class CategoryAnalyzer extends Analyzer {
public class ArrayTokenStream extends TokenStream {
Modified:
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EnglishKStemSingular.java
===================================================================
---
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EnglishKStemSingular.java
2011-02-28 03:39:05 UTC (rev 82928)
+++
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EnglishKStemSingular.java
2011-02-28 10:11:44 UTC (rev 82929)
@@ -3,7 +3,7 @@
import org.apache.lucene.analysis.KStemmer;
/**
- * KStem-based singular-finding class for english
+ * KStem-based singular-finding class for English
*
* @author rainman
*
@@ -15,7 +15,7 @@
if(!word.equals(ret))
return ret;
else{
- // strip possesive
+ // strip possessive suffix
if(word.endsWith("'s"))
return word.substring(0,word.length()-2);
return null;
Modified:
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EnglishSingular.java
===================================================================
---
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EnglishSingular.java
2011-02-28 03:39:05 UTC (rev 82928)
+++
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EnglishSingular.java
2011-02-28 10:11:44 UTC (rev 82929)
@@ -1,9 +1,11 @@
package org.wikimedia.lsearch.analyzers;
+import java.util.Arrays;
import java.util.HashMap;
+import java.util.HashSet;
/**
- * Porter-based singular filter for english
+ * Porter-based singular filter for English
*
* @author rainman
*
@@ -18,10 +20,11 @@
if(w.length() <= 3 || w.charAt(w.length()-1) != 's')
return null;
// exceptions (from porter2)
- if("news".equals(w) || "atlas".equals(w) || "cosmos".equals(w)
- || "bias".equals(w) || "andes".equals(w) ||
"aries".equals(w))
+ String[] exceptions = { "news", "atlas", "cosmos", "bias",
"andes", "aries" };
+ HashSet<String> set = new
HashSet<String>(Arrays.asList(exceptions));
+ if( set.contains(w) )
return null;
- // don't strip posssesive form
+ // don't strip possessive form
if(w.endsWith("'s")){
//if(w.length() > 2)
// return w.substring(0,w.length()-2);
Modified:
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EnglishSingularFilter.java
===================================================================
---
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EnglishSingularFilter.java
2011-02-28 03:39:05 UTC (rev 82928)
+++
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EnglishSingularFilter.java
2011-02-28 10:11:44 UTC (rev 82929)
@@ -8,7 +8,7 @@
import org.apache.lucene.analysis.TokenStream;
/**
- * Add english singular forms of words as aliases of
+ * Add English singular forms of words as aliases of
* type "singular"
*
* @author rainman
@@ -17,7 +17,7 @@
public class EnglishSingularFilter extends TokenFilter{
Singular singular = new EnglishKStemSingular();
- Token next = null, next2=null;
+ Token next = null, next2= null;
public EnglishSingularFilter(TokenStream input) {
super(input);
}
@@ -53,7 +53,7 @@
return t;
}
- /** Return token with sigular form of the noun, or null if none found */
+ /** Return token with singular form of the noun, or null if none found
*/
protected final Token singular(Token t){
String w = singular.getSingular(t.termText());
if(w != null){
Modified:
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EsperantoStemFilter.java
===================================================================
---
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EsperantoStemFilter.java
2011-02-28 03:39:05 UTC (rev 82928)
+++
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/EsperantoStemFilter.java
2011-02-28 10:11:44 UTC (rev 82929)
@@ -32,7 +32,7 @@
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenFilter;
-/** Stem filter for esperanto */
+/** Stem filter for Esperanto */
public class EsperantoStemFilter extends TokenFilter {
public EsperantoStemFilter(TokenStream tokenizer) {
super(tokenizer);
Modified:
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java
===================================================================
---
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java
2011-02-28 03:39:05 UTC (rev 82928)
+++
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java
2011-02-28 10:11:44 UTC (rev 82929)
@@ -18,10 +18,10 @@
/**
* Wiki Tokenizer. Tokens are words and numbers. All letters are
- * lowercased and diacritics deleted using unicode compatibility
+ * lowercased and diacritics deleted using Unicode compatibility
* decomposition (i.e. č -> c). Parses some basic wiki syntax,
* template names are skipped, from images captions are extracted,
- * categories and interwiki links are extracted ...
+ * categories and interwiki links are extracted...
*
* Tokenizer will not take a Reader as input, but a String (for
* optimal performance)
@@ -172,7 +172,7 @@
* This function is called at word boundaries, it is used to
* make a new token and add it to token stream
*
- * Does unicode decomposition, and will make alias token with
+ * Does Unicode decomposition, and will make alias token with
* alternative transliterations (e.g. ö -> oe)
*/
private final void addToken(){
@@ -203,7 +203,7 @@
boolean addDecomposed = false;
boolean allUpperCase = true;
boolean titleCase = true;
- boolean split = false; // if more tokens shold be
produced, e.g. joe's -> joe + s
+ boolean split = false; // if more tokens should be
produced, e.g. joe's -> joe + s
for(int i=0;i<length;i++){
if(decomposer.isCombiningChar(buffer[i])){
addDecomposed = true;
@@ -328,7 +328,7 @@
else if(titleCase)
exact.setType("titlecase");
}
- // detect hyphenation (takes presedence over case
detection)
+ // detect hyphenation (takes precedence over case
detection)
if(cur+1<textLength && text[cur]=='-' &&
(Character.isLetterOrDigit(text[cur+1]) ||
decomposer.isCombiningChar(text[cur+1])))
exact.setType("with_hyphen");
@@ -347,14 +347,14 @@
if(decompLength!=0 && addDecomposed){
Token t = makeToken(new
String(decompBuffer, 0, decompLength), start, start + length, false);
t.setPositionIncrement(0);
- t.setType(exact.type());
+ t.setType(exact.type() + "-decomposed");
addToTokens(t);
}
// add alias (if any) token to stream
if(aliasLength>0){
Token t = makeToken(new
String(aliasBuffer, 0, aliasLength), start, start + length, false);
t.setPositionIncrement(0);
- t.setType(exact.type());
+ t.setType(exact.type() + "-aliased");
addToTokens(t);
}
}
@@ -796,7 +796,7 @@
if(lc == '\n' || lc =='\r')
break;
}
- int start=0, end=0; // number of ='s at begining and
end of line
+ int start=0, end=0; // number of ='s at beginning and
end of line
// find first sequence of =
for(lookup = cur ; lookup < textLength && lookup <
endOfLine ; lookup++ ){
if(text[lookup] == '=')
@@ -804,7 +804,7 @@
else
break;
}
- // find the last squence of =
+ // find the last sequence of =
for(lookup = endOfLine-1 ; lookup > cur ; lookup-- ){
if(text[lookup] == '=')
end++;
@@ -843,6 +843,7 @@
}
return true;
}
+
/** Check if it's a reference tag starting at cur */
protected boolean checkRefStart(){
if(matchesString("<ref")){
@@ -894,7 +895,7 @@
return tokens;
}
- // strip comments so we don't neded to complicate syntax
parsing even more
+ // strip comments so we don't need to complicate syntax parsing
even more
stripComments();
// start parsing
@@ -974,7 +975,7 @@
}
}
} else if(cur > 0 &&
text[cur-1]=='\n' && text[cur+1] == '-'){
- // explicitely put '-'
into the glue buffer
+ // Explicitly put '-'
into the glue buffer
if(options.highlightParsing){
if(glueLength
== 0)
glueStart = cur+1;
@@ -1276,7 +1277,7 @@
continue;
case LINK_FETCH:
if(length == 0 && c ==' ')
- continue; // ignore leading whitespaces
+ continue; // ignore leading whitespace
if(c == ']'){
state = ParserState.LINK_END;
continue;
@@ -1333,7 +1334,7 @@
cur =
fetchStart;
state =
ParserState.CATEGORY_WORDS;
} else
-
System.err.print("ERROR: Inconsistent parser state, attepmted category
backtrace for uninitalized fetchStart.");
+
System.err.print("ERROR: Inconsistent parser state, attempted category
backtrace for uninitalized fetchStart.");
fetchStart = -1;
continue;
case INTERWIKI:
@@ -1375,7 +1376,7 @@
continue;
case TABLE_BEGIN:
tableLevel++;
- // ignore everything up to the newspace, since
they are table display params
+ // ignore everything up to the newline, since
they are table display params
while(cur < textLength && (text[cur]!='\r' &&
text[cur]!='\n'))
cur++;
state = ParserState.WORD;
@@ -1422,7 +1423,7 @@
flushGlue();
if(nonContentTokens.size() != 0){
boolean first = true;
- // flush any remaning tokens from initial templates,
etc..
+ // flush any remaining tokens from initial templates,
etc..
for(Token tt : nonContentTokens){
if(first){
tt.setPositionIncrement(FIRST_SECTION_GAP);
@@ -1595,7 +1596,11 @@
return new String(buf,0,len).trim();
}
- /** Delete all vowels from a word or phrase */
+ /**
+ * Delete all vowels from a word or phrase
+ *
+ * Unused (except test)?
+ */
public static String deleteVowels(String title){
char[] buf = new char[256];
Modified:
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/FieldBuilder.java
===================================================================
--- trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/FieldBuilder.java
2011-02-28 03:39:05 UTC (rev 82928)
+++ trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/FieldBuilder.java
2011-02-28 10:11:44 UTC (rev 82929)
@@ -3,18 +3,18 @@
import org.wikimedia.lsearch.config.IndexId;
/**
- * Agregate class for FilterFactory and FieldNameFactory. This class
- * contains methods used to build various fields of the index,
- * it contains field names to be used, filter that are to be applied...
+ * Aggregate class for FilterFactory and FieldNameFactory. This class contains
+ * methods used to build various fields of the index, it contains field names
to
+ * be used, filter that are to be applied...
*
* @author rainman
- *
+ *
*/
public class FieldBuilder {
public class BuilderSet{
FilterFactory filters;
FieldNameFactory fields;
- boolean addKeywords; // wether to add keywords from beginning
of article
+ boolean addKeywords; // whether to add keywords from beginning
of article
public BuilderSet(FilterFactory filters, FieldNameFactory
fields) {
this.filters = filters;
Modified:
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/HyphenFilter.java
===================================================================
--- trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/HyphenFilter.java
2011-02-28 03:39:05 UTC (rev 82928)
+++ trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/HyphenFilter.java
2011-02-28 10:11:44 UTC (rev 82929)
@@ -19,7 +19,7 @@
@Override
public Token next() throws IOException {
- // return buferred
+ // return buffered
if(inx < buffer.size())
return buffer.get(inx++);
Modified:
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java
===================================================================
---
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java
2011-02-28 03:39:05 UTC (rev 82928)
+++
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java
2011-02-28 10:11:44 UTC (rev 82929)
@@ -67,7 +67,7 @@
for(int i=0;i<levels;i++)
keywordsBySize.add(new ArrayList<String>());
TokenizerOptions options = new TokenizerOptions(exactCase);
- // arange keywords into a list by token number
+ // arrange keywords into a list by token number
for(String k : keywords){
ArrayList<Token> parsed = new
FastWikiTokenizerEngine(k,iid,options).parse();
if(parsed.size() == 0)
Modified:
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/LowercaseAnalyzer.java
===================================================================
---
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/LowercaseAnalyzer.java
2011-02-28 03:39:05 UTC (rev 82928)
+++
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/LowercaseAnalyzer.java
2011-02-28 10:11:44 UTC (rev 82929)
@@ -7,7 +7,7 @@
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
/**
- * Analyzer that just lowecases the text, doesn't split up anything, etc..
+ * Analyzer that just lowercases the text, doesn't split up anything, etc..
*
* @author rainman
*
Modified:
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/PhraseFilter.java
===================================================================
--- trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/PhraseFilter.java
2011-02-28 03:39:05 UTC (rev 82928)
+++ trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/PhraseFilter.java
2011-02-28 10:11:44 UTC (rev 82929)
@@ -38,7 +38,7 @@
protected Token phrase1 = null, phrase2 = null;
protected boolean phraseReady = false;
protected String gap = "_";
- /** pairs of words, two adjecent words */
+ /** pairs of words, two adjacent words */
protected Token pair1 = null, pair2 = null;
protected boolean pairReady = false;
protected Token nextToken = null;
Modified:
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/StopWords.java
===================================================================
--- trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/StopWords.java
2011-02-28 03:39:05 UTC (rev 82928)
+++ trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/StopWords.java
2011-02-28 10:11:44 UTC (rev 82929)
@@ -90,7 +90,7 @@
return ret;
}
- /** Get a brand new hash set of predifined stop words (i.e. not those
generated from lucene indexes) */
+ /** Get a brand new hash set of predefined stop words (i.e. not those
generated from lucene indexes) */
public static HashSet<String> getPredefinedSet(String langCode){
loadPredefined();
HashSet<String> ret = new HashSet<String>();
Modified:
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/VietnameseFilter.java
===================================================================
---
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/VietnameseFilter.java
2011-02-28 03:39:05 UTC (rev 82928)
+++
trunk/lucene-search-2/src/org/wikimedia/lsearch/analyzers/VietnameseFilter.java
2011-02-28 10:11:44 UTC (rev 82929)
@@ -7,8 +7,8 @@
import org.apache.lucene.analysis.TokenStream;
/**
- * Vietnamese standard transliterations to ascii. Most of the stuff is done by
unicode decomposed,
- * we just additionaly convert Đ/đ -> D/d
+ * Vietnamese standard transliterations to ascii. Most of the stuff is done by
Unicode decomposition.
+ * Additional conversions here are: Đ/đ -> D/d
*
* @author rainman
*
Modified:
trunk/lucene-search-2/src/org/wikimedia/lsearch/config/GlobalConfiguration.java
===================================================================
---
trunk/lucene-search-2/src/org/wikimedia/lsearch/config/GlobalConfiguration.java
2011-02-28 03:39:05 UTC (rev 82928)
+++
trunk/lucene-search-2/src/org/wikimedia/lsearch/config/GlobalConfiguration.java
2011-02-28 10:11:44 UTC (rev 82929)
@@ -119,13 +119,13 @@
protected static GlobalConfiguration instance = null;
- /** All the lang codes we encountered, used for "smart interwiki" */
+ /** All the language codes we encountered, used for "smart interwiki" */
protected HashSet<String> smartInterwikiCodes = new HashSet<String>();
protected boolean useSmartInterwiki = false;
protected int maxSearchLimit = 1000;
protected int maxSearchOffset = 1000000;
- /** Wether to report warnings and info */
+ /** Whether to report warnings and info */
protected static boolean verbose = true;
/** Sections in lsearch-config.conf */
@@ -145,14 +145,12 @@
}
protected GlobalConfiguration(){
- // try to determin this hosts IP address
+ // try to determine this hosts IP address
determineInetAddress();
}
/**
* Get singleton instance of this class
- *
- * @return
*/
synchronized public static GlobalConfiguration getInstance() {
if (instance == null)
@@ -382,7 +380,7 @@
}
/**
- * Reads a config file from a bufferedreader, will
+ * Reads a config file from a BufferedReader, will
* close the reader when done.
*
* @param in opened reader
@@ -423,7 +421,7 @@
prop.append("\n");
}
globalProperties.load(new
ByteArrayInputStream(prop.toString().getBytes("utf-8")));
- // get some predifined global properties
+ // get some predefined global properties
this.databaseSuffixes =
getArrayProperty("Database.suffix");
this.keywordScoringSuffixes =
getArrayProperty("KeywordScoring.suffix");
this.exactCaseSuffix =
getArrayProperty("ExactCase.suffix");
Modified: trunk/lucene-search-2/src/org/wikimedia/lsearch/config/IndexId.java
===================================================================
--- trunk/lucene-search-2/src/org/wikimedia/lsearch/config/IndexId.java
2011-02-28 03:39:05 UTC (rev 82928)
+++ trunk/lucene-search-2/src/org/wikimedia/lsearch/config/IndexId.java
2011-02-28 10:11:44 UTC (rev 82929)
@@ -83,7 +83,7 @@
/** Type of index, enumeration */
protected IndexType type;
- /** Part number in split repestnation, e.g. 1..N */
+ /** Part number in split representation, e.g. 1..N */
protected int partNum;
/** Namespace -> part (for nssplit indexes) */
@@ -137,10 +137,10 @@
/** Namespaces that are searched by default */
protected NamespaceFilter defaultNs = null;
- /** filter set to true for namespaces with subpages */
+ /** Filter set to true for namespaces with subpages */
protected NamespaceFilter nsWithSubpages = null;
- /** namespaces with content (from initialise settings) */
+ /** Namespaces with content (from initialise settings) */
protected NamespaceFilter contentNamespaces = null;
/** If we should be using additional global rank for scores */
@@ -683,7 +683,6 @@
/**
* Get all indexes parts for this iid except for logical names.
* I.e. for db of kind mainsplit, it will return db.mainpart,
db.restpart
- * @return
*/
public HashSet<String> getPhysicalIndexes() {
HashSet<String> ret = new HashSet<String>();
@@ -712,8 +711,6 @@
/**
* Wrapper for getPhysicalIndexes to get iid objects
- *
- * @return
*/
public ArrayList<IndexId> getPhysicalIndexIds(){
HashSet<String> physical = getPhysicalIndexes();
Modified:
trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/HttpHandler.java
===================================================================
--- trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/HttpHandler.java
2011-02-28 03:39:05 UTC (rev 82928)
+++ trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/HttpHandler.java
2011-02-28 10:11:44 UTC (rev 82929)
@@ -18,7 +18,7 @@
/**
* Simple HTTP 1.1 handler, used for Index and Search daemons
- * for more info on protocole see handle() method
+ * for more info about the protocol see handle() method
*
* @author Brion Vibber
*
@@ -136,7 +136,7 @@
* URL path format: /operation/database/searchterm
* The path should be URL-encoded UTF-8 (standard IRI).
*
- * Additional paramters may be specified in a query string:
+ * Additional parameters may be specified in a query string:
* namespaces: comma-separated list of namespace numeric keys to
subset results
* limit: maximum number of results to return
* offset: number of matches to skip before returning results
@@ -271,7 +271,7 @@
return null;
}
- /** This method is to be used for header reads only (which is utf-8
free!) */
+ /** This method is to be used for header reads only (which is UTF-8
free!) */
@SuppressWarnings("deprecation")
protected String readInputLine() {
String sin="";
Modified:
trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/HttpMonitor.java
===================================================================
--- trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/HttpMonitor.java
2011-02-28 03:39:05 UTC (rev 82928)
+++ trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/HttpMonitor.java
2011-02-28 10:11:44 UTC (rev 82929)
@@ -4,18 +4,17 @@
import java.util.Collections;
import java.util.Comparator;
import java.util.Hashtable;
-import java.util.List;
import java.util.Map.Entry;
import org.apache.log4j.Logger;
public class HttpMonitor extends Thread {
static Logger log = Logger.getLogger(HttpMonitor.class);
- protected static HttpMonitor instance=null;
+ protected static HttpMonitor instance;
/** times when http request have been started */
protected Hashtable<HttpHandler,Long> startTimes = new
Hashtable<HttpHandler,Long>();
- /** threshold for reporting 10s */
+ /** threshold in milliseconds for reporting */
protected long threshold = 10000;
private HttpMonitor(){}
Modified:
trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/SearchDaemon.java
===================================================================
--- trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/SearchDaemon.java
2011-02-28 03:39:05 UTC (rev 82928)
+++ trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/SearchDaemon.java
2011-02-28 10:11:44 UTC (rev 82929)
@@ -436,7 +436,7 @@
log.error("Error sending result line ("+score + " " +
namespace + " " + title +"): "+e.getMessage(),e);
}
}
-
+ /** Unused? */
private void sendResultLine(String namespace, String title) {
try{
sendOutputLine(namespace + " " +
encodeTitle(title));
Modified:
trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/SearchServer.java
===================================================================
--- trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/SearchServer.java
2011-02-28 03:39:05 UTC (rev 82928)
+++ trunk/lucene-search-2/src/org/wikimedia/lsearch/frontend/SearchServer.java
2011-02-28 10:11:44 UTC (rev 82929)
@@ -72,7 +72,7 @@
if (max != null)
maxThreads = Integer.parseInt(max);
- // Initialise statistics
+ // Initialize statistics
stats = new Statistics(1000, statsPeriod);
if (config.getBoolean("Ganglia", "report")) {
log.info("Starting ganglia statistics thread...");
Modified:
trunk/lucene-search-2/src/org/wikimedia/lsearch/search/SearchEngine.java
===================================================================
--- trunk/lucene-search-2/src/org/wikimedia/lsearch/search/SearchEngine.java
2011-02-28 03:39:05 UTC (rev 82928)
+++ trunk/lucene-search-2/src/org/wikimedia/lsearch/search/SearchEngine.java
2011-02-28 10:11:44 UTC (rev 82929)
@@ -69,8 +69,19 @@
public class SearchEngine {
static org.apache.log4j.Logger log =
Logger.getLogger(SearchEngine.class);
+ /**
+ * Maximum number of search results at once.
+ */
protected static int maxlimit = 1000;
+
+ /**
+ * Largest search result offset.
+ */
protected static int maxoffset = 100000;
+
+ /**
+ * Maximum number of search results for prefix query.
+ */
protected final int MAXPREFIX = 50;
protected static GlobalConfiguration global = null;
protected static Configuration config = null;
@@ -518,7 +529,7 @@
return res;
}
- /** Strip key using PrefixIndexBuilder stip function */
+ /** Strip key using PrefixIndexBuilder strip function */
private String stripKey(String key){
return PrefixIndexBuilder.stripKey(key);
}
Modified:
trunk/lucene-search-2/test/org/wikimedia/lsearch/util/LocalizationTest.java
===================================================================
--- trunk/lucene-search-2/test/org/wikimedia/lsearch/util/LocalizationTest.java
2011-02-28 03:39:05 UTC (rev 82928)
+++ trunk/lucene-search-2/test/org/wikimedia/lsearch/util/LocalizationTest.java
2011-02-28 10:11:44 UTC (rev 82929)
@@ -1,16 +1,11 @@
package org.wikimedia.lsearch.util;
-import java.net.URL;
-
import org.wikimedia.lsearch.config.Configuration;
import org.wikimedia.lsearch.config.IndexId;
import org.wikimedia.lsearch.util.Localization;
public class LocalizationTest {
- /**
- * @param args
- */
public static void main(String[] args) {
Configuration.open();
String text = "#redirect [[mw]]";
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs