http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/BR/BrazilianStemmer.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/BR/BrazilianStemmer.cs b/src/contrib/Analyzers/BR/BrazilianStemmer.cs deleted file mode 100644 index 9ec12ec..0000000 --- a/src/contrib/Analyzers/BR/BrazilianStemmer.cs +++ /dev/null @@ -1,1264 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * A stemmer for Brazilian words. - */ -namespace Lucene.Net.Analysis.BR -{ - - public class BrazilianStemmer - { - - /* - * Changed term - */ - private string TERM; - private string CT; - private string R1; - private string R2; - private string RV; - - - public BrazilianStemmer() - { - } - - /* - * Stemms the given term to an unique <tt>discriminator</tt>. - * - * <param name="term"> The term that should be stemmed.</param> - * <returns> Discriminator for <tt>term</tt></returns> - */ - public string Stem(string term) - { - bool altered = false; // altered the term - - // creates CT - createCT(term); - - if (!isIndexable(CT)) - { - return null; - } - if (!isStemmable(CT)) - { - return CT; - } - - R1 = getR1(CT); - R2 = getR1(R1); - RV = getRV(CT); - TERM = term + ";" + CT; - - altered = step1(); - if (!altered) - { - altered = step2(); - } - - if (altered) - { - step3(); - } - else - { - step4(); - } - - step5(); - - return CT; - } - - /* - * Checks a term if it can be processed correctly. - * - * <returns> true if, and only if, the given term consists in letters.</returns> - */ - private bool isStemmable(string term) - { - for (int c = 0; c < term.Length; c++) - { - // Discard terms that contain non-letter characters. - if (!char.IsLetter(term[c])) - { - return false; - } - } - return true; - } - - /* - * Checks a term if it can be processed indexed. - * - * <returns> true if it can be indexed</returns> - */ - private bool isIndexable(string term) - { - return (term.Length < 30) && (term.Length > 2); - } - - /* - * See if string is 'a','e','i','o','u' - * - * <returns>true if is vowel</returns> - */ - private bool isVowel(char value) - { - return (value == 'a') || - (value == 'e') || - (value == 'i') || - (value == 'o') || - (value == 'u'); - } - - /* - * Gets R1 - * - * R1 - is the region after the first non-vowel follwing a vowel, - * or is the null region at the end of the word if there is - * no such non-vowel. - * - * <returns>null or a string representing R1</returns> - */ - private string getR1(string value) - { - int i; - int j; - - // be-safe !!! - if (value == null) - { - return null; - } - - // find 1st vowel - i = value.Length - 1; - for (j = 0; j < i; j++) - { - if (isVowel(value[j])) - { - break; - } - } - - if (!(j < i)) - { - return null; - } - - // find 1st non-vowel - for (; j < i; j++) - { - if (!(isVowel(value[j]))) - { - break; - } - } - - if (!(j < i)) - { - return null; - } - - return value.Substring(j + 1); - } - - /* - * Gets RV - * - * RV - IF the second letter is a consoant, RV is the region after - * the next following vowel, - * - * OR if the first two letters are vowels, RV is the region - * after the next consoant, - * - * AND otherwise (consoant-vowel case) RV is the region after - * the third letter. - * - * BUT RV is the end of the word if this positions cannot be - * found. - * - * <returns>null or a string representing RV</returns> - */ - private string getRV(string value) - { - int i; - int j; - - // be-safe !!! - if (value == null) - { - return null; - } - - i = value.Length - 1; - - // RV - IF the second letter is a consoant, RV is the region after - // the next following vowel, - if ((i > 0) && !isVowel(value[1])) - { - // find 1st vowel - for (j = 2; j < i; j++) - { - if (isVowel(value[j])) - { - break; - } - } - - if (j < i) - { - return value.Substring(j + 1); - } - } - - - // RV - OR if the first two letters are vowels, RV is the region - // after the next consoant, - if ((i > 1) && - isVowel(value[0]) && - isVowel(value[1])) - { - // find 1st consoant - for (j = 2; j < i; j++) - { - if (!isVowel(value[j])) - { - break; - } - } - - if (j < i) - { - return value.Substring(j + 1); - } - } - - // RV - AND otherwise (consoant-vowel case) RV is the region after - // the third letter. - if (i > 2) - { - return value.Substring(3); - } - - return null; - } - - /* - * 1) Turn to lowercase - * 2) Remove accents - * 3) ã -> a ; õ -> o - * 4) ç -> c - * - * <returns>null or a string transformed</returns> - */ - private string changeTerm(string value) - { - int j; - string r = ""; - - // be-safe !!! - if (value == null) - { - return null; - } - - value = value.ToLower(); - for (j = 0; j < value.Length; j++) - { - if ((value[j] == 'á') || - (value[j] == 'â') || - (value[j] == 'ã')) - { - r = r + "a"; continue; - } - if ((value[j] == 'é') || - (value[j] == 'ê')) - { - r = r + "e"; continue; - } - if (value[j] == 'Ã') - { - r = r + "i"; continue; - } - if ((value[j] == 'ó') || - (value[j] == 'ô') || - (value[j] == 'õ')) - { - r = r + "o"; continue; - } - if ((value[j] == 'ú') || - (value[j] == 'ü')) - { - r = r + "u"; continue; - } - if (value[j] == 'ç') - { - r = r + "c"; continue; - } - if (value[j] == 'ñ') - { - r = r + "n"; continue; - } - - r = r + value[j]; - } - - return r; - } - - /* - * Check if a string ends with a suffix - * - * <returns>true if the string ends with the specified suffix</returns> - */ - private bool suffix(string value, string suffix) - { - - // be-safe !!! - if ((value == null) || (suffix == null)) - { - return false; - } - - if (suffix.Length > value.Length) - { - return false; - } - - return value.Substring(value.Length - suffix.Length).Equals(suffix); - } - - /* - * Replace a string suffix by another - * - * <returns>the replaced string</returns> - */ - private string replaceSuffix(string value, string toReplace, string changeTo) - { - string vvalue; - - // be-safe !!! - if ((value == null) || - (toReplace == null) || - (changeTo == null)) - { - return value; - } - - vvalue = removeSuffix(value, toReplace); - - if (value.Equals(vvalue)) - { - return value; - } - else - { - return vvalue + changeTo; - } - } - - /* - * Remove a string suffix - * - * <returns>the string without the suffix</returns> - */ - private string removeSuffix(string value, string toRemove) - { - // be-safe !!! - if ((value == null) || - (toRemove == null) || - !suffix(value, toRemove)) - { - return value; - } - - return value.Substring(0, value.Length - toRemove.Length); - } - - /* - * See if a suffix is preceded by a string - * - * <returns>true if the suffix is preceded</returns> - */ - private bool suffixPreceded(string value, string _suffix, string preceded) - { - // be-safe !!! - if ((value == null) || - (_suffix == null) || - (preceded == null) || - !suffix(value, _suffix)) - { - return false; - } - - return suffix(removeSuffix(value, _suffix), preceded); - } - - - - - /* - * Creates CT (changed term) , substituting * 'ã' and 'õ' for 'a~' and 'o~'. - */ - private void createCT(string term) - { - CT = changeTerm(term); - - if (CT.Length < 2) return; - - // if the first character is ... , remove it - if ((CT[0] == '"') || - (CT[0] == '\'') || - (CT[0] == '-') || - (CT[0] == ',') || - (CT[0] == ';') || - (CT[0] == '.') || - (CT[0] == '?') || - (CT[0] == '!') - ) - { - CT = CT.Substring(1); - } - - if (CT.Length < 2) return; - - // if the last character is ... , remove it - if ((CT[CT.Length - 1] == '-') || - (CT[CT.Length - 1] == ',') || - (CT[CT.Length - 1] == ';') || - (CT[CT.Length - 1] == '.') || - (CT[CT.Length - 1] == '?') || - (CT[CT.Length - 1] == '!') || - (CT[CT.Length - 1] == '\'') || - (CT[CT.Length - 1] == '"') - ) - { - CT = CT.Substring(0, CT.Length - 1); - } - } - - - /* - * Standart suffix removal. - * Search for the longest among the following suffixes, and perform - * the following actions: - * - * <returns>false if no ending was removed</returns> - */ - private bool step1() - { - if (CT == null) return false; - - // suffix lenght = 7 - if (suffix(CT, "uciones") && suffix(R2, "uciones")) - { - CT = replaceSuffix(CT, "uciones", "u"); return true; - } - - // suffix lenght = 6 - if (CT.Length >= 6) - { - if (suffix(CT, "imentos") && suffix(R2, "imentos")) - { - CT = removeSuffix(CT, "imentos"); return true; - } - if (suffix(CT, "amentos") && suffix(R2, "amentos")) - { - CT = removeSuffix(CT, "amentos"); return true; - } - if (suffix(CT, "adores") && suffix(R2, "adores")) - { - CT = removeSuffix(CT, "adores"); return true; - } - if (suffix(CT, "adoras") && suffix(R2, "adoras")) - { - CT = removeSuffix(CT, "adoras"); return true; - } - if (suffix(CT, "logias") && suffix(R2, "logias")) - { - replaceSuffix(CT, "logias", "log"); return true; - } - if (suffix(CT, "encias") && suffix(R2, "encias")) - { - CT = replaceSuffix(CT, "encias", "ente"); return true; - } - if (suffix(CT, "amente") && suffix(R1, "amente")) - { - CT = removeSuffix(CT, "amente"); return true; - } - if (suffix(CT, "idades") && suffix(R2, "idades")) - { - CT = removeSuffix(CT, "idades"); return true; - } - } - - // suffix lenght = 5 - if (CT.Length >= 5) - { - if (suffix(CT, "acoes") && suffix(R2, "acoes")) - { - CT = removeSuffix(CT, "acoes"); return true; - } - if (suffix(CT, "imento") && suffix(R2, "imento")) - { - CT = removeSuffix(CT, "imento"); return true; - } - if (suffix(CT, "amento") && suffix(R2, "amento")) - { - CT = removeSuffix(CT, "amento"); return true; - } - if (suffix(CT, "adora") && suffix(R2, "adora")) - { - CT = removeSuffix(CT, "adora"); return true; - } - if (suffix(CT, "ismos") && suffix(R2, "ismos")) - { - CT = removeSuffix(CT, "ismos"); return true; - } - if (suffix(CT, "istas") && suffix(R2, "istas")) - { - CT = removeSuffix(CT, "istas"); return true; - } - if (suffix(CT, "logia") && suffix(R2, "logia")) - { - CT = replaceSuffix(CT, "logia", "log"); return true; - } - if (suffix(CT, "ucion") && suffix(R2, "ucion")) - { - CT = replaceSuffix(CT, "ucion", "u"); return true; - } - if (suffix(CT, "encia") && suffix(R2, "encia")) - { - CT = replaceSuffix(CT, "encia", "ente"); return true; - } - if (suffix(CT, "mente") && suffix(R2, "mente")) - { - CT = removeSuffix(CT, "mente"); return true; - } - if (suffix(CT, "idade") && suffix(R2, "idade")) - { - CT = removeSuffix(CT, "idade"); return true; - } - } - - // suffix lenght = 4 - if (CT.Length >= 4) - { - if (suffix(CT, "acao") && suffix(R2, "acao")) - { - CT = removeSuffix(CT, "acao"); return true; - } - if (suffix(CT, "ezas") && suffix(R2, "ezas")) - { - CT = removeSuffix(CT, "ezas"); return true; - } - if (suffix(CT, "icos") && suffix(R2, "icos")) - { - CT = removeSuffix(CT, "icos"); return true; - } - if (suffix(CT, "icas") && suffix(R2, "icas")) - { - CT = removeSuffix(CT, "icas"); return true; - } - if (suffix(CT, "ismo") && suffix(R2, "ismo")) - { - CT = removeSuffix(CT, "ismo"); return true; - } - if (suffix(CT, "avel") && suffix(R2, "avel")) - { - CT = removeSuffix(CT, "avel"); return true; - } - if (suffix(CT, "ivel") && suffix(R2, "ivel")) - { - CT = removeSuffix(CT, "ivel"); return true; - } - if (suffix(CT, "ista") && suffix(R2, "ista")) - { - CT = removeSuffix(CT, "ista"); return true; - } - if (suffix(CT, "osos") && suffix(R2, "osos")) - { - CT = removeSuffix(CT, "osos"); return true; - } - if (suffix(CT, "osas") && suffix(R2, "osas")) - { - CT = removeSuffix(CT, "osas"); return true; - } - if (suffix(CT, "ador") && suffix(R2, "ador")) - { - CT = removeSuffix(CT, "ador"); return true; - } - if (suffix(CT, "ivas") && suffix(R2, "ivas")) - { - CT = removeSuffix(CT, "ivas"); return true; - } - if (suffix(CT, "ivos") && suffix(R2, "ivos")) - { - CT = removeSuffix(CT, "ivos"); return true; - } - if (suffix(CT, "iras") && - suffix(RV, "iras") && - suffixPreceded(CT, "iras", "e")) - { - CT = replaceSuffix(CT, "iras", "ir"); return true; - } - } - - // suffix lenght = 3 - if (CT.Length >= 3) - { - if (suffix(CT, "eza") && suffix(R2, "eza")) - { - CT = removeSuffix(CT, "eza"); return true; - } - if (suffix(CT, "ico") && suffix(R2, "ico")) - { - CT = removeSuffix(CT, "ico"); return true; - } - if (suffix(CT, "ica") && suffix(R2, "ica")) - { - CT = removeSuffix(CT, "ica"); return true; - } - if (suffix(CT, "oso") && suffix(R2, "oso")) - { - CT = removeSuffix(CT, "oso"); return true; - } - if (suffix(CT, "osa") && suffix(R2, "osa")) - { - CT = removeSuffix(CT, "osa"); return true; - } - if (suffix(CT, "iva") && suffix(R2, "iva")) - { - CT = removeSuffix(CT, "iva"); return true; - } - if (suffix(CT, "ivo") && suffix(R2, "ivo")) - { - CT = removeSuffix(CT, "ivo"); return true; - } - if (suffix(CT, "ira") && - suffix(RV, "ira") && - suffixPreceded(CT, "ira", "e")) - { - CT = replaceSuffix(CT, "ira", "ir"); return true; - } - } - - // no ending was removed by step1 - return false; - } - - - /* - * Verb suffixes. - * - * Search for the longest among the following suffixes in RV, - * and if found, delete. - * - * <returns>false if no ending was removed</returns> - */ - private bool step2() - { - if (RV == null) return false; - - // suffix lenght = 7 - if (RV.Length >= 7) - { - if (suffix(RV, "issemos")) - { - CT = removeSuffix(CT, "issemos"); return true; - } - if (suffix(RV, "essemos")) - { - CT = removeSuffix(CT, "essemos"); return true; - } - if (suffix(RV, "assemos")) - { - CT = removeSuffix(CT, "assemos"); return true; - } - if (suffix(RV, "ariamos")) - { - CT = removeSuffix(CT, "ariamos"); return true; - } - if (suffix(RV, "eriamos")) - { - CT = removeSuffix(CT, "eriamos"); return true; - } - if (suffix(RV, "iriamos")) - { - CT = removeSuffix(CT, "iriamos"); return true; - } - } - - // suffix lenght = 6 - if (RV.Length >= 6) - { - if (suffix(RV, "iremos")) - { - CT = removeSuffix(CT, "iremos"); return true; - } - if (suffix(RV, "eremos")) - { - CT = removeSuffix(CT, "eremos"); return true; - } - if (suffix(RV, "aremos")) - { - CT = removeSuffix(CT, "aremos"); return true; - } - if (suffix(RV, "avamos")) - { - CT = removeSuffix(CT, "avamos"); return true; - } - if (suffix(RV, "iramos")) - { - CT = removeSuffix(CT, "iramos"); return true; - } - if (suffix(RV, "eramos")) - { - CT = removeSuffix(CT, "eramos"); return true; - } - if (suffix(RV, "aramos")) - { - CT = removeSuffix(CT, "aramos"); return true; - } - if (suffix(RV, "asseis")) - { - CT = removeSuffix(CT, "asseis"); return true; - } - if (suffix(RV, "esseis")) - { - CT = removeSuffix(CT, "esseis"); return true; - } - if (suffix(RV, "isseis")) - { - CT = removeSuffix(CT, "isseis"); return true; - } - if (suffix(RV, "arieis")) - { - CT = removeSuffix(CT, "arieis"); return true; - } - if (suffix(RV, "erieis")) - { - CT = removeSuffix(CT, "erieis"); return true; - } - if (suffix(RV, "irieis")) - { - CT = removeSuffix(CT, "irieis"); return true; - } - } - - - // suffix lenght = 5 - if (RV.Length >= 5) - { - if (suffix(RV, "irmos")) - { - CT = removeSuffix(CT, "irmos"); return true; - } - if (suffix(RV, "iamos")) - { - CT = removeSuffix(CT, "iamos"); return true; - } - if (suffix(RV, "armos")) - { - CT = removeSuffix(CT, "armos"); return true; - } - if (suffix(RV, "ermos")) - { - CT = removeSuffix(CT, "ermos"); return true; - } - if (suffix(RV, "areis")) - { - CT = removeSuffix(CT, "areis"); return true; - } - if (suffix(RV, "ereis")) - { - CT = removeSuffix(CT, "ereis"); return true; - } - if (suffix(RV, "ireis")) - { - CT = removeSuffix(CT, "ireis"); return true; - } - if (suffix(RV, "asses")) - { - CT = removeSuffix(CT, "asses"); return true; - } - if (suffix(RV, "esses")) - { - CT = removeSuffix(CT, "esses"); return true; - } - if (suffix(RV, "isses")) - { - CT = removeSuffix(CT, "isses"); return true; - } - if (suffix(RV, "astes")) - { - CT = removeSuffix(CT, "astes"); return true; - } - if (suffix(RV, "assem")) - { - CT = removeSuffix(CT, "assem"); return true; - } - if (suffix(RV, "essem")) - { - CT = removeSuffix(CT, "essem"); return true; - } - if (suffix(RV, "issem")) - { - CT = removeSuffix(CT, "issem"); return true; - } - if (suffix(RV, "ardes")) - { - CT = removeSuffix(CT, "ardes"); return true; - } - if (suffix(RV, "erdes")) - { - CT = removeSuffix(CT, "erdes"); return true; - } - if (suffix(RV, "irdes")) - { - CT = removeSuffix(CT, "irdes"); return true; - } - if (suffix(RV, "ariam")) - { - CT = removeSuffix(CT, "ariam"); return true; - } - if (suffix(RV, "eriam")) - { - CT = removeSuffix(CT, "eriam"); return true; - } - if (suffix(RV, "iriam")) - { - CT = removeSuffix(CT, "iriam"); return true; - } - if (suffix(RV, "arias")) - { - CT = removeSuffix(CT, "arias"); return true; - } - if (suffix(RV, "erias")) - { - CT = removeSuffix(CT, "erias"); return true; - } - if (suffix(RV, "irias")) - { - CT = removeSuffix(CT, "irias"); return true; - } - if (suffix(RV, "estes")) - { - CT = removeSuffix(CT, "estes"); return true; - } - if (suffix(RV, "istes")) - { - CT = removeSuffix(CT, "istes"); return true; - } - if (suffix(RV, "areis")) - { - CT = removeSuffix(CT, "areis"); return true; - } - if (suffix(RV, "aveis")) - { - CT = removeSuffix(CT, "aveis"); return true; - } - } - - // suffix lenght = 4 - if (RV.Length >= 4) - { - if (suffix(RV, "aria")) - { - CT = removeSuffix(CT, "aria"); return true; - } - if (suffix(RV, "eria")) - { - CT = removeSuffix(CT, "eria"); return true; - } - if (suffix(RV, "iria")) - { - CT = removeSuffix(CT, "iria"); return true; - } - if (suffix(RV, "asse")) - { - CT = removeSuffix(CT, "asse"); return true; - } - if (suffix(RV, "esse")) - { - CT = removeSuffix(CT, "esse"); return true; - } - if (suffix(RV, "isse")) - { - CT = removeSuffix(CT, "isse"); return true; - } - if (suffix(RV, "aste")) - { - CT = removeSuffix(CT, "aste"); return true; - } - if (suffix(RV, "este")) - { - CT = removeSuffix(CT, "este"); return true; - } - if (suffix(RV, "iste")) - { - CT = removeSuffix(CT, "iste"); return true; - } - if (suffix(RV, "arei")) - { - CT = removeSuffix(CT, "arei"); return true; - } - if (suffix(RV, "erei")) - { - CT = removeSuffix(CT, "erei"); return true; - } - if (suffix(RV, "irei")) - { - CT = removeSuffix(CT, "irei"); return true; - } - if (suffix(RV, "aram")) - { - CT = removeSuffix(CT, "aram"); return true; - } - if (suffix(RV, "eram")) - { - CT = removeSuffix(CT, "eram"); return true; - } - if (suffix(RV, "iram")) - { - CT = removeSuffix(CT, "iram"); return true; - } - if (suffix(RV, "avam")) - { - CT = removeSuffix(CT, "avam"); return true; - } - if (suffix(RV, "arem")) - { - CT = removeSuffix(CT, "arem"); return true; - } - if (suffix(RV, "erem")) - { - CT = removeSuffix(CT, "erem"); return true; - } - if (suffix(RV, "irem")) - { - CT = removeSuffix(CT, "irem"); return true; - } - if (suffix(RV, "ando")) - { - CT = removeSuffix(CT, "ando"); return true; - } - if (suffix(RV, "endo")) - { - CT = removeSuffix(CT, "endo"); return true; - } - if (suffix(RV, "indo")) - { - CT = removeSuffix(CT, "indo"); return true; - } - if (suffix(RV, "arao")) - { - CT = removeSuffix(CT, "arao"); return true; - } - if (suffix(RV, "erao")) - { - CT = removeSuffix(CT, "erao"); return true; - } - if (suffix(RV, "irao")) - { - CT = removeSuffix(CT, "irao"); return true; - } - if (suffix(RV, "adas")) - { - CT = removeSuffix(CT, "adas"); return true; - } - if (suffix(RV, "idas")) - { - CT = removeSuffix(CT, "idas"); return true; - } - if (suffix(RV, "aras")) - { - CT = removeSuffix(CT, "aras"); return true; - } - if (suffix(RV, "eras")) - { - CT = removeSuffix(CT, "eras"); return true; - } - if (suffix(RV, "iras")) - { - CT = removeSuffix(CT, "iras"); return true; - } - if (suffix(RV, "avas")) - { - CT = removeSuffix(CT, "avas"); return true; - } - if (suffix(RV, "ares")) - { - CT = removeSuffix(CT, "ares"); return true; - } - if (suffix(RV, "eres")) - { - CT = removeSuffix(CT, "eres"); return true; - } - if (suffix(RV, "ires")) - { - CT = removeSuffix(CT, "ires"); return true; - } - if (suffix(RV, "ados")) - { - CT = removeSuffix(CT, "ados"); return true; - } - if (suffix(RV, "idos")) - { - CT = removeSuffix(CT, "idos"); return true; - } - if (suffix(RV, "amos")) - { - CT = removeSuffix(CT, "amos"); return true; - } - if (suffix(RV, "emos")) - { - CT = removeSuffix(CT, "emos"); return true; - } - if (suffix(RV, "imos")) - { - CT = removeSuffix(CT, "imos"); return true; - } - if (suffix(RV, "iras")) - { - CT = removeSuffix(CT, "iras"); return true; - } - if (suffix(RV, "ieis")) - { - CT = removeSuffix(CT, "ieis"); return true; - } - } - - // suffix lenght = 3 - if (RV.Length >= 3) - { - if (suffix(RV, "ada")) - { - CT = removeSuffix(CT, "ada"); return true; - } - if (suffix(RV, "ida")) - { - CT = removeSuffix(CT, "ida"); return true; - } - if (suffix(RV, "ara")) - { - CT = removeSuffix(CT, "ara"); return true; - } - if (suffix(RV, "era")) - { - CT = removeSuffix(CT, "era"); return true; - } - if (suffix(RV, "ira")) - { - CT = removeSuffix(CT, "ava"); return true; - } - if (suffix(RV, "iam")) - { - CT = removeSuffix(CT, "iam"); return true; - } - if (suffix(RV, "ado")) - { - CT = removeSuffix(CT, "ado"); return true; - } - if (suffix(RV, "ido")) - { - CT = removeSuffix(CT, "ido"); return true; - } - if (suffix(RV, "ias")) - { - CT = removeSuffix(CT, "ias"); return true; - } - if (suffix(RV, "ais")) - { - CT = removeSuffix(CT, "ais"); return true; - } - if (suffix(RV, "eis")) - { - CT = removeSuffix(CT, "eis"); return true; - } - if (suffix(RV, "ira")) - { - CT = removeSuffix(CT, "ira"); return true; - } - if (suffix(RV, "ear")) - { - CT = removeSuffix(CT, "ear"); return true; - } - } - - // suffix lenght = 2 - if (RV.Length >= 2) - { - if (suffix(RV, "ia")) - { - CT = removeSuffix(CT, "ia"); return true; - } - if (suffix(RV, "ei")) - { - CT = removeSuffix(CT, "ei"); return true; - } - if (suffix(RV, "am")) - { - CT = removeSuffix(CT, "am"); return true; - } - if (suffix(RV, "em")) - { - CT = removeSuffix(CT, "em"); return true; - } - if (suffix(RV, "ar")) - { - CT = removeSuffix(CT, "ar"); return true; - } - if (suffix(RV, "er")) - { - CT = removeSuffix(CT, "er"); return true; - } - if (suffix(RV, "ir")) - { - CT = removeSuffix(CT, "ir"); return true; - } - if (suffix(RV, "as")) - { - CT = removeSuffix(CT, "as"); return true; - } - if (suffix(RV, "es")) - { - CT = removeSuffix(CT, "es"); return true; - } - if (suffix(RV, "is")) - { - CT = removeSuffix(CT, "is"); return true; - } - if (suffix(RV, "eu")) - { - CT = removeSuffix(CT, "eu"); return true; - } - if (suffix(RV, "iu")) - { - CT = removeSuffix(CT, "iu"); return true; - } - if (suffix(RV, "iu")) - { - CT = removeSuffix(CT, "iu"); return true; - } - if (suffix(RV, "ou")) - { - CT = removeSuffix(CT, "ou"); return true; - } - } - - // no ending was removed by step2 - return false; - } - - /* - * Delete suffix 'i' if in RV and preceded by 'c' - * - */ - private void step3() - { - if (RV == null) return; - - if (suffix(RV, "i") && suffixPreceded(RV, "i", "c")) - { - CT = removeSuffix(CT, "i"); - } - - } - - /* - * Residual suffix - * - * If the word ends with one of the suffixes (os a i o á à ó) - * in RV, delete it - * - */ - private void step4() - { - if (RV == null) return; - - if (suffix(RV, "os")) - { - CT = removeSuffix(CT, "os"); return; - } - if (suffix(RV, "a")) - { - CT = removeSuffix(CT, "a"); return; - } - if (suffix(RV, "i")) - { - CT = removeSuffix(CT, "i"); return; - } - if (suffix(RV, "o")) - { - CT = removeSuffix(CT, "o"); return; - } - - } - - /* - * If the word ends with one of ( e é ê) in RV,delete it, - * and if preceded by 'gu' (or 'ci') with the 'u' (or 'i') in RV, - * delete the 'u' (or 'i') - * - * Or if the word ends ç remove the cedilha - * - */ - private void step5() - { - if (RV == null) return; - - if (suffix(RV, "e")) - { - if (suffixPreceded(RV, "e", "gu")) - { - CT = removeSuffix(CT, "e"); - CT = removeSuffix(CT, "u"); - return; - } - - if (suffixPreceded(RV, "e", "ci")) - { - CT = removeSuffix(CT, "e"); - CT = removeSuffix(CT, "i"); - return; - } - - CT = removeSuffix(CT, "e"); return; - } - } - - /* - * For log and debug purpose - * - * <returns> TERM, CT, RV, R1 and R2</returns> - */ - public string Log() - { - return " (TERM = " + TERM + ")" + - " (CT = " + CT + ")" + - " (RV = " + RV + ")" + - " (R1 = " + R1 + ")" + - " (R2 = " + R2 + ")"; - } - - } - -} \ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/CJK/CJKAnalyzer.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/CJK/CJKAnalyzer.cs b/src/contrib/Analyzers/CJK/CJKAnalyzer.cs deleted file mode 100644 index fb21358..0000000 --- a/src/contrib/Analyzers/CJK/CJKAnalyzer.cs +++ /dev/null @@ -1,154 +0,0 @@ -/* - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - * -*/ - -using System; -using System.Collections.Generic; -using System.IO; -using System.Collections; -using Lucene.Net.Analysis; -using Version = Lucene.Net.Util.Version; - -namespace Lucene.Net.Analysis.CJK -{ - /// <summary> - /// Filters CJKTokenizer with StopFilter. - /// - /// <author>Che, Dong</author> - /// </summary> - public class CJKAnalyzer : Analyzer - { - //~ Static fields/initializers --------------------------------------------- - - /// <summary> - /// An array containing some common English words that are not usually - /// useful for searching. and some double-byte interpunctions..... - /// </summary> - // TODO make this final in 3.1 - - // this might be revised and merged with StopFilter stop words too - [Obsolete("use GetDefaultStopSet() instead")] public static String[] STOP_WORDS = - { - "a", "and", "are", "as", "at", "be", - "but", "by", "for", "if", "in", - "into", "is", "it", "no", "not", - "of", "on", "or", "s", "such", "t", - "that", "the", "their", "then", - "there", "these", "they", "this", - "to", "was", "will", "with", "", - "www" - }; - - //~ Instance fields -------------------------------------------------------- - - /// <summary> - /// Returns an unmodifiable instance of the default stop-words set. - /// </summary> - /// <returns>Returns an unmodifiable instance of the default stop-words set.</returns> - public static ISet<string> GetDefaultStopSet() - { - return DefaultSetHolder.DEFAULT_STOP_SET; - } - - private static class DefaultSetHolder - { - internal static ISet<string> DEFAULT_STOP_SET = - CharArraySet.UnmodifiableSet(new CharArraySet((IEnumerable<string>)STOP_WORDS, false)); - } - - /// <summary> - /// stop word list - /// </summary> - private ISet<string> stopTable; - - private readonly Version matchVersion; - - //~ Constructors ----------------------------------------------------------- - - public CJKAnalyzer(Version matchVersion) - : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET) - { - - } - - public CJKAnalyzer(Version matchVersion, ISet<string> stopWords) - { - stopTable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopWords)); - this.matchVersion = matchVersion; - } - - /// <summary> - /// Builds an analyzer which removes words in the provided array. - /// </summary> - /// <param name="stopWords">stop word array</param> - public CJKAnalyzer(Version matchVersion, params string[] stopWords) - { - stopTable = StopFilter.MakeStopSet(stopWords); - this.matchVersion = matchVersion; - } - - //~ Methods ---------------------------------------------------------------- - - /// <summary> - /// get token stream from input - /// </summary> - /// <param name="fieldName">lucene field name</param> - /// <param name="reader">input reader</param> - /// <returns>Token Stream</returns> - public override sealed TokenStream TokenStream(String fieldName, TextReader reader) - { - return new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), - new CJKTokenizer(reader), stopTable); - } - - private class SavedStreams - { - protected internal Tokenizer source; - protected internal TokenStream result; - }; - - /* - * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text - * in the provided {@link Reader}. - * - * @param fieldName lucene field name - * @param reader Input {@link Reader} - * @return A {@link TokenStream} built from {@link CJKTokenizer}, filtered with - * {@link StopFilter} - */ - public override sealed TokenStream ReusableTokenStream(String fieldName, TextReader reader) - { - /* tokenStream() is final, no back compat issue */ - SavedStreams streams = (SavedStreams) PreviousTokenStream; - if (streams == null) - { - streams = new SavedStreams(); - streams.source = new CJKTokenizer(reader); - streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), - streams.source, stopTable); - PreviousTokenStream = streams; - } - else - { - streams.source.Reset(reader); - } - return streams.result; - } - } -} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/CJK/CJKTokenizer.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/CJK/CJKTokenizer.cs b/src/contrib/Analyzers/CJK/CJKTokenizer.cs deleted file mode 100644 index 6be5a6e..0000000 --- a/src/contrib/Analyzers/CJK/CJKTokenizer.cs +++ /dev/null @@ -1,399 +0,0 @@ -/* - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - * -*/ - -using System; -using System.Globalization; -using System.IO; -using System.Text; -using System.Text.RegularExpressions; -using Lucene.Net.Analysis; -using Lucene.Net.Analysis.Tokenattributes; -using Lucene.Net.Util; - -namespace Lucene.Net.Analysis.CJK -{ - /// <summary> - /// <p> - /// CJKTokenizer was modified from StopTokenizer which does a decent job for - /// most European languages. and it perferm other token method for double-byte - /// chars: the token will return at each two charactors with overlap match.<br/> - /// Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it - /// also need filter filter zero length token ""<br/> - /// for Digit: digit, '+', '#' will token as letter<br/> - /// for more info on Asia language(Chinese Japanese Korean) text segmentation: - /// please search <a - /// href="http://www.google.com/search?q=word+chinese+segment">google</a> - /// </p> - /// - /// @author Che, Dong - /// @version $Id: CJKTokenizer.java,v 1.3 2003/01/22 20:54:47 otis Exp $ - /// </summary> - public sealed class CJKTokenizer : Tokenizer - { - //~ Static fields/initializers --------------------------------------------- - /// <summary> - /// Word token type - /// </summary> - internal static readonly int WORD_TYPE = 0; - - /// <summary> - /// Single byte token type - /// </summary> - internal static readonly int SINGLE_TOKEN_TYPE = 1; - - /// <summary> - /// Double byte token type - /// </summary> - internal static readonly int DOUBLE_TOKEN_TYPE = 2; - - /// <summary> - /// Names for token types - /// </summary> - internal static readonly String[] TOKEN_TYPE_NAMES = { "word", "single", "double" }; - - /// <summary> - /// Max word length - /// </summary> - internal static readonly int MAX_WORD_LEN = 255; - - /// <summary> - /// buffer size - /// </summary> - internal static readonly int IO_BUFFER_SIZE = 256; - - //~ Instance fields -------------------------------------------------------- - - /// <summary> - /// word offset, used to imply which character(in ) is parsed - /// </summary> - private int offset = 0; - - /// <summary> - /// the index used only for ioBuffer - /// </summary> - private int bufferIndex = 0; - - /// <summary> - /// data length - /// </summary> - private int dataLen = 0; - - /// <summary> - /// character buffer, store the characters which are used to compose <br/> - /// the returned Token - /// </summary> - private char[] buffer = new char[MAX_WORD_LEN]; - - /// <summary> - /// I/O buffer, used to store the content of the input(one of the <br/> - /// members of Tokenizer) - /// </summary> - private char[] ioBuffer = new char[IO_BUFFER_SIZE]; - - /// <summary> - /// word type: single=>ASCII double=>non-ASCII word=>default - /// </summary> - private int tokenType = WORD_TYPE; - - /// <summary> - /// tag: previous character is a cached double-byte character "C1C2C3C4" - /// ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened) - /// C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4" - /// </summary> - private bool preIsTokened = false; - - private ITermAttribute termAtt; - private IOffsetAttribute offsetAtt; - private ITypeAttribute typeAtt; - - //~ Constructors ----------------------------------------------------------- - - /// <summary> - /// Construct a token stream processing the given input. - /// </summary> - /// <param name="_in">I/O reader</param> - public CJKTokenizer(TextReader _in) - : base(_in) - { - Init(); - } - - public CJKTokenizer(AttributeSource source, TextReader _in) - : base(source, _in) - { - Init(); - } - - public CJKTokenizer(AttributeFactory factory, TextReader _in) - : base(factory, _in) - { - Init(); - } - - private void Init() - { - termAtt = AddAttribute<ITermAttribute>(); - offsetAtt = AddAttribute<IOffsetAttribute>(); - typeAtt = AddAttribute<ITypeAttribute>(); - } - - //~ Methods ---------------------------------------------------------------- - - /* - * Returns true for the next token in the stream, or false at EOS. - * See http://java.sun.com/j2se/1.3/docs/api/java/lang/char.UnicodeBlock.html - * for detail. - * - * @return false for end of stream, true otherwise - * - * @throws java.io.IOException - throw IOException when read error <br> - * happened in the InputStream - * - */ - - Regex isBasicLatin = new Regex(@"\p{IsBasicLatin}", RegexOptions.Compiled); - Regex isHalfWidthAndFullWidthForms = new Regex(@"\p{IsHalfwidthandFullwidthForms}", RegexOptions.Compiled); - - public override bool IncrementToken() - { - ClearAttributes(); - /* how many character(s) has been stored in buffer */ - - while (true) - { - // loop until we find a non-empty token - - int length = 0; - - /* the position used to create Token */ - int start = offset; - - while (true) - { - // loop until we've found a full token - /* current character */ - char c; - - offset++; - - if (bufferIndex >= dataLen) - { - dataLen = input.Read(ioBuffer, 0, ioBuffer.Length); - bufferIndex = 0; - } - - if (dataLen == 0) // input.Read returns 0 when its empty, not -1, as in java - { - if (length > 0) - { - if (preIsTokened == true) - { - length = 0; - preIsTokened = false; - } - else - { - offset--; - } - - break; - } - else - { - offset--; - return false; - } - } - else - { - //get current character - c = ioBuffer[bufferIndex++]; - } - - //TODO: Using a Regex to determine the UnicodeCategory is probably slower than - // If we just created a small class that would look it up for us, which - // would likely be trivial, however time-consuming. I can't imagine a Regex - // being fast for this, considering we have to pull a char from the buffer, - // and convert it to a string before we run a regex on it. - cc - bool isHalfFullForm = isHalfWidthAndFullWidthForms.Match(c.ToString()).Success; - //if the current character is ASCII or Extend ASCII - if ((isBasicLatin.Match(c.ToString()).Success) || (isHalfFullForm)) - { - if (isHalfFullForm) - { - int i = (int) c; - if (i >= 65281 && i <= 65374) - { - // convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN - i = i - 65248; - c = (char) i; - } - } - - // if the current character is a letter or "_" "+" "#" - if (char.IsLetterOrDigit(c) - || ((c == '_') || (c == '+') || (c == '#')) - ) - { - if (length == 0) - { - // "javaC1C2C3C4linux" <br> - // ^--: the current character begin to token the ASCII - // letter - start = offset - 1; - } - else if (tokenType == DOUBLE_TOKEN_TYPE) - { - // "javaC1C2C3C4linux" <br> - // ^--: the previous non-ASCII - // : the current character - offset--; - bufferIndex--; - - if (preIsTokened == true) - { - // there is only one non-ASCII has been stored - length = 0; - preIsTokened = false; - break; - } - else - { - break; - } - } - - // store the LowerCase(c) in the buffer - buffer[length++] = char.ToLower(c); // TODO: is java invariant? If so, this should be ToLowerInvariant() - tokenType = SINGLE_TOKEN_TYPE; - - // break the procedure if buffer overflowed! - if (length == MAX_WORD_LEN) - { - break; - } - } - else if (length > 0) - { - if (preIsTokened) - { - length = 0; - preIsTokened = false; - } - else - { - break; - } - } - } - else - { - // non-ASCII letter, e.g."C1C2C3C4" - if (char.IsLetter(c)) - { - if (length == 0) - { - start = offset - 1; - buffer[length++] = c; - tokenType = DOUBLE_TOKEN_TYPE; - } - else - { - if (tokenType == SINGLE_TOKEN_TYPE) - { - offset--; - bufferIndex--; - - //return the previous ASCII characters - break; - } - else - { - buffer[length++] = c; - tokenType = DOUBLE_TOKEN_TYPE; - - if (length == 2) - { - offset--; - bufferIndex--; - preIsTokened = true; - - break; - } - } - } - } - else if (length > 0) - { - if (preIsTokened == true) - { - // empty the buffer - length = 0; - preIsTokened = false; - } - else - { - break; - } - } - } - } - - if (length > 0) - { - termAtt.SetTermBuffer(buffer, 0, length); - offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length)); - typeAtt.Type = TOKEN_TYPE_NAMES[tokenType]; - return true; - } - else if (dataLen == 0) - { - offset--; - return false; - } - - // Cycle back and try for the next token (don't - // return an empty string) - } - } - - public override void End() - { - // set final offset - int finalOffset = CorrectOffset(offset); - this.offsetAtt.SetOffset(finalOffset, finalOffset); - } - - public override void Reset() - { - base.Reset(); - offset = bufferIndex = dataLen = 0; - preIsTokened = false; - tokenType = WORD_TYPE; - } - - public override void Reset(TextReader reader) - { - base.Reset(reader); - Reset(); - } - } -} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/Cn/ChineseAnalyzer.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/Cn/ChineseAnalyzer.cs b/src/contrib/Analyzers/Cn/ChineseAnalyzer.cs deleted file mode 100644 index 1ec050a..0000000 --- a/src/contrib/Analyzers/Cn/ChineseAnalyzer.cs +++ /dev/null @@ -1,85 +0,0 @@ -/* - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - * -*/ - -using System; -using System.IO; -using System.Text; -using System.Collections; - -using Lucene.Net.Analysis; - -namespace Lucene.Net.Analysis.Cn -{ - /// <summary> - /// An <see cref="Analyzer"/> that tokenizes text with <see cref="ChineseTokenizer"/> and - /// filters with <see cref="ChineseFilter"/> - /// </summary> - public class ChineseAnalyzer : Analyzer - { - - public ChineseAnalyzer() - { - } - - /// <summary> - /// Creates a TokenStream which tokenizes all the text in the provided Reader. - /// </summary> - /// <returns>A TokenStream build from a ChineseTokenizer filtered with ChineseFilter.</returns> - public override sealed TokenStream TokenStream(String fieldName, TextReader reader) - { - TokenStream result = new ChineseTokenizer(reader); - result = new ChineseFilter(result); - return result; - } - - private class SavedStreams - { - protected internal Tokenizer source; - protected internal TokenStream result; - }; - - /// <summary> - /// Returns a (possibly reused) <see cref="TokenStream"/> which tokenizes all the text in the - /// provided <see cref="TextReader"/>. - /// </summary> - /// <returns> - /// A <see cref="TokenStream"/> built from a <see cref="ChineseTokenizer"/> - /// filtered with <see cref="ChineseFilter"/>. - /// </returns> - public override TokenStream ReusableTokenStream(String fieldName, TextReader reader) - { - /* tokenStream() is final, no back compat issue */ - SavedStreams streams = (SavedStreams) PreviousTokenStream; - if (streams == null) - { - streams = new SavedStreams(); - streams.source = new ChineseTokenizer(reader); - streams.result = new ChineseFilter(streams.source); - PreviousTokenStream = streams; - } - else - { - streams.source.Reset(reader); - } - return streams.result; - } - } -} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/Cn/ChineseFilter.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/Cn/ChineseFilter.cs b/src/contrib/Analyzers/Cn/ChineseFilter.cs deleted file mode 100644 index e5c83a5..0000000 --- a/src/contrib/Analyzers/Cn/ChineseFilter.cs +++ /dev/null @@ -1,100 +0,0 @@ -/* - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - * -*/ - -using System; -using System.Collections.Generic; -using System.IO; -using System.Collections; -using System.Globalization; - -using Lucene.Net.Analysis; -using Lucene.Net.Analysis.Tokenattributes; - -namespace Lucene.Net.Analysis.Cn -{ - // TODO: convert this XML code to valid .NET - /// <summary> - /// A {@link TokenFilter} with a stop word table. - /// <ul> - /// <li>Numeric tokens are removed.</li> - /// <li>English tokens must be larger than 1 char.</li> - /// <li>One Chinese char as one Chinese word.</li> - /// </ul> - /// TO DO: - /// <ol> - /// <li>Add Chinese stop words, such as \ue400</li> - /// <li>Dictionary based Chinese word extraction</li> - /// <li>Intelligent Chinese word extraction</li> - /// </ol> - /// </summary> - public sealed class ChineseFilter : TokenFilter - { - // Only English now, Chinese to be added later. - public static String[] STOP_WORDS = - { - "and", "are", "as", "at", "be", "but", "by", - "for", "if", "in", "into", "is", "it", - "no", "not", "of", "on", "or", "such", - "that", "the", "their", "then", "there", "these", - "they", "this", "to", "was", "will", "with" - }; - - private CharArraySet stopTable; - private ITermAttribute termAtt; - - public ChineseFilter(TokenStream _in) - : base(_in) - { - stopTable = new CharArraySet((IEnumerable<string>)STOP_WORDS, false); - termAtt = AddAttribute<ITermAttribute>(); - } - - public override bool IncrementToken() - { - while (input.IncrementToken()) - { - char[] text = termAtt.TermBuffer(); - int termLength = termAtt.TermLength(); - - // why not key off token type here assuming ChineseTokenizer comes first? - if (!stopTable.Contains(text, 0, termLength)) - { - switch (char.GetUnicodeCategory(text[0])) - { - case UnicodeCategory.LowercaseLetter: - case UnicodeCategory.UppercaseLetter: - // English word/token should larger than 1 char. - if (termLength > 1) - { - return true; - } - break; - case UnicodeCategory.OtherLetter: - // One Chinese char as one Chinese word. - // Chinese word extraction to be added later here. - return true; - } - } - } - return false; - } - } -} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/Cn/ChineseTokenizer.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/Cn/ChineseTokenizer.cs b/src/contrib/Analyzers/Cn/ChineseTokenizer.cs deleted file mode 100644 index 69947aa..0000000 --- a/src/contrib/Analyzers/Cn/ChineseTokenizer.cs +++ /dev/null @@ -1,191 +0,0 @@ -/* - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - * -*/ - -using System; -using System.IO; -using System.Text; -using System.Collections; -using System.Globalization; - -using Lucene.Net.Analysis; -using Lucene.Net.Analysis.Tokenattributes; -using Lucene.Net.Util; - -namespace Lucene.Net.Analysis.Cn -{ - /// <summary> - /// Tokenize Chinese text as individual chinese chars. - /// <p> - /// The difference between ChineseTokenizer and - /// CJKTokenizer is that they have different - /// token parsing logic. - /// </p> - /// <p> - /// For example, if the Chinese text - /// "C1C2C3C4" is to be indexed: - /// <ul> - /// <li>The tokens returned from ChineseTokenizer are C1, C2, C3, C4</li> - /// <li>The tokens returned from the CJKTokenizer are C1C2, C2C3, C3C4.</li> - /// </ul> - /// </p> - /// <p> - /// Therefore the index created by CJKTokenizer is much larger. - /// </p> - /// <p> - /// The problem is that when searching for C1, C1C2, C1C3, - /// C4C2, C1C2C3 ... the ChineseTokenizer works, but the - /// CJKTokenizer will not work. - /// </p> - /// </summary> - public sealed class ChineseTokenizer : Tokenizer - { - public ChineseTokenizer(TextReader _in) - : base(_in) - { - Init(); - } - - public ChineseTokenizer(AttributeSource source, TextReader _in) - : base(source, _in) - { - Init(); - } - - public ChineseTokenizer(AttributeFactory factory, TextReader _in) - : base(factory, _in) - { - Init(); - } - - private void Init() - { - termAtt = AddAttribute<ITermAttribute>(); - offsetAtt = AddAttribute<IOffsetAttribute>(); - } - - private int offset = 0, bufferIndex = 0, dataLen = 0; - private static readonly int MAX_WORD_LEN = 255; - private static readonly int IO_BUFFER_SIZE = 1024; - private readonly char[] buffer = new char[MAX_WORD_LEN]; - private readonly char[] ioBuffer = new char[IO_BUFFER_SIZE]; - - private int length; - private int start; - - private ITermAttribute termAtt; - private IOffsetAttribute offsetAtt; - - private void Push(char c) - { - if (length == 0) start = offset - 1; // start of token - buffer[length++] = Char.ToLower(c); // buffer it - } - - private bool Flush() - { - - if (length > 0) - { - termAtt.SetTermBuffer(buffer, 0, length); - offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length)); - return true; - } - else - return false; - } - - - public override bool IncrementToken() - { - ClearAttributes(); - - length = 0; - start = offset; - - - while (true) - { - - char c; - offset++; - - if (bufferIndex >= dataLen) - { - dataLen = input.Read(ioBuffer, 0, ioBuffer.Length); - bufferIndex = 0; - } - - if (dataLen == 0) - { - offset--; - return Flush(); - } - else - c = ioBuffer[bufferIndex++]; - - - switch (char.GetUnicodeCategory(c)) - { - - case UnicodeCategory.DecimalDigitNumber: - case UnicodeCategory.LowercaseLetter: - case UnicodeCategory.UppercaseLetter: - Push(c); - if (length == MAX_WORD_LEN) return Flush(); - break; - - case UnicodeCategory.OtherLetter: - if (length > 0) - { - bufferIndex--; - offset--; - return Flush(); - } - Push(c); - return Flush(); - - default: - if (length > 0) return Flush(); - break; - } - } - } - - public override sealed void End() - { - // set final offset - int finalOffset = CorrectOffset(offset); - this.offsetAtt.SetOffset(finalOffset, finalOffset); - } - - public override void Reset() - { - base.Reset(); - offset = bufferIndex = dataLen = 0; - } - - public override void Reset(TextReader input) - { - base.Reset(input); - Reset(); - } - } -} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/Compound/CompoundWordTokenFilterBase.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/Compound/CompoundWordTokenFilterBase.cs b/src/contrib/Analyzers/Compound/CompoundWordTokenFilterBase.cs deleted file mode 100644 index af3f702..0000000 --- a/src/contrib/Analyzers/Compound/CompoundWordTokenFilterBase.cs +++ /dev/null @@ -1,230 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; -using System.Collections.Generic; -using Lucene.Net.Analysis.Tokenattributes; - -namespace Lucene.Net.Analysis.Compound -{ - - /* - * Base class for decomposition token filters. - */ - public abstract class CompoundWordTokenFilterBase : TokenFilter - { - /* - * The default for minimal word length that gets decomposed - */ - public static readonly int DEFAULT_MIN_WORD_SIZE = 5; - - /* - * The default for minimal length of subwords that get propagated to the output of this filter - */ - public static readonly int DEFAULT_MIN_SUBWORD_SIZE = 2; - - /* - * The default for maximal length of subwords that get propagated to the output of this filter - */ - public static readonly int DEFAULT_MAX_SUBWORD_SIZE = 15; - - protected readonly CharArraySet dictionary; - protected readonly LinkedList<Token> tokens; - protected readonly int minWordSize; - protected readonly int minSubwordSize; - protected readonly int maxSubwordSize; - protected readonly bool onlyLongestMatch; - - private ITermAttribute termAtt; - private IOffsetAttribute offsetAtt; - private IFlagsAttribute flagsAtt; - private IPositionIncrementAttribute posIncAtt; - private ITypeAttribute typeAtt; - private IPayloadAttribute payloadAtt; - - private readonly Token wrapper = new Token(); - - protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch) - : this(input, MakeDictionary(dictionary), minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch) - { - - } - - protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, bool onlyLongestMatch) - : this(input, MakeDictionary(dictionary), DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch) - { - - } - - protected CompoundWordTokenFilterBase(TokenStream input, ISet<string> dictionary, bool onlyLongestMatch) - : this(input, dictionary, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch) - { - - } - - protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary) - : this(input, MakeDictionary(dictionary), DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false) - { - - } - - protected CompoundWordTokenFilterBase(TokenStream input, ISet<string> dictionary) - : this(input, dictionary, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false) - { - - } - - protected CompoundWordTokenFilterBase(TokenStream input, ISet<string> dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch) - : base(input) - { - this.tokens = new LinkedList<Token>(); - this.minWordSize = minWordSize; - this.minSubwordSize = minSubwordSize; - this.maxSubwordSize = maxSubwordSize; - this.onlyLongestMatch = onlyLongestMatch; - - if (dictionary is CharArraySet) - { - this.dictionary = (CharArraySet)dictionary; - } - else - { - this.dictionary = new CharArraySet(dictionary.Count, false); - AddAllLowerCase(this.dictionary, dictionary); - } - - termAtt = AddAttribute<ITermAttribute>(); - offsetAtt = AddAttribute<IOffsetAttribute>(); - flagsAtt = AddAttribute<IFlagsAttribute>(); - posIncAtt = AddAttribute<IPositionIncrementAttribute>(); - typeAtt = AddAttribute<ITypeAttribute>(); - payloadAtt = AddAttribute<IPayloadAttribute>(); - } - - /* - * Create a set of words from an array - * The resulting Set does case insensitive matching - * TODO We should look for a faster dictionary lookup approach. - * @param dictionary - * @return {@link Set} of lowercased terms - */ - public static ISet<string> MakeDictionary(String[] dictionary) - { - // is the below really case insensitive? - CharArraySet dict = new CharArraySet(dictionary.Length, false); - AddAllLowerCase(dict, dictionary); - return dict; - } - - private void setToken(Token token) - { - ClearAttributes(); - termAtt.SetTermBuffer(token.TermBuffer(), 0, token.TermLength()); - flagsAtt.Flags = token.Flags; - typeAtt.Type = token.Type; - offsetAtt.SetOffset(token.StartOffset, token.EndOffset); - posIncAtt.PositionIncrement = token.PositionIncrement; - payloadAtt.Payload = token.Payload; - } - - public sealed override bool IncrementToken() - { - if (tokens.Count > 0) - { - setToken((Token)tokens.First.Value); - tokens.RemoveFirst(); - return true; - } - - if (input.IncrementToken() == false) - return false; - - wrapper.SetTermBuffer(termAtt.TermBuffer(), 0, termAtt.TermLength()); - wrapper.StartOffset = offsetAtt.StartOffset; - wrapper.EndOffset = offsetAtt.EndOffset; - wrapper.Flags = flagsAtt.Flags; - wrapper.Type = typeAtt.Type; - wrapper.PositionIncrement = posIncAtt.PositionIncrement; - wrapper.Payload = payloadAtt.Payload; - - Decompose(wrapper); - - if (tokens.Count > 0) - { - setToken(tokens.First.Value); - tokens.RemoveFirst(); - return true; - } - else - { - return false; - } - } - - protected static void AddAllLowerCase(ISet<string> target, ICollection<string> col) - { - foreach (var str in col) - { - target.Add(str.ToLower(System.Globalization.CultureInfo.GetCultureInfo("en-US"))); - } - } - - protected static char[] MakeLowerCaseCopy(char[] buffer) - { - char[] result = new char[buffer.Length]; - Array.Copy(buffer, 0, result, 0, buffer.Length); - - for (int i = 0; i < buffer.Length; ++i) - { - result[i] = char.ToLower(buffer[i]); // Is java invariant? - } - - return result; - } - - protected Token CreateToken(int offset, int length, - Token prototype) - { - int newStart = prototype.StartOffset + offset; - Token t = prototype.Clone(prototype.TermBuffer(), offset, length, newStart, newStart + length); - t.PositionIncrement = 0; - return t; - } - - protected void Decompose(Token token) - { - // In any case we give the original token back - tokens.AddLast((Token)token.Clone()); - - // Only words longer than minWordSize get processed - if (token.TermLength() < this.minWordSize) - { - return; - } - - DecomposeInternal(token); - } - - protected abstract void DecomposeInternal(Token token); - - public override void Reset() - { - base.Reset(); - tokens.Clear(); - } - } -} \ No newline at end of file
