http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_dutch.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_dutch.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_dutch.txt new file mode 100644 index 0000000..2a69a96 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_dutch.txt @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// CONSONANTS +"ssj" "" "" "S" +"sj" "" "" "S" +"ch" "" "" "x" +"c" "" "[eiy]" "ts" +"ck" "" "" "k" // German +"pf" "" "" "(pf|p|f)" // German +"ph" "" "" "(ph|f)" +"qu" "" "" "kv" +"th" "^" "" "t" // German +"th" "" "[äöüaeiou]" "(t|th)" // German +"th" "" "" "t" // German +"ss" "" "" "s" +"h" "[aeiouy]" "" "" + +// VOWELS +"aue" "" "" "aue" +"ou" "" "" "au" +"ie" "" "" "(Q|i)" +"uu" "" "" "(Q|u)" +"ee" "" "" "e" +"eu" "" "" "(Y|Yj)" // Dutch Y +"aa" "" "" "a" +"oo" "" "" "o" +"oe" "" "" "u" +"ij" "" "" "ej" +"ui" "" "" "(Y|uj)" +"ei" "" "" "(ej|aj)" // Dutch ej + +"i" "" "[aou]" "j" +"y" "" "[aeou]" "j" +"i" "[aou]" "" "j" +"y" "[aeou]" "" "j" + +// LATIN ALPHABET +"a" "" "" "a" +"b" "" "" "b" +"c" "" "" "k" +"d" "" "" "d" +"e" "" "" "e" +"f" "" "" "f" +"g" "" "" "(g|x)" +"h" "" "" "h" +"i" "" "" "(i|Q)" +"j" "" "" "j" +"k" "" "" "k" +"l" "" "" "l" +"m" "" "" "m" +"n" "" "" "n" +"o" "" "" "o" +"p" "" "" "p" +"q" "" "" "k" +"r" "" "" "r" +"s" "" "" "s" +"t" "" "" "t" +"u" "" "" "(u|Q)" +"v" "" "" "v" +"w" "" "" "(w|v)" +"x" "" "" "ks" +"y" "" "" "i" +"z" "" "" "z"
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_english.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_english.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_english.txt new file mode 100644 index 0000000..db9ccec --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_english.txt @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// GENERAL + +// CONSONANTS +"�" "" "" "" // O�Neill +"'" "" "" "" // O�Neill +"mc" "^" "" "mak" // McDonald +"tz" "" "" "ts" // Fitzgerald +"tch" "" "" "tS" +"ch" "" "" "(tS|x)" +"ck" "" "" "k" +"cc" "" "[iey]" "ks" // success, accent +"c" "" "c" "" +"c" "" "[iey]" "s" // circle + +"gh" "^" "" "g" // ghost +"gh" "" "" "(g|f|w)" // burgh | tough | bough +"gn" "" "" "(gn|n)" +"g" "" "[iey]" "(g|dZ)" // get, gem, giant, gigabyte +// "th" "" "" "(6|8|t)" +"th" "" "" "t" +"kh" "" "" "x" +"ph" "" "" "f" +"sch" "" "" "(S|sk)" +"sh" "" "" "S" +"who" "^" "" "hu" +"wh" "^" "" "w" + +"h" "" "$" "" // hard to find an example that isn't in a name +"h" "" "[^aeiou]" "" // hard to find an example that isn't in a name +"h" "^" "" "H" + +"kn" "^" "" "n" // knight +"mb" "" "$" "m" +"ng" "" "$" "(N|ng)" +"pn" "^" "" "(pn|n)" +"ps" "^" "" "(ps|s)" +"qu" "" "" "kw" +"tia" "" "" "(So|Sa)" +"tio" "" "" "So" +"wr" "^" "" "r" +"x" "^" "" "z" + +// VOWELS +"y" "^" "" "j" +"y" "^" "[aeiouy]" "j" +"yi" "^" "" "i" +"aue" "" "" "aue" +"oue" "" "" "(aue|oue)" +"ai" "" "" "(aj|ej|e)" // rain | said +"ay" "" "" "(aj|ej)" +"a" "" "[^aeiou]e" "ej" // plane +"ei" "" "" "(ej|aj|i)" // weigh | receive +"ey" "" "" "(ej|aj|i)" // hey | barley +"ear" "" "" "ia" // tear +"ea" "" "" "(i|e)" // reason | treasure +"ee" "" "" "i" // between +"e" "" "[^aeiou]e" "i" // meter +"e" "" "$" "(|E)" // blame, badge +"ie" "" "" "i" // believe +"i" "" "[^aeiou]e" "aj" // five +"oa" "" "" "ou" // toad +"oi" "" "" "oj" // join +"oo" "" "" "u" // food +"ou" "" "" "(u|ou)" // through | tough | could +"oy" "" "" "oj" // boy +"o" "" "[^aeiou]e" "ou" // rode +"u" "" "[^aeiou]e" "(ju|u)" // cute | flute +"u" "" "r" "(e|u)" // turn -- Morse disagrees, feels it should go to E + +// LATIN ALPHABET +"a" "" "" "(e|o|a)" // hat | call | part +"b" "" "" "b" +"c" "" "" "k" // candy +"d" "" "" "d" +"e" "" "" "E" // bed +"f" "" "" "f" +"g" "" "" "g" +"h" "" "" "h" +"i" "" "" "I" +"j" "" "" "dZ" +"k" "" "" "k" +"l" "" "" "l" +"m" "" "" "m" +"n" "" "" "n" +"o" "" "" "(o|a)" // hot +"p" "" "" "p" +"q" "" "" "k" +"r" "" "" "r" +"s" "" "" "s" +"t" "" "" "t" +"u" "" "" "(u|a)" // put +"v" "" "" "v" +"w" "" "" "(w|v)" // the variant "v" is for spellings coming from German/Polish +"x" "" "" "ks" +"y" "" "" "i" +"z" "" "" "z" http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_french.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_french.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_french.txt new file mode 100644 index 0000000..e67a0ec --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_french.txt @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// GENERAL + +// CONSONANTS +"lt" "u" "$" "(lt|)" // Renault +"c" "n" "$" "(k|)" // Tronc +//"f" "" "" "(f|)" // Clef +"d" "" "$" "(t|)" // Durand +"g" "n" "$" "(k|)" // Gang +"p" "" "$" "(p|)" // Trop, Champ +"r" "e" "$" "(r|)" // Barbier +"t" "" "$" "(t|)" // Murat, Constant +"z" "" "$" "(s|)" + +"ds" "" "$" "(ds|)" +"ps" "" "$" "(ps|)" // Champs +"rs" "e" "$" "(rs|)" +"ts" "" "$" "(ts|)" +"s" "" "$" "(s|)" // Denis + +"x" "u" "$" "(ks|)" // Arnoux + +"s" "[aeéèêiou]" "[^aeéèêiou]" "(s|)" // Deschamps, Malesherbes, Groslot +"t" "[aeéèêiou]" "[^aeéèêiou]" "(t|)" // Petitjean + +"kh" "" "" "x" // foreign +"ph" "" "" "f" + +"ç" "" "" "s" +"x" "" "" "ks" +"ch" "" "" "S" +"c" "" "[eiyéèê]" "s" + +"gn" "" "" "(n|gn)" +"g" "" "[eiy]" "Z" +"gue" "" "$" "k" +"gu" "" "[eiy]" "g" +"aill" "" "e" "aj" // non Jewish +"ll" "" "e" "(l|j)" // non Jewish +"que" "" "$" "k" +"qu" "" "" "k" +"s" "[aeiouyéèê]" "[aeiouyéèê]" "z" +"h" "[bdgt]" "" "" // translit from Arabic + +"m" "[aeiouy]" "[aeiouy]" "m" +"m" "[aeiouy]" "" "(m|n)" // nasal + +"ou" "" "[aeio]" "v" +"u" "" "[aeio]" "v" + +// VOWELS +"aue" "" "" "aue" +"eau" "" "" "o" +"au" "" "" "(o|au)" // non Jewish +"ai" "" "" "(e|aj)" // [e] is non Jewish +"ay" "" "" "(e|aj)" // [e] is non Jewish +"é" "" "" "e" +"ê" "" "" "e" +"è" "" "" "e" +"à " "" "" "a" +"â" "" "" "a" +"où" "" "" "u" +"ou" "" "" "u" +"oi" "" "" "(oj|va)" // [va] (actually "ua") is non Jewish +"ei" "" "" "(aj|ej|e)" // [e] is non Jewish +"ey" "" "" "(aj|ej|e)" // [e] non Jewish +"eu" "" "" "(ej|Y)" // non Jewish +"y" "[ou]" "" "j" +"e" "" "$" "(e|)" +"i" "" "[aou]" "j" +"y" "" "[aoeu]" "j" + +// LATIN ALPHABET +"a" "" "" "a" +"b" "" "" "b" +"c" "" "" "k" +"d" "" "" "d" +"e" "" "" "e" +"f" "" "" "f" +"g" "" "" "g" +"h" "" "" "h" +"i" "" "" "i" +"j" "" "" "Z" +"k" "" "" "k" +"l" "" "" "l" +"m" "" "" "m" +"n" "" "" "n" +"o" "" "" "o" +"p" "" "" "p" +"q" "" "" "k" +"r" "" "" "r" +"s" "" "" "s" +"t" "" "" "t" +"u" "" "" "(u|Q)" +"v" "" "" "v" +"w" "" "" "v" +"y" "" "" "i" +"z" "" "" "z" http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_german.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_german.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_german.txt new file mode 100644 index 0000000..1e79c35 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_german.txt @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// GENERIC + +// CONSONANTS +"ewitsch" "" "$" "evitS" +"owitsch" "" "$" "ovitS" +"evitsch" "" "$" "evitS" +"ovitsch" "" "$" "ovitS" +"witsch" "" "$" "vitS" +"vitsch" "" "$" "vitS" +"ssch" "" "" "S" +"chsch" "" "" "xS" +"sch" "" "" "S" + +"ziu" "" "" "tsu" +"zia" "" "" "tsa" +"zio" "" "" "tso" + +"chs" "" "" "ks" +"ch" "" "" "x" +"ck" "" "" "k" +"c" "" "[eiy]" "ts" + +"sp" "^" "" "Sp" +"st" "^" "" "St" +"ssp" "" "" "(Sp|sp)" +"sp" "" "" "(Sp|sp)" +"sst" "" "" "(St|st)" +"st" "" "" "(St|st)" +"pf" "" "" "(pf|p|f)" +"ph" "" "" "(ph|f)" +"qu" "" "" "kv" + +"ewitz" "" "$" "(evits|evitS)" +"ewiz" "" "$" "(evits|evitS)" +"evitz" "" "$" "(evits|evitS)" +"eviz" "" "$" "(evits|evitS)" +"owitz" "" "$" "(ovits|ovitS)" +"owiz" "" "$" "(ovits|ovitS)" +"ovitz" "" "$" "(ovits|ovitS)" +"oviz" "" "$" "(ovits|ovitS)" +"witz" "" "$" "(vits|vitS)" +"wiz" "" "$" "(vits|vitS)" +"vitz" "" "$" "(vits|vitS)" +"viz" "" "$" "(vits|vitS)" +"tz" "" "" "ts" + +"thal" "" "$" "tal" +"th" "^" "" "t" +"th" "" "[äöüaeiou]" "(t|th)" +"th" "" "" "t" +"rh" "^" "" "r" +"h" "[aeiouyäöü]" "" "" +"h" "^" "" "H" + +"ss" "" "" "s" +"s" "" "[äöüaeiouy]" "(z|s)" +"s" "[aeiouyäöüj]" "[aeiouyäöü]" "z" +"Ã" "" "" "s" + + +// VOWELS +"ij" "" "$" "i" +"aue" "" "" "aue" +"ue" "" "" "Q" +"ae" "" "" "Y" +"oe" "" "" "Y" +"ü" "" "" "Q" +"ä" "" "" "Y" +"ö" "" "" "Y" +"ei" "" "" "(aj|ej)" +"ey" "" "" "(aj|ej)" +"eu" "" "" "(Yj|ej|aj|oj)" +"i" "[aou]" "" "j" +"y" "[aou]" "" "j" +"ie" "" "" "I" +"i" "" "[aou]" "j" +"y" "" "[aoeu]" "j" + +// FOREIGN LETTERs +"ñ" "" "" "n" +"ã" "" "" "a" +"Å" "" "" "o" +"ű" "" "" "u" +"ç" "" "" "s" + +// LATIN ALPHABET +"a" "" "" "A" +"b" "" "" "b" +"c" "" "" "k" +"d" "" "" "d" +"e" "" "" "E" +"f" "" "" "f" +"g" "" "" "g" +"h" "" "" "h" +"i" "" "" "I" +"j" "" "" "j" +"k" "" "" "k" +"l" "" "" "l" +"m" "" "" "m" +"n" "" "" "n" +"o" "" "" "O" +"p" "" "" "p" +"q" "" "" "k" +"r" "" "" "r" +"s" "" "" "s" +"t" "" "" "t" +"u" "" "" "U" +"v" "" "" "(f|v)" +"w" "" "" "v" +"x" "" "" "ks" +"y" "" "" "i" +"z" "" "" "ts" http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_greek.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_greek.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_greek.txt new file mode 100644 index 0000000..f396a65 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_greek.txt @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +"Î±Ï " "" "$" "af" // "av" before vowels and voiced consonants, "af" elsewhere +"Î±Ï " "" "(κ|Ï|Ï|Ï|Ï|θ|Ï|Ï)" "af" +"Î±Ï " "" "" "av" +"ÎµÏ " "" "$" "ef" // "ev" before vowels and voiced consonants, "ef" elsewhere +"ÎµÏ " "" "(κ|Ï|Ï|Ï|Ï|θ|Ï|Ï)" "ef" +"ÎµÏ " "" "" "ev" +"Î·Ï " "" "$" "if" // "iv" before vowels and voiced consonants, "if" elsewhere +"Î·Ï " "" "(κ|Ï|Ï|Ï|Ï|θ|Ï|Ï)" "if" +"Î·Ï " "" "" "iv" +"Î¿Ï " "" "" "u" // [u:] + +"αι" "" "" "aj" // modern [e] +"ει" "" "" "ej" // modern [i] +"οι" "" "" "oj" // modern [i] +"Ïι" "" "" "oj" +"ηι" "" "" "ej" +"Ï Î¹" "" "" "i" // modern Greek "i" + +"γγ" "(ε|ι|η|α|ο|Ï|Ï )" "(ε|ι|η)" "(nj|j)" +"γγ" "" "(ε|ι|η)" "j" +"γγ" "(ε|ι|η|α|ο|Ï|Ï )" "" "(ng|g)" +"γγ" "" "" "g" +"γκ" "^" "" "g" +"γκ" "(ε|ι|η|α|ο|Ï|Ï )" "(ε|ι|η)" "(nj|j)" +"γκ" "" "(ε|ι|η)" "j" +"γκ" "(ε|ι|η|α|ο|Ï|Ï )" "" "(ng|g)" +"γκ" "" "" "g" +"γι" "" "(α|ο|Ï|Ï )" "j" +"γι" "" "" "(gi|i)" +"γε" "" "(α|ο|Ï|Ï )" "j" +"γε" "" "" "(ge|je)" + +"κζ" "" "" "gz" +"Ïζ" "" "" "dz" +"Ï" "" "(β|γ|δ|μ|ν|Ï)" "z" + +"μβ" "" "" "(mb|b)" +"μÏ" "^" "" "b" +"μÏ" "(ε|ι|η|α|ο|Ï|Ï )" "" "mb" +"μÏ" "" "" "b" // after any consonant +"νÏ" "^" "" "d" +"νÏ" "(ε|ι|η|α|ο|Ï|Ï )" "" "(nd|nt)" // Greek is "nd" +"νÏ" "" "" "(nt|d)" // Greek is "d" after any consonant + +"ά" "" "" "a" +"Î" "" "" "e" +"ή" "" "" "(i|e)" +"ί" "" "" "i" +"Ï" "" "" "o" +"Ï" "" "" "(Q|i|u)" +"Ï" "" "" "o" +"ΰ" "" "" "(Q|i|u)" +"Ï" "" "" "(Q|i|u)" +"Ï" "" "" "j" + +"α" "" "" "a" +"β" "" "" "(v|b)" // modern "v", old "b" +"γ" "" "" "g" +"δ" "" "" "d" // modern like "th" in English "them", old "d" +"ε" "" "" "e" +"ζ" "" "" "z" +"η" "" "" "(i|e)" // modern "i", old "e:" +"ι" "" "" "i" +"κ" "" "" "k" +"λ" "" "" "l" +"μ" "" "" "m" +"ν" "" "" "n" +"ξ" "" "" "ks" +"ο" "" "" "o" +"Ï" "" "" "p" +"Ï" "" "" "r" +"Ï" "" "" "s" +"Ï" "" "" "s" +"Ï" "" "" "t" +"Ï " "" "" "(Q|i|u)" // modern "i", old like German "ü" +"Ï" "" "" "f" +"θ" "" "" "t" // old greek like "th" in English "theme" +"Ï" "" "" "x" +"Ï" "" "" "ps" +"Ï" "" "" "o" http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_greeklatin.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_greeklatin.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_greeklatin.txt new file mode 100644 index 0000000..43ec3f5 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_greeklatin.txt @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +"au" "" "$" "af" +"au" "" "[kpstfh]" "af" +"au" "" "" "av" +"eu" "" "$" "ef" +"eu" "" "[kpstfh]" "ef" +"eu" "" "" "ev" +"ou" "" "" "u" + +"gge" "[aeiouy]" "" "(nje|je)" // aggelopoulos +"ggi" "[aeiouy]" "[aou]" "(nj|j)" +"ggi" "[aeiouy]" "" "(ni|i)" +"gge" "" "" "je" +"ggi" "" "" "i" +"gg" "[aeiouy]" "" "(ng|g)" +"gg" "" "" "g" +"gk" "^" "" "g" +"gke" "[aeiouy]" "" "(nje|je)" +"gki" "[aeiouy]" "" "(ni|i)" +"gke" "" "" "je" +"gki" "" "" "i" +"gk" "[aeiouy]" "" "(ng|g)" +"gk" "" "" "g" +"nghi" "" "[aouy]" "Nj" +"nghi" "" "" "(Ngi|Ni)" +"nghe" "" "[aouy]" "Nj" +"nghe" "" "" "(Nje|Nge)" +"ghi" "" "[aouy]" "j" +"ghi" "" "" "(gi|i)" +"ghe" "" "[aouy]" "j" +"ghe" "" "" "(je|ge)" +"ngh" "" "" "Ng" +"gh" "" "" "g" +"ngi" "" "[aouy]" "Nj" +"ngi" "" "" "(Ngi|Ni)" +"nge" "" "[aouy]" "Nj" +"nge" "" "" "(Nje|Nge)" +"gi" "" "[aouy]" "j" +"gi" "" "" "(gi|i)" // what about Pantazis = Pantagis ??? +"ge" "" "[aouy]" "j" +"ge" "" "" "(je|ge)" +"ng" "" "" "Ng" // fragakis = fraggakis = frangakis; angel = agel = aggel + +"i" "" "[aeou]" "j" +"i" "[aeou]" "" "j" +"y" "" "[aeou]" "j" +"y" "[aeou]" "" "j" +"yi" "" "[aeou]" "j" +"yi" "" "" "i" + +"ch" "" "" "x" +"kh" "" "" "x" +"dh" "" "" "d" // actually as "th" in English "that" +"dj" "" "" "dZ" // Turkish words +"ph" "" "" "f" +"th" "" "" "t" +"kz" "" "" "gz" +"tz" "" "" "dz" +"s" "" "[bgdmnr]" "z" + +"mb" "" "" "(mb|b)" // Liberis = Limperis = Limberis +"mp" "^" "" "b" +"mp" "[aeiouy]" "" "mp" +"mp" "" "" "b" +"nt" "^" "" "d" +"nt" "[aeiouy]" "" "(nd|nt)" // Greek "nd" +"nt" "" "" "(nt|d)" // Greek "d" after any consonant + +"á" "" "" "a" +"é" "" "" "e" +"Ã" "" "" "i" +"ó" "" "" "o" +"óu" "" "" "u" +"ú" "" "" "u" +"ý" "" "" "(i|Q|u)" // [ü] + +"a" "" "" "a" +"b" "" "" "(b|v)" // beta: modern "v", old "b" +"c" "" "" "k" +"d" "" "" "d" // modern like "th" in English "them", old "d" +"e" "" "" "e" +"f" "" "" "f" +"g" "" "" "g" +"h" "" "" "x" +"i" "" "" "i" +"j" "" "" "(j|Z)" // Panajotti = Panaiotti; Louijos = Louizos; Pantajis = Pantazis = Pantagis +"k" "" "" "k" +"l" "" "" "l" +"m" "" "" "m" +"n" "" "" "n" +"ο" "" "" "o" +"p" "" "" "p" +"q" "" "" "k" // foreign +"r" "" "" "r" +"s" "" "" "s" +"t" "" "" "t" +"u" "" "" "u" +"v" "" "" "v" +"w" "" "" "v" // foreign +"x" "" "" "ks" +"y" "" "" "(i|Q|u)" // [ü] +"z" "" "" "z" http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_hebrew.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_hebrew.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_hebrew.txt new file mode 100644 index 0000000..7e039d5 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_hebrew.txt @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// General = Ashkenazic + +"××" "" "" "i" +"×¢×" "" "" "i" +"×¢×" "" "" "VV" +"××" "" "" "VV" + +"×׳" "" "" "Z" +"×׳" "" "" "dZ" + +"×" "" "" "L" +"×" "" "" "b" +"×" "" "" "g" +"×" "" "" "d" + +"×" "^" "" "1" +"×" "" "$" "1" +"×" "" "" "" + +"××" "" "" "V" +"××" "" "" "WW" +"×" "" "" "W" +"×" "" "" "z" +"×" "" "" "X" +"×" "" "" "T" +"××" "" "" "i" +"×" "" "" "i" +"×" "" "" "X" +"×" "^" "" "K" +"×" "" "" "k" +"×" "" "" "l" +"×" "" "" "m" +"×" "" "" "m" +"×" "" "" "n" +"× " "" "" "n" +"ס" "" "" "s" +"×¢" "" "" "L" +"×£" "" "" "f" +"פ" "" "" "f" +"×¥" "" "" "C" +"צ" "" "" "C" +"×§" "" "" "K" +"ר" "" "" "r" +"ש" "" "" "s" +"ת" "" "" "TB" // only Ashkenazic http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_hungarian.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_hungarian.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_hungarian.txt new file mode 100644 index 0000000..615d26a --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_hungarian.txt @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// GENERAL + +// CONSONANTS +"sz" "" "" "s" +"zs" "" "" "Z" +"cs" "" "" "tS" + +"ay" "" "" "(oj|aj)" +"ai" "" "" "(oj|aj)" +"aj" "" "" "(oj|aj)" + +"ei" "" "" "(aj|ej)" // German element +"ey" "" "" "(aj|ej)" // German element + +"y" "[áo]" "" "j" +"i" "[áo]" "" "j" +"ee" "" "" "(ej|e)" +"ely" "" "" "(ej|eli)" +"ly" "" "" "(j|li)" +"gy" "" "[aeouáéóúüöÅű]" "dj" +"gy" "" "" "(d|gi)" +"ny" "" "[aeouáéóúüöÅű]" "nj" +"ny" "" "" "(n|ni)" +"ty" "" "[aeouáéóúüöÅű]" "tj" +"ty" "" "" "(t|ti)" +"qu" "" "" "(ku|kv)" +"h" "" "$" "" + +// SPECIAL VOWELS +"á" "" "" "a" +"é" "" "" "e" +"Ã" "" "" "i" +"ó" "" "" "o" +"ú" "" "" "u" +"ö" "" "" "Y" +"Å" "" "" "Y" +"ü" "" "" "Q" +"ű" "" "" "Q" + +// LATIN ALPHABET +"a" "" "" "a" +"b" "" "" "b" +"c" "" "" "ts" +"d" "" "" "d" +"e" "" "" "E" +"f" "" "" "f" +"g" "" "" "g" +"h" "" "" "h" +"i" "" "" "I" +"j" "" "" "j" +"k" "" "" "k" +"l" "" "" "l" +"m" "" "" "m" +"n" "" "" "n" +"o" "" "" "o" +"p" "" "" "p" +"q" "" "" "k" +"r" "" "" "r" +"s" "" "" "(S|s)" +"t" "" "" "t" +"u" "" "" "u" +"v" "" "" "v" +"w" "" "" "v" +"x" "" "" "ks" +"y" "" "" "i" +"z" "" "" "z" http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_italian.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_italian.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_italian.txt new file mode 100644 index 0000000..8775edd --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_italian.txt @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +"kh" "" "" "x" // foreign + +"gli" "" "" "(l|gli)" +"gn" "" "[aeou]" "(n|nj|gn)" +"gni" "" "" "(ni|gni)" + +"gi" "" "[aeou]" "dZ" +"gg" "" "[ei]" "dZ" +"g" "" "[ei]" "dZ" +"h" "[bdgt]" "" "g" // gh is It; others from Arabic translit +"h" "" "$" "" // foreign + +"ci" "" "[aeou]" "tS" +"ch" "" "[ei]" "k" +"sc" "" "[ei]" "S" +"cc" "" "[ei]" "tS" +"c" "" "[ei]" "tS" +"s" "[aeiou]" "[aeiou]" "z" + +"i" "[aeou]" "" "j" +"i" "" "[aeou]" "j" +"y" "[aeou]" "" "j" // foreign +"y" "" "[aeou]" "j" // foreign + +"qu" "" "" "k" +"uo" "" "" "(vo|o)" +"u" "" "[aei]" "v" + +"�" "" "" "e" +"�" "" "" "e" +"�" "" "" "o" +"�" "" "" "o" + +// LATIN ALPHABET +"a" "" "" "a" +"b" "" "" "b" +"c" "" "" "k" +"d" "" "" "d" +"e" "" "" "e" +"f" "" "" "f" +"g" "" "" "g" +"h" "" "" "h" +"i" "" "" "i" +"j" "" "" "(Z|dZ|j)" // foreign +"k" "" "" "k" +"l" "" "" "l" +"m" "" "" "m" +"n" "" "" "n" +"o" "" "" "o" +"p" "" "" "p" +"q" "" "" "k" +"r" "" "" "r" +"s" "" "" "s" +"t" "" "" "t" +"u" "" "" "u" +"v" "" "" "v" +"w" "" "" "v" // foreign +"x" "" "" "ks" // foreign +"y" "" "" "i" // foreign +"z" "" "" "(ts|dz)" http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_polish.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_polish.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_polish.txt new file mode 100644 index 0000000..dd72f6a --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_polish.txt @@ -0,0 +1,185 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// GENERIC + +// CONVERTING FEMININE TO MASCULINE +"ska" "" "$" "ski" +"cka" "" "$" "tski" +"lowa" "" "$" "(lova|lof|l|el)" +"kowa" "" "$" "(kova|kof|k|ek)" +"owa" "" "$" "(ova|of|)" +"lowna" "" "$" "(lovna|levna|l|el)" +"kowna" "" "$" "(kovna|k|ek)" +"owna" "" "$" "(ovna|)" +"lówna" "" "$" "(l|el)" +"kówna" "" "$" "(k|ek)" +"ówna" "" "$" "" +"a" "" "$" "(a|i)" + +// CONSONANTS +"czy" "" "" "tSi" +"cze" "" "[bcdgkpstwzż]" "(tSe|tSF)" +"ciewicz" "" "" "(tsevitS|tSevitS)" +"siewicz" "" "" "(sevitS|SevitS)" +"ziewicz" "" "" "(zevitS|ZevitS)" +"riewicz" "" "" "rjevitS" +"diewicz" "" "" "djevitS" +"tiewicz" "" "" "tjevitS" +"iewicz" "" "" "evitS" +"ewicz" "" "" "evitS" +"owicz" "" "" "ovitS" +"icz" "" "" "itS" +"cz" "" "" "tS" +"ch" "" "" "x" + +"cia" "" "[bcdgkpstwzż]" "(tSB|tsB)" +"cia" "" "" "(tSa|tsa)" +"ciÄ " "" "[bp]" "(tSom|tsom)" +"ciÄ " "" "" "(tSon|tson)" +"ciÄ" "" "[bp]" "(tSem|tsem)" +"ciÄ" "" "" "(tSen|tsen)" +"cie" "" "[bcdgkpstwzż]" "(tSF|tsF)" +"cie" "" "" "(tSe|tse)" +"cio" "" "" "(tSo|tso)" +"ciu" "" "" "(tSu|tsu)" +"ci" "" "" "(tSi|tsI)" +"Ä" "" "" "(tS|ts)" + +"ssz" "" "" "S" +"sz" "" "" "S" +"sia" "" "[bcdgkpstwzż]" "(SB|sB|sja)" +"sia" "" "" "(Sa|sja)" +"siÄ " "" "[bp]" "(Som|som)" +"siÄ " "" "" "(Son|son)" +"siÄ" "" "[bp]" "(Sem|sem)" +"siÄ" "" "" "(Sen|sen)" +"sie" "" "[bcdgkpstwzż]" "(SF|sF|se)" +"sie" "" "" "(Se|se)" +"sio" "" "" "(So|so)" +"siu" "" "" "(Su|sju)" +"si" "" "" "(Si|sI)" +"Å" "" "" "(S|s)" + +"zia" "" "[bcdgkpstwzż]" "(ZB|zB|zja)" +"zia" "" "" "(Za|zja)" +"ziÄ " "" "[bp]" "(Zom|zom)" +"ziÄ " "" "" "(Zon|zon)" +"ziÄ" "" "[bp]" "(Zem|zem)" +"ziÄ" "" "" "(Zen|zen)" +"zie" "" "[bcdgkpstwzż]" "(ZF|zF)" +"zie" "" "" "(Ze|ze)" +"zio" "" "" "(Zo|zo)" +"ziu" "" "" "(Zu|zju)" +"zi" "" "" "(Zi|zI)" + +"że" "" "[bcdgkpstwzż]" "(Ze|ZF)" +"że" "" "[bcdgkpstwzż]" "(Ze|ZF|ze|zF)" +"że" "" "" "Ze" +"źe" "" "" "(Ze|ze)" +"ży" "" "" "Zi" +"źi" "" "" "(Zi|zi)" +"ż" "" "" "Z" +"ź" "" "" "(Z|z)" + +"rze" "t" "" "(Se|re)" +"rze" "" "" "(Ze|re|rZe)" +"rzy" "t" "" "(Si|ri)" +"rzy" "" "" "(Zi|ri|rZi)" +"rz" "t" "" "(S|r)" +"rz" "" "" "(Z|r|rZ)" + +"lio" "" "" "(lo|le)" +"Å" "" "" "l" +"Å" "" "" "n" +"qu" "" "" "k" +"s" "" "s" "" + +// VOWELS +"ó" "" "" "(u|o)" +"Ä " "" "[bp]" "om" +"Ä" "" "[bp]" "em" +"Ä " "" "" "on" +"Ä" "" "" "en" + +"ije" "" "" "je" +"yje" "" "" "je" +"iie" "" "" "je" +"yie" "" "" "je" +"iye" "" "" "je" +"yye" "" "" "je" + +"ij" "" "[aou]" "j" +"yj" "" "[aou]" "j" +"ii" "" "[aou]" "j" +"yi" "" "[aou]" "j" +"iy" "" "[aou]" "j" +"yy" "" "[aou]" "j" + +"rie" "" "" "rje" +"die" "" "" "dje" +"tie" "" "" "tje" +"ie" "" "[bcdgkpstwzż]" "F" +"ie" "" "" "e" + +"aue" "" "" "aue" +"au" "" "" "au" + +"ei" "" "" "aj" +"ey" "" "" "aj" +"ej" "" "" "aj" + +"ai" "" "" "aj" +"ay" "" "" "aj" +"aj" "" "" "aj" + +"i" "[aeou]" "" "j" +"y" "[aeou]" "" "j" +"i" "" "[aou]" "j" +"y" "" "[aeou]" "j" + +"a" "" "[bcdgkpstwzż]" "B" +"e" "" "[bcdgkpstwzż]" "(E|F)" +"o" "" "[bcÄdgklÅmnÅrsÅtwzźż]" "P" + +// LATIN ALPHABET +"a" "" "" "a" +"b" "" "" "b" +"c" "" "" "ts" +"d" "" "" "d" +"e" "" "" "E" +"f" "" "" "f" +"g" "" "" "g" +"h" "" "" "(h|x)" +"i" "" "" "I" +"j" "" "" "j" +"k" "" "" "k" +"l" "" "" "l" +"m" "" "" "m" +"n" "" "" "n" +"o" "" "" "o" +"p" "" "" "p" +"q" "" "" "k" +"r" "" "" "r" +"s" "" "" "s" +"t" "" "" "t" +"u" "" "" "u" +"v" "" "" "v" +"w" "" "" "v" +"x" "" "" "ks" +"y" "" "" "I" +"z" "" "" "z" http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_portuguese.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_portuguese.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_portuguese.txt new file mode 100644 index 0000000..74de1d7 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_portuguese.txt @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +"kh" "" "" "x" // foreign +"ch" "" "" "S" +"ss" "" "" "s" +"sc" "" "[ei]" "s" +"sç" "" "[aou]" "s" +"ç" "" "" "s" +"c" "" "[ei]" "s" +// "c" "" "[aou]" "(k|C)" + +"s" "^" "" "s" +"s" "[aáuiÃoóeéêy]" "[aáuiÃoóeéêy]" "z" +"s" "" "[dglmnrv]" "(Z|S)" // Z is Brazil + +"z" "" "$" "(Z|s|S)" // s and S in Brazil +"z" "" "[bdgv]" "(Z|z)" // Z in Brazil +"z" "" "[ptckf]" "(s|S|z)" // s and S in Brazil + +"gu" "" "[eiu]" "g" +"gu" "" "[ao]" "gv" +"g" "" "[ei]" "Z" +"qu" "" "[eiu]" "k" +"qu" "" "[ao]" "kv" + +"uo" "" "" "(vo|o|u)" +"u" "" "[aei]" "v" + +"lh" "" "" "l" +"nh" "" "" "nj" +"h" "[bdgt]" "" "" // translit. from Arabic +"h" "" "$" "" // foreign + +"ex" "" "[aáuiÃoóeéêy]" "(ez|eS|eks)" // ez in Brazil +"ex" "" "[cs]" "e" + +"y" "[aáuiÃoóeéê]" "" "j" +"y" "" "[aeiÃou]" "j" +"m" "" "[bcdfglnprstv]" "(m|n)" // maybe to add a rule for m/n before a consonant that disappears [preceding vowel becomes nasalized] +"m" "" "$" "(m|n)" // maybe to add a rule for final m/n that disappears [preceding vowel becomes nasalized] + +"ão" "" "" "(au|an|on)" +"ãe" "" "" "(aj|an)" +"ãi" "" "" "(aj|an)" +"õe" "" "" "(oj|on)" +"i" "[aáuoóeéê]" "" "j" +"i" "" "[aeou]" "j" + +"â" "" "" "a" +"à " "" "" "a" +"á" "" "" "a" +"ã" "" "" "(a|an|on)" +"é" "" "" "e" +"ê" "" "" "e" +"Ã" "" "" "i" +"ô" "" "" "o" +"ó" "" "" "o" +"õ" "" "" "(o|on)" +"ú" "" "" "u" +"ü" "" "" "u" + +"aue" "" "" "aue" + +// LATIN ALPHABET +"a" "" "" "a" +"b" "" "" "b" +"c" "" "" "k" +"d" "" "" "d" +"e" "" "" "(e|i)" +"f" "" "" "f" +"g" "" "" "g" +"h" "" "" "h" +"i" "" "" "i" +"j" "" "" "Z" +"k" "" "" "k" +"l" "" "" "l" +"m" "" "" "m" +"n" "" "" "n" +"o" "" "" "(o|u)" +"p" "" "" "p" +"q" "" "" "k" +"r" "" "" "r" +"s" "" "" "S" +"t" "" "" "t" +"u" "" "" "u" +"v" "" "" "v" +"w" "" "" "v" +"x" "" "" "(S|ks)" +"y" "" "" "i" +"z" "" "" "z" http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_romanian.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_romanian.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_romanian.txt new file mode 100644 index 0000000..a6d0aac --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_romanian.txt @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +"ce" "" "" "tSe" +"ci" "" "" "(tSi|tS)" +"ch" "" "[ei]" "k" +"ch" "" "" "x" // foreign + +"gi" "" "" "(dZi|dZ)" +"g" "" "[ei]" "dZ" +"gh" "" "" "g" + +"i" "[aeou]" "" "j" +"i" "" "[aeou]" "j" +"Å£" "" "" "ts" +"Å" "" "" "S" +"qu" "" "" "k" + +"î" "" "" "i" +"ea" "" "" "ja" +"Ä" "" "" "(e|a)" +"aue" "" "" "aue" + +// LATIN ALPHABET +"a" "" "" "a" +"b" "" "" "b" +"c" "" "" "k" +"d" "" "" "d" +"e" "" "" "E" +"f" "" "" "f" +"g" "" "" "g" +"h" "" "" "(x|h)" +"i" "" "" "I" +"j" "" "" "Z" +"k" "" "" "k" +"l" "" "" "l" +"m" "" "" "m" +"n" "" "" "n" +"o" "" "" "o" +"p" "" "" "p" +"q" "" "" "k" +"r" "" "" "r" +"s" "" "" "s" +"t" "" "" "t" +"u" "" "" "u" +"v" "" "" "v" +"w" "" "" "v" +"x" "" "" "ks" +"y" "" "" "i" +"z" "" "" "z" http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_russian.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_russian.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_russian.txt new file mode 100644 index 0000000..310be84 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_russian.txt @@ -0,0 +1,142 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//GENERAL// CONVERTING FEMININE TO MASCULINE +"yna" "" "$" "(in|ina)" +"ina" "" "$" "(in|ina)" +"liova" "" "$" "(lof|lef)" +"lova" "" "$" "(lof|lef|lova)" +"ova" "" "$" "(of|ova)" +"eva" "" "$" "(ef|ova)" +"aia" "" "$" "(aja|i)" +"aja" "" "$" "(aja|i)" +"aya" "" "$" "(aja|i)" + +//SPECIAL CONSONANTS +"tsya" "" "" "tsa" +"tsyu" "" "" "tsu" +"tsia" "" "" "tsa" +"tsie" "" "" "tse" +"tsio" "" "" "tso" +"tsye" "" "" "tse" +"tsyo" "" "" "tso" +"tsiu" "" "" "tsu" +"sie" "" "" "se" +"sio" "" "" "so" +"zie" "" "" "ze" +"zio" "" "" "zo" +"sye" "" "" "se" +"syo" "" "" "so" +"zye" "" "" "ze" +"zyo" "" "" "zo" + +"ger" "" "$" "ger" +"gen" "" "$" "gen" +"gin" "" "$" "gin" +"gg" "" "" "g" +"g" "[jaeoiuy]" "[aeoiu]" "g" +"g" "" "[aeoiu]" "(g|h)" + +"kh" "" "" "x" +"ch" "" "" "(tS|x)" +"sch" "" "" "(StS|S)" +"ssh" "" "" "S" +"sh" "" "" "S" +"zh" "" "" "Z" +"tz" "" "$" "ts" +"tz" "" "" "(ts|tz)" +"c" "" "[iey]" "s" +"qu" "" "" "(kv|k)" +"s" "" "s" "" + +//SPECIAL VOWELS +"lya" "" "" "la" +"lyu" "" "" "lu" +"lia" "" "" "la" // not in DJSRE +"liu" "" "" "lu" // not in DJSRE +"lja" "" "" "la" // not in DJSRE +"lju" "" "" "lu" // not in DJSRE +"le" "" "" "(lo|lE)" //not in DJSRE +"lyo" "" "" "(lo|le)" //not in DJSRE +"lio" "" "" "(lo|le)" + +"ije" "" "" "je" +"ie" "" "" "je" +"iye" "" "" "je" +"iie" "" "" "je" +"yje" "" "" "je" +"ye" "" "" "je" +"yye" "" "" "je" +"yie" "" "" "je" + +"ij" "" "[aou]" "j" +"iy" "" "[aou]" "j" +"ii" "" "[aou]" "j" +"yj" "" "[aou]" "j" +"yy" "" "[aou]" "j" +"yi" "" "[aou]" "j" + +"io" "" "" "(jo|e)" +"i" "" "[au]" "j" +"i" "[aeou]" "" "j" +"yo" "" "" "(jo|e)" +"y" "" "[au]" "j" +"y" "[aeiou]" "" "j" + +"ii" "" "$" "i" +"iy" "" "$" "i" +"yy" "" "$" "i" +"yi" "" "$" "i" +"yj" "" "$" "i" +"ij" "" "$" "i" + +"e" "^" "" "(je|E)" +"ee" "" "" "(aje|i)" +"e" "[aou]" "" "je" +"oo" "" "" "(oo|u)" +"'" "" "" "" +"\"" "" "" "" + +"aue" "" "" "aue" + +// LATIN ALPHABET +"a" "" "" "a" +"b" "" "" "b" +"c" "" "" "k" +"d" "" "" "d" +"e" "" "" "E" +"f" "" "" "f" +"g" "" "" "g" +"h" "" "" "h" +"i" "" "" "I" +"j" "" "" "j" +"k" "" "" "k" +"l" "" "" "l" +"m" "" "" "m" +"n" "" "" "n" +"o" "" "" "o" +"p" "" "" "p" +"q" "" "" "k" +"r" "" "" "r" +"s" "" "" "s" +"t" "" "" "t" +"u" "" "" "u" +"v" "" "" "v" +"w" "" "" "v" +"x" "" "" "ks" +"y" "" "" "I" +"z" "" "" "z" http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_spanish.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_spanish.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_spanish.txt new file mode 100644 index 0000000..3ba2695 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_spanish.txt @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// GENERAL + +// Includes both Spanish (Castillian) & Catalan + +// CONSONANTS +"ñ" "" "" "(n|nj)" +"ny" "" "" "nj" // Catalan +"ç" "" "" "s" // Catalan + +"ig" "[aeiou]" "" "(tS|ig)" // tS is Catalan +"ix" "[aeiou]" "" "S" // Catalan +"tx" "" "" "tS" // Catalan +"tj" "" "$" "tS" // Catalan +"tj" "" "" "dZ" // Catalan +"tg" "" "" "(tg|dZ)" // dZ is Catalan +"ch" "" "" "(tS|dZ)" // dZ is typical for Argentina +"bh" "" "" "b" // translit. from Arabic +"h" "[dgt]" "" "" // translit. from Arabic +"h" "" "$" "" // foreign +//"ll" "" "" "(l|Z)" // Z is typical for Argentina, only Ashkenazic +"m" "" "[bpvf]" "(m|n)" +"c" "" "[ei]" "s" +// "c" "" "[aou]" "(k|C)" +"gu" "" "[ei]" "(g|gv)" // "gv" because "u" can actually be "ü" +"g" "" "[ei]" "(x|g|dZ)" // "g" only for foreign words; dZ is Catalan +"qu" "" "" "k" + +"uo" "" "" "(vo|o)" +"u" "" "[aei]" "v" + +// SPECIAL VOWELS +"ü" "" "" "v" +"á" "" "" "a" +"é" "" "" "e" +"Ã" "" "" "i" +"ó" "" "" "o" +"ú" "" "" "u" +"à " "" "" "a" // Catalan +"è" "" "" "e" // Catalan +"ò" "" "" "o" // Catalan + +// LATIN ALPHABET +"a" "" "" "a" +"b" "" "" "B" +"c" "" "" "k" +"d" "" "" "d" +"e" "" "" "e" +"f" "" "" "f" +"g" "" "" "g" +"h" "" "" "h" +"i" "" "" "i" +"j" "" "" "(x|Z)" // Z is Catalan +"k" "" "" "k" +"l" "" "" "l" +"m" "" "" "m" +"n" "" "" "n" +"o" "" "" "o" +"p" "" "" "p" +"q" "" "" "k" +"r" "" "" "r" +"s" "" "" "s" +"t" "" "" "t" +"u" "" "" "u" +"v" "" "" "V" +"w" "" "" "v" // foreign words +"x" "" "" "(ks|gz|S)" // ks is Spanish, all are Catalan +"y" "" "" "(i|j)" +"z" "" "" "(z|s)" // as "c" befoire "e" or "i", in Spain it is like unvoiced English "th" http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_turkish.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_turkish.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_turkish.txt new file mode 100644 index 0000000..c639a13 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/gen_rules_turkish.txt @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +"ç" "" "" "tS" +"Ä" "" "" "" // to show that previous vowel is long +"Å" "" "" "S" +"ü" "" "" "Q" +"ö" "" "" "Y" +"ı" "" "" "(e|i|)" // as "e" in English "label" + +"a" "" "" "a" +"b" "" "" "b" +"c" "" "" "dZ" +"d" "" "" "d" +"e" "" "" "e" +"f" "" "" "f" +"g" "" "" "g" +"h" "" "" "h" +"i" "" "" "i" +"j" "" "" "Z" +"k" "" "" "k" +"l" "" "" "l" +"m" "" "" "m" +"n" "" "" "n" +"o" "" "" "o" +"p" "" "" "p" +"q" "" "" "k" // foreign words +"r" "" "" "r" +"s" "" "" "s" +"t" "" "" "t" +"u" "" "" "u" +"v" "" "" "v" +"w" "" "" "v" // foreign words +"x" "" "" "ks" // foreign words +"y" "" "" "j" +"z" "" "" "z" http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/lang.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/lang.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/lang.txt new file mode 100644 index 0000000..99742b1 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/lang.txt @@ -0,0 +1,293 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// 1. following are rules to accept the language +// 1.1 Special letter combinations +^oâ english true +^o' english true +^mc english true +^fitz english true +ceau french+romanian true +eau$ french true // mp: I've added this +eaux$ french true // mp: I've added this +ault$ french true +oult$ french true +eux$ french true +eix$ french true +glou$ greeklatin true +uu dutch true +tx spanish true +witz german true +tz$ german+russian+english true +^tz russian+english true +poulos$ greeklatin true +pulos$ greeklatin true +iou greeklatin true +sj$ dutch true +^sj dutch true +güe spanish true +güi spanish true +ghe romanian+greeklatin true +ghi romanian+greeklatin true +escu$ romanian true +esco$ romanian true +vici$ romanian true +schi$ romanian true +ii$ russian true +iy$ russian true +yy$ russian true +yi$ russian true +^rz polish true +rz$ polish+german true +[bcdfgklmnpstwz]rz polish true +rz[bcdfghklmnpstw] polish true +etti$ italian true +eti$ italian true +ati$ italian true +ato$ italian true +[aoei]no$ italian true +[aoei]ni$ italian true +esi$ italian true +oli$ italian true +field$ english true +cki$ polish true +ska$ polish true +cka$ polish true +ae german+russian+english true +oe german+french+russian+english+dutch true +th$ german+english true +^th german+english+greeklatin true +mann german true +cz polish true +cy polish+greeklatin true +niew polish true +stein german true +heim$ german true +heimer$ german true +thal german true +zweig german true +[aeou]h german true +äh german true +öh german true +üh german true +[ln]h[ao]$ portuguese true +[ln]h[aou] portuguese+french+german+dutch+czech+spanish+turkish true +chsch german true +tsch german true +sch$ german+russian true +^sch german+russian true +ck$ german+english true +c$ polish+romanian+hungarian+czech+turkish true +sz polish+hungarian true +cs$ hungarian true +^cs hungarian true +dzs hungarian true +zs$ hungarian true +^zs hungarian true +^wl polish true +^wr polish+english+german+dutch true + +gy$ hungarian true +gy[aeou] hungarian true +gy hungarian+russian+french+greeklatin true +guy french true +gu[ei] spanish+french+portuguese true +gu[ao] spanish+portuguese true +gi[aou] italian+greeklatin true + +ly hungarian+russian+polish+greeklatin true +ny hungarian+russian+polish+spanish+greeklatin true +ty hungarian+russian+polish+greeklatin true + +// 1.2 special characters +Ä polish true +ç french+spanish+portuguese+turkish true +Ä czech true +Ä czech true +Ä turkish true +Å polish true +Å polish true +ñ spanish true +Å czech true +Å czech true +Å polish true +Å romanian+turkish true +Å¡ czech true +Å£ romanian true +Å¥ czech true +ź polish true +ż polish true + +à german true + +ä german true +á hungarian+spanish+portuguese+czech+greeklatin true +â romanian+french+portuguese true +Ä romanian true +Ä polish true +à portuguese true +ã portuguese true +Ä polish true +é french+hungarian+czech+greeklatin true +è french+spanish+italian true +ê french true +Ä czech true +ê french+portuguese true +à hungarian+spanish+portuguese+czech+greeklatin true +î romanian+french true +ı turkish true +ó polish+hungarian+spanish+italian+portuguese+czech+greeklatin true +ö german+hungarian+turkish true +ô french+portuguese true +õ portuguese+hungarian true +ò italian+spanish true +ű hungarian true +ú hungarian+spanish+portuguese+czech+greeklatin true +ü german+hungarian+spanish+portuguese+turkish true +ù french true +ů czech true +ý czech+greeklatin true + +// Every Cyrillic word has at least one Cyrillic vowel (аÑеоиÑÑÑÑÑ) +а cyrillic true +Ñ cyrillic true +о cyrillic true +е cyrillic true +и cyrillic true +Ñ cyrillic true +Ñ cyrillic true +Ñ cyrillic true +Ñ cyrillic true +Ñ cyrillic true + +// Every Greek word has at least one Greek vowel +α greek true +ε greek true +η greek true +ι greek true +ο greek true +Ï greek true +Ï greek true + +// Arabic (only initial) +ا arabic true // alif (isol + init) +ب arabic true // ba' +ت arabic true // ta' +Ø« arabic true // tha' +ج arabic true // jim +Ø arabic true // h.a' +Ø®' arabic true // kha' +د arabic true // dal (isol + init) +ذ arabic true // dhal (isol + init) +ر arabic true // ra' (isol + init) +ز arabic true // za' (isol + init) +س arabic true // sin +Ø´ arabic true // shin +ص arabic true // s.ad +ض arabic true // d.ad +Ø· arabic true // t.a' +ظ arabic true // z.a' +ع arabic true // 'ayn +غ arabic true // ghayn +Ù arabic true // fa' +Ù arabic true // qaf +Ù arabic true // kaf +Ù arabic true // lam +Ù arabic true // mim +Ù arabic true // nun +Ù arabic true // ha' +Ù arabic true // waw (isol + init) +Ù arabic true // ya' + +Ø¢ arabic true // alif madda +Ø¥ arabic true // alif + diacritic +Ø£ arabic true // alif + hamza +ؤ arabic true // waw + hamza +ئ arabic true // ya' + hamza + + +// Hebrew +× hebrew true +× hebrew true +× hebrew true +× hebrew true +× hebrew true +× hebrew true +× hebrew true +× hebrew true +× hebrew true +× hebrew true +× hebrew true +× hebrew true +× hebrew true +× hebrew true +ס hebrew true +×¢ hebrew true +פ hebrew true +צ hebrew true +×§ hebrew true +ר hebrew true +ש hebrew true +ת hebrew true + +// 2. following are rules to reject the language + +// Every Latin character word has at least one Latin vowel +a cyrillic+hebrew+greek+arabic false +o cyrillic+hebrew+greek+arabic false +e cyrillic+hebrew+greek+arabic false +i cyrillic+hebrew+greek+arabic false +y cyrillic+hebrew+greek+arabic+romanian+dutch false +u cyrillic+hebrew+greek+arabic false + +j italian false +j[^aoeiuy] french+spanish+portuguese+greeklatin false +g czech false +k romanian+spanish+portuguese+french+italian false +q hungarian+polish+russian+romanian+czech+dutch+turkish+greeklatin false +v polish false +w french+romanian+spanish+hungarian+russian+czech+turkish+greeklatin false +x czech+hungarian+dutch+turkish false // polish excluded from the list + +dj spanish+turkish false +v[^aoeiu] german false // in german, "v" can be found before a vowel only +y[^aoeiu] german false // in german, "y" usually appears only in the last position; sometimes before a vowel +c[^aohk] german false +dzi german+english+french+turkish false +ou german false +a[eiou] turkish false // no diphthongs in Turkish +ö[eaio] turkish false +ü[eaio] turkish false +e[aiou] turkish false +i[aeou] turkish false +o[aieu] turkish false +u[aieo] turkish false +aj german+english+french+dutch false +ej german+english+french+dutch false +oj german+english+french+dutch false +uj german+english+french+dutch false +eu russian+polish false +ky polish false +kie french+spanish+greeklatin false +gie portuguese+romanian+spanish+greeklatin false +ch[aou] italian false +ch turkish false +son$ german false +sc[ei] french false +sch hungarian+polish+french+spanish false +^h russian false +etti$ greeklatin false http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_approx_any.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_approx_any.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_approx_any.txt new file mode 100644 index 0000000..390419e --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_approx_any.txt @@ -0,0 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// SEPHARDIC + +"E" "" "" "" \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_approx_common.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_approx_common.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_approx_common.txt new file mode 100644 index 0000000..e744d32 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_approx_common.txt @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include sep_exact_approx_common + +"bens" "^" "" "(binz|s)" +"benS" "^" "" "(binz|s)" +"ben" "^" "" "(bin|)" + +"abens" "^" "" "(abinz|binz|s)" +"abenS" "^" "" "(abinz|binz|s)" +"aben" "^" "" "(abin|bin|)" + +"els" "^" "" "(ilz|alz|s)" +"elS" "^" "" "(ilz|alz|s)" +"el" "^" "" "(il|al|)" +"als" "^" "" "(alz|s)" +"alS" "^" "" "(alz|s)" +"al" "^" "" "(al|)" + +//"dels" "^" "" "(dilz|s)" +//"delS" "^" "" "(dilz|s)" +"del" "^" "" "(dil|)" +"dela" "^" "" "(dila|)" +//"delo" "^" "" "(dila|)" +"da" "^" "" "(da|)" +"de" "^" "" "(di|)" +//"des" "^" "" "(dis|dAs|)" +//"di" "^" "" "(di|)" +//"dos" "^" "" "(das|dus|)" + +"oa" "" "" "(va|a|D)" +"oe" "" "" "(vi|D)" +"ae" "" "" "D" + +/// "s" "" "$" "(s|)" // Attia(s) +/// "C" "" "" "s" // "c" could actually be "�" + +"n" "" "[bp]" "m" + +"h" "" "" "(|h|f)" // sound "h" (absent) can be expressed via /x/, Cojab in Spanish = Kohab ; Hakim = Fakim +"x" "" "" "h" + +// DIPHTHONGS ARE APPROXIMATELY equivalent +"aja" "^" "" "(Da|ia)" +"aje" "^" "" "(Di|Da|i|ia)" +"aji" "^" "" "(Di|i)" +"ajo" "^" "" "(Du|Da|iu|ia)" +"aju" "^" "" "(Du|iu)" + +"aj" "" "" "D" +"ej" "" "" "D" +"oj" "" "" "D" +"uj" "" "" "D" +"au" "" "" "D" +"eu" "" "" "D" +"ou" "" "" "D" + +"a" "^" "" "(a|)" // Arabic + +"ja" "^" "" "ia" +"je" "^" "" "i" +"jo" "^" "" "(iu|ia)" +"ju" "^" "" "iu" + +"ja" "" "" "a" +"je" "" "" "i" +"ji" "" "" "i" +"jo" "" "" "u" +"ju" "" "" "u" + +"j" "" "" "i" + +// CONSONANTS {z & Z & dZ; s & S} are approximately interchangeable +"s" "" "[rmnl]" "z" +"S" "" "[rmnl]" "z" +"s" "[rmnl]" "" "z" +"S" "[rmnl]" "" "z" + +"dS" "" "$" "S" +"dZ" "" "$" "S" +"Z" "" "$" "S" +"S" "" "$" "(S|s)" +"z" "" "$" "(S|s)" + +"S" "" "" "s" +"dZ" "" "" "z" +"Z" "" "" "z" + +"i" "" "$" "(i|)" // often in Arabic +"e" "" "" "i" + +"o" "" "$" "(a|u)" +"o" "" "" "u" + +// special character to deal correctly in Hebrew match +"B" "" "" "b" +"V" "" "" "v" + +// Arabic +"p" "^" "" "b" http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_approx_french.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_approx_french.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_approx_french.txt new file mode 100644 index 0000000..0990004 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_approx_french.txt @@ -0,0 +1,18 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// empty \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_approx_hebrew.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_approx_hebrew.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_approx_hebrew.txt new file mode 100644 index 0000000..0990004 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_approx_hebrew.txt @@ -0,0 +1,18 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// empty \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_approx_italian.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_approx_italian.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_approx_italian.txt new file mode 100644 index 0000000..58fe459 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_approx_italian.txt @@ -0,0 +1,18 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include sep_approx_french \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_approx_portuguese.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_approx_portuguese.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_approx_portuguese.txt new file mode 100644 index 0000000..4bca846 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_approx_portuguese.txt @@ -0,0 +1,18 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include sep_approx_french http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_approx_spanish.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_approx_spanish.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_approx_spanish.txt new file mode 100644 index 0000000..4bca846 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_approx_spanish.txt @@ -0,0 +1,18 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include sep_approx_french http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_exact_any.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_exact_any.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_exact_any.txt new file mode 100644 index 0000000..d4bf51e --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_exact_any.txt @@ -0,0 +1,18 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +"E" "" "" "e" \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_exact_approx_common.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_exact_approx_common.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_exact_approx_common.txt new file mode 100644 index 0000000..1f4e864 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_exact_approx_common.txt @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Sephardic + +"h" "" "$" "" + +// VOICED - UNVOICED CONSONANTS +"b" "" "[fktSs]" "p" +"b" "" "p" "" +"b" "" "$" "p" +"p" "" "[vgdZz]" "b" +"p" "" "b" "" + +"v" "" "[pktSs]" "f" +"v" "" "f" "" +"v" "" "$" "f" +"f" "" "[vbgdZz]" "v" +"f" "" "v" "" + +"g" "" "[pftSs]" "k" +"g" "" "k" "" +"g" "" "$" "k" +"k" "" "[vbdZz]" "g" +"k" "" "g" "" + +"d" "" "[pfkSs]" "t" +"d" "" "t" "" +"d" "" "$" "t" +"t" "" "[vbgZz]" "d" +"t" "" "d" "" + +"s" "" "dZ" "" +"s" "" "tS" "" + +"z" "" "[pfkSt]" "s" +"z" "" "[sSzZ]" "" +"s" "" "[sSzZ]" "" +"Z" "" "[sSzZ]" "" +"S" "" "[sSzZ]" "" + +// SIMPLIFICATION OF CONSONANT CLUSTERS +"nm" "" "" "m" + +// DOUBLE --> SINGLE +"ji" "^" "" "i" + +"a" "" "a" "" +"b" "" "b" "" +"d" "" "d" "" +"e" "" "e" "" +"f" "" "f" "" +"g" "" "g" "" +"i" "" "i" "" +"k" "" "k" "" +"l" "" "l" "" +"m" "" "m" "" +"n" "" "n" "" +"o" "" "o" "" +"p" "" "p" "" +"r" "" "r" "" +"t" "" "t" "" +"u" "" "u" "" +"v" "" "v" "" +"z" "" "z" "" http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_exact_common.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_exact_common.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_exact_common.txt new file mode 100644 index 0000000..b97c589 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_exact_common.txt @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include sep_exact_approx_common + +"h" "" "" "" +//"C" "" "" "k" // c that can actually be � + +// VOICED - UNVOICED CONSONANTS +"s" "[^t]" "[bgZd]" "z" +"Z" "" "[pfkst]" "S" +"Z" "" "$" "S" +"S" "" "[bgzd]" "Z" +"z" "" "$" "s" + +//special character to deal correctly in Hebrew match +"B" "" "" "b" +"V" "" "" "v" http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_exact_french.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_exact_french.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_exact_french.txt new file mode 100644 index 0000000..ea75dc4 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_exact_french.txt @@ -0,0 +1,18 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Sephadic \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_exact_hebrew.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_exact_hebrew.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_exact_hebrew.txt new file mode 100644 index 0000000..0990004 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_exact_hebrew.txt @@ -0,0 +1,18 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// empty \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_exact_italian.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_exact_italian.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_exact_italian.txt new file mode 100644 index 0000000..0990004 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_exact_italian.txt @@ -0,0 +1,18 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// empty \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_exact_portuguese.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_exact_portuguese.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_exact_portuguese.txt new file mode 100644 index 0000000..0990004 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_exact_portuguese.txt @@ -0,0 +1,18 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// empty \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_exact_spanish.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_exact_spanish.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_exact_spanish.txt new file mode 100644 index 0000000..0990004 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_exact_spanish.txt @@ -0,0 +1,18 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// empty \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_hebrew_common.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_hebrew_common.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_hebrew_common.txt new file mode 100644 index 0000000..00357f9 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_hebrew_common.txt @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include sep_exact_approx_common + +"E" "" "" "" // final French "e": only in Sephardic + +"ts" "" "" "C" // for not confusion Gutes [=guts] and Guts [=guc] +"tS" "" "" "C" // same reason +"S" "" "" "s" +"p" "" "" "f" +"b" "^" "" "b" +"b" "" "" "(b|v)" + +"ja" "" "" "i" +"je" "" "" "i" +"aj" "" "" "i" +"j" "" "" "i" + +"a" "^" "" "1" +"e" "^" "" "1" +"a" "" "$" "1" +"e" "" "$" "1" + +"a" "" "" "" +"e" "" "" "" + +"oj" "^" "" "(u|vi)" +"uj" "^" "" "(u|vi)" + +"oj" "" "" "u" +"uj" "" "" "u" + +"ou" "^" "" "(u|v|1)" +"o" "^" "" "(u|v|1)" +"u" "^" "" "(u|v|1)" + +"o" "" "$" "(u|1)" +"u" "" "$" "(u|1)" + +"ou" "" "" "u" +"o" "" "" "u" + +"VV" "" "" "u" // alef/ayin + vov from ruleshebrew +"L" "^" "" "1" // alef/ayin from ruleshebrew +"L" "" "$" "1" // alef/ayin from ruleshebrew +"L" "" "" " " // alef/ayin from ruleshebrew +"WW" "^" "" "(vi|u)" // vav-yod from ruleshebrew +"WW" "" "" "u" // vav-yod from ruleshebrew +"W" "^" "" "(u|v)" // vav from ruleshebrew +"W" "" "" "u" // vav from ruleshebrew + +// "g" "" "" "(g|Z)" +// "z" "" "" "(z|Z)" +// "d" "" "" "(d|dZ)" + +"T" "" "" "t" // tet from ruleshebrew + +// "k" "" "" "(k|x)" +// "x" "" "" "(k|x)" +"K" "" "" "k" // kof and initial kaf from ruleshebrew +"X" "" "" "x" // khet and final kaf from ruleshebrew + +// special for Spanish initial B/V +"B" "" "" "v" +"V" "" "" "b" + +"H" "^" "" "(x|1)" +"H" "" "$" "(x|1)" +"H" "" "" "(x|)" +"h" "^" "" "1" +"h" "" "" "" http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_languages.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_languages.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_languages.txt new file mode 100644 index 0000000..9a1935a --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_languages.txt @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +any +french +hebrew +italian +portuguese +spanish http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_any.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_any.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_any.txt new file mode 100644 index 0000000..fc08b5a --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_any.txt @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// SEPHARDIC: INCORPORATES Portuguese + Italian + Spanish(+Catalan) + French + +// CONSONANTS +"ph" "" "" "f" // foreign +"sh" "" "" "S" // foreign +"kh" "" "" "x" // foreign + +"gli" "" "" "(gli|l[italian])" +"gni" "" "" "(gni|ni[italian+french])" +"gn" "" "[aeou]" "(n[italian+french]|nj[italian+french]|gn)" +"gh" "" "" "g" // It + translit. from Arabic +"dh" "" "" "d" // translit. from Arabic +"bh" "" "" "b" // translit. from Arabic +"th" "" "" "t" // translit. from Arabic +"lh" "" "" "l" // Port +"nh" "" "" "nj" // Port + +"ig" "[aeiou]" "" "(ig|tS[spanish])" +"ix" "[aeiou]" "" "S" // Sp +"tx" "" "" "tS" // Sp +"tj" "" "$" "tS" // Sp +"tj" "" "" "dZ" // Sp +"tg" "" "" "(tg|dZ[spanish])" + +"gi" "" "[aeou]" "dZ" // italian +"g" "" "y" "Z" // french +"gg" "" "[ei]" "(gZ[portuguese+french]|dZ[italian+spanish]|x[spanish])" +"g" "" "[ei]" "(Z[portuguese+french]|dZ[italian+spanish]|x[spanish])" + +"guy" "" "" "gi" +"gue" "" "$" "(k[french]|ge)" +"gu" "" "[ei]" "(g|gv)" // not It +"gu" "" "[ao]" "gv" // not It + +"ñ" "" "" "(n|nj)" +"ny" "" "" "nj" + +"sc" "" "[ei]" "(s|S[italian])" +"sç" "" "[aeiou]" "s" // not It +"ss" "" "" "s" +"ç" "" "" "s" // not It + +"ch" "" "[ei]" "(k[italian]|S[portuguese+french]|tS[spanish]|dZ[spanish])" +"ch" "" "" "(S|tS[spanish]|dZ[spanish])" + +"ci" "" "[aeou]" "(tS[italian]|si)" +"cc" "" "[eiyéèê]" "(tS[italian]|ks[portuguese+french+spanish])" +"c" "" "[eiyéèê]" "(tS[italian]|s[portuguese+french+spanish])" +//"c" "" "[aou]" "(k|C[portuguese+spanish])" // "C" means that the actual letter could be "ç" (cedille omitted) + +"s" "^" "" "s" +"s" "[aáuiÃoóeéêy]" "[aáuiÃoóeéêy]" "(s[spanish]|z[portuguese+french+italian])" +"s" "" "[dglmnrv]" "(z|Z[portuguese])" + +"z" "" "$" "(s|ts[italian]|S[portuguese])" // ts It, s/S/Z Port, s in Sp, z Fr +"z" "" "[bdgv]" "(z|dz[italian]|Z[portuguese])" // dz It, Z/z Port, z Sp & Fr +"z" "" "[ptckf]" "(s|ts[italian]|S[portuguese])" // ts It, s/S/z Port, z/s Sp +"z" "" "" "(z|dz[italian]|ts[italian]|s[spanish])" // ts/dz It, z Port & Fr, z/s Sp + +"que" "" "$" "(k[french]|ke)" +"qu" "" "[eiu]" "k" +"qu" "" "[ao]" "(kv|k)" // k is It + +"ex" "" "[aáuiÃoóeéêy]" "(ez[portuguese]|eS[portuguese]|eks|egz)" +"ex" "" "[cs]" "(e[portuguese]|ek)" + +"m" "" "[cdglnrst]" "(m|n[portuguese])" +"m" "" "[bfpv]" "(m|n[portuguese+spanish])" +"m" "" "$" "(m|n[portuguese])" + +"b" "^" "" "(b|V[spanish])" +"v" "^" "" "(v|B[spanish])" + +// VOWELS +"eau" "" "" "o" // Fr + +"ouh" "" "[aioe]" "(v[french]|uh)" +"uh" "" "[aioe]" "(v|uh)" +"ou" "" "[aioe]" "v" // french +"uo" "" "" "(vo|o)" +"u" "" "[aie]" "v" + +"i" "[aáuoóeéê]" "" "j" +"i" "" "[aeou]" "j" +"y" "[aáuiÃoóeéê]" "" "j" +"y" "" "[aeiÃou]" "j" +"e" "" "$" "(e|E[french])" + +"ão" "" "" "(au|an)" // Port +"ãe" "" "" "(aj|an)" // Port +"ãi" "" "" "(aj|an)" // Port +"õe" "" "" "(oj|on)" // Port +"où" "" "" "u" // Fr +"ou" "" "" "(ou|u[french])" + +"â" "" "" "a" // Port & Fr +"à " "" "" "a" // Port +"á" "" "" "a" // Port & Sp +"ã" "" "" "(a|an)" // Port +"é" "" "" "e" +"ê" "" "" "e" // Port & Fr +"è" "" "" "e" // Sp & Fr & It +"Ã" "" "" "i" // Port & Sp +"î" "" "" "i" // Fr +"ô" "" "" "o" // Port & Fr +"ó" "" "" "o" // Port & Sp & It +"õ" "" "" "(o|on)" // Port +"ò" "" "" "o" // Sp & It +"ú" "" "" "u" // Port & Sp +"ü" "" "" "u" // Port & Sp + +// LATIN ALPHABET +"a" "" "" "a" +"b" "" "" "(b|v[spanish])" +"c" "" "" "k" +"d" "" "" "d" +"e" "" "" "e" +"f" "" "" "f" +"g" "" "" "g" +"h" "" "" "h" +"i" "" "" "i" +"j" "" "" "(x[spanish]|Z)" // not It +"k" "" "" "k" +"l" "" "" "l" +"m" "" "" "m" +"n" "" "" "n" +"o" "" "" "o" +"p" "" "" "p" +"q" "" "" "k" +"r" "" "" "r" +"s" "" "" "(s|S[portuguese])" +"t" "" "" "t" +"u" "" "" "u" +"v" "" "" "(v|b[spanish])" +"w" "" "" "v" // foreign +"x" "" "" "(ks|gz|S[portuguese+spanish])" // S/ks Port & Sp, gz Sp, It only ks +"y" "" "" "i" +"z" "" "" "z"
