Revision: 6880
http://languagetool.svn.sourceforge.net/languagetool/?rev=6880&view=rev
Author: dominikoeo
Date: 2012-05-12 18:36:11 +0000 (Sat, 12 May 2012)
Log Message:
-----------
[br] changed the Java rule BR_TOPO so that it can
report multiple suggestions. This is quite similar
to what is done in the Romanian file
rules/ro/SimpleReplaceRule.java.
Modified Paths:
--------------
trunk/JLanguageTool/src/java/org/languagetool/rules/br/TopoReplaceRule.java
trunk/JLanguageTool/src/rules/br/topo.txt
Modified:
trunk/JLanguageTool/src/java/org/languagetool/rules/br/TopoReplaceRule.java
===================================================================
--- trunk/JLanguageTool/src/java/org/languagetool/rules/br/TopoReplaceRule.java
2012-05-12 15:03:32 UTC (rev 6879)
+++ trunk/JLanguageTool/src/java/org/languagetool/rules/br/TopoReplaceRule.java
2012-05-12 18:36:11 UTC (rev 6880)
@@ -18,62 +18,245 @@
*/
package org.languagetool.rules.br;
+import java.io.BufferedReader;
import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
import java.util.Locale;
+import java.util.Map;
+import java.util.Queue;
import java.util.ResourceBundle;
+import java.util.concurrent.ArrayBlockingQueue;
+import org.languagetool.AnalyzedSentence;
+import org.languagetool.AnalyzedTokenReadings;
+import org.languagetool.JLanguageTool;
+import org.languagetool.Language;
import org.languagetool.rules.AbstractSimpleReplaceRule;
+import org.languagetool.rules.Category;
+import org.languagetool.rules.Rule;
+import org.languagetool.rules.RuleMatch;
+import org.languagetool.tokenizers.Tokenizer;
+import org.languagetool.tools.StringTools;
/**
* A rule that matches place names in French which should be
* translated in Breton.
*
* Loads the list of words from <code>rules/br/topo.txt</code>.
+ * This class is mostly copied from ro/SimplaceReplaceRules.java.
*
* @author Dominique Pellé
*/
-public class TopoReplaceRule extends AbstractSimpleReplaceRule {
+public class TopoReplaceRule extends Rule {
public static final String BRETON_TOPO = "BR_TOPO";
private static final String FILE_NAME = "/br/topo.txt";
+ private static final String FILE_ENCODING = "utf-8";
// locale used on case-conversion
private static final Locale BR_LOCALE = new Locale("br");
- @Override
+ // list of maps containing error-corrections pairs.
+ // the n-th map contains key strings of (n+1) words
+ private final List<Map<String, String>> wrongWords;
+
public final String getFileName() {
return FILE_NAME;
}
public TopoReplaceRule(final ResourceBundle messages) throws IOException {
super(messages);
+ if (messages != null) {
+ super.setCategory(new Category(messages.getString("category_misc")));
+ }
+ wrongWords =
loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream(getFileName()));
}
- @Override
public final String getId() {
return BRETON_TOPO;
}
- @Override
public String getDescription() {
return "anvioù-lec’h e brezhoneg";
}
- @Override
public String getShort() {
return "anvioù lec’h";
}
- @Override
public String getSuggestion() {
- return " a vez ul lec’h-anv gallek. E brezhoneg e vez graet ";
+ return " zo un anv lec’h gallek. Ha fellout a rae deoc’h skrivañ ";
}
/**
+ * @return the word used to separate multiple suggestions; used only before
last suggestion, the rest are comma-separated.
+ */
+ public String getSuggestionsSeparator() {
+ return " pe ";
+ }
+
+ /**
+ * use case-insensitive matching.
+ */
+ public boolean isCaseSensitive() {
+ return true;
+ }
+
+ /**
* locale used on case-conversion
*/
- @Override
public Locale getLocale() {
return BR_LOCALE;
}
+
+ public String getEncoding() {
+ return FILE_ENCODING;
+ }
+
+ /**
+ * @return the word tokenizer used for tokenization on loading words.
+ */
+ protected Tokenizer getWordTokenizer() {
+ return Language.BRETON.getWordTokenizer();
+ }
+
+ /**
+ * @return the list of wrong words for which this rule can suggest
correction. The list cannot be modified.
+ */
+ public List<Map<String, String>> getWrongWords() {
+ return wrongWords;
+ }
+
+ /**
+ * Load the list of words. <br/>
+ * Same as {@link AbstractSimpleReplaceRule#loadWords} but allows multiple
words.
+ * @param file the file to load.
+ * @return the list of maps containing the error-corrections pairs. <br/>The
n-th map contains key strings of (n+1) words.
+ * @throws IOException when the file contains errors.
+ * @see #getWordTokenizer
+ */
+ private List<Map<String, String>> loadWords(final InputStream file)
+ throws IOException {
+ final List<Map<String, String>> list = new ArrayList<Map<String,
String>>();
+ InputStreamReader isr = null;
+ BufferedReader br = null;
+ try {
+ isr = new InputStreamReader(file, getEncoding());
+ br = new BufferedReader(isr);
+ String line;
+
+ while ((line = br.readLine()) != null) {
+ line = line.trim();
+ if (line.length() < 1 || line.charAt(0) == '#') { // ignore comments
+ continue;
+ }
+ final String[] parts = line.split("=");
+ if (parts.length != 2) {
+ throw new IOException("Format error in file "
+ +
JLanguageTool.getDataBroker().getFromRulesDirAsUrl(getFileName())
+ + ", line: " + line);
+ }
+ final String[] wrongForms = parts[0].split("\\|"); // multiple
incorrect forms
+ for (String wrongForm : wrongForms) {
+ int wordCount = 0;
+ final List<String> tokens = getWordTokenizer().tokenize(wrongForm);
+ for (String token : tokens) {
+ if (!StringTools.isWhitespace(token)) {
+ wordCount++;
+ }
+ }
+ // grow if necessary
+ for (int i = list.size(); i < wordCount; i++) {
+ list.add(new HashMap<String, String>());
+ }
+ list.get(wordCount - 1).put(wrongForm, parts[1]);
+ }
+ }
+
+ } finally {
+ if (br != null) {
+ br.close();
+ }
+ if (isr != null) {
+ isr.close();
+ }
+ }
+ // seal the result (prevent modification from outside this class)
+ final List<Map<String,String>> result = new ArrayList<Map<String,
String>>();
+ for (Map<String, String> map : list) {
+ result.add(Collections.unmodifiableMap(map));
+ }
+ return Collections.unmodifiableList(result);
+ }
+
+ private void addToQueue(AnalyzedTokenReadings token,
+ Queue<AnalyzedTokenReadings> prevTokens) {
+ final boolean inserted = prevTokens.offer(token);
+ if (!inserted) {
+ prevTokens.poll();
+ prevTokens.offer(token);
+ }
+ }
+
+ @Override
+ public RuleMatch[] match(final AnalyzedSentence text) {
+ final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
+ final AnalyzedTokenReadings[] tokens = text
+ .getTokensWithoutWhitespace();
+
+ final Queue<AnalyzedTokenReadings> prevTokens = new
ArrayBlockingQueue<AnalyzedTokenReadings>(wrongWords.size());
+
+ for (int i = 1; i < tokens.length; i++) {
+ addToQueue(tokens[i], prevTokens);
+ final StringBuilder sb = new StringBuilder();
+ final ArrayList<String> variants = new ArrayList<String>();
+ final List<AnalyzedTokenReadings> prevTokensList =
Arrays.asList(prevTokens.toArray(new AnalyzedTokenReadings[] {}));
+ for (int j = prevTokensList.size() - 1; j >= 0; j--) {
+ if (j != prevTokensList.size() - 1 && prevTokensList.get(j +
1).isWhitespaceBefore())
+ sb.insert(0, " ");
+ sb.insert(0, prevTokensList.get(j).getToken());
+ variants.add(0, sb.toString());
+ }
+ final int len = variants.size(); // prevTokensList and variants have now
the same length
+ for (int j = 0; j < len; j++) { // longest words first
+ final String crt = variants.get(j);
+ final int crtWordCount = len - j;
+ final String crtMatch = isCaseSensitive() ?
wrongWords.get(crtWordCount - 1).get(crt) : wrongWords.get(crtWordCount-
1).get(crt.toLowerCase(getLocale()));
+ if (crtMatch != null) {
+ final List<String> replacements =
Arrays.asList(crtMatch.split("\\|"));
+ String msg = crt + getSuggestion();
+ for (int k = 0; k < replacements.size(); k++) {
+ if (k > 0) {
+ msg = msg + (k == replacements.size() - 1 ?
getSuggestionsSeparator(): ", ");
+ }
+ msg += "<suggestion>" + replacements.get(k) + "</suggestion>?";
+ }
+ final int startPos = prevTokensList.get(len -
crtWordCount).getStartPos();
+ final int endPos = prevTokensList.get(len - 1).getStartPos() +
prevTokensList.get(len - 1).getToken().length();
+ final RuleMatch potentialRuleMatch = new RuleMatch(this, startPos,
endPos, msg, getShort());
+
+ if (!isCaseSensitive() && StringTools.startsWithUppercase(crt)) {
+ for (int k = 0; k < replacements.size(); k++) {
+ replacements.set(k,
StringTools.uppercaseFirstChar(replacements.get(k)));
+ }
+ }
+ potentialRuleMatch.setSuggestedReplacements(replacements);
+ ruleMatches.add(potentialRuleMatch);
+ break;
+ }
+ }
+ }
+ return toRuleMatchArray(ruleMatches);
+ }
+
+ @Override
+ public void reset() {
+ }
+
}
Modified: trunk/JLanguageTool/src/rules/br/topo.txt
===================================================================
--- trunk/JLanguageTool/src/rules/br/topo.txt 2012-05-12 15:03:32 UTC (rev
6879)
+++ trunk/JLanguageTool/src/rules/br/topo.txt 2012-05-12 18:36:11 UTC (rev
6880)
@@ -16,8 +16,7 @@
Alger=Aljer
Algérie=Aljeria
Allaire=Alaer
-Allemagne=Alamagn
-Allemagne=bro-Alamagn
+Allemagne=Alamagn|bro-Alamagn
Allineuc=Alineg
Alsace=Elzas
Amanlis=Amanliz
@@ -44,8 +43,7 @@
Arménie=Armenia
Arradon=Aradon
Arthon-en-Retz=Arzhon-Raez
-Arz=Aer
-Arz=An Arzh
+Arz=Aer|An Arzh
Arzal=Arzhal
Arzano=An Arzhanaou
Arzon=Arzhon-Rewiz
@@ -56,15 +54,12 @@
Aucaleuc=Oskaleg
Audierne=Gwaien
Augan=Algam
-Aulne=Aon
-Aulne=Stêr-Aon
+Aulne=Aon|Stêr-Aon
Auray=An Alre
Australie=Aostralia
Autriche-Hongrie=Aostria-Hungaria
Autriche=Aostria
Availles-sur-Seiche=Avallod-ar-Sec’h
-Aven=Stêr Pont-Aven
-Aven=Stêr-Aven
Avessac=Avezeg
Azerbaïdjan=Azerbaidjan
Baguer-Morvan=Bagar-Morvan
@@ -80,9 +75,7 @@
Barbechat=Bargazh
Barcelone=Barcelona
Bas-Leon=Goueled-Leon
-Basse-Bretagne=Breizh-Izel
-Basse-Bretagne=Breizh-Izel
-Basse-Bretagne=Goueled-Breizh
+Basse-Bretagne=Breizh-Izel|Goueled-Breizh
Basse-Cornouaille=Kerne-Izel
Basse-Goulaine=Goueled-Goulen
Basse-Loire=Goueled-Liger
@@ -164,10 +157,7 @@
Brasparts=Brasparzh
Brech=Brec’h
Brennilis=Brenniliz
-Bretagne=Breizh
-Bretagne=Breizh
-Bretagne=Breizh-Vihan
-Bretagne=bzh
+Bretagne=Breizh|Breizh-Vihan
Breteil=Brezhiel
Brie=Brev
Briec=Brieg
@@ -341,11 +331,7 @@
Cork=Korkig
Corlay=Korle
Cornillé=Kornilieg
-Cornouaille=Kerne
-Cornouaille=Kerne
-Cornouaille=Kernev
-Cornouaille=bro-Gerne
-Cornouaille=bro-Gernev
+Cornouaille=Kerne|Kernev|bro-Gerne|bro-Gernev
Cornouailles=Kernev-Veur
Corps-Nuds=Kornuz
Corse=Korsika
@@ -380,7 +366,6 @@
Côtes-du-Nord=Aodoù-an-Hanternoz
Danemark=Danmark
Daoulas=Daoulaz
-Daoulas=Daoulaz
Derval=Derwal
Dinard=Dinarzh
Dingé=Dingad
@@ -395,8 +380,7 @@
Domploup=Domloup
Donges=Donez
Doré=Dore
-Dourcane=Dourkamm
-Dourcane=Richer Dourkamm
+Dourcane=Dourkamm|Richer Dourkamm
Dourdain=Dourdan
Drefféac=Devrieg
Drennec=An Dreneg
@@ -418,8 +402,7 @@
Ergué-Gabéric=An Erge-Vras
Erquy=Erge-ar-Mor
Eréac=Erieg
-Espagne=Spagn
-Espagne=bro-Spagn
+Espagne=Spagn|bro-Spagn
Esquibien=An Eskevien
Essé=Ezieg
Estonie=Estonia
@@ -429,8 +412,7 @@
Evran=Evrann
Evriguet=Evriged
Faou=Ar Faou
-Faouët=Ar Faoued
-Faouët=Ar Faoued-Pontrev
+Faouët=Ar Faoued|Ar Faoued-Pontrev
Fay-de-Bretagne=Faouell
Feins=Finioù
Fercé=Ferreg
@@ -449,8 +431,7 @@
Fouesnant=Fouenant
Fougerêts=Felgerieg-al-Lann
Fougères=Felger
-France=Frañs
-France=bro-C’hall
+France=Frañs|bro-C’hall
Fresnais=An Onneg
Fresnaye-en-Retz=Onnod-Raez
Fresne-sur-Loire=Runonn
@@ -502,8 +483,7 @@
Groix=Groe
Grâce-Uzel=Gras-Uzel
Grâces=Gras-Gwengamp
-Grèce=Gres
-Grèce=bro-C’hres
+Grèce=Gres|bro-C’hres
Grée-Saint-Laurent=Ar C’hrav-Sant-Laorañs
Guadeloupe=Gwadeloup
Gueltas=Gweltaz
@@ -515,7 +495,6 @@
Guerlesquin=Gwerliskin
Guern=Gwern
Guernesey=Gwernenez
-Guernesey=Gwernenez
Guerno=Ar Gwernoù
Guichen=Gwizien
Guiclan=Gwiglann
@@ -530,8 +509,6 @@
Guimaëc=Gwimaeg
Guimiliau=Gwimilio
Guingamp=Gwengamp
-Guingamp=Gwengamp
-Guingamp=Gwengamp
Guinée=Ginea
Guipavas=Gwipavaz
Guipel=Gwipedel
@@ -558,9 +535,7 @@
Haut-Corlay=Ar Gozh-Korle
Haut-Karabagh=Nagorno-Karabac’h
Haut-Leon=Gorre-Leon
-Haute-Bretagne=Breizh-Uhel
-Haute-Bretagne=Breizh-Uhel
-Haute-Bretagne=Gorre-Breizh
+Haute-Bretagne=Breizh-Uhel|Gorre-Breizh
Haute-Cornouaille=Kerne-Uhel
Haute-Goulaine=Gorre-Goulen
Haye-Fouassière=An Hae-Foazer
@@ -579,8 +554,7 @@
Hongrie=Hungaria
Houat=Houad
Huelgoat=An Uhelgoad
-Hyères=Iêr
-Hyères=Stêr-Iêr
+Hyères=Iêr|Stêr-Iêr
Hâvre=Havr
Hédé=Hazhoù
Hémonstoir=Henvoustoer
@@ -594,7 +568,6 @@
Iffs=An Ivineg
Ilet=Iled
Ille=Il
-Ille=Il
Illet=Ilan
Illifaut=Ilifav
Inam=Stêr Laer
@@ -604,8 +577,7 @@
Indre=Antr
Inguiniel=An Ignel
Inzinzac-Lochrist=Zinzag-Lokrist
-Irlande=Iwerzhon
-Irlande=bro-Iwerzhon
+Irlande=Iwerzhon|bro-Iwerzhon
Irodouër=Irodouer
Iroise=Hirwazh
Irvillac=Irvilhag
@@ -718,12 +690,9 @@
Laurenan=Lanreunan
Lauzach=Laozag
Lavau-sur-Loire=Gwal-Liger
-Laïta=Laeta
-Laïta=Stêr Gemperle
+Laïta=Laeta|Stêr Gemperle
Leff=Leñv
Legé=Levieg
-Leon=Bro-Leon
-Leon=bro-Leon
Lescouët-Gouarec=Leskoed-Gwareg
Leslay=Al Leslae
Lettonie=Latvia
@@ -770,8 +739,6 @@
Lohéac=Lohieg
Loire-Inférieure=Liger-Izelañ
Loire=Liger
-Loire=Liger
-Loire=Liger
Londres=Londrez
Longaulnay=Hirwerneg
Loperhet=Loperc’hed
@@ -830,8 +797,7 @@
Massérac=Merzhereg
Maumusson=Malvegon
Maure-de-Bretagne=Anast
-Maurice=Maoris
-Maurice=Moris
+Maurice=Maoris|Moris
Mauritanie=Maouritania
Mauron=Maoron
Mauves-sur-Loire=Malvid
@@ -904,8 +870,7 @@
Morlaix=Montroulez
Moréac=Mourieg
Motreff=Motrev
-Motte=Ar Voudenn
-Motte=moudenn
+Motte=Ar Voudenn|moudenn
Mouais=Lanvoe
Mouazé=Moezeg
Moulins=Melined
@@ -935,7 +900,6 @@
Naizin=Neizin
Namibie=Namibia
Nantes=Naoned
-Nantes=Naoned
Nançon=Nanton
Neulliac=Neulieg
Newport=Casnewydd
@@ -963,8 +927,7 @@
Népal=Nepal
Névez=Nevez
Occitanie=Okitania
-Odet=Oded
-Odet=Stêr Oded
+Odet=Oded|Stêr Oded
Orgères=An Heizeg
Orvault=Orvez
Ossé=Oc’heg
@@ -1072,8 +1035,7 @@
Ploudaniel=Plouzeniel
Ploudiry=Plouziri
Plouescat=Ploueskad
-Plouezec=Ploueg
-Plouezec=Ploueg-ar-Mor
+Plouezec=Ploueg|Ploueg-ar-Mor
Plougar=Gwikar
Plougasnou=Plouganoù
Plougastel-Daoulas=Plougastell-Daoulaz
@@ -1090,8 +1052,7 @@
Plouguernével=Plougernevel
Plouguiel=Priel
Plouguin=Plougin
-Plouhinec=Pleheneg
-Plouhinec=Ploeneg
+Plouhinec=Pleheneg|Ploeneg
Plouigneau=Plouigno
Plouisy=Plouizi
Ploumagoar=Plouvagor
@@ -1160,7 +1121,6 @@
Plévin=Plevin
Pocé-les-Bois=Pozieg
Poher=Poc’hêr
-Poher=Poc’hêr
Poilley=Polieg
Poligné=Polinieg
Pologne=Polonia
@@ -1180,7 +1140,6 @@
Porcaro=Porzh-Karozh
Pordic=Porzhig
Porhoët=Porc’hoed
-Porhoët=Porc’hoed
Pornic=Pornizh
Pornichet=Pornizhan
Port-Launay=Meilh-ar-Wern
@@ -1219,7 +1178,6 @@
Quilly=Killig
Quily=Killi
Quimper=Kemper
-Quimper=Kemper
Quimperlé=Kemperle
Quintenic=Kistenid
Quintin=Kintin
@@ -1241,8 +1199,6 @@
Remungol=Remengol
Renac=Ranneg
Rennes=Roazhon
-Rennes=Roazhon
-Rennes=Roazhon
Retiers=Rester
Rezé=Reudied
Rheu=Reuz
@@ -1297,10 +1253,8 @@
Saint-Alban=Sant-Alvan
Saint-Allouestre=Sant-Aleustr
Saint-André-Treize-Voies=Sant-Andrev-Trizek-Hent
-Saint-André-des-Eaux=Sant-Andrev-an-Dour
-Saint-André-des-Eaux=Sant-Andrev-an-Doureier
-Saint-Armel=Sant-Armael
-Saint-Armel=Sant-Armael-ar-Gilli
+Saint-André-des-Eaux=Sant-Andrev-an-Dour|Sant-Andrev-an-Doureier
+Saint-Armel=Sant-Armael|Sant-Armael-ar-Gilli
Saint-Aubin-des-Châteaux=Sant-Albin-ar-C’hestell
Saint-Aubin-des-Landes=Sant-Albin-al-Lann
Saint-Aubin-du-Cormier=Sant-Albin-an-Hiliber
@@ -1317,7 +1271,6 @@
Saint-Brieuc-de-Mauron=Sant-Brieg-Maoron
Saint-Brieuc-des-Iffs=Sant-Brieg-an-Ivineg
Saint-Brieuc=Sant-Brieg
-Saint-Brieuc=Sant-Brieg
Saint-Broladre=Sant-Brewalaer
Saint-Brévin-les-Pins=Sant-Brewenn
Saint-Caradec-Tréglomel=Sant-Karadeg-Tregonvael
@@ -1430,7 +1383,6 @@
Saint-Malo-de-Phily=Sant-Maloù-Fili
Saint-Malo-des-Troix-Fontaines=Sant-Maloù-an-Teir-Feunteun
Saint-Malo=Sant-Maloù
-Saint-Malo=Sant-Maloù
Saint-Malon-sur-Mel=Sant-Malon
Saint-Marc-le-Blanc=Sant-Mezar-Elvinieg
Saint-Marc-sur-Couesnon=Sant-Marzh-ar-C’houenon
@@ -1442,8 +1394,7 @@
Saint-Mars-la-Jaille=Sant-Marzh-an-Olivenn
Saint-Martin-des-Champs=Sant-Martin-war-ar-Maez
Saint-Martin-des-Prés=Sant-Varzhin-Korle
-Saint-Martin=Sant-Martin
-Saint-Martin=Sant-Varzhin-an-Oud
+Saint-Martin=Sant-Martin|Sant-Varzhin-an-Oud
Saint-Maudan=Sant-Maodan
Saint-Maudez=Sant-Maodez
Saint-Maugan=Sant-Malgant
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Live Security Virtual Conference
Exclusive live event will cover all the ways today's security and
threat landscape has changed and how IT managers can respond. Discussions
will include endpoint security, mobile security and the latest in malware
threats. http://www.accelacomm.com/jaw/sfrnl04242012/114/50122263/
_______________________________________________
Languagetool-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/languagetool-cvs