Revision: 6880
          
http://languagetool.svn.sourceforge.net/languagetool/?rev=6880&view=rev
Author:   dominikoeo
Date:     2012-05-12 18:36:11 +0000 (Sat, 12 May 2012)
Log Message:
-----------
[br] changed the Java rule BR_TOPO so that it can
     report multiple suggestions. This is quite similar
     to what is done in the Romanian file
     rules/ro/SimpleReplaceRule.java.

Modified Paths:
--------------
    trunk/JLanguageTool/src/java/org/languagetool/rules/br/TopoReplaceRule.java
    trunk/JLanguageTool/src/rules/br/topo.txt

Modified: 
trunk/JLanguageTool/src/java/org/languagetool/rules/br/TopoReplaceRule.java
===================================================================
--- trunk/JLanguageTool/src/java/org/languagetool/rules/br/TopoReplaceRule.java 
2012-05-12 15:03:32 UTC (rev 6879)
+++ trunk/JLanguageTool/src/java/org/languagetool/rules/br/TopoReplaceRule.java 
2012-05-12 18:36:11 UTC (rev 6880)
@@ -18,62 +18,245 @@
  */
 package org.languagetool.rules.br;
 
+import java.io.BufferedReader;
 import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
 import java.util.Locale;
+import java.util.Map;
+import java.util.Queue;
 import java.util.ResourceBundle;
+import java.util.concurrent.ArrayBlockingQueue;
 
+import org.languagetool.AnalyzedSentence;
+import org.languagetool.AnalyzedTokenReadings;
+import org.languagetool.JLanguageTool;
+import org.languagetool.Language;
 import org.languagetool.rules.AbstractSimpleReplaceRule;
+import org.languagetool.rules.Category;
+import org.languagetool.rules.Rule;
+import org.languagetool.rules.RuleMatch;
+import org.languagetool.tokenizers.Tokenizer;
+import org.languagetool.tools.StringTools;
 
 /**
  * A rule that matches place names in French which should be
  * translated in Breton.
  *
  * Loads the list of words from <code>rules/br/topo.txt</code>.
+ * This class is mostly copied from ro/SimplaceReplaceRules.java.
  *
  * @author Dominique Pellé
  */
-public class TopoReplaceRule extends AbstractSimpleReplaceRule {
+public class TopoReplaceRule extends Rule {
 
   public static final String BRETON_TOPO = "BR_TOPO";
 
   private static final String FILE_NAME = "/br/topo.txt";
+  private static final String FILE_ENCODING = "utf-8";
   // locale used on case-conversion
   private static final Locale BR_LOCALE = new Locale("br");
 
-  @Override
+  // list of maps containing error-corrections pairs.
+  // the n-th map contains key strings of (n+1) words 
+  private final List<Map<String, String>> wrongWords;
+
   public final String getFileName() {
     return FILE_NAME;
   }
 
   public TopoReplaceRule(final ResourceBundle messages) throws IOException {
     super(messages);
+    if (messages != null) {
+      super.setCategory(new Category(messages.getString("category_misc")));
+    }
+    wrongWords = 
loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream(getFileName()));
   }
 
-  @Override
   public final String getId() {
     return BRETON_TOPO;
   }
 
-  @Override
   public String getDescription() {
     return "anvioù-lec’h e brezhoneg";
   }
 
-  @Override
   public String getShort() {
     return "anvioù lec’h";
   }
 
-  @Override
   public String getSuggestion() {
-    return " a vez ul lec’h-anv gallek. E brezhoneg e vez graet ";
+    return " zo un anv lec’h gallek. Ha fellout a rae deoc’h skrivañ ";
   }
 
   /**
+   * @return the word used to separate multiple suggestions; used only before 
last suggestion, the rest are comma-separated.  
+   */
+  public String getSuggestionsSeparator() {
+    return " pe ";
+  }
+
+  /**
+   * use case-insensitive matching.
+   */
+  public boolean isCaseSensitive() {
+    return true;
+  }
+
+  /**
    * locale used on case-conversion
    */
-  @Override
   public Locale getLocale() {
     return BR_LOCALE;
   }
+
+  public String getEncoding() {
+    return FILE_ENCODING;
+  }
+
+  /**
+   * @return the word tokenizer used for tokenization on loading words.
+   */
+  protected Tokenizer getWordTokenizer() {
+    return Language.BRETON.getWordTokenizer();
+  }
+
+  /**
+   * @return the list of wrong words for which this rule can suggest 
correction. The list cannot be modified.
+   */
+  public List<Map<String, String>> getWrongWords() {
+    return wrongWords;
+  }
+
+  /**
+   * Load the list of words. <br/>
+   * Same as {@link AbstractSimpleReplaceRule#loadWords} but allows multiple 
words.   
+   * @param file the file to load.
+   * @return the list of maps containing the error-corrections pairs. <br/>The 
n-th map contains key strings of (n+1) words.
+   * @throws IOException when the file contains errors.
+   * @see #getWordTokenizer
+   */
+  private List<Map<String, String>> loadWords(final InputStream file)
+          throws IOException {
+    final List<Map<String, String>> list = new ArrayList<Map<String, 
String>>();
+    InputStreamReader isr = null;
+    BufferedReader br = null;
+    try {
+      isr = new InputStreamReader(file, getEncoding());
+      br = new BufferedReader(isr);
+      String line;
+
+      while ((line = br.readLine()) != null) {
+        line = line.trim();
+        if (line.length() < 1 || line.charAt(0) == '#') { // ignore comments
+          continue;
+        }
+        final String[] parts = line.split("=");
+        if (parts.length != 2) {
+          throw new IOException("Format error in file "
+                  + 
JLanguageTool.getDataBroker().getFromRulesDirAsUrl(getFileName())
+                  + ", line: " + line);
+        }
+        final String[] wrongForms = parts[0].split("\\|"); // multiple 
incorrect forms
+        for (String wrongForm : wrongForms) {
+          int wordCount = 0;
+          final List<String> tokens = getWordTokenizer().tokenize(wrongForm);
+          for (String token : tokens) {
+            if (!StringTools.isWhitespace(token)) {
+              wordCount++;
+            }
+          }
+          // grow if necessary
+          for (int i = list.size(); i < wordCount; i++) {
+            list.add(new HashMap<String, String>());
+          }
+          list.get(wordCount - 1).put(wrongForm, parts[1]);
+        }
+      }
+
+    } finally {
+      if (br != null) {
+        br.close();
+      }
+      if (isr != null) {
+        isr.close();
+      }
+    }
+    // seal the result (prevent modification from outside this class)
+    final List<Map<String,String>> result = new ArrayList<Map<String, 
String>>();
+    for (Map<String, String> map : list) {
+      result.add(Collections.unmodifiableMap(map));
+    }
+    return Collections.unmodifiableList(result);
+  }
+
+  private void addToQueue(AnalyzedTokenReadings token,
+                          Queue<AnalyzedTokenReadings> prevTokens) {
+    final boolean inserted = prevTokens.offer(token);
+    if (!inserted) {
+      prevTokens.poll();
+      prevTokens.offer(token);
+    }
+  }
+
+  @Override
+  public RuleMatch[] match(final AnalyzedSentence text) {
+    final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
+    final AnalyzedTokenReadings[] tokens = text
+            .getTokensWithoutWhitespace();
+
+    final Queue<AnalyzedTokenReadings> prevTokens = new 
ArrayBlockingQueue<AnalyzedTokenReadings>(wrongWords.size());
+
+    for (int i = 1; i < tokens.length; i++) {
+      addToQueue(tokens[i], prevTokens);
+      final StringBuilder sb = new StringBuilder();
+      final ArrayList<String> variants = new ArrayList<String>();
+      final List<AnalyzedTokenReadings> prevTokensList = 
Arrays.asList(prevTokens.toArray(new AnalyzedTokenReadings[] {}));
+      for (int j = prevTokensList.size() - 1; j >= 0; j--) {
+        if (j != prevTokensList.size() - 1 && prevTokensList.get(j + 
1).isWhitespaceBefore())
+          sb.insert(0, " ");
+        sb.insert(0, prevTokensList.get(j).getToken());
+        variants.add(0, sb.toString());
+      }
+      final int len = variants.size(); // prevTokensList and variants have now 
the same length
+      for (int j = 0; j < len; j++) {  // longest words first
+        final String crt = variants.get(j);
+        final int crtWordCount = len - j;
+        final String crtMatch = isCaseSensitive() ? 
wrongWords.get(crtWordCount - 1).get(crt) : wrongWords.get(crtWordCount- 
1).get(crt.toLowerCase(getLocale()));
+        if (crtMatch != null) {
+          final List<String> replacements = 
Arrays.asList(crtMatch.split("\\|"));
+          String msg = crt + getSuggestion();
+          for (int k = 0; k < replacements.size(); k++) {
+            if (k > 0) {
+              msg = msg + (k == replacements.size() - 1 ? 
getSuggestionsSeparator(): ", ");
+            }
+            msg += "<suggestion>" + replacements.get(k) + "</suggestion>?";
+          }
+          final int startPos = prevTokensList.get(len - 
crtWordCount).getStartPos();
+          final int endPos = prevTokensList.get(len - 1).getStartPos() + 
prevTokensList.get(len - 1).getToken().length();
+          final RuleMatch potentialRuleMatch = new RuleMatch(this, startPos, 
endPos, msg, getShort());
+
+          if (!isCaseSensitive() && StringTools.startsWithUppercase(crt)) {
+            for (int k = 0; k < replacements.size(); k++) {
+              replacements.set(k, 
StringTools.uppercaseFirstChar(replacements.get(k)));
+            }
+          }
+          potentialRuleMatch.setSuggestedReplacements(replacements);
+          ruleMatches.add(potentialRuleMatch);
+          break;
+        }
+      }
+    }
+    return toRuleMatchArray(ruleMatches);
+  }
+
+  @Override
+  public void reset() {
+  }
+
 }

Modified: trunk/JLanguageTool/src/rules/br/topo.txt
===================================================================
--- trunk/JLanguageTool/src/rules/br/topo.txt   2012-05-12 15:03:32 UTC (rev 
6879)
+++ trunk/JLanguageTool/src/rules/br/topo.txt   2012-05-12 18:36:11 UTC (rev 
6880)
@@ -16,8 +16,7 @@
 Alger=Aljer
 Algérie=Aljeria
 Allaire=Alaer
-Allemagne=Alamagn
-Allemagne=bro-Alamagn
+Allemagne=Alamagn|bro-Alamagn
 Allineuc=Alineg
 Alsace=Elzas
 Amanlis=Amanliz
@@ -44,8 +43,7 @@
 Arménie=Armenia
 Arradon=Aradon
 Arthon-en-Retz=Arzhon-Raez
-Arz=Aer
-Arz=An Arzh
+Arz=Aer|An Arzh
 Arzal=Arzhal
 Arzano=An Arzhanaou
 Arzon=Arzhon-Rewiz
@@ -56,15 +54,12 @@
 Aucaleuc=Oskaleg
 Audierne=Gwaien
 Augan=Algam
-Aulne=Aon
-Aulne=Stêr-Aon
+Aulne=Aon|Stêr-Aon
 Auray=An Alre
 Australie=Aostralia
 Autriche-Hongrie=Aostria-Hungaria
 Autriche=Aostria
 Availles-sur-Seiche=Avallod-ar-Sec’h
-Aven=Stêr Pont-Aven
-Aven=Stêr-Aven
 Avessac=Avezeg
 Azerbaïdjan=Azerbaidjan
 Baguer-Morvan=Bagar-Morvan
@@ -80,9 +75,7 @@
 Barbechat=Bargazh
 Barcelone=Barcelona
 Bas-Leon=Goueled-Leon
-Basse-Bretagne=Breizh-Izel
-Basse-Bretagne=Breizh-Izel
-Basse-Bretagne=Goueled-Breizh
+Basse-Bretagne=Breizh-Izel|Goueled-Breizh
 Basse-Cornouaille=Kerne-Izel
 Basse-Goulaine=Goueled-Goulen
 Basse-Loire=Goueled-Liger
@@ -164,10 +157,7 @@
 Brasparts=Brasparzh
 Brech=Brec’h
 Brennilis=Brenniliz
-Bretagne=Breizh
-Bretagne=Breizh
-Bretagne=Breizh-Vihan
-Bretagne=bzh
+Bretagne=Breizh|Breizh-Vihan
 Breteil=Brezhiel
 Brie=Brev
 Briec=Brieg
@@ -341,11 +331,7 @@
 Cork=Korkig
 Corlay=Korle
 Cornillé=Kornilieg
-Cornouaille=Kerne
-Cornouaille=Kerne
-Cornouaille=Kernev
-Cornouaille=bro-Gerne
-Cornouaille=bro-Gernev
+Cornouaille=Kerne|Kernev|bro-Gerne|bro-Gernev
 Cornouailles=Kernev-Veur
 Corps-Nuds=Kornuz
 Corse=Korsika
@@ -380,7 +366,6 @@
 Côtes-du-Nord=Aodoù-an-Hanternoz
 Danemark=Danmark
 Daoulas=Daoulaz
-Daoulas=Daoulaz
 Derval=Derwal
 Dinard=Dinarzh
 Dingé=Dingad
@@ -395,8 +380,7 @@
 Domploup=Domloup
 Donges=Donez
 Doré=Dore
-Dourcane=Dourkamm
-Dourcane=Richer Dourkamm
+Dourcane=Dourkamm|Richer Dourkamm
 Dourdain=Dourdan
 Drefféac=Devrieg
 Drennec=An Dreneg
@@ -418,8 +402,7 @@
 Ergué-Gabéric=An Erge-Vras
 Erquy=Erge-ar-Mor
 Eréac=Erieg
-Espagne=Spagn
-Espagne=bro-Spagn
+Espagne=Spagn|bro-Spagn
 Esquibien=An Eskevien
 Essé=Ezieg
 Estonie=Estonia
@@ -429,8 +412,7 @@
 Evran=Evrann
 Evriguet=Evriged
 Faou=Ar Faou
-Faouët=Ar Faoued
-Faouët=Ar Faoued-Pontrev
+Faouët=Ar Faoued|Ar Faoued-Pontrev
 Fay-de-Bretagne=Faouell
 Feins=Finioù
 Fercé=Ferreg
@@ -449,8 +431,7 @@
 Fouesnant=Fouenant
 Fougerêts=Felgerieg-al-Lann
 Fougères=Felger
-France=Frañs
-France=bro-C’hall
+France=Frañs|bro-C’hall
 Fresnais=An Onneg
 Fresnaye-en-Retz=Onnod-Raez
 Fresne-sur-Loire=Runonn
@@ -502,8 +483,7 @@
 Groix=Groe
 Grâce-Uzel=Gras-Uzel
 Grâces=Gras-Gwengamp
-Grèce=Gres
-Grèce=bro-C’hres
+Grèce=Gres|bro-C’hres
 Grée-Saint-Laurent=Ar C’hrav-Sant-Laorañs
 Guadeloupe=Gwadeloup
 Gueltas=Gweltaz
@@ -515,7 +495,6 @@
 Guerlesquin=Gwerliskin
 Guern=Gwern
 Guernesey=Gwernenez
-Guernesey=Gwernenez
 Guerno=Ar Gwernoù
 Guichen=Gwizien
 Guiclan=Gwiglann
@@ -530,8 +509,6 @@
 Guimaëc=Gwimaeg
 Guimiliau=Gwimilio
 Guingamp=Gwengamp
-Guingamp=Gwengamp
-Guingamp=Gwengamp
 Guinée=Ginea
 Guipavas=Gwipavaz
 Guipel=Gwipedel
@@ -558,9 +535,7 @@
 Haut-Corlay=Ar Gozh-Korle
 Haut-Karabagh=Nagorno-Karabac’h
 Haut-Leon=Gorre-Leon
-Haute-Bretagne=Breizh-Uhel
-Haute-Bretagne=Breizh-Uhel
-Haute-Bretagne=Gorre-Breizh
+Haute-Bretagne=Breizh-Uhel|Gorre-Breizh
 Haute-Cornouaille=Kerne-Uhel
 Haute-Goulaine=Gorre-Goulen
 Haye-Fouassière=An Hae-Foazer
@@ -579,8 +554,7 @@
 Hongrie=Hungaria
 Houat=Houad
 Huelgoat=An Uhelgoad
-Hyères=Iêr
-Hyères=Stêr-Iêr
+Hyères=Iêr|Stêr-Iêr
 Hâvre=Havr
 Hédé=Hazhoù
 Hémonstoir=Henvoustoer
@@ -594,7 +568,6 @@
 Iffs=An Ivineg
 Ilet=Iled
 Ille=Il
-Ille=Il
 Illet=Ilan
 Illifaut=Ilifav
 Inam=Stêr Laer
@@ -604,8 +577,7 @@
 Indre=Antr
 Inguiniel=An Ignel
 Inzinzac-Lochrist=Zinzag-Lokrist
-Irlande=Iwerzhon
-Irlande=bro-Iwerzhon
+Irlande=Iwerzhon|bro-Iwerzhon
 Irodouër=Irodouer
 Iroise=Hirwazh
 Irvillac=Irvilhag
@@ -718,12 +690,9 @@
 Laurenan=Lanreunan
 Lauzach=Laozag
 Lavau-sur-Loire=Gwal-Liger
-Laïta=Laeta
-Laïta=Stêr Gemperle
+Laïta=Laeta|Stêr Gemperle
 Leff=Leñv
 Legé=Levieg
-Leon=Bro-Leon
-Leon=bro-Leon
 Lescouët-Gouarec=Leskoed-Gwareg
 Leslay=Al Leslae
 Lettonie=Latvia
@@ -770,8 +739,6 @@
 Lohéac=Lohieg
 Loire-Inférieure=Liger-Izelañ
 Loire=Liger
-Loire=Liger
-Loire=Liger
 Londres=Londrez
 Longaulnay=Hirwerneg
 Loperhet=Loperc’hed
@@ -830,8 +797,7 @@
 Massérac=Merzhereg
 Maumusson=Malvegon
 Maure-de-Bretagne=Anast
-Maurice=Maoris
-Maurice=Moris
+Maurice=Maoris|Moris
 Mauritanie=Maouritania
 Mauron=Maoron
 Mauves-sur-Loire=Malvid
@@ -904,8 +870,7 @@
 Morlaix=Montroulez
 Moréac=Mourieg
 Motreff=Motrev
-Motte=Ar Voudenn
-Motte=moudenn
+Motte=Ar Voudenn|moudenn
 Mouais=Lanvoe
 Mouazé=Moezeg
 Moulins=Melined
@@ -935,7 +900,6 @@
 Naizin=Neizin
 Namibie=Namibia
 Nantes=Naoned
-Nantes=Naoned
 Nançon=Nanton
 Neulliac=Neulieg
 Newport=Casnewydd
@@ -963,8 +927,7 @@
 Népal=Nepal
 Névez=Nevez
 Occitanie=Okitania
-Odet=Oded
-Odet=Stêr Oded
+Odet=Oded|Stêr Oded
 Orgères=An Heizeg
 Orvault=Orvez
 Ossé=Oc’heg
@@ -1072,8 +1035,7 @@
 Ploudaniel=Plouzeniel
 Ploudiry=Plouziri
 Plouescat=Ploueskad
-Plouezec=Ploueg
-Plouezec=Ploueg-ar-Mor
+Plouezec=Ploueg|Ploueg-ar-Mor
 Plougar=Gwikar
 Plougasnou=Plouganoù
 Plougastel-Daoulas=Plougastell-Daoulaz
@@ -1090,8 +1052,7 @@
 Plouguernével=Plougernevel
 Plouguiel=Priel
 Plouguin=Plougin
-Plouhinec=Pleheneg
-Plouhinec=Ploeneg
+Plouhinec=Pleheneg|Ploeneg
 Plouigneau=Plouigno
 Plouisy=Plouizi
 Ploumagoar=Plouvagor
@@ -1160,7 +1121,6 @@
 Plévin=Plevin
 Pocé-les-Bois=Pozieg
 Poher=Poc’hêr
-Poher=Poc’hêr
 Poilley=Polieg
 Poligné=Polinieg
 Pologne=Polonia
@@ -1180,7 +1140,6 @@
 Porcaro=Porzh-Karozh
 Pordic=Porzhig
 Porhoët=Porc’hoed
-Porhoët=Porc’hoed
 Pornic=Pornizh
 Pornichet=Pornizhan
 Port-Launay=Meilh-ar-Wern
@@ -1219,7 +1178,6 @@
 Quilly=Killig
 Quily=Killi
 Quimper=Kemper
-Quimper=Kemper
 Quimperlé=Kemperle
 Quintenic=Kistenid
 Quintin=Kintin
@@ -1241,8 +1199,6 @@
 Remungol=Remengol
 Renac=Ranneg
 Rennes=Roazhon
-Rennes=Roazhon
-Rennes=Roazhon
 Retiers=Rester
 Rezé=Reudied
 Rheu=Reuz
@@ -1297,10 +1253,8 @@
 Saint-Alban=Sant-Alvan
 Saint-Allouestre=Sant-Aleustr
 Saint-André-Treize-Voies=Sant-Andrev-Trizek-Hent
-Saint-André-des-Eaux=Sant-Andrev-an-Dour
-Saint-André-des-Eaux=Sant-Andrev-an-Doureier
-Saint-Armel=Sant-Armael
-Saint-Armel=Sant-Armael-ar-Gilli
+Saint-André-des-Eaux=Sant-Andrev-an-Dour|Sant-Andrev-an-Doureier
+Saint-Armel=Sant-Armael|Sant-Armael-ar-Gilli
 Saint-Aubin-des-Châteaux=Sant-Albin-ar-C’hestell
 Saint-Aubin-des-Landes=Sant-Albin-al-Lann
 Saint-Aubin-du-Cormier=Sant-Albin-an-Hiliber
@@ -1317,7 +1271,6 @@
 Saint-Brieuc-de-Mauron=Sant-Brieg-Maoron
 Saint-Brieuc-des-Iffs=Sant-Brieg-an-Ivineg
 Saint-Brieuc=Sant-Brieg
-Saint-Brieuc=Sant-Brieg
 Saint-Broladre=Sant-Brewalaer
 Saint-Brévin-les-Pins=Sant-Brewenn
 Saint-Caradec-Tréglomel=Sant-Karadeg-Tregonvael
@@ -1430,7 +1383,6 @@
 Saint-Malo-de-Phily=Sant-Maloù-Fili
 Saint-Malo-des-Troix-Fontaines=Sant-Maloù-an-Teir-Feunteun
 Saint-Malo=Sant-Maloù
-Saint-Malo=Sant-Maloù
 Saint-Malon-sur-Mel=Sant-Malon
 Saint-Marc-le-Blanc=Sant-Mezar-Elvinieg
 Saint-Marc-sur-Couesnon=Sant-Marzh-ar-C’houenon
@@ -1442,8 +1394,7 @@
 Saint-Mars-la-Jaille=Sant-Marzh-an-Olivenn
 Saint-Martin-des-Champs=Sant-Martin-war-ar-Maez
 Saint-Martin-des-Prés=Sant-Varzhin-Korle
-Saint-Martin=Sant-Martin
-Saint-Martin=Sant-Varzhin-an-Oud
+Saint-Martin=Sant-Martin|Sant-Varzhin-an-Oud
 Saint-Maudan=Sant-Maodan
 Saint-Maudez=Sant-Maodez
 Saint-Maugan=Sant-Malgant

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Live Security Virtual Conference
Exclusive live event will cover all the ways today's security and 
threat landscape has changed and how IT managers can respond. Discussions 
will include endpoint security, mobile security and the latest in malware 
threats. http://www.accelacomm.com/jaw/sfrnl04242012/114/50122263/
_______________________________________________
Languagetool-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/languagetool-cvs

Reply via email to