Revision: 7635
http://languagetool.svn.sourceforge.net/languagetool/?rev=7635&view=rev
Author: milek_pl
Date: 2012-07-02 07:57:57 +0000 (Mon, 02 Jul 2012)
Log Message:
-----------
[br] add Breton MorfologikSpeller dictionary with internal tokenizer
(separatorLength() seems to be a hack...)
Modified Paths:
--------------
trunk/JLanguageTool/src/java/org/languagetool/language/Breton.java
trunk/JLanguageTool/src/java/org/languagetool/rules/spelling/morfologik/MorfologikSpellerRule.java
trunk/JLanguageTool/src/test/org/languagetool/rules/HunspellRuleTest.java
Added Paths:
-----------
trunk/JLanguageTool/src/java/org/languagetool/rules/br/MorfologikBretonSpellerRule.java
trunk/JLanguageTool/src/resource/br/hunspell/br_FR.dict
trunk/JLanguageTool/src/resource/br/hunspell/br_FR.info
trunk/JLanguageTool/src/test/org/languagetool/rules/br/
trunk/JLanguageTool/src/test/org/languagetool/rules/br/MorfologikBretonSpellerRuleTest.java
Modified: trunk/JLanguageTool/src/java/org/languagetool/language/Breton.java
===================================================================
--- trunk/JLanguageTool/src/java/org/languagetool/language/Breton.java
2012-07-01 21:41:25 UTC (rev 7634)
+++ trunk/JLanguageTool/src/java/org/languagetool/language/Breton.java
2012-07-02 07:57:57 UTC (rev 7635)
@@ -29,7 +29,7 @@
import org.languagetool.rules.UppercaseSentenceStartRule;
import org.languagetool.rules.WhitespaceRule;
import org.languagetool.rules.br.TopoReplaceRule;
-import org.languagetool.rules.spelling.hunspell.HunspellNoSuggestionRule;
+import org.languagetool.rules.br.MorfologikBretonSpellerRule;
import org.languagetool.tagging.Tagger;
import org.languagetool.tagging.br.BretonTagger;
import org.languagetool.tagging.disambiguation.Disambiguator;
@@ -103,7 +103,7 @@
return Arrays.asList(
CommaWhitespaceRule.class,
DoublePunctuationRule.class,
- HunspellNoSuggestionRule.class,
+ MorfologikBretonSpellerRule.class,
UppercaseSentenceStartRule.class,
WhitespaceRule.class,
TopoReplaceRule.class
Added:
trunk/JLanguageTool/src/java/org/languagetool/rules/br/MorfologikBretonSpellerRule.java
===================================================================
---
trunk/JLanguageTool/src/java/org/languagetool/rules/br/MorfologikBretonSpellerRule.java
(rev 0)
+++
trunk/JLanguageTool/src/java/org/languagetool/rules/br/MorfologikBretonSpellerRule.java
2012-07-02 07:57:57 UTC (rev 7635)
@@ -0,0 +1,56 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2012 Marcin Miłkowski (http://www.languagetool.org)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+
+package org.languagetool.rules.br;
+
+import java.util.ResourceBundle;
+import java.util.regex.Pattern;
+
+import org.languagetool.Language;
+import org.languagetool.rules.spelling.morfologik.MorfologikSpellerRule;
+
+public final class MorfologikBretonSpellerRule extends MorfologikSpellerRule {
+
+ private static final String RESOURCE_FILENAME = "/br/hunspell/br_FR.dict";
+
+ private static final Pattern BRETON_TOKENIZING_CHARS =
Pattern.compile("-");
+
+ public MorfologikBretonSpellerRule(ResourceBundle messages,
+ Language language) {
+ super(messages, language);
+ }
+
+ @Override
+ public String getFileName() {
+ return RESOURCE_FILENAME;
+ }
+
+ public String getId() {
+ return "MORFOLOGIK_RULE_BR_FR";
+ }
+
+ public Pattern tokenizingPattern() {
+ return BRETON_TOKENIZING_CHARS;
+ }
+
+ public int separatorLength() {
+ return 1;
+ }
+
+}
Property changes on:
trunk/JLanguageTool/src/java/org/languagetool/rules/br/MorfologikBretonSpellerRule.java
___________________________________________________________________
Added: svn:mime-type
+ text/plain
Modified:
trunk/JLanguageTool/src/java/org/languagetool/rules/spelling/morfologik/MorfologikSpellerRule.java
===================================================================
---
trunk/JLanguageTool/src/java/org/languagetool/rules/spelling/morfologik/MorfologikSpellerRule.java
2012-07-01 21:41:25 UTC (rev 7634)
+++
trunk/JLanguageTool/src/java/org/languagetool/rules/spelling/morfologik/MorfologikSpellerRule.java
2012-07-02 07:57:57 UTC (rev 7635)
@@ -21,10 +21,12 @@
import java.io.IOException;
import java.net.URL;
+import java.nio.charset.CharacterCodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.ResourceBundle;
+import java.util.regex.Pattern;
import morfologik.speller.Speller;
import morfologik.stemming.Dictionary;
@@ -46,6 +48,7 @@
private final static String LANGUAGETOOL = "LanguageTool";
+
/**
* Get the filename, e.g., <tt>/resource/pl/spelling.dict</tt>.
*/
@@ -84,34 +87,51 @@
}
for (AnalyzedTokenReadings token : tokens) {
final String word = token.getToken();
- boolean isAlphabetic = true;
- if (word.length() == 1) { // dictionaries usually do not contain
punctuation
- isAlphabetic = StringTools.isAlphabetic(word.charAt(0));
- }
- if (word.length() > 0 && isAlphabetic && !token.isImmunized()
- && !containsDigit(word)
- && !LANGUAGETOOL.equals(word)
- && !speller.isInDictionary(word)
- &&
!speller.isInDictionary(word.toLowerCase(conversionLocale))) {
- final List<String> suggestions = new ArrayList<String>();
- suggestions.addAll(speller.findReplacements(word));
- if (!word.toLowerCase(conversionLocale).equals(word)) {
-
suggestions.addAll(speller.findReplacements(word.toLowerCase(conversionLocale)));
+ if (!token.isImmunized()) {
+ if (tokenizingPattern() == null) {
+ ruleMatches.addAll(getRuleMatch(word, token.getStartPos()));
+ } else {
+ int i = 0;
+ for (final String internalSplit :
tokenizingPattern().split(word)) {
+ ruleMatches.addAll(getRuleMatch(internalSplit,
token.getStartPos() + i));
+ i += internalSplit.length() + separatorLength();
}
- suggestions.addAll(speller.replaceRunOnWords(word));
- final RuleMatch ruleMatch = new RuleMatch(this,
- token.getStartPos(), token.getStartPos() +
word.length(),
- messages.getString("spelling"),
- messages.getString("desc_spelling_short"));
- if (!suggestions.isEmpty()) {
- ruleMatch.setSuggestedReplacements(suggestions);
- }
- ruleMatches.add(ruleMatch);
}
+ }
}
return toRuleMatchArray(ruleMatches);
}
+ private List<RuleMatch> getRuleMatch(final String word, final int
startPos) throws CharacterCodingException {
+ boolean isAlphabetic = true;
+ final List<RuleMatch> ruleM = new ArrayList<RuleMatch>();
+ if (word.length() == 1) { // dictionaries usually do not contain
punctuation
+ isAlphabetic = StringTools.isAlphabetic(word.charAt(0));
+ }
+ if (word.length() > 0 && isAlphabetic
+ && !containsDigit(word)
+ && !LANGUAGETOOL.equals(word)
+ && !speller.isInDictionary(word)
+ &&
!speller.isInDictionary(word.toLowerCase(conversionLocale))) {
+ final List<String> suggestions = new ArrayList<String>();
+ suggestions.addAll(speller.findReplacements(word));
+ if (!word.toLowerCase(conversionLocale).equals(word)) {
+
suggestions.addAll(speller.findReplacements(word.toLowerCase(conversionLocale)));
+ }
+ suggestions.addAll(speller.replaceRunOnWords(word));
+
+ final RuleMatch ruleMatch = new RuleMatch(this,
+ startPos, startPos + word.length(),
+ messages.getString("spelling"),
+ messages.getString("desc_spelling_short"));
+ if (!suggestions.isEmpty()) {
+ ruleMatch.setSuggestedReplacements(suggestions);
+ }
+ ruleM.add(ruleMatch);
+ }
+ return ruleM;
+ }
+
private final boolean containsDigit(final String s) {
for (int k = 0; k < s.length(); k++) {
if (Character.isDigit(s.charAt(k))) {
@@ -120,5 +140,13 @@
}
return false;
}
+
+ public Pattern tokenizingPattern() {
+ return null;
+ }
+
+ public int separatorLength() {
+ return 0;
+ }
}
Added: trunk/JLanguageTool/src/resource/br/hunspell/br_FR.dict
===================================================================
(Binary files differ)
Property changes on: trunk/JLanguageTool/src/resource/br/hunspell/br_FR.dict
___________________________________________________________________
Added: svn:mime-type
+ application/octet-stream
Added: trunk/JLanguageTool/src/resource/br/hunspell/br_FR.info
===================================================================
--- trunk/JLanguageTool/src/resource/br/hunspell/br_FR.info
(rev 0)
+++ trunk/JLanguageTool/src/resource/br/hunspell/br_FR.info 2012-07-02
07:57:57 UTC (rev 7635)
@@ -0,0 +1,9 @@
+#
+# Dictionary properties.
+#
+
+fsa.dict.separator=+
+fsa.dict.encoding=utf-8
+
+fsa.dict.uses-prefixes=false
+fsa.dict.uses-infixes=false
Modified:
trunk/JLanguageTool/src/test/org/languagetool/rules/HunspellRuleTest.java
===================================================================
--- trunk/JLanguageTool/src/test/org/languagetool/rules/HunspellRuleTest.java
2012-07-01 21:41:25 UTC (rev 7634)
+++ trunk/JLanguageTool/src/test/org/languagetool/rules/HunspellRuleTest.java
2012-07-02 07:57:57 UTC (rev 7635)
@@ -101,7 +101,6 @@
final HunspellRule rule = new
HunspellRule(TestTools.getMessages("Breton"), Language.BRETON);
final JLanguageTool langTool = new JLanguageTool(Language.BRETON);
- assertEquals(0, rule.match(langTool.getAnalyzedSentence("Penaos emañ kont
ganit?")).length);
assertEquals(0, rule.match(langTool.getAnalyzedSentence("C'hwerc'h merc'h
gwerc'h war c'hwerc'h marc'h kalloc'h")).length);
assertEquals(0, rule.match(langTool.getAnalyzedSentence("C’hwerc’h merc’h
gwerc‘h war c‘hwerc‘h marc'h kalloc‘h")).length);
assertEquals(0,
rule.match(langTool.getAnalyzedSentence("Evel-just")).length);
Added:
trunk/JLanguageTool/src/test/org/languagetool/rules/br/MorfologikBretonSpellerRuleTest.java
===================================================================
---
trunk/JLanguageTool/src/test/org/languagetool/rules/br/MorfologikBretonSpellerRuleTest.java
(rev 0)
+++
trunk/JLanguageTool/src/test/org/languagetool/rules/br/MorfologikBretonSpellerRuleTest.java
2012-07-02 07:57:57 UTC (rev 7635)
@@ -0,0 +1,55 @@
+package org.languagetool.rules.br;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+
+import org.junit.Test;
+import org.languagetool.JLanguageTool;
+import org.languagetool.Language;
+import org.languagetool.TestTools;
+import org.languagetool.rules.RuleMatch;
+
+public class MorfologikBretonSpellerRuleTest {
+
+ @Test
+ public void testMorfologikSpeller() throws IOException {
+ MorfologikBretonSpellerRule rule =
+ new MorfologikBretonSpellerRule
(TestTools.getMessages("Breton"), Language.BRETON);
+
+ RuleMatch[] matches;
+ JLanguageTool langTool = new JLanguageTool(Language.BRETON);
+
+
+ // correct sentences:
+ assertEquals(0, rule.match(langTool.getAnalyzedSentence("Penaos emañ
kont ganit?")).length);
+
+ assertEquals(0, rule.match(langTool.getAnalyzedSentence("C'hwerc'h
merc'h gwerc'h war c'hwerc'h marc'h kalloc'h")).length);
+ assertEquals(0, rule.match(langTool.getAnalyzedSentence("C’hwerc’h
merc’h gwerc‘h war c‘hwerc‘h marc'h kalloc‘h")).length);
+
+ //does not work: words with hyphens are not in the dictionary...
+ assertEquals(0,
rule.match(langTool.getAnalyzedSentence("Evel-just")).length);
+ assertEquals(0, rule.match(langTool.getAnalyzedSentence("Barrek-tre eo
LanguageTool")).length);
+
+ //test for "LanguageTool":
+ assertEquals(0,
rule.match(langTool.getAnalyzedSentence("LanguageTool!")).length);
+ assertEquals(0, rule.match(langTool.getAnalyzedSentence(",")).length);
+ assertEquals(0,
rule.match(langTool.getAnalyzedSentence("123454")).length);
+
+ //incorrect sentences:
+
+ assertEquals(1,
rule.match(langTool.getAnalyzedSentence("Evel-juste")).length);
+
+ matches = rule.match(langTool.getAnalyzedSentence("Evel-juste"));
+
+ // check match positions:
+ assertEquals(1, matches.length);
+ assertEquals(5, matches[0].getFromPos());
+ assertEquals(10, matches[0].getToPos());
+
+ assertEquals(1,
rule.match(langTool.getAnalyzedSentence("aõh")).length);
+ assertEquals(0, rule.match(langTool.getAnalyzedSentence("a")).length);
+
+ }
+
+}
Property changes on:
trunk/JLanguageTool/src/test/org/languagetool/rules/br/MorfologikBretonSpellerRuleTest.java
___________________________________________________________________
Added: svn:mime-type
+ text/plain
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Live Security Virtual Conference
Exclusive live event will cover all the ways today's security and
threat landscape has changed and how IT managers can respond. Discussions
will include endpoint security, mobile security and the latest in malware
threats. http://www.accelacomm.com/jaw/sfrnl04242012/114/50122263/
_______________________________________________
Languagetool-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/languagetool-cvs