Revision: 9896
http://languagetool.svn.sourceforge.net/languagetool/?rev=9896&view=rev
Author: jaumeortola
Date: 2013-04-07 19:05:24 +0000 (Sun, 07 Apr 2013)
Log Message:
-----------
Consider mixed case words (ex. woRd, wORD, WoRD) as spelling errors, and don't
tag them. Is it okay for all languages?
Modified Paths:
--------------
trunk/languagetool/languagetool-core/src/main/java/org/languagetool/rules/spelling/morfologik/MorfologikSpeller.java
trunk/languagetool/languagetool-core/src/main/java/org/languagetool/rules/spelling/morfologik/MorfologikSpellerRule.java
trunk/languagetool/languagetool-core/src/main/java/org/languagetool/tagging/BaseTagger.java
trunk/languagetool/languagetool-language-modules/ca/src/test/java/org/languagetool/rules/ca/MorfologikCatalanSpellerRuleTest.java
Modified:
trunk/languagetool/languagetool-core/src/main/java/org/languagetool/rules/spelling/morfologik/MorfologikSpeller.java
===================================================================
---
trunk/languagetool/languagetool-core/src/main/java/org/languagetool/rules/spelling/morfologik/MorfologikSpeller.java
2013-04-07 18:39:18 UTC (rev 9895)
+++
trunk/languagetool/languagetool-core/src/main/java/org/languagetool/rules/spelling/morfologik/MorfologikSpeller.java
2013-04-07 19:05:24 UTC (rev 9896)
@@ -80,7 +80,8 @@
&& !containsDigit(word)
&& !LANGUAGETOOL.equals(word)
&& !speller.isInDictionary(word)
- && !speller.isInDictionary(word.toLowerCase(conversionLocale));
+ && !(!StringTools.isMixedCase(word)
+ && speller.isInDictionary(word.toLowerCase(conversionLocale)));
}
public List<String> getSuggestions(String word) {
Modified:
trunk/languagetool/languagetool-core/src/main/java/org/languagetool/rules/spelling/morfologik/MorfologikSpellerRule.java
===================================================================
---
trunk/languagetool/languagetool-core/src/main/java/org/languagetool/rules/spelling/morfologik/MorfologikSpellerRule.java
2013-04-07 18:39:18 UTC (rev 9895)
+++
trunk/languagetool/languagetool-core/src/main/java/org/languagetool/rules/spelling/morfologik/MorfologikSpellerRule.java
2013-04-07 19:05:24 UTC (rev 9896)
@@ -52,6 +52,7 @@
public MorfologikSpellerRule(ResourceBundle messages, Language language)
throws IOException {
super(messages, language);
super.setCategory(new Category(messages.getString("category_typo")));
+ this.conversionLocale = conversionLocale != null ? conversionLocale :
Locale.getDefault();
init();
}
@@ -124,8 +125,8 @@
+ word.length(), messages.getString("spelling"),
messages.getString("desc_spelling_short"));
List<String> suggestions = speller.getSuggestions(word);
- //If few suggestions are found, try to get more from the word without
diacritics
- final String wordWithoutDiacritics=removeAccents(word);
+ //If few suggestions are found, try to get more from the word without
diacritics and lowercase
+ final String
wordWithoutDiacritics=removeAccents(word).toLowerCase(conversionLocale);
if (suggestions.size() < 5 && !word.equals(wordWithoutDiacritics)) {
List<String> moreSuggestions =
speller.getSuggestions(wordWithoutDiacritics);
if (!speller.isMisspelled(wordWithoutDiacritics)) {
@@ -161,7 +162,7 @@
}
/*
- * Remove any diacritical mark from a String
+ * Remove all diacritical marks from a String
*/
private static String removeAccents(String text) {
return text == null ? null
Modified:
trunk/languagetool/languagetool-core/src/main/java/org/languagetool/tagging/BaseTagger.java
===================================================================
---
trunk/languagetool/languagetool-core/src/main/java/org/languagetool/tagging/BaseTagger.java
2013-04-07 18:39:18 UTC (rev 9895)
+++
trunk/languagetool/languagetool-core/src/main/java/org/languagetool/tagging/BaseTagger.java
2013-04-07 19:05:24 UTC (rev 9896)
@@ -74,29 +74,34 @@
taggerTokens = asAnalyzedTokenList(word, dictLookup.lookup(word));
lowerTaggerTokens = asAnalyzedTokenList(word,
dictLookup.lookup(lowerWord));
final boolean isLowercase = word.equals(lowerWord);
+ final boolean isMixedcase = StringTools.isMixedCase(word);
//normal case
addTokens(taggerTokens, l);
- if (!isLowercase) {
- //lowercase
+ //tag alluppercase or startuppercase word (but not mixedcase)
+ //with lowercase word tags
+ if (!isLowercase && !isMixedcase) {
addTokens(lowerTaggerTokens, l);
}
- //uppercase
- if (lowerTaggerTokens.isEmpty() && taggerTokens.isEmpty()) {
- if (tagLowercaseWithUppercase && isLowercase) {
- upperTaggerTokens = asAnalyzedTokenList(word,
- dictLookup.lookup(StringTools.uppercaseFirstChar(word)));
- if (!upperTaggerTokens.isEmpty()) {
- addTokens(upperTaggerTokens, l);
- } else {
- l.add(new AnalyzedToken(word, null, null));
+ //tag lowercase word with startuppercase word tags
+ if (tagLowercaseWithUppercase) {
+ if (lowerTaggerTokens.isEmpty() && taggerTokens.isEmpty()) {
+ if (isLowercase) {
+ upperTaggerTokens = asAnalyzedTokenList(word,
+ dictLookup.lookup(StringTools.uppercaseFirstChar(word)));
+ if (!upperTaggerTokens.isEmpty()) {
+ addTokens(upperTaggerTokens, l);
+ }
}
- } else {
- l.add(new AnalyzedToken(word, null, null));
}
- }
+ }
+
+ if (l.isEmpty()) {
+ l.add(new AnalyzedToken(word, null, null));
+ }
+
tokenReadings.add(new AnalyzedTokenReadings(l, pos));
pos += word.length();
}
Modified:
trunk/languagetool/languagetool-language-modules/ca/src/test/java/org/languagetool/rules/ca/MorfologikCatalanSpellerRuleTest.java
===================================================================
---
trunk/languagetool/languagetool-language-modules/ca/src/test/java/org/languagetool/rules/ca/MorfologikCatalanSpellerRuleTest.java
2013-04-07 18:39:18 UTC (rev 9895)
+++
trunk/languagetool/languagetool-language-modules/ca/src/test/java/org/languagetool/rules/ca/MorfologikCatalanSpellerRuleTest.java
2013-04-07 19:05:24 UTC (rev 9896)
@@ -65,11 +65,14 @@
assertEquals(0,
rule.match(langTool.getAnalyzedSentence("LanguageTool!")).length);
assertEquals(0, rule.match(langTool.getAnalyzedSentence(",")).length);
assertEquals(0,
rule.match(langTool.getAnalyzedSentence("123454")).length);
+
+ //tests for mixed case words
+ assertEquals(0, rule.match(langTool.getAnalyzedSentence("pH")).length);
+ assertEquals(0,
rule.match(langTool.getAnalyzedSentence("McDonald")).length);
//incorrect sentences:
matches = rule.match(langTool.getAnalyzedSentence("joan"));
- // check match positions:
assertEquals(1, matches.length);
assertEquals(0, matches[0].getFromPos());
assertEquals(4, matches[0].getToPos());
@@ -81,6 +84,7 @@
assertEquals(9, matches[0].getToPos());
assertEquals("abatussats",
matches[0].getSuggestedReplacements().get(0));
+ // incomplete multiword
matches = rule.match(langTool.getAnalyzedSentence("L'statu"));
assertEquals(1, matches.length);
assertEquals(2, matches[0].getFromPos());
@@ -103,8 +107,6 @@
matches = rule.match(langTool.getAnalyzedSentence("ángel"));
assertEquals(1, matches.length);
- assertEquals(0, matches[0].getFromPos());
- assertEquals(5, matches[0].getToPos());
assertEquals("Àngel", matches[0].getSuggestedReplacements().get(0));
assertEquals("àngel", matches[0].getSuggestedReplacements().get(1));
assertEquals("angle", matches[0].getSuggestedReplacements().get(2));
@@ -112,8 +114,6 @@
matches = rule.match(langTool.getAnalyzedSentence("caçessim"));
assertEquals(1, matches.length);
- assertEquals(0, matches[0].getFromPos());
- assertEquals(8, matches[0].getToPos());
assertEquals("caçàssim", matches[0].getSuggestedReplacements().get(0));
assertEquals("cacessin", matches[0].getSuggestedReplacements().get(1));
assertEquals("cacessis", matches[0].getSuggestedReplacements().get(2));
@@ -121,16 +121,36 @@
matches = rule.match(langTool.getAnalyzedSentence("cantaríà"));
assertEquals(1, matches.length);
- assertEquals(0, matches[0].getFromPos());
- assertEquals(8, matches[0].getToPos());
assertEquals("cantarà", matches[0].getSuggestedReplacements().get(0));
assertEquals("cantaria", matches[0].getSuggestedReplacements().get(1));
+ //incorrect mixed case words
+ matches = rule.match(langTool.getAnalyzedSentence("tAula"));
+ assertEquals(1, matches.length);
+ assertEquals("taula", matches[0].getSuggestedReplacements().get(0));
+
+ matches = rule.match(langTool.getAnalyzedSentence("TAula"));
+ assertEquals(1, matches.length);
+ assertEquals("Tula", matches[0].getSuggestedReplacements().get(0));
+ assertEquals("taula", matches[0].getSuggestedReplacements().get(1));
+
+ matches = rule.match(langTool.getAnalyzedSentence("col·Labora"));
+ assertEquals(1, matches.length);
+ assertEquals("col·labora",
matches[0].getSuggestedReplacements().get(0));
+
+ matches = rule.match(langTool.getAnalyzedSentence("col·laborÀ"));
+ assertEquals(1, matches.length);
+ assertEquals("col·labor",
matches[0].getSuggestedReplacements().get(0));
+ assertEquals("col·labora",
matches[0].getSuggestedReplacements().get(1));
+ assertEquals("col·labore",
matches[0].getSuggestedReplacements().get(2));
+ assertEquals("col·labori",
matches[0].getSuggestedReplacements().get(3));
+ assertEquals("col·laboro",
matches[0].getSuggestedReplacements().get(4));
+ assertEquals("col·laborà",
matches[0].getSuggestedReplacements().get(5)); //-->Better in the first place!
+ assertEquals("col·laborí",
matches[0].getSuggestedReplacements().get(6));
+
//capitalized wrong words
matches = rule.match(langTool.getAnalyzedSentence("En la Pecra"));
assertEquals(1, matches.length);
- assertEquals(6, matches[0].getFromPos());
- assertEquals(11, matches[0].getToPos());
assertEquals("Pedra", matches[0].getSuggestedReplacements().get(0));
assertEquals("Peira", matches[0].getSuggestedReplacements().get(1));
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Minimize network downtime and maximize team effectiveness.
Reduce network management and security costs.Learn how to hire
the most talented Cisco Certified professionals. Visit the
Employer Resources Portal
http://www.cisco.com/web/learning/employer_resources/index.html
_______________________________________________
Languagetool-commits mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/languagetool-commits