OK so I have a patch that adds another Java rule and uses it for the Italian language module.
I tested the rules against wikipedia for italian and got no false alarms, same goes for several other languages such as Spanish and German. english has a couple of false alarms in bibliography entries. Lt me know what you think. Ciao Paolo ### Eclipse Workspace Patch 1.0 #P languagetool Index: languagetool-core/src/main/java/org/languagetool/rules/WhitespaceBeforePunctuationRule.java =================================================================== --- languagetool-core/src/main/java/org/languagetool/rules/WhitespaceBeforePunctuationRule.java (revision 0) +++ languagetool-core/src/main/java/org/languagetool/rules/WhitespaceBeforePunctuationRule.java (revision 0) @@ -0,0 +1,157 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package org.languagetool.rules; + +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; + +import org.languagetool.AnalyzedSentence; +import org.languagetool.AnalyzedTokenReadings; +import org.languagetool.tools.StringTools; + +/** + * A rule that matches several punctuation signs such as : ; and % preceded by whitespace. + * + * BUG ID 3607406: no space before semicolon + * + * @author Paolo Bianchini + */ + +public class WhitespaceBeforePunctuationRule extends Rule { + + public WhitespaceBeforePunctuationRule(final ResourceBundle messages) { + super(messages); + super.setCategory(new Category(messages.getString("category_misc"))); + setLocQualityIssueType("typographical"); + } + + @Override + public final String getId() { + return "WHITESPACE_PUNCTUATION"; + } + + @Override + public final String getDescription() { + return messages.getString("desc_whitespace_before_punctuation"); + } + + @Override + public final RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokens(); + String prevToken = ""; + String prevPrevToken = ""; + boolean prevWhite = false; + int prevLen = 0; + for (int i = 0; i < tokens.length; i++) { + final String token = tokens[i].getToken(); + final boolean isWhitespace = tokens[i].isWhitespace() || StringTools.isNonBreakingWhitespace(token) + || tokens[i].isFieldCode(); + String msg = null; + int fixLen = 0; + String suggestionText = null; + if (prevWhite) { + if (token.equals(":")) { + msg = messages.getString("no_space_before_colon"); + suggestionText = ":"; + fixLen = 1; + // exception case for figures such as " : 0" + if (i + 2 < tokens.length + && tokens[i + 1].isWhitespace() + && isNumberOrDot(tokens[i + 2].getToken())) { + msg = null; + } + } else if (token.equals(";")) { + msg = messages.getString("no_space_before_semicolon"); + suggestionText = ";"; + fixLen = 1; + } else if (token.equals("%")) { + msg = messages.getString("no_space_before_percentage"); + suggestionText = "%"; + fixLen = 1; + } + } + if (msg != null) { + final int fromPos = tokens[i - 1].getStartPos(); + final int toPos = tokens[i - 1].getStartPos() + fixLen + prevLen; + // TODO: add some good short comment here + final RuleMatch ruleMatch = new RuleMatch(this, fromPos, toPos, msg); + ruleMatch.setSuggestedReplacement(suggestionText); + ruleMatches.add(ruleMatch); + } + prevPrevToken = prevToken; + prevToken = token; + prevWhite = isWhitespace && !tokens[i].isFieldCode(); //OOo code before comma/dot + prevLen = tokens[i].getToken().length(); + } + + return toRuleMatchArray(ruleMatches); + } + + static boolean isNotQuoteOrHyphen(final String str) { + if (str.length() == 1) { + final char c = str.charAt(0); + if (c =='\'' || c == '-' || c == '‚Äù' + || c =='‚Äô' || c == '"' || c == '‚Äú' + || c == ',') { + return false; + } + } else { + return containsNoNumber(str); + } + return true; + } + + static boolean isNumberOrDot(final String str) { + final char c = str.charAt(0); + return (c == '.' || Character.isDigit(c)); + } + + static boolean isLeftBracket(final String str) { + if (str.length() == 0) { + return false; + } + final char c = str.charAt(0); + return (c == '(' || c == '[' || c == '{'); + } + + static boolean isRightBracket(final String str) { + if (str.length() == 0) { + return false; + } + final char c = str.charAt(0); + return (c == ')' || c == ']' || c == '}'); + } + + static boolean containsNoNumber(final String str) { + for (int i = 0; i < str.length(); i++) { + if (Character.isDigit(str.charAt(i))) { + return false; + } + } + return true; + } + + @Override + public void reset() { + // nothing + } + +} Index: languagetool-language-modules/it/src/main/resources/org/languagetool/MessagesBundle_it.properties =================================================================== --- languagetool-language-modules/it/src/main/resources/org/languagetool/MessagesBundle_it.properties (revision 9815) +++ languagetool-language-modules/it/src/main/resources/org/languagetool/MessagesBundle_it.properties (working copy) @@ -272,3 +272,17 @@ tray_menu_enable_server = Avviare il servizio HTTP. tray_tooltip_server_running = LanguageTool (servizio HTTP attivo) + + +# 3607406 + + +no_space_before_colon = Non inserire uno spazio prima dei due punti + +no_space_before_semicolon = Non inserire uno spazio prima del punto e virgola + +no_space_before_percentage = Non inserire uno spazio prima del segno di percentuale + +desc_whitespace_before_punctuation = Utilizzo dello spazio prima di : ; % + + +# 3607406 - Index: languagetool-language-modules/it/src/main/java/org/languagetool/language/Italian.java =================================================================== --- languagetool-language-modules/it/src/main/java/org/languagetool/language/Italian.java (revision 9815) +++ languagetool-language-modules/it/src/main/java/org/languagetool/language/Italian.java (working copy) @@ -30,6 +30,10 @@ import org.languagetool.rules.WhitespaceRule; import org.languagetool.rules.WordRepeatRule; import org.languagetool.rules.it.MorfologikItalianSpellerRule; +// 3607406 + +import org.languagetool.rules.WhitespaceBeforePunctuationRule; +// 3607406 - + import org.languagetool.tagging.Tagger; import org.languagetool.tagging.it.ItalianTagger; @@ -79,6 +83,9 @@ @Override public List<Class<? extends Rule>> getRelevantRules() { return Arrays.asList( +// 3607406 + + WhitespaceBeforePunctuationRule.class, +// 3607406 - CommaWhitespaceRule.class, DoublePunctuationRule.class, GenericUnpairedBracketsRule.class, Index: languagetool-core/src/main/resources/org/languagetool/MessagesBundle.properties =================================================================== --- languagetool-core/src/main/resources/org/languagetool/MessagesBundle.properties (revision 9815) +++ languagetool-core/src/main/resources/org/languagetool/MessagesBundle.properties (working copy) @@ -73,6 +73,12 @@ desc_spelling_short = Spelling mistake +# 3607406 + + +desc_whitespace_before_punctuation = Use of whitespace before colon, semicolon and percentage. + +# 3607406 - + double_dots_short = Two consecutive dots double_commas_short = Two consecutive comma @@ -203,6 +209,16 @@ no_space_before_dot = Don't put a space before the full stop +# 3607406 + + +no_space_before_colon = Don't put a space before the colon + +no_space_before_semicolon = Don't put a space before the semicolon + +no_space_before_percentage = Don't put a space before the percentage + +# 3607406 - + pl = Polish repetition = Possible typo: you repeated a word On Apr 5, 2013, at 11:29 AM, Dominique Pellé wrote: > It's not entirely language independent. At least French does not > follow those rules. French typography rules use a narrow non-break > space before colon, semi-colon, question mark and exclamation mark. > > There is an easy rule of thumb to remember where to put a space in > French: all punctuation characters that require to raise the pen to draw > them (? ! ; : %) require a space before them. All punctuation characters > that do not require to raise the pen to draw them should not have a > space before them (dot, comma). > > But even if it's a common Java rule, it does not have to be enabled > for French so that would be fine. > > Regards > Dominique > > R.J. Baars <[email protected]> wrote: > >> I guess this is language independ, so be fixed in same rule as no space >> before comma. >> >> Ruud >> >>> All, >>> >>> I could fix this bug by coding the following rule within the Italian >>> grammar.xml (I also added a check for semicolon… I know, I could have used >>> a reg_exp :-) ). >>> >>> The question is, would this be the correct/acceptable way of handling this >>> bug or should it be handled differently (a java rule) and/or for a wider >>> set of languages? >>> >>> Ciao >>> >>> Paolo >>> >>> >>> <rulegroup name="spazio prima di : o di ;" id="GR_09_005"> >>> <rule> >>> <pattern> >>> <token></token> >>> <token spacebefore="yes">:</token> >>> </pattern> >>> <message>Non lasciare uno spazio prima dei due >>> punti: >>> <suggestion><match no="1"></match>:</suggestion>.</message> >>> <example type="incorrect">Comprammo tanti >>> <marker>regali :</marker> >>> bambole, libri, vestiti.</example> >>> <example type="correct">Comprammo tanti >>> <marker>regali:</marker> >>> bambole, libri, vestiti.</example> >>> </rule> >>> <rule> >>> <pattern> >>> <token></token> >>> <token spacebefore="yes">;</token> >>> </pattern> >>> <message>Non lasciare uno spazio prima del >>> punto e virgola: >>> <suggestion><match no="1"></match>;</suggestion>.</message> >>> <example type="incorrect">Gli venne >>> <marker>sonno ;</marker> e rimandò >>> all'indomani.</example> >>> <example type="correct">Gli venne >>> <marker>sonno;</marker> e rimandò >>> all'indomani.</example> >>> </rule> >>> </rulegroup> >>> >>> ------------------------------------------------------------------------------ >>> Minimize network downtime and maximize team effectiveness. >>> Reduce network management and security costs.Learn how to hire >>> the most talented Cisco Certified professionals. Visit the >>> Employer Resources Portal >>> http://www.cisco.com/web/learning/employer_resources/index.html_______________________________________________ >>> Languagetool-devel mailing list >>> [email protected] >>> https://lists.sourceforge.net/lists/listinfo/languagetool-devel >>> >> >> >> >> ------------------------------------------------------------------------------ >> Minimize network downtime and maximize team effectiveness. >> Reduce network management and security costs.Learn how to hire >> the most talented Cisco Certified professionals. Visit the >> Employer Resources Portal >> http://www.cisco.com/web/learning/employer_resources/index.html >> _______________________________________________ >> Languagetool-devel mailing list >> [email protected] >> https://lists.sourceforge.net/lists/listinfo/languagetool-devel > > ------------------------------------------------------------------------------ > Minimize network downtime and maximize team effectiveness. > Reduce network management and security costs.Learn how to hire > the most talented Cisco Certified professionals. Visit the > Employer Resources Portal > http://www.cisco.com/web/learning/employer_resources/index.html > _______________________________________________ > Languagetool-devel mailing list > [email protected] > https://lists.sourceforge.net/lists/listinfo/languagetool-devel ------------------------------------------------------------------------------ Minimize network downtime and maximize team effectiveness. Reduce network management and security costs.Learn how to hire the most talented Cisco Certified professionals. Visit the Employer Resources Portal http://www.cisco.com/web/learning/employer_resources/index.html _______________________________________________ Languagetool-devel mailing list [email protected] https://lists.sourceforge.net/lists/listinfo/languagetool-devel
