Revision: 7042
          
http://languagetool.svn.sourceforge.net/languagetool/?rev=7042&view=rev
Author:   yakovru
Date:     2012-05-23 18:46:48 +0000 (Wed, 23 May 2012)
Log Message:
-----------
[ru] add new RussianWordRepeatRule, disable WordRepeatRule for Russian

Modified Paths:
--------------
    trunk/JLanguageTool/src/java/org/languagetool/language/Russian.java

Added Paths:
-----------
    
trunk/JLanguageTool/src/java/org/languagetool/rules/ru/RussianWordRepeatRule.java

Modified: trunk/JLanguageTool/src/java/org/languagetool/language/Russian.java
===================================================================
--- trunk/JLanguageTool/src/java/org/languagetool/language/Russian.java 
2012-05-23 18:43:29 UTC (rev 7041)
+++ trunk/JLanguageTool/src/java/org/languagetool/language/Russian.java 
2012-05-23 18:46:48 UTC (rev 7042)
@@ -27,6 +27,7 @@
 import org.languagetool.rules.ru.RussianSimpleReplaceRule;
 import org.languagetool.rules.ru.RussianCompoundRule;
 import org.languagetool.rules.ru.RussianUnpairedBracketsRule;
+import org.languagetool.rules.ru.RussianWordRepeatRule;
 import org.languagetool.synthesis.Synthesizer;
 import org.languagetool.synthesis.ru.RussianSynthesizer;
 import org.languagetool.tagging.Tagger;
@@ -116,12 +117,13 @@
             DoublePunctuationRule.class,
             UppercaseSentenceStartRule.class,
             HunspellRule.class,
-            WordRepeatRule.class,
+//            WordRepeatRule.class,
             WhitespaceRule.class,
             // specific to Russian :
             RussianUnpairedBracketsRule.class,
             RussianCompoundRule.class,
-            RussianSimpleReplaceRule.class
+            RussianSimpleReplaceRule.class,
+            RussianWordRepeatRule.class
     );
   }
 

Added: 
trunk/JLanguageTool/src/java/org/languagetool/rules/ru/RussianWordRepeatRule.java
===================================================================
--- 
trunk/JLanguageTool/src/java/org/languagetool/rules/ru/RussianWordRepeatRule.java
                           (rev 0)
+++ 
trunk/JLanguageTool/src/java/org/languagetool/rules/ru/RussianWordRepeatRule.java
   2012-05-23 18:46:48 UTC (rev 7042)
@@ -0,0 +1,197 @@
+/* LanguageTool, a natural language style checker 
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+package org.languagetool.rules.ru;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.ResourceBundle;
+import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.languagetool.AnalyzedSentence;
+import org.languagetool.AnalyzedTokenReadings;
+import org.languagetool.rules.Category;
+import org.languagetool.rules.RuleMatch;
+import org.languagetool.tools.StringTools;
+
+/**
+ * @author   -  Yakov Reztsov, based on code by Marcin Miłkowski
+ * 
+ *         Rule for detecting same words in the sentence but not just in a row
+ * 
+ */
+public class RussianWordRepeatRule extends RussianRule {
+  /**
+   * Excluded dictionary words.
+   */
+  private static final Pattern EXC_WORDS = Pattern
+      .compile("не|ни|а|"
+          + "на|в");
+
+  /**
+   * Excluded part of speech classes.
+   */
+  private static final Pattern EXC_POS = 
Pattern.compile("INTERJECTION|PRDC|PNN:.*");
+
+  /**
+   * Excluded non-words (special symbols, Roman numerals etc.
+   */
+  private static final Pattern EXC_NONWORDS = Pattern
+      .compile("&quot|&gt|&lt|&amp|[0-9].*|"
+          + "M*(D?C{0,3}|C[DM])(L?X{0,3}|X[LC])(V?I{0,3}|I[VX])$");
+
+  public RussianWordRepeatRule(final ResourceBundle messages) {
+    if (messages != null) {
+      super.setCategory(new Category(messages.getString("category_misc")));
+    }
+//    setDefaultOff();
+  }
+
+  /*
+   * (non-Javadoc)
+   * 
+   * @see org.languagetool.rules.Rule#getId()
+   */
+  @Override
+  public final String getId() {
+    return "RU_WORD_REPEAT";
+  }
+
+  /*
+   * (non-Javadoc)
+   * 
+   * @see org.languagetool.rules.Rule#getDescription()
+   */
+  @Override
+  public final String getDescription() {
+    return "Повтор слов в предложении";
+  }
+
+  /*
+   * Tests if any word form is repeated in the sentence.
+   */
+  @Override
+  public final RuleMatch[] match(final AnalyzedSentence text) {
+    final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
+    final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace();
+    boolean repetition = false;
+    final TreeSet<String> inflectedWords = new TreeSet<String>();
+    String prevLemma, curLemma;
+    // start from real token, 0 = SENT_START
+    for (int i = 1; i < tokens.length; i++) {
+      final String token = tokens[i].getToken();
+      // avoid "..." etc. to be matched:
+      boolean isWord = true;
+      boolean hasLemma = true;
+
+      if (token.length() < 2) {
+        isWord = false;
+      }
+
+      final int readingsLen = tokens[i].getReadingsLength();
+      for (int k = 0; k < readingsLen; k++) {
+        final String posTag = tokens[i].getAnalyzedToken(k).getPOSTag();
+        if (posTag != null) {
+          if (StringTools.isEmpty(posTag)) {
+            isWord = false;
+            break;
+          }
+          // FIXME: too many false alarms here:
+          final String lemma = tokens[i].getAnalyzedToken(k).getLemma();
+          if (lemma == null) {
+            hasLemma = false;
+            break;
+          }
+          final Matcher m1 = EXC_WORDS.matcher(lemma);
+          if (m1.matches()) {
+            isWord = false;
+            break;
+          }
+
+          final Matcher m2 = EXC_POS.matcher(posTag);
+          if (m2.matches()) {
+            isWord = false;
+            break;
+          }
+        } else {
+          hasLemma = false;
+        }
+
+      }
+
+      final Matcher m1 = EXC_NONWORDS.matcher(tokens[i].getToken());
+      if (m1.matches()) {
+        isWord = false;
+      }
+
+      prevLemma = "";
+      if (isWord) {
+        boolean notSentEnd = false;
+        for (int j = 0; j < readingsLen; j++) {
+          final String pos = tokens[i].getAnalyzedToken(j).getPOSTag();
+          if (pos != null) {
+            notSentEnd |= "SENT_END".equals(pos);
+          }
+          if (hasLemma) {
+            curLemma = tokens[i].getAnalyzedToken(j).getLemma();
+            if (!prevLemma.equals(curLemma) && !notSentEnd) {
+              if (inflectedWords.contains(curLemma)) {
+                repetition = true;
+              } else {
+                inflectedWords.add(tokens[i].getAnalyzedToken(j).getLemma());
+              }
+            }
+            prevLemma = curLemma;
+          } else {
+            if (inflectedWords.contains(tokens[i].getToken()) && !notSentEnd) {
+              repetition = true;
+            } else {
+              inflectedWords.add(tokens[i].getToken());
+            }
+          }
+
+        }
+      }
+
+      if (repetition) {
+        final String msg = "Повтор слов в предложении";
+        final int pos = tokens[i].getStartPos();
+        final RuleMatch ruleMatch = new RuleMatch(this, pos, pos
+            + token.length(), msg, "Повтор слов в предложении");        
+        ruleMatches.add(ruleMatch);
+        repetition = false;
+      }
+
+    }
+    return toRuleMatchArray(ruleMatches);
+  }
+
+  /*
+   * (non-Javadoc)
+   * 
+   * @see org.languagetool.rules.Rule#reset()
+   */
+  @Override
+  public void reset() {
+    // nothing
+
+  }
+
+}

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Live Security Virtual Conference
Exclusive live event will cover all the ways today's security and 
threat landscape has changed and how IT managers can respond. Discussions 
will include endpoint security, mobile security and the latest in malware 
threats. http://www.accelacomm.com/jaw/sfrnl04242012/114/50122263/
_______________________________________________
Languagetool-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/languagetool-cvs

Reply via email to