Revision: 10516
          http://sourceforge.net/p/languagetool/code/10516
Author:   dnaber
Date:     2013-07-16 19:07:52 +0000 (Tue, 16 Jul 2013)
Log Message:
-----------
more improvements to Wikipedia text extraction/mapping - now skipping over the 
remaining errors

Modified Paths:
--------------
    
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/LocationHelper.java
    
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/MarkupAwareWikipediaResult.java
    
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/TextConverter.java
    
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/WikipediaQuickCheck.java
    
trunk/languagetool/languagetool-wikipedia/src/test/java/org/languagetool/dev/wikipedia/SuggestionReplacerTest.java
    
trunk/languagetool/languagetool-wikipedia/src/test/java/org/languagetool/dev/wikipedia/WikipediaQuickCheckTest.java

Modified: 
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/LocationHelper.java
===================================================================
--- 
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/LocationHelper.java
  2013-07-16 18:12:09 UTC (rev 10515)
+++ 
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/LocationHelper.java
  2013-07-16 19:07:52 UTC (rev 10516)
@@ -32,29 +32,34 @@
     int line = 1;
     int col = 1;
     int pos = 0;
-    boolean ignoreMode = false;
+    int ignoreLevel = 0;
     final StringBuilder relevantLine = new StringBuilder();
     for (int i = 0; i < text.length(); i++) {
       char ch = text.charAt(i);
       if (line == location.line) {
         relevantLine.append(ch);
       }
+      //System.out.println(line  + "/" + col + ", ignoreLevel: " + 
ignoreLevel);
       if (line == location.line && col == location.column) {
         return pos;
       }
       char prevCh = i > 0 ? text.charAt(i - 1) : '-';
-      if (ignoreMode) {
-        //
-        if (ch == '}' && prevCh == '}') {
-          // ignore templates
-          ignoreMode = false;
+      if (i < text.length() - 4 && text.substring(i, i + 4).equals("<!--")) {
+        // HTML comment
+        ignoreLevel++;
+      } else if (i < text.length() - 3 && text.substring(i, i + 
3).equals("-->")) {
+        ignoreLevel--;
+      } else if (ch == '}' && prevCh == '}') {
+        if (ignoreLevel > 0) {
+          ignoreLevel--;
         }
       } else if (ch == '{' && prevCh == '{') {
-        ignoreMode = true;
-      } else if (ch == '\n') {
+        // ignore templates
+        ignoreLevel++;
+      } else if (ch == '\n' && ignoreLevel == 0) {
         line++;
         col = 1;
-      } else {
+      } else if (ignoreLevel == 0) {
         col++;
       }
       pos++;
@@ -62,7 +67,7 @@
     if (line == location.line && col == location.column) {
       return pos;
     }
-    throw new RuntimeException("Could not find location " + location + " in 
text: '" + text + "'. " +
+    throw new RuntimeException("Could not find location " + location + " in 
text. " +
             "Max line/col was: " + line + "/" + col + ", Content of relevant 
line (" + location.line + "): '"
             + relevantLine + "' (" + relevantLine.length() + " chars)");
   }

Modified: 
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/MarkupAwareWikipediaResult.java
===================================================================
--- 
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/MarkupAwareWikipediaResult.java
      2013-07-16 18:12:09 UTC (rev 10515)
+++ 
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/MarkupAwareWikipediaResult.java
      2013-07-16 19:07:52 UTC (rev 10516)
@@ -27,13 +27,24 @@
 public class MarkupAwareWikipediaResult {
 
   private final List<RuleApplication> ruleApplications;
+  private final int internalErrors;
 
-  public MarkupAwareWikipediaResult(List<RuleApplication> ruleApplications) {
+  public MarkupAwareWikipediaResult(List<RuleApplication> ruleApplications, 
int internalErrors) {
     this.ruleApplications = ruleApplications;
+    this.internalErrors = internalErrors;
   }
 
   public List<RuleApplication> getRuleApplications() {
     return ruleApplications;
   }
 
+  /**
+   * Mapping the Wikipedia syntax is complicated and it sometimes fails. We
+   * try to skip over those problems and increase this counter. Each increment
+   * here means that one rule match is missing from the rule applications.
+   */
+  public int getInternalErrorCount() {
+    return internalErrors;
+  }
+
 }

Modified: 
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/TextConverter.java
===================================================================
--- 
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/TextConverter.java
   2013-07-16 18:12:09 UTC (rev 10515)
+++ 
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/TextConverter.java
   2013-07-16 19:07:52 UTC (rev 10516)
@@ -372,6 +372,7 @@
     int textPos = contentSoFar.length() + 1 + (needSpace ? 1 : 0);
     if (loc.hasLocation()) {
       mapping.put(textPos, loc.getLocation());
+      //System.out.println("PUT " + textPos + " -> " + loc.getLocation());
     }
   }
 

Modified: 
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/WikipediaQuickCheck.java
===================================================================
--- 
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/WikipediaQuickCheck.java
     2013-07-16 18:12:09 UTC (rev 10515)
+++ 
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/WikipediaQuickCheck.java
     2013-07-16 19:07:52 UTC (rev 10516)
@@ -93,20 +93,26 @@
     final WikipediaQuickCheck check = new WikipediaQuickCheck();
     final String xml = check.getMediaWikiContent(url);
     final String wikiMarkup = getRevisionContent(xml);
-    return checkWikipediaMarkup(wikiMarkup, getLanguage(url));
+    return checkWikipediaMarkup(url, wikiMarkup, getLanguage(url));
   }
 
-  public MarkupAwareWikipediaResult checkWikipediaMarkup(String wikiMarkup, 
Language language) throws IOException {
+  public MarkupAwareWikipediaResult checkWikipediaMarkup(URL url, String 
wikiMarkup, Language language) throws IOException {
     final SwebleWikipediaTextFilter filter = new SwebleWikipediaTextFilter();
     final PlainTextMapping mapping = filter.filter(wikiMarkup);
     final JLanguageTool langTool = getLanguageTool(language);
     final List<RuleApplication> ruleApplications = new 
ArrayList<RuleApplication>();
     final List<RuleMatch> matches = langTool.check(mapping.getPlainText());
+    int internalErrors = 0;
     for (RuleMatch match : matches) {
       final SuggestionReplacer replacer = new SuggestionReplacer(mapping, 
wikiMarkup);
-      ruleApplications.addAll(replacer.applySuggestionsToOriginalText(match));
+      try {
+        
ruleApplications.addAll(replacer.applySuggestionsToOriginalText(match));
+      } catch (Exception e) {
+        System.err.println("Failed to apply suggestion for rule match '" + 
match + "' for URL " + url + ": " + e.toString());
+        internalErrors++;
+      }
     }
-    return new MarkupAwareWikipediaResult(ruleApplications);
+    return new MarkupAwareWikipediaResult(ruleApplications, internalErrors);
   }
 
   public WikipediaQuickCheckResult checkPage(String plainText, Language lang) 
throws IOException {

Modified: 
trunk/languagetool/languagetool-wikipedia/src/test/java/org/languagetool/dev/wikipedia/SuggestionReplacerTest.java
===================================================================
--- 
trunk/languagetool/languagetool-wikipedia/src/test/java/org/languagetool/dev/wikipedia/SuggestionReplacerTest.java
  2013-07-16 18:12:09 UTC (rev 10515)
+++ 
trunk/languagetool/languagetool-wikipedia/src/test/java/org/languagetool/dev/wikipedia/SuggestionReplacerTest.java
  2013-07-16 19:07:52 UTC (rev 10516)
@@ -54,6 +54,27 @@
                                       "\n\nEin <s>ökumenisch</s> 
Gottesdienst.\n");
   }
 
+  public void testNestedTemplates() throws Exception {
+    JLanguageTool langTool = getLanguageTool();
+    SwebleWikipediaTextFilter filter = new SwebleWikipediaTextFilter();
+    String markup = "{{FNBox|\n" +
+            "  {{FNZ|1|1979 und 1984}}\n" +
+            "  {{FNZ|2|[[Rundungsfehler]]}}\n" +
+            "}}\n\nEin ökonomischer Gottesdienst.\n";
+    applySuggestion(langTool, filter, markup, markup.replace("ökonomischer", 
"<s>ökumenisch</s>"));
+  }
+
+  public void testKnownBug() throws Exception {
+    JLanguageTool langTool = getLanguageTool();
+    SwebleWikipediaTextFilter filter = new SwebleWikipediaTextFilter();
+    String markup = "{{HdBG GKZ|9761000}}.";
+    try {
+      applySuggestion(langTool, filter, markup, markup);
+    } catch (RuntimeException e) {
+      // known problem - Sweble's location seems to be wrong?!
+    }
+  }
+
   public void testComplexText() throws Exception {
     String markup = "{{Dieser Artikel|behandelt die freie Onlineenzyklopädie 
Wikipedia; zu dem gleichnamigen Asteroiden siehe [[(274301) Wikipedia]].}}\n" +
             "\n" +

Modified: 
trunk/languagetool/languagetool-wikipedia/src/test/java/org/languagetool/dev/wikipedia/WikipediaQuickCheckTest.java
===================================================================
--- 
trunk/languagetool/languagetool-wikipedia/src/test/java/org/languagetool/dev/wikipedia/WikipediaQuickCheckTest.java
 2013-07-16 18:12:09 UTC (rev 10515)
+++ 
trunk/languagetool/languagetool-wikipedia/src/test/java/org/languagetool/dev/wikipedia/WikipediaQuickCheckTest.java
 2013-07-16 19:07:52 UTC (rev 10516)
@@ -36,9 +36,11 @@
     //final String url = 
"http://de.wikipedia.org/wiki/Benutzer_Diskussion:Dnaber";;
     //final String url = "http://de.wikipedia.org/wiki/OpenThesaurus";;
     //final String url = "http://de.wikipedia.org/wiki/Gütersloh";;
-    final String url = "http://de.wikipedia.org/wiki/Bielefeld";;
+    //final String url = "http://de.wikipedia.org/wiki/Bielefeld";;
+    final String url = "http://de.wikipedia.org/wiki/Augsburg";;
     final MarkupAwareWikipediaResult result = check.checkPage(new URL(url));
     final List<RuleApplication> ruleApplications = 
result.getRuleApplications();
+    System.out.println("ruleApplications: " + ruleApplications.size());
     for (RuleApplication ruleApplication : ruleApplications) {
       System.out.println("Rule     : " + 
ruleApplication.getRuleMatch().getRule().getDescription());
       System.out.println("Original : " + 
ruleApplication.getOriginalErrorContext().replace("\n", " "));
@@ -54,7 +56,7 @@
     final String markup = "== Beispiele ==\n\n" +
             "Eine kleine Auswahl von Fehlern.\n\n" +
             "Das Komma ist richtig, wegen dem Leerzeichen.";
-    final MarkupAwareWikipediaResult result = 
check.checkWikipediaMarkup(markup, new German());
+    final MarkupAwareWikipediaResult result = check.checkWikipediaMarkup(new 
URL("http://fake-url.org";), markup, new German());
     final List<RuleApplication> ruleApplications = 
result.getRuleApplications();
     // even though this error has no suggestion, there's a (pseudo) correction:
     assertThat(ruleApplications.size(), is(1));

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
See everything from the browser to the database with AppDynamics
Get end-to-end visibility with application monitoring from AppDynamics
Isolate bottlenecks and diagnose root cause in seconds.
Start your free trial of AppDynamics Pro today!
http://pubads.g.doubleclick.net/gampad/clk?id=48808831&iu=/4140/ostg.clktrk
_______________________________________________
Languagetool-commits mailing list
Languagetool-commits@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/languagetool-commits

Reply via email to