Revision: 10516 http://sourceforge.net/p/languagetool/code/10516 Author: dnaber Date: 2013-07-16 19:07:52 +0000 (Tue, 16 Jul 2013) Log Message: ----------- more improvements to Wikipedia text extraction/mapping - now skipping over the remaining errors
Modified Paths: -------------- trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/LocationHelper.java trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/MarkupAwareWikipediaResult.java trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/TextConverter.java trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/WikipediaQuickCheck.java trunk/languagetool/languagetool-wikipedia/src/test/java/org/languagetool/dev/wikipedia/SuggestionReplacerTest.java trunk/languagetool/languagetool-wikipedia/src/test/java/org/languagetool/dev/wikipedia/WikipediaQuickCheckTest.java Modified: trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/LocationHelper.java =================================================================== --- trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/LocationHelper.java 2013-07-16 18:12:09 UTC (rev 10515) +++ trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/LocationHelper.java 2013-07-16 19:07:52 UTC (rev 10516) @@ -32,29 +32,34 @@ int line = 1; int col = 1; int pos = 0; - boolean ignoreMode = false; + int ignoreLevel = 0; final StringBuilder relevantLine = new StringBuilder(); for (int i = 0; i < text.length(); i++) { char ch = text.charAt(i); if (line == location.line) { relevantLine.append(ch); } + //System.out.println(line + "/" + col + ", ignoreLevel: " + ignoreLevel); if (line == location.line && col == location.column) { return pos; } char prevCh = i > 0 ? text.charAt(i - 1) : '-'; - if (ignoreMode) { - // - if (ch == '}' && prevCh == '}') { - // ignore templates - ignoreMode = false; + if (i < text.length() - 4 && text.substring(i, i + 4).equals("<!--")) { + // HTML comment + ignoreLevel++; + } else if (i < text.length() - 3 && text.substring(i, i + 3).equals("-->")) { + ignoreLevel--; + } else if (ch == '}' && prevCh == '}') { + if (ignoreLevel > 0) { + ignoreLevel--; } } else if (ch == '{' && prevCh == '{') { - ignoreMode = true; - } else if (ch == '\n') { + // ignore templates + ignoreLevel++; + } else if (ch == '\n' && ignoreLevel == 0) { line++; col = 1; - } else { + } else if (ignoreLevel == 0) { col++; } pos++; @@ -62,7 +67,7 @@ if (line == location.line && col == location.column) { return pos; } - throw new RuntimeException("Could not find location " + location + " in text: '" + text + "'. " + + throw new RuntimeException("Could not find location " + location + " in text. " + "Max line/col was: " + line + "/" + col + ", Content of relevant line (" + location.line + "): '" + relevantLine + "' (" + relevantLine.length() + " chars)"); } Modified: trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/MarkupAwareWikipediaResult.java =================================================================== --- trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/MarkupAwareWikipediaResult.java 2013-07-16 18:12:09 UTC (rev 10515) +++ trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/MarkupAwareWikipediaResult.java 2013-07-16 19:07:52 UTC (rev 10516) @@ -27,13 +27,24 @@ public class MarkupAwareWikipediaResult { private final List<RuleApplication> ruleApplications; + private final int internalErrors; - public MarkupAwareWikipediaResult(List<RuleApplication> ruleApplications) { + public MarkupAwareWikipediaResult(List<RuleApplication> ruleApplications, int internalErrors) { this.ruleApplications = ruleApplications; + this.internalErrors = internalErrors; } public List<RuleApplication> getRuleApplications() { return ruleApplications; } + /** + * Mapping the Wikipedia syntax is complicated and it sometimes fails. We + * try to skip over those problems and increase this counter. Each increment + * here means that one rule match is missing from the rule applications. + */ + public int getInternalErrorCount() { + return internalErrors; + } + } Modified: trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/TextConverter.java =================================================================== --- trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/TextConverter.java 2013-07-16 18:12:09 UTC (rev 10515) +++ trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/TextConverter.java 2013-07-16 19:07:52 UTC (rev 10516) @@ -372,6 +372,7 @@ int textPos = contentSoFar.length() + 1 + (needSpace ? 1 : 0); if (loc.hasLocation()) { mapping.put(textPos, loc.getLocation()); + //System.out.println("PUT " + textPos + " -> " + loc.getLocation()); } } Modified: trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/WikipediaQuickCheck.java =================================================================== --- trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/WikipediaQuickCheck.java 2013-07-16 18:12:09 UTC (rev 10515) +++ trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/WikipediaQuickCheck.java 2013-07-16 19:07:52 UTC (rev 10516) @@ -93,20 +93,26 @@ final WikipediaQuickCheck check = new WikipediaQuickCheck(); final String xml = check.getMediaWikiContent(url); final String wikiMarkup = getRevisionContent(xml); - return checkWikipediaMarkup(wikiMarkup, getLanguage(url)); + return checkWikipediaMarkup(url, wikiMarkup, getLanguage(url)); } - public MarkupAwareWikipediaResult checkWikipediaMarkup(String wikiMarkup, Language language) throws IOException { + public MarkupAwareWikipediaResult checkWikipediaMarkup(URL url, String wikiMarkup, Language language) throws IOException { final SwebleWikipediaTextFilter filter = new SwebleWikipediaTextFilter(); final PlainTextMapping mapping = filter.filter(wikiMarkup); final JLanguageTool langTool = getLanguageTool(language); final List<RuleApplication> ruleApplications = new ArrayList<RuleApplication>(); final List<RuleMatch> matches = langTool.check(mapping.getPlainText()); + int internalErrors = 0; for (RuleMatch match : matches) { final SuggestionReplacer replacer = new SuggestionReplacer(mapping, wikiMarkup); - ruleApplications.addAll(replacer.applySuggestionsToOriginalText(match)); + try { + ruleApplications.addAll(replacer.applySuggestionsToOriginalText(match)); + } catch (Exception e) { + System.err.println("Failed to apply suggestion for rule match '" + match + "' for URL " + url + ": " + e.toString()); + internalErrors++; + } } - return new MarkupAwareWikipediaResult(ruleApplications); + return new MarkupAwareWikipediaResult(ruleApplications, internalErrors); } public WikipediaQuickCheckResult checkPage(String plainText, Language lang) throws IOException { Modified: trunk/languagetool/languagetool-wikipedia/src/test/java/org/languagetool/dev/wikipedia/SuggestionReplacerTest.java =================================================================== --- trunk/languagetool/languagetool-wikipedia/src/test/java/org/languagetool/dev/wikipedia/SuggestionReplacerTest.java 2013-07-16 18:12:09 UTC (rev 10515) +++ trunk/languagetool/languagetool-wikipedia/src/test/java/org/languagetool/dev/wikipedia/SuggestionReplacerTest.java 2013-07-16 19:07:52 UTC (rev 10516) @@ -54,6 +54,27 @@ "\n\nEin <s>ökumenisch</s> Gottesdienst.\n"); } + public void testNestedTemplates() throws Exception { + JLanguageTool langTool = getLanguageTool(); + SwebleWikipediaTextFilter filter = new SwebleWikipediaTextFilter(); + String markup = "{{FNBox|\n" + + " {{FNZ|1|1979 und 1984}}\n" + + " {{FNZ|2|[[Rundungsfehler]]}}\n" + + "}}\n\nEin ökonomischer Gottesdienst.\n"; + applySuggestion(langTool, filter, markup, markup.replace("ökonomischer", "<s>ökumenisch</s>")); + } + + public void testKnownBug() throws Exception { + JLanguageTool langTool = getLanguageTool(); + SwebleWikipediaTextFilter filter = new SwebleWikipediaTextFilter(); + String markup = "{{HdBG GKZ|9761000}}."; + try { + applySuggestion(langTool, filter, markup, markup); + } catch (RuntimeException e) { + // known problem - Sweble's location seems to be wrong?! + } + } + public void testComplexText() throws Exception { String markup = "{{Dieser Artikel|behandelt die freie Onlineenzyklopädie Wikipedia; zu dem gleichnamigen Asteroiden siehe [[(274301) Wikipedia]].}}\n" + "\n" + Modified: trunk/languagetool/languagetool-wikipedia/src/test/java/org/languagetool/dev/wikipedia/WikipediaQuickCheckTest.java =================================================================== --- trunk/languagetool/languagetool-wikipedia/src/test/java/org/languagetool/dev/wikipedia/WikipediaQuickCheckTest.java 2013-07-16 18:12:09 UTC (rev 10515) +++ trunk/languagetool/languagetool-wikipedia/src/test/java/org/languagetool/dev/wikipedia/WikipediaQuickCheckTest.java 2013-07-16 19:07:52 UTC (rev 10516) @@ -36,9 +36,11 @@ //final String url = "http://de.wikipedia.org/wiki/Benutzer_Diskussion:Dnaber"; //final String url = "http://de.wikipedia.org/wiki/OpenThesaurus"; //final String url = "http://de.wikipedia.org/wiki/Gütersloh"; - final String url = "http://de.wikipedia.org/wiki/Bielefeld"; + //final String url = "http://de.wikipedia.org/wiki/Bielefeld"; + final String url = "http://de.wikipedia.org/wiki/Augsburg"; final MarkupAwareWikipediaResult result = check.checkPage(new URL(url)); final List<RuleApplication> ruleApplications = result.getRuleApplications(); + System.out.println("ruleApplications: " + ruleApplications.size()); for (RuleApplication ruleApplication : ruleApplications) { System.out.println("Rule : " + ruleApplication.getRuleMatch().getRule().getDescription()); System.out.println("Original : " + ruleApplication.getOriginalErrorContext().replace("\n", " ")); @@ -54,7 +56,7 @@ final String markup = "== Beispiele ==\n\n" + "Eine kleine Auswahl von Fehlern.\n\n" + "Das Komma ist richtig, wegen dem Leerzeichen."; - final MarkupAwareWikipediaResult result = check.checkWikipediaMarkup(markup, new German()); + final MarkupAwareWikipediaResult result = check.checkWikipediaMarkup(new URL("http://fake-url.org"), markup, new German()); final List<RuleApplication> ruleApplications = result.getRuleApplications(); // even though this error has no suggestion, there's a (pseudo) correction: assertThat(ruleApplications.size(), is(1)); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. ------------------------------------------------------------------------------ See everything from the browser to the database with AppDynamics Get end-to-end visibility with application monitoring from AppDynamics Isolate bottlenecks and diagnose root cause in seconds. Start your free trial of AppDynamics Pro today! http://pubads.g.doubleclick.net/gampad/clk?id=48808831&iu=/4140/ostg.clktrk _______________________________________________ Languagetool-commits mailing list Languagetool-commits@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/languagetool-commits