Revision: 7848 http://languagetool.svn.sourceforge.net/languagetool/?rev=7848&view=rev Author: dnaber Date: 2012-08-12 19:59:04 +0000 (Sun, 12 Aug 2012) Log Message: ----------- Wikipedia search: move searching to its own thread which can be interrupted if it takes too long - just using Lucene's TimeLimitedCollector is not enough, as it doesn't take the rewriting step into account (which is the most expensive for some regex pattern)
Modified Paths: -------------- trunk/JLanguageTool/src/dev/org/languagetool/dev/index/Searcher.java trunk/ltcommunity/grails-app/controllers/org/languagetool/RuleEditorController.groovy trunk/ltcommunity/grails-app/views/ruleEditor/_checkRuleProblem.gsp trunk/ltcommunity/grails-app/views/ruleEditor/_corpusResult.gsp trunk/ltcommunity/grails-app/views/ruleEditor/createXml.gsp Added Paths: ----------- trunk/JLanguageTool/src/dev/org/languagetool/dev/index/SearchTimeoutException.java trunk/ltcommunity/grails-app/views/ruleEditor/_submitRule.gsp Added: trunk/JLanguageTool/src/dev/org/languagetool/dev/index/SearchTimeoutException.java =================================================================== --- trunk/JLanguageTool/src/dev/org/languagetool/dev/index/SearchTimeoutException.java (rev 0) +++ trunk/JLanguageTool/src/dev/org/languagetool/dev/index/SearchTimeoutException.java 2012-08-12 19:59:04 UTC (rev 7848) @@ -0,0 +1,29 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2012 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package org.languagetool.dev.index; + +/** + * Thrown if Lucene index search takes too long. + */ +public class SearchTimeoutException extends RuntimeException { + + public SearchTimeoutException(String message) { + super(message); + } +} Modified: trunk/JLanguageTool/src/dev/org/languagetool/dev/index/Searcher.java =================================================================== --- trunk/JLanguageTool/src/dev/org/languagetool/dev/index/Searcher.java 2012-08-12 15:45:49 UTC (rev 7847) +++ trunk/JLanguageTool/src/dev/org/languagetool/dev/index/Searcher.java 2012-08-12 19:59:04 UTC (rev 7848) @@ -95,19 +95,37 @@ this.maxSearchTimeMillis = maxSearchTimeMillis; } - public SearcherResult findRuleMatchesOnIndex(PatternRule rule, Language language, IndexSearcher indexSearcher) throws IOException { + public SearcherResult findRuleMatchesOnIndex(PatternRule rule, Language language, final IndexSearcher indexSearcher) throws IOException { final PossiblyRelaxedQuery query = createQuery(rule); - final Sort sort = new Sort(new SortField("docCount", SortField.Type.INT)); // do not sort by relevance as this will move the shortest documents to the top if (query.query == null) { throw new NullPointerException("Cannot search on null query for rule: " + rule); } - final PossiblyLimitedTopDocs limitedTopDocs = getTopDocs(indexSearcher, query, sort); - final JLanguageTool languageTool = getLanguageToolWithOneRule(language, rule); - final List<MatchingSentence> matchingSentences = findMatchingSentences(indexSearcher, limitedTopDocs.topDocs, languageTool); + + final SearchRunnable runnable = new SearchRunnable(indexSearcher, query, language, rule); + final Thread searchThread = new Thread(runnable); + searchThread.start(); + try { + searchThread.join(maxSearchTimeMillis); + searchThread.interrupt(); + } catch (InterruptedException e) { + throw new RuntimeException("Search thread got interrupted for query " + query, e); + } + + if (searchThread.isInterrupted()) { + throw new SearchTimeoutException("Search timeout of " + maxSearchTimeMillis + "ms reached"); + } + + final Exception exception = runnable.getException(); + if (exception != null) { + throw new RuntimeException("Exception during search for query " + query, exception); + } + + final List<MatchingSentence> matchingSentences = runnable.getMatchingSentences(); final int sentencesChecked = getSentenceCheckCount(query, indexSearcher); final SearcherResult searcherResult = new SearcherResult(matchingSentences, sentencesChecked, query); searcherResult.setDocCount(getDocCount(indexSearcher)); - searcherResult.setResultIsTimeLimited(limitedTopDocs.resultIsTimeLimited); + //TODO: the search itself could also timeout, don't just ignore that: + //searcherResult.setResultIsTimeLimited(limitedTopDocs.resultIsTimeLimited); return searcherResult; } @@ -224,6 +242,44 @@ } } + class SearchRunnable implements Runnable { + + private final IndexSearcher indexSearcher; + private final PossiblyRelaxedQuery query; + private final Language language; + private final PatternRule rule; + + private List<MatchingSentence> matchingSentences; + private Exception exception; + + SearchRunnable(IndexSearcher indexSearcher, PossiblyRelaxedQuery query, Language language, PatternRule rule) { + this.indexSearcher = indexSearcher; + this.query = query; + this.language = language; + this.rule = rule; + } + + @Override + public void run() { + try { + final Sort sort = new Sort(new SortField("docCount", SortField.Type.INT)); // do not sort by relevance as this will move the shortest documents to the top + final PossiblyLimitedTopDocs limitedTopDocs = getTopDocs(indexSearcher, query, sort); + final JLanguageTool languageTool = getLanguageToolWithOneRule(language, rule); + matchingSentences = findMatchingSentences(indexSearcher, limitedTopDocs.topDocs, languageTool); + } catch (Exception e) { + exception = e; + } + } + + Exception getException() { + return exception; + } + + List<MatchingSentence> getMatchingSentences() { + return matchingSentences; + } + } + public static void main(String[] args) throws Exception { ensureCorrectUsageOrExit(args); final Searcher searcher = new Searcher(); Modified: trunk/ltcommunity/grails-app/controllers/org/languagetool/RuleEditorController.groovy =================================================================== --- trunk/ltcommunity/grails-app/controllers/org/languagetool/RuleEditorController.groovy 2012-08-12 15:45:49 UTC (rev 7847) +++ trunk/ltcommunity/grails-app/controllers/org/languagetool/RuleEditorController.groovy 2012-08-12 19:59:04 UTC (rev 7848) @@ -19,13 +19,10 @@ package org.languagetool import org.languagetool.rules.patterns.PatternRule -import org.languagetool.dev.index.Searcher -import org.apache.lucene.search.IndexSearcher -import org.apache.lucene.store.FSDirectory import org.languagetool.dev.index.SearcherResult import org.languagetool.rules.patterns.PatternRuleLoader import org.languagetool.rules.IncorrectExample -import org.apache.lucene.index.DirectoryReader +import org.languagetool.dev.index.SearchTimeoutException /** * Editor that helps with creating the XML for simple rules. @@ -62,10 +59,17 @@ List shortProblems = [] checkExampleSentences(patternRule, language, problems, shortProblems) if (problems.size() == 0) { - SearcherResult searcherResult = searchService.checkRuleAgainstCorpus(patternRule, language, CORPUS_MATCH_LIMIT) - log.info("Checked rule: valid - LANG: ${language.getShortNameWithVariant()} - PATTERN: ${params.pattern} - BAD: ${params.incorrectExample1} - GOOD: ${params.correctExample1}") - [messagePreset: params.messageBackup, namePreset: params.nameBackup, - searcherResult: searcherResult, limit: CORPUS_MATCH_LIMIT] + SearcherResult searcherResult = null + boolean timeOut = false + try { + searcherResult = searchService.checkRuleAgainstCorpus(patternRule, language, CORPUS_MATCH_LIMIT) + } catch (SearchTimeoutException e) { + log.info("Timeout exception: " + e + " - LANG: ${language.getShortNameWithVariant()} - PATTERN: ${params.pattern}") + timeOut = true + } + log.info("Checked rule: valid - LANG: ${language.getShortNameWithVariant()} - PATTERN: ${params.pattern} - BAD: ${params.incorrectExample1} - GOOD: ${params.correctExample1}") + [messagePreset: params.messageBackup, namePreset: params.nameBackup, + searcherResult: searcherResult, limit: CORPUS_MATCH_LIMIT, timeOut: timeOut] } else { log.info("Checked rule: invalid - LANG: ${language.getShortNameWithVariant()} - PATTERN: ${params.pattern} - BAD: ${params.incorrectExample1} - GOOD: ${params.correctExample1} - ${shortProblems}") render(template: 'checkRuleProblem', model: [problems: problems, hasRegex: hasRegex(patternRule), expertMode: false]) @@ -99,10 +103,21 @@ return } long startTime = System.currentTimeMillis() - SearcherResult searcherResult = searchService.checkRuleAgainstCorpus(patternRule, language, EXPERT_MODE_CORPUS_MATCH_LIMIT) - long searchTime = System.currentTimeMillis() - startTime - log.info("Checked XML in ${language}, timeout (${SearchService.SEARCH_TIMEOUT_MILLIS}ms) triggered: ${searcherResult.resultIsTimeLimited}, time: ${searchTime}ms") - render(view: '_corpusResult', model: [searcherResult: searcherResult, expertMode: true, limit: EXPERT_MODE_CORPUS_MATCH_LIMIT]) + try { + SearcherResult searcherResult = searchService.checkRuleAgainstCorpus(patternRule, language, EXPERT_MODE_CORPUS_MATCH_LIMIT) + long searchTime = System.currentTimeMillis() - startTime + log.info("Checked XML in ${language}, timeout (${SearchService.SEARCH_TIMEOUT_MILLIS}ms) triggered: ${searcherResult.resultIsTimeLimited}, time: ${searchTime}ms") + render(view: '_corpusResult', model: [searcherResult: searcherResult, expertMode: true, limit: EXPERT_MODE_CORPUS_MATCH_LIMIT]) + } catch (SearchTimeoutException e) { + long searchTime = System.currentTimeMillis() - startTime + log.warn("Timeout checking XML in ${language}, timeout (${SearchService.SEARCH_TIMEOUT_MILLIS}ms), time: ${searchTime}ms, pattern: ${patternRule}") + problems.add("Sorry, there was a timeout when searching our Wikipedia data for matches. This can happen" + + " for patterns with some regular expressions, for example if the pattern starts with .*." + + " These kinds of patterns are currently not supported by this tool.") + render(template: 'checkRuleProblem', model: [problems: problems, hasRegex: hasRegex(patternRule), + expertMode: true, isOff: patternRule.isDefaultOff()]) + return + } } private void checkExampleSentences(PatternRule patternRule, Language language, List problems, List shortProblems) { Modified: trunk/ltcommunity/grails-app/views/ruleEditor/_checkRuleProblem.gsp =================================================================== --- trunk/ltcommunity/grails-app/views/ruleEditor/_checkRuleProblem.gsp 2012-08-12 15:45:49 UTC (rev 7847) +++ trunk/ltcommunity/grails-app/views/ruleEditor/_checkRuleProblem.gsp 2012-08-12 19:59:04 UTC (rev 7848) @@ -19,7 +19,7 @@ </div> - <p style="width:450px;margin-top: 5px">The examples sentences are used to test your rule. Your first + <p style="width:450px;margin-top: 5px">The example sentences are used to test your rule. Your first example sentence should contain the error so it can be found with the "Wrong words" pattern. The second example sentence should not contain the error. If you need help, <a target="_blank" href="http://www.languagetool.org/forum/">please ask in our forum</a>.</p> Modified: trunk/ltcommunity/grails-app/views/ruleEditor/_corpusResult.gsp =================================================================== --- trunk/ltcommunity/grails-app/views/ruleEditor/_corpusResult.gsp 2012-08-12 15:45:49 UTC (rev 7847) +++ trunk/ltcommunity/grails-app/views/ruleEditor/_corpusResult.gsp 2012-08-12 19:59:04 UTC (rev 7848) @@ -5,8 +5,14 @@ <g:if test="${searcherResult.getMatchingSentences().size() == 0}"> <p style="width:700px;">We've checked your pattern against ${docsChecked} documents - from the ${params.language.encodeAsHTML()} <a href="http://www.wikipedia.org">Wikipedia</a> and found no matches.</p> + from the ${params.language.encodeAsHTML()} <a href="http://www.wikipedia.org">Wikipedia</a> and found no matches. + That's a good sign, it means your rule doesn't trigger any false alarms at least + in the documents we checked.</p> + <p>Your example sentences are also correct.</p> + + <g:render template="submitRule"/> + </g:if> <g:else> @@ -41,4 +47,12 @@ </g:if> <g:else> + <g:if test="${timeOut}"> + <p class="warn"> + Sorry, there was a timeout when searching our Wikipedia data for matches. This can happen + for patterns with some regular expressions, for example if the pattern starts with .*. + These kinds of patterns are currently not supported by this tool. You can continue + creating the rule anyway. + </p> + </g:if> </g:else> Added: trunk/ltcommunity/grails-app/views/ruleEditor/_submitRule.gsp =================================================================== --- trunk/ltcommunity/grails-app/views/ruleEditor/_submitRule.gsp (rev 0) +++ trunk/ltcommunity/grails-app/views/ruleEditor/_submitRule.gsp 2012-08-12 19:59:04 UTC (rev 7848) @@ -0,0 +1,4 @@ +<p>If your rule might be useful to others, please consider submitting it to the +LanguageTool team, either via <a target="_blank" href="http://www.languagetool.org/forum/">our forum</a> +or via <a target="_blank" href="https://lists.sourceforge.net/lists/listinfo/languagetool-devel">our mailing list</a>. +Thank you!</p> Modified: trunk/ltcommunity/grails-app/views/ruleEditor/createXml.gsp =================================================================== --- trunk/ltcommunity/grails-app/views/ruleEditor/createXml.gsp 2012-08-12 15:45:49 UTC (rev 7847) +++ trunk/ltcommunity/grails-app/views/ruleEditor/createXml.gsp 2012-08-12 19:59:04 UTC (rev 7848) @@ -20,6 +20,8 @@ can only create simple rules. See <a target="devdocumentation" href="http://www.languagetool.org/development/">our development documentation</a> for more features.</p> + <g:render template="submitRule"/> + </g:else> </td> </tr> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. ------------------------------------------------------------------------------ Live Security Virtual Conference Exclusive live event will cover all the ways today's security and threat landscape has changed and how IT managers can respond. Discussions will include endpoint security, mobile security and the latest in malware threats. http://www.accelacomm.com/jaw/sfrnl04242012/114/50122263/ _______________________________________________ Languagetool-cvs mailing list Languagetool-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/languagetool-cvs