Revision: 7834 http://languagetool.svn.sourceforge.net/languagetool/?rev=7834&view=rev Author: dnaber Date: 2012-08-11 22:29:29 +0000 (Sat, 11 Aug 2012) Log Message: ----------- wikipedia indexing: slightly more verbose output during indexing; print number of searched documents, not number of sentences
Modified Paths: -------------- trunk/JLanguageTool/src/dev/org/languagetool/dev/index/Searcher.java trunk/JLanguageTool/src/dev/org/languagetool/dev/index/SearcherResult.java trunk/JLanguageTool/src/dev/org/languagetool/dev/wikipedia/WikipediaIndexHandler.java trunk/ltcommunity/grails-app/views/ruleEditor/_corpusResult.gsp Modified: trunk/JLanguageTool/src/dev/org/languagetool/dev/index/Searcher.java =================================================================== --- trunk/JLanguageTool/src/dev/org/languagetool/dev/index/Searcher.java 2012-08-11 21:14:46 UTC (rev 7833) +++ trunk/JLanguageTool/src/dev/org/languagetool/dev/index/Searcher.java 2012-08-11 22:29:29 UTC (rev 7834) @@ -25,14 +25,8 @@ import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.Sort; -import org.apache.lucene.search.SortField; -import org.apache.lucene.search.TimeLimitingCollector; -import org.apache.lucene.search.TopDocs; -import org.apache.lucene.search.TopFieldCollector; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.*; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Counter; import org.languagetool.JLanguageTool; @@ -42,6 +36,10 @@ import org.languagetool.rules.patterns.PatternRule; import org.languagetool.rules.patterns.PatternRuleLoader; +import static org.languagetool.dev.wikipedia.WikipediaIndexHandler.MAX_DOC_COUNT_FIELD; +import static org.languagetool.dev.wikipedia.WikipediaIndexHandler.MAX_DOC_COUNT_FIELD_VAL; +import static org.languagetool.dev.wikipedia.WikipediaIndexHandler.MAX_DOC_COUNT_VALUE; + /** * A class with a main() method that takes a rule id and the location of the * index that runs the query on that index and prints all matches. @@ -70,6 +68,17 @@ } } + private int getDocCount(IndexSearcher indexSearcher) throws IOException { + final Term searchTerm = new Term(MAX_DOC_COUNT_FIELD, MAX_DOC_COUNT_FIELD_VAL); + final TopDocs search = indexSearcher.search(new TermQuery(searchTerm), 1); + if (search.totalHits != 1) { + throw new RuntimeException("Got " + search.totalHits + " hits for the docCount query in " + indexSearcher.getIndexReader() + ", expected 1"); + } + final ScoreDoc scoreDoc = search.scoreDocs[0]; + final Document doc = indexSearcher.doc(scoreDoc.doc); + return Integer.parseInt(doc.get(MAX_DOC_COUNT_VALUE)); + } + public int getMaxHits() { return maxHits; } @@ -97,6 +106,7 @@ final List<MatchingSentence> matchingSentences = findMatchingSentences(indexSearcher, limitedTopDocs.topDocs, languageTool); final int sentencesChecked = getSentenceCheckCount(query, indexSearcher); final SearcherResult searcherResult = new SearcherResult(matchingSentences, sentencesChecked, query); + searcherResult.setDocCount(getDocCount(indexSearcher)); searcherResult.setResultIsTimeLimited(limitedTopDocs.resultIsTimeLimited); return searcherResult; } Modified: trunk/JLanguageTool/src/dev/org/languagetool/dev/index/SearcherResult.java =================================================================== --- trunk/JLanguageTool/src/dev/org/languagetool/dev/index/SearcherResult.java 2012-08-11 21:14:46 UTC (rev 7833) +++ trunk/JLanguageTool/src/dev/org/languagetool/dev/index/SearcherResult.java 2012-08-11 22:29:29 UTC (rev 7834) @@ -30,6 +30,7 @@ private final Searcher.PossiblyRelaxedQuery possiblyRelaxedQuery; private boolean resultIsTimeLimited; + private int docCount; public SearcherResult(List<MatchingSentence> matchingSentences, int checkedSentences, Searcher.PossiblyRelaxedQuery relaxedQuery) { this.matchingSentences = matchingSentences; @@ -60,4 +61,12 @@ public void setResultIsTimeLimited(boolean resultIsTimeLimited) { this.resultIsTimeLimited = resultIsTimeLimited; } + + public void setDocCount(int docCount) { + this.docCount = docCount; + } + + public int getDocCount() { + return docCount; + } } Modified: trunk/JLanguageTool/src/dev/org/languagetool/dev/wikipedia/WikipediaIndexHandler.java =================================================================== --- trunk/JLanguageTool/src/dev/org/languagetool/dev/wikipedia/WikipediaIndexHandler.java 2012-08-11 21:14:46 UTC (rev 7833) +++ trunk/JLanguageTool/src/dev/org/languagetool/dev/wikipedia/WikipediaIndexHandler.java 2012-08-11 22:29:29 UTC (rev 7834) @@ -20,10 +20,14 @@ import java.io.File; import java.io.FileInputStream; +import java.io.IOException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.StringField; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.languagetool.Language; @@ -35,14 +39,18 @@ /** * - * Wikipedia handler for indexing. + * Wikipedia handler for indexing. See {@link org.languagetool.dev.index.Searcher} for a + * class that lets you use this index. * * @author Tao Lin */ public class WikipediaIndexHandler extends DefaultHandler { + public static final String MAX_DOC_COUNT_VALUE = "maxDocCountValue"; + public static final String MAX_DOC_COUNT_FIELD = "maxDocCount"; + public static final String MAX_DOC_COUNT_FIELD_VAL = "1"; + private final Indexer indexer; - private int articleCount = 0; // the number of the wiki page to start indexing @@ -51,9 +59,7 @@ private int end = 0; private boolean inText = false; - private StringBuilder text = new StringBuilder(); - private TextFilter textFilter = new BlikiWikipediaTextFilter(); // =========================================================== @@ -118,6 +124,13 @@ indexer.close(); } + private void writeMetaDocuments() throws IOException { + final Document doc = new Document(); + doc.add(new StringField(MAX_DOC_COUNT_FIELD, MAX_DOC_COUNT_FIELD_VAL, Field.Store.YES)); + doc.add(new StringField(MAX_DOC_COUNT_VALUE, articleCount + "", Field.Store.YES)); + indexer.add(doc); + } + public static void main(String... args) throws Exception { if (args.length != 4) { System.out.println("Usage: " + WikipediaIndexHandler.class.getSimpleName() + " <wikipediaDump> <indexDir> <languageCode> <maxDocs>"); @@ -127,28 +140,37 @@ System.out.println("\t<maxDocs> maximum number of documents to be indexed, use 0 for no limit"); System.exit(1); } + final File dumpFile = new File(args[0]); + final File indexDir = new File(args[1]); final String languageCode = args[2]; + final int maxDocs = Integer.parseInt(args[3]); + final Language language = Language.getLanguageForShortName(languageCode); if (language == null) { throw new RuntimeException("Could not find language '" + languageCode + "'"); } - final int maxDocs = Integer.parseInt(args[3]); if (maxDocs == 0) { - System.out.println("Going to index all documents from input"); + System.out.println("Going to index all documents from " + dumpFile); } else { - System.out.println("Going to index up to " + maxDocs + " documents"); + System.out.println("Going to index up to " + maxDocs + " documents from " + dumpFile); } + System.out.println("Output index dir: " + indexDir); final long start = System.currentTimeMillis(); final SAXParserFactory factory = SAXParserFactory.newInstance(); final SAXParser saxParser = factory.newSAXParser(); - final FSDirectory fsDirectory = FSDirectory.open(new File(args[1])); - final WikipediaIndexHandler handler = new WikipediaIndexHandler(fsDirectory, language, 1, maxDocs); + final FSDirectory fsDirectory = FSDirectory.open(indexDir); try { - saxParser.parse(new FileInputStream(new File(args[0])), handler); - } catch (DocumentLimitReachedException e) { - System.out.println("Document limit (" + e.limit + ") reached, stopping indexing"); + final WikipediaIndexHandler handler = new WikipediaIndexHandler(fsDirectory, language, 1, maxDocs); + try { + saxParser.parse(new FileInputStream(dumpFile), handler); + } catch (DocumentLimitReachedException e) { + System.out.println("Document limit (" + e.limit + ") reached, stopping indexing"); + } finally { + handler.writeMetaDocuments(); + handler.close(); + } } finally { - handler.close(); + fsDirectory.close(); } final long end = System.currentTimeMillis(); final float minutes = (end - start) / (float)(1000 * 60); Modified: trunk/ltcommunity/grails-app/views/ruleEditor/_corpusResult.gsp =================================================================== --- trunk/ltcommunity/grails-app/views/ruleEditor/_corpusResult.gsp 2012-08-11 21:14:46 UTC (rev 7833) +++ trunk/ltcommunity/grails-app/views/ruleEditor/_corpusResult.gsp 2012-08-11 22:29:29 UTC (rev 7834) @@ -1,10 +1,11 @@ <g:if test="${searcherResult}"> <g:set var="sentencesChecked" value="${formatNumber(number:searcherResult.getCheckedSentences(), type: 'number')}"/> + <g:set var="docsChecked" value="${formatNumber(number:searcherResult.getDocCount(), type: 'number')}"/> <g:if test="${searcherResult.getMatchingSentences().size() == 0}"> - <p style="width:700px;">We've checked your pattern against ${sentencesChecked} sentences - from <a href="http://www.wikipedia.org">Wikipedia</a> and found no matches.</p> + <p style="width:700px;">We've checked your pattern against ${docsChecked} documents + from the ${params.language.encodeAsHTML()} <a href="http://www.wikipedia.org">Wikipedia</a> and found no matches.</p> </g:if> <g:else> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. ------------------------------------------------------------------------------ Live Security Virtual Conference Exclusive live event will cover all the ways today's security and threat landscape has changed and how IT managers can respond. Discussions will include endpoint security, mobile security and the latest in malware threats. http://www.accelacomm.com/jaw/sfrnl04242012/114/50122263/ _______________________________________________ Languagetool-cvs mailing list Languagetool-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/languagetool-cvs