Revision: 7834
          
http://languagetool.svn.sourceforge.net/languagetool/?rev=7834&view=rev
Author:   dnaber
Date:     2012-08-11 22:29:29 +0000 (Sat, 11 Aug 2012)
Log Message:
-----------
wikipedia indexing: slightly more verbose output during indexing; print number 
of searched documents, not number of sentences

Modified Paths:
--------------
    trunk/JLanguageTool/src/dev/org/languagetool/dev/index/Searcher.java
    trunk/JLanguageTool/src/dev/org/languagetool/dev/index/SearcherResult.java
    
trunk/JLanguageTool/src/dev/org/languagetool/dev/wikipedia/WikipediaIndexHandler.java
    trunk/ltcommunity/grails-app/views/ruleEditor/_corpusResult.gsp

Modified: trunk/JLanguageTool/src/dev/org/languagetool/dev/index/Searcher.java
===================================================================
--- trunk/JLanguageTool/src/dev/org/languagetool/dev/index/Searcher.java        
2012-08-11 21:14:46 UTC (rev 7833)
+++ trunk/JLanguageTool/src/dev/org/languagetool/dev/index/Searcher.java        
2012-08-11 22:29:29 UTC (rev 7834)
@@ -25,14 +25,8 @@
 
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.ScoreDoc;
-import org.apache.lucene.search.Sort;
-import org.apache.lucene.search.SortField;
-import org.apache.lucene.search.TimeLimitingCollector;
-import org.apache.lucene.search.TopDocs;
-import org.apache.lucene.search.TopFieldCollector;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.*;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.util.Counter;
 import org.languagetool.JLanguageTool;
@@ -42,6 +36,10 @@
 import org.languagetool.rules.patterns.PatternRule;
 import org.languagetool.rules.patterns.PatternRuleLoader;
 
+import static 
org.languagetool.dev.wikipedia.WikipediaIndexHandler.MAX_DOC_COUNT_FIELD;
+import static 
org.languagetool.dev.wikipedia.WikipediaIndexHandler.MAX_DOC_COUNT_FIELD_VAL;
+import static 
org.languagetool.dev.wikipedia.WikipediaIndexHandler.MAX_DOC_COUNT_VALUE;
+
 /**
  * A class with a main() method that takes a rule id  and the location of the
  * index that runs the query on that index and prints all matches.
@@ -70,6 +68,17 @@
     }
   }
 
+  private int getDocCount(IndexSearcher indexSearcher) throws IOException {
+    final Term searchTerm = new Term(MAX_DOC_COUNT_FIELD, 
MAX_DOC_COUNT_FIELD_VAL);
+    final TopDocs search = indexSearcher.search(new TermQuery(searchTerm), 1);
+    if (search.totalHits != 1) {
+      throw new RuntimeException("Got " + search.totalHits + " hits for the 
docCount query in " + indexSearcher.getIndexReader() + ", expected 1");
+    }
+    final ScoreDoc scoreDoc = search.scoreDocs[0];
+    final Document doc = indexSearcher.doc(scoreDoc.doc);
+    return Integer.parseInt(doc.get(MAX_DOC_COUNT_VALUE));
+  }
+
   public int getMaxHits() {
     return maxHits;
   }
@@ -97,6 +106,7 @@
     final List<MatchingSentence> matchingSentences = 
findMatchingSentences(indexSearcher, limitedTopDocs.topDocs, languageTool);
     final int sentencesChecked = getSentenceCheckCount(query, indexSearcher);
     final SearcherResult searcherResult = new 
SearcherResult(matchingSentences, sentencesChecked, query);
+    searcherResult.setDocCount(getDocCount(indexSearcher));
     searcherResult.setResultIsTimeLimited(limitedTopDocs.resultIsTimeLimited);
     return searcherResult;
   }

Modified: 
trunk/JLanguageTool/src/dev/org/languagetool/dev/index/SearcherResult.java
===================================================================
--- trunk/JLanguageTool/src/dev/org/languagetool/dev/index/SearcherResult.java  
2012-08-11 21:14:46 UTC (rev 7833)
+++ trunk/JLanguageTool/src/dev/org/languagetool/dev/index/SearcherResult.java  
2012-08-11 22:29:29 UTC (rev 7834)
@@ -30,6 +30,7 @@
   private final Searcher.PossiblyRelaxedQuery possiblyRelaxedQuery;
 
   private boolean resultIsTimeLimited;
+  private int docCount;
 
   public SearcherResult(List<MatchingSentence> matchingSentences, int 
checkedSentences, Searcher.PossiblyRelaxedQuery relaxedQuery) {
     this.matchingSentences = matchingSentences;
@@ -60,4 +61,12 @@
   public void setResultIsTimeLimited(boolean resultIsTimeLimited) {
     this.resultIsTimeLimited = resultIsTimeLimited;
   }
+
+  public void setDocCount(int docCount) {
+    this.docCount = docCount;
+  }
+
+  public int getDocCount() {
+    return docCount;
+  }
 }

Modified: 
trunk/JLanguageTool/src/dev/org/languagetool/dev/wikipedia/WikipediaIndexHandler.java
===================================================================
--- 
trunk/JLanguageTool/src/dev/org/languagetool/dev/wikipedia/WikipediaIndexHandler.java
       2012-08-11 21:14:46 UTC (rev 7833)
+++ 
trunk/JLanguageTool/src/dev/org/languagetool/dev/wikipedia/WikipediaIndexHandler.java
       2012-08-11 22:29:29 UTC (rev 7834)
@@ -20,10 +20,14 @@
 
 import java.io.File;
 import java.io.FileInputStream;
+import java.io.IOException;
 
 import javax.xml.parsers.SAXParser;
 import javax.xml.parsers.SAXParserFactory;
 
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.languagetool.Language;
@@ -35,14 +39,18 @@
 
 /**
  * 
- * Wikipedia handler for indexing.
+ * Wikipedia handler for indexing. See {@link 
org.languagetool.dev.index.Searcher} for a
+ * class that lets you use this index.
  * 
  * @author Tao Lin
  */
 public class WikipediaIndexHandler extends DefaultHandler {
 
+  public static final String MAX_DOC_COUNT_VALUE = "maxDocCountValue";
+  public static final String MAX_DOC_COUNT_FIELD = "maxDocCount";
+  public static final String MAX_DOC_COUNT_FIELD_VAL = "1";
+
   private final Indexer indexer;
-
   private int articleCount = 0;
   
   // the number of the wiki page to start indexing
@@ -51,9 +59,7 @@
   private int end = 0;
 
   private boolean inText = false;
-
   private StringBuilder text = new StringBuilder();
-
   private TextFilter textFilter = new BlikiWikipediaTextFilter();
 
   // ===========================================================
@@ -118,6 +124,13 @@
     indexer.close();
   }
 
+  private void writeMetaDocuments() throws IOException {
+    final Document doc = new Document();
+    doc.add(new StringField(MAX_DOC_COUNT_FIELD, MAX_DOC_COUNT_FIELD_VAL, 
Field.Store.YES));
+    doc.add(new StringField(MAX_DOC_COUNT_VALUE, articleCount + "", 
Field.Store.YES));
+    indexer.add(doc);
+  }
+
   public static void main(String... args) throws Exception {
     if (args.length != 4) {
       System.out.println("Usage: " + 
WikipediaIndexHandler.class.getSimpleName() + " <wikipediaDump> <indexDir> 
<languageCode> <maxDocs>");
@@ -127,28 +140,37 @@
       System.out.println("\t<maxDocs> maximum number of documents to be 
indexed, use 0 for no limit");
       System.exit(1);
     }
+    final File dumpFile = new File(args[0]);
+    final File indexDir = new File(args[1]);
     final String languageCode = args[2];
+    final int maxDocs = Integer.parseInt(args[3]);
+
     final Language language = Language.getLanguageForShortName(languageCode);
     if (language == null) {
       throw new RuntimeException("Could not find language '" + languageCode + 
"'");
     }
-    final int maxDocs = Integer.parseInt(args[3]);
     if (maxDocs == 0) {
-      System.out.println("Going to index all documents from input");
+      System.out.println("Going to index all documents from " + dumpFile);
     } else {
-      System.out.println("Going to index up to " + maxDocs + " documents");
+      System.out.println("Going to index up to " + maxDocs + " documents from 
" + dumpFile);
     }
+    System.out.println("Output index dir: " + indexDir);
     final long start = System.currentTimeMillis();
     final SAXParserFactory factory = SAXParserFactory.newInstance();
     final SAXParser saxParser = factory.newSAXParser();
-    final FSDirectory fsDirectory = FSDirectory.open(new File(args[1]));
-    final WikipediaIndexHandler handler = new 
WikipediaIndexHandler(fsDirectory, language, 1, maxDocs);
+    final FSDirectory fsDirectory = FSDirectory.open(indexDir);
     try {
-      saxParser.parse(new FileInputStream(new File(args[0])), handler);
-    } catch (DocumentLimitReachedException e) {
-      System.out.println("Document limit (" + e.limit + ") reached, stopping 
indexing");
+      final WikipediaIndexHandler handler = new 
WikipediaIndexHandler(fsDirectory, language, 1, maxDocs);
+      try {
+        saxParser.parse(new FileInputStream(dumpFile), handler);
+      } catch (DocumentLimitReachedException e) {
+        System.out.println("Document limit (" + e.limit + ") reached, stopping 
indexing");
+      } finally {
+        handler.writeMetaDocuments();
+        handler.close();
+      }
     } finally {
-      handler.close();
+      fsDirectory.close();
     }
     final long end = System.currentTimeMillis();
     final float minutes = (end - start) / (float)(1000 * 60);

Modified: trunk/ltcommunity/grails-app/views/ruleEditor/_corpusResult.gsp
===================================================================
--- trunk/ltcommunity/grails-app/views/ruleEditor/_corpusResult.gsp     
2012-08-11 21:14:46 UTC (rev 7833)
+++ trunk/ltcommunity/grails-app/views/ruleEditor/_corpusResult.gsp     
2012-08-11 22:29:29 UTC (rev 7834)
@@ -1,10 +1,11 @@
 <g:if test="${searcherResult}">
     <g:set var="sentencesChecked" 
value="${formatNumber(number:searcherResult.getCheckedSentences(), type: 
'number')}"/>
+    <g:set var="docsChecked" 
value="${formatNumber(number:searcherResult.getDocCount(), type: 'number')}"/>
 
     <g:if test="${searcherResult.getMatchingSentences().size() == 0}">
 
-        <p style="width:700px;">We've checked your pattern against 
${sentencesChecked} sentences
-        from <a href="http://www.wikipedia.org";>Wikipedia</a> and found no 
matches.</p>
+        <p style="width:700px;">We've checked your pattern against 
${docsChecked} documents
+        from the ${params.language.encodeAsHTML()} <a 
href="http://www.wikipedia.org";>Wikipedia</a> and found no matches.</p>
 
     </g:if>
     <g:else>

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Live Security Virtual Conference
Exclusive live event will cover all the ways today's security and 
threat landscape has changed and how IT managers can respond. Discussions 
will include endpoint security, mobile security and the latest in malware 
threats. http://www.accelacomm.com/jaw/sfrnl04242012/114/50122263/
_______________________________________________
Languagetool-cvs mailing list
Languagetool-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/languagetool-cvs

Reply via email to