Author: ab Date: Wed May 9 12:36:54 2007 New Revision: 536629 URL: http://svn.apache.org/viewvc?view=rev&rev=536629 Log: NUTCH-393 - Indexer should handle null documents returned by filters.
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=536629&r1=536628&r2=536629 ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Wed May 9 12:36:54 2007 @@ -7,6 +7,9 @@ 2. NUTCH-443 - Allow parsers to return multiple Parse objects. (Dogacan Guney et al, via ab) + 3. NUTCH-393 - Indexer should handle null documents returned by filters. + (Eelco Lempsink via ab) + Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?view=diff&rev=536629&r1=536628&r2=536629 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Wed May 9 12:36:54 2007 @@ -218,6 +218,9 @@ return; } + // skip documents discarded by indexing filters + if (doc == null) return; + float boost = 1.0f; // run scoring filters try { Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java?view=diff&rev=536629&r1=536628&r2=536629 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java Wed May 9 12:36:54 2007 @@ -41,14 +41,15 @@ /** * Adds fields or otherwise modifies the document that will be indexed for a - * parse. + * parse. Unwanted documents can be removed from indexing by returning a null value. * * @param doc document instance for collecting fields * @param parse parse data instance * @param url page url * @param datum crawl datum for the page * @param inlinks page inlinks - * @return modified (or a new) document instance + * @return modified (or a new) document instance, or null (meaning the document + * should be discarded) * @throws IndexingException */ Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java?view=diff&rev=536629&r1=536628&r2=536629 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java Wed May 9 12:36:54 2007 @@ -108,6 +108,8 @@ Inlinks inlinks) throws IndexingException { for (int i = 0; i < this.indexingFilters.length; i++) { doc = this.indexingFilters[i].filter(doc, parse, url, datum, inlinks); + // break the loop if an indexing filter discards the doc + if (doc == null) return null; } return doc; ------------------------------------------------------------------------- This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs