Author: dogacan Date: Sun Jun 17 13:27:17 2007 New Revision: 548103 URL: http://svn.apache.org/viewvc?view=rev&rev=548103 Log: NUTCH-485 - Change HtmlParseFilter 's to return ParseResult object instead of Parse object.
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilter.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseResult.java lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=548103&r1=548102&r2=548103 ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Sun Jun 17 13:27:17 2007 @@ -39,6 +39,8 @@ datums. This patch addresses that issue. Now, if Fetcher gets a null content, instead of pushing an empty content, it filters the null content. +13. NUTCH-485 - Change HtmlParseFilter 's to return ParseResult object instead of Parse object. (Gal Nitzan via dogacan) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilter.java?view=diff&rev=548103&r1=548102&r2=548103 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilter.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilter.java Sun Jun 17 13:27:17 2007 @@ -38,5 +38,5 @@ /** Adds metadata or otherwise modifies a parse of HTML content, given * the DOM tree of a page. */ - Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc); + ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc); } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java?view=diff&rev=548103&r1=548102&r2=548103 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java Sun Jun 17 13:27:17 2007 @@ -59,18 +59,23 @@ /** Run all defined filters. */ public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { - ParseResult filteredParseResult = new ParseResult(content.getUrl()); - - for (java.util.Map.Entry<Text, Parse> entry : parseResult) { - Parse parse = entry.getValue(); - for (int i = 0 ; i < this.htmlParseFilters.length; i++) { - parse = this.htmlParseFilters[i].filter(content, parse, metaTags, doc); - if (!parse.getData().getStatus().isSuccess()) break; + // loop on each filter + for (int i = 0 ; i < this.htmlParseFilters.length; i++) { + // call filter interface + parseResult = + htmlParseFilters[i].filter(content, parseResult, metaTags, doc); + + // any failure on parse obj, return + if (!parseResult.isSuccess()) { + // TODO: What happens when parseResult.isEmpty() ? + // Maybe clone parseResult and use parseResult as backup... + + // remove failed parse before return + parseResult.filter(); + return parseResult; } - filteredParseResult.put(entry.getKey(), - new ParseText(parse.getText()), parse.getData()); } - return filteredParseResult; + return parseResult; } } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseResult.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseResult.java?view=diff&rev=548103&r1=548102&r2=548103 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseResult.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseResult.java Sun Jun 17 13:27:17 2007 @@ -139,4 +139,18 @@ } } + + /** + * A convenience method which returns true only if all parses are successful. + * Parse success is determined by [EMAIL PROTECTED] ParseStatus#isSuccess()} + */ + public boolean isSuccess() { + for(Iterator<Entry<Text, Parse>> i = iterator(); i.hasNext();) { + Entry<Text, Parse> entry = i.next(); + if (!entry.getValue().getData().getStatus().isSuccess()) { + return false; + } + } + return true; + } } Modified: lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java?view=diff&rev=548103&r1=548102&r2=548103 ============================================================================== --- lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java (original) +++ lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java Sun Jun 17 13:27:17 2007 @@ -22,6 +22,7 @@ import org.apache.nutch.protocol.Content; import org.apache.nutch.metadata.Metadata; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -263,24 +264,35 @@ /** Adds metadata or otherwise modifies a parse of an HTML document, given * the DOM tree of a page. */ - public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) { + public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { + + // get parse obj + Parse parse = parseResult.get(content.getUrl()); // construct base url URL base; try { base = new URL(content.getBaseUrl()); } catch (MalformedURLException e) { - return new ParseStatus(e).getEmptyParse(getConf()); + Parse emptyParse = new ParseStatus(e).getEmptyParse(getConf()); + parseResult.put(content.getUrl(), + new ParseText(emptyParse.getText()), + emptyParse.getData()); + return parseResult; } try { // extract license metadata Walker.walk(doc, base, parse.getData().getParseMeta(), getConf()); } catch (ParseException e) { - return new ParseStatus(e).getEmptyParse(getConf()); + Parse emptyParse = new ParseStatus(e).getEmptyParse(getConf()); + parseResult.put(content.getUrl(), + new ParseText(emptyParse.getText()), + emptyParse.getData()); + return parseResult; } - return parse; + return parseResult; } public void setConf(Configuration conf) { Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java?view=diff&rev=548103&r1=548102&r2=548103 ============================================================================== --- lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java (original) +++ lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java Sun Jun 17 13:27:17 2007 @@ -30,11 +30,13 @@ import org.apache.nutch.metadata.Metadata; import org.apache.nutch.parse.HTMLMetaTags; import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseResult; import org.apache.nutch.parse.HtmlParseFilter; import org.apache.nutch.protocol.Content; // Hadoop imports import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; // DOM imports import org.w3c.dom.DocumentFragment; @@ -84,8 +86,10 @@ * <li>3. meta http-equiv (content-language) (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2) * <br>Only the first occurence of language is stored. */ - public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) { + public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { + Parse parse = parseResult.get(content.getUrl()); + // Trying to find the document's language LanguageParser parser = new LanguageParser(doc); String lang = parser.getLanguage(); @@ -93,7 +97,7 @@ if (lang != null) { parse.getData().getParseMeta().set(Metadata.LANGUAGE, lang); } - return parse; + return parseResult; } static class LanguageParser { Modified: lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java?view=diff&rev=548103&r1=548102&r2=548103 ============================================================================== --- lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java (original) +++ lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java Sun Jun 17 13:27:17 2007 @@ -35,12 +35,14 @@ import org.apache.nutch.metadata.Metadata; import org.apache.nutch.parse.HTMLMetaTags; import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseResult; import org.apache.nutch.parse.HtmlParseFilter; import org.apache.nutch.protocol.Content; import org.apache.nutch.util.StringUtil; // Hadoop imports import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; /** @@ -63,8 +65,11 @@ /** * Scan the HTML document looking at possible rel-tags */ - public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) { + public ParseResult filter(Content content, ParseResult parseResult, + HTMLMetaTags metaTags, DocumentFragment doc) { + // get parse obj + Parse parse = parseResult.get(content.getUrl()); // Trying to find the document's rel-tags Parser parser = new Parser(doc); Set tags = parser.getRelTags(); @@ -73,7 +78,7 @@ while (iter.hasNext()) { metadata.add(REL_TAG, (String) iter.next()); } - return parse; + return parseResult; } private static class Parser { Modified: lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?view=diff&rev=548103&r1=548102&r2=548103 ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (original) +++ lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Sun Jun 17 13:27:17 2007 @@ -36,11 +36,13 @@ import org.apache.nutch.parse.ParseData; import org.apache.nutch.parse.ParseImpl; import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.parse.ParseText; import org.apache.nutch.parse.ParseStatus; import org.apache.nutch.parse.Parser; import org.apache.nutch.protocol.Content; import org.apache.nutch.util.NutchConfiguration; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; import org.apache.oro.text.regex.MatchResult; import org.apache.oro.text.regex.Pattern; import org.apache.oro.text.regex.PatternCompiler; @@ -69,7 +71,11 @@ private Configuration conf; - public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) { + public ParseResult filter(Content content, ParseResult parseResult, + HTMLMetaTags metaTags, DocumentFragment doc) { + + Parse parse = parseResult.get(content.getUrl()); + String url = content.getBaseUrl(); ArrayList outlinks = new ArrayList(); walk(doc, parse, metaTags, url, outlinks); @@ -85,9 +91,11 @@ parse.getData().getContentMeta(), parse.getData().getParseMeta()); parseData.setConf(this.conf); - parse = new ParseImpl(text, parseData); + + // replace original parse obj with new one + parseResult.put(content.getUrl(), new ParseText(text), parseData); } - return parse; + return parseResult; } private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base, List outlinks) { ------------------------------------------------------------------------- This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs