Author: mattmann Date: Sun Jun 17 10:19:14 2007 New Revision: 548076 URL: http://svn.apache.org/viewvc?view=rev&rev=548076 Log: - fix for NUTCH-443 (contributed by Dogacan)
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=548076&r1=548075&r2=548076 ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Sun Jun 17 10:19:14 2007 @@ -32,6 +32,13 @@ 11. NUTCH-495 - Unnecessary delays in Fetcher2 (dogacan) +12. NUTCH-443 - allow parsers to return multiple Parse object, this will speed + up the rss parser (dogacan via mattmann). This update is a fix and semantics + change from the original patch for NUTCH-443. The original patch did not tell + the Indexer to read crawl_parse too so that it can pickup sub-urls' fetch + datums. This patch addresses that issue. Now, if Fetcher gets a null content, + instead of pushing an empty content, it filters the null content. + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?view=diff&rev=548076&r1=548075&r2=548076 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Sun Jun 17 10:19:14 2007 @@ -288,78 +288,75 @@ datum.setFetchTime(System.currentTimeMillis()); if (pstatus != null) datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus); - if (content == null) { - String url = key.toString(); - content = new Content(url, url, new byte[0], "", new Metadata(), this.conf); - } - Metadata metadata = content.getMetadata(); - // add segment to metadata - metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName); - // add score to content metadata so that ParseSegment can pick it up. - try { - scfilters.passScoreBeforeParsing(key, datum, content); - } catch (Exception e) { - if (LOG.isWarnEnabled()) { - e.printStackTrace(LogUtil.getWarnStream(LOG)); - LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); - } - } - - /* Note: Fetcher will only follow meta-redirects coming from the - * original URL. */ ParseResult parseResult = null; - if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) { + if (content != null) { + Metadata metadata = content.getMetadata(); + // add segment to metadata + metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName); + // add score to content metadata so that ParseSegment can pick it up. try { - parseResult = this.parseUtil.parse(content); + scfilters.passScoreBeforeParsing(key, datum, content); } catch (Exception e) { - LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e)); + if (LOG.isWarnEnabled()) { + e.printStackTrace(LogUtil.getWarnStream(LOG)); + LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); + } } + /* Note: Fetcher will only follow meta-redirects coming from the + * original URL. */ + if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) { + try { + parseResult = this.parseUtil.parse(content); + } catch (Exception e) { + LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e)); + } - if (parseResult != null) { - for (Entry<Text, Parse> entry : parseResult) { - Text url = entry.getKey(); - Parse parse = entry.getValue(); - ParseStatus parseStatus = parse.getData().getStatus(); - - if (!parseStatus.isSuccess()) { - LOG.warn("Error parsing: " + key + ": " + parseStatus); - parse = parseStatus.getEmptyParse(getConf()); - } + if (parseResult != null) { + for (Entry<Text, Parse> entry : parseResult) { + Text url = entry.getKey(); + Parse parse = entry.getValue(); + ParseStatus parseStatus = parse.getData().getStatus(); + + if (!parseStatus.isSuccess()) { + LOG.warn("Error parsing: " + key + ": " + parseStatus); + parse = parseStatus.getEmptyParse(getConf()); + } - // Calculate page signature. For non-parsing fetchers this will - // be done in ParseSegment - byte[] signature = - SignatureFactory.getSignature(getConf()).calculate(content, parse); - // Ensure segment name and score are in parseData metadata - parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, - segmentName); - parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, - StringUtil.toHexString(signature)); - // Pass fetch time to content meta - parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY, - Long.toString(datum.getFetchTime())); - if (url.equals(key)) - datum.setSignature(signature); - try { - scfilters.passScoreAfterParsing(url, content, parse); - } catch (Exception e) { - if (LOG.isWarnEnabled()) { - e.printStackTrace(LogUtil.getWarnStream(LOG)); - LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); + // Calculate page signature. For non-parsing fetchers this will + // be done in ParseSegment + byte[] signature = + SignatureFactory.getSignature(getConf()).calculate(content, parse); + // Ensure segment name and score are in parseData metadata + parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, + segmentName); + parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, + StringUtil.toHexString(signature)); + // Pass fetch time to content meta + parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY, + Long.toString(datum.getFetchTime())); + if (url.equals(key)) + datum.setSignature(signature); + try { + scfilters.passScoreAfterParsing(url, content, parse); + } catch (Exception e) { + if (LOG.isWarnEnabled()) { + e.printStackTrace(LogUtil.getWarnStream(LOG)); + LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); + } } } + } else { + byte[] signature = + SignatureFactory.getSignature(getConf()).calculate(content, + new ParseStatus().getEmptyParse(conf)); + datum.setSignature(signature); } - } else { - byte[] signature = - SignatureFactory.getSignature(getConf()).calculate(content, - new ParseStatus().getEmptyParse(conf)); - datum.setSignature(signature); } } try { output.collect(key, new ObjectWritable(datum)); - if (storingContent) + if (content != null && storingContent) output.collect(key, new ObjectWritable(content)); if (parseResult != null) { for (Entry<Text, Parse> entry : parseResult) { Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?view=diff&rev=548076&r1=548075&r2=548076 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Sun Jun 17 10:19:14 2007 @@ -662,72 +662,69 @@ datum.setFetchTime(System.currentTimeMillis()); if (pstatus != null) datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus); - if (content == null) { - String url = key.toString(); - content = new Content(url, url, new byte[0], "", new Metadata(), this.conf); - } - Metadata metadata = content.getMetadata(); - // add segment to metadata - metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName); - // add score to content metadata so that ParseSegment can pick it up. - try { - scfilters.passScoreBeforeParsing(key, datum, content); - } catch (Exception e) { - if (LOG.isWarnEnabled()) { - e.printStackTrace(LogUtil.getWarnStream(LOG)); - LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); - } - } - - /* Note: Fetcher will only follow meta-redirects coming from the - * original URL. */ ParseResult parseResult = null; - if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) { + if (content != null) { + Metadata metadata = content.getMetadata(); + // add segment to metadata + metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName); + // add score to content metadata so that ParseSegment can pick it up. try { - parseResult = this.parseUtil.parse(content); + scfilters.passScoreBeforeParsing(key, datum, content); } catch (Exception e) { - LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e)); + if (LOG.isWarnEnabled()) { + e.printStackTrace(LogUtil.getWarnStream(LOG)); + LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); + } } + /* Note: Fetcher will only follow meta-redirects coming from the + * original URL. */ + if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) { + try { + parseResult = this.parseUtil.parse(content); + } catch (Exception e) { + LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e)); + } - if (parseResult != null) { - for (Entry<Text, Parse> entry : parseResult) { - Text url = entry.getKey(); - Parse parse = entry.getValue(); - ParseStatus parseStatus = parse.getData().getStatus(); + if (parseResult != null) { + for (Entry<Text, Parse> entry : parseResult) { + Text url = entry.getKey(); + Parse parse = entry.getValue(); + ParseStatus parseStatus = parse.getData().getStatus(); - if (!parseStatus.isSuccess()) { - LOG.warn("Error parsing: " + key + ": " + parseStatus); - parse = parseStatus.getEmptyParse(getConf()); - } + if (!parseStatus.isSuccess()) { + LOG.warn("Error parsing: " + key + ": " + parseStatus); + parse = parseStatus.getEmptyParse(getConf()); + } - // Calculate page signature. For non-parsing fetchers this will - // be done in ParseSegment - byte[] signature = - SignatureFactory.getSignature(getConf()).calculate(content, parse); - // Ensure segment name and score are in parseData metadata - parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, - segmentName); - parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, - StringUtil.toHexString(signature)); - // Pass fetch time to content meta - parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY, - Long.toString(datum.getFetchTime())); - if (url.equals(key)) - datum.setSignature(signature); - try { - scfilters.passScoreAfterParsing(url, content, parse); - } catch (Exception e) { - if (LOG.isWarnEnabled()) { - e.printStackTrace(LogUtil.getWarnStream(LOG)); - LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); + // Calculate page signature. For non-parsing fetchers this will + // be done in ParseSegment + byte[] signature = + SignatureFactory.getSignature(getConf()).calculate(content, parse); + // Ensure segment name and score are in parseData metadata + parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, + segmentName); + parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, + StringUtil.toHexString(signature)); + // Pass fetch time to content meta + parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY, + Long.toString(datum.getFetchTime())); + if (url.equals(key)) + datum.setSignature(signature); + try { + scfilters.passScoreAfterParsing(url, content, parse); + } catch (Exception e) { + if (LOG.isWarnEnabled()) { + e.printStackTrace(LogUtil.getWarnStream(LOG)); + LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); + } } } + } else { + byte[] signature = + SignatureFactory.getSignature(getConf()).calculate(content, + new ParseStatus().getEmptyParse(conf)); + datum.setSignature(signature); } - } else { - byte[] signature = - SignatureFactory.getSignature(getConf()).calculate(content, - new ParseStatus().getEmptyParse(conf)); - datum.setSignature(signature); } } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?view=diff&rev=548076&r1=548075&r2=548076 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Sun Jun 17 10:19:14 2007 @@ -153,7 +153,6 @@ Inlinks inlinks = null; CrawlDatum dbDatum = null; CrawlDatum fetchDatum = null; - CrawlDatum redir = null; ParseData parseData = null; ParseText parseText = null; while (values.hasNext()) { @@ -168,11 +167,12 @@ // don't index unmodified (empty) pages if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED) fetchDatum = datum; - } else if (CrawlDatum.STATUS_LINKED == datum.getStatus()) - // redirected page - redir = datum; - else + } else if (CrawlDatum.STATUS_LINKED == datum.getStatus() || + CrawlDatum.STATUS_SIGNATURE == datum.getStatus()) { + continue; + } else { throw new RuntimeException("Unexpected status: "+datum.getStatus()); + } } else if (value instanceof ParseData) { parseData = (ParseData)value; } else if (value instanceof ParseText) { @@ -181,11 +181,6 @@ LOG.warn("Unrecognized type: "+value.getClass()); } } - if (redir != null) { - // XXX page was redirected - what should we do? - // XXX discard it for now - return; - } if (fetchDatum == null || dbDatum == null || parseText == null || parseData == null) { @@ -260,6 +255,7 @@ LOG.info("Indexer: adding segment: " + segments[i]); } job.addInputPath(new Path(segments[i], CrawlDatum.FETCH_DIR_NAME)); + job.addInputPath(new Path(segments[i], CrawlDatum.PARSE_DIR_NAME)); job.addInputPath(new Path(segments[i], ParseData.DIR_NAME)); job.addInputPath(new Path(segments[i], ParseText.DIR_NAME)); } ------------------------------------------------------------------------- This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs