Author: dogacan Date: Fri Apr 18 11:52:38 2008 New Revision: 649652 URL: http://svn.apache.org/viewvc?rev=649652&view=rev Log: NUTCH-596 - ParseSegments parse content even if its not CrawlDatum.STATUS_FETCH_SUCCESS.
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=649652&r1=649651&r2=649652&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Fri Apr 18 11:52:38 2008 @@ -248,6 +248,8 @@ 90. NUTCH-500 - Add hadoop masters configuration file into conf folder. (Emmanuel Joke via kubes) +91. NUTCH-596 - ParseSegments parse content even if its not + CrawlDatum.STATUS_FETCH_SUCCESS (dogacan) Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=649652&r1=649651&r2=649652&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Fri Apr 18 11:52:38 2008 @@ -344,6 +344,14 @@ datum.setSignature(signature); } } + + if (!parsing) { + /* Store status code in content if we are not parsing in fetcher. + * So we can read this value during parsing (as a separate job) and + * decide to parse or not. + * */ + content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status)); + } } try { Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?rev=649652&r1=649651&r2=649652&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Fri Apr 18 11:52:38 2008 @@ -746,6 +746,14 @@ datum.setSignature(signature); } } + + if (!parsing) { + /* Store status code in content if we are not parsing in fetcher. + * So we can read this value during parsing (as a separate job) and + * decide to parse or not. + * */ + content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status)); + } } try { Modified: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java?rev=649652&r1=649651&r2=649652&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java Fri Apr 18 11:52:38 2008 @@ -48,6 +48,8 @@ public static final Text WRITABLE_PROTO_STATUS_KEY = new Text(PROTO_STATUS_KEY); public static final String FETCH_TIME_KEY = "_ftk_"; + + public static final String FETCH_STATUS_KEY = "_fst_"; /** Sites may request that search engines don't provide access to cached documents. */ public static final String CACHING_FORBIDDEN_KEY = "caching.forbidden"; Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=649652&r1=649651&r2=649652&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Fri Apr 18 11:52:38 2008 @@ -20,6 +20,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.SignatureFactory; import org.apache.hadoop.io.*; import org.apache.hadoop.mapred.*; @@ -67,6 +68,14 @@ if (key instanceof UTF8) { newKey.set(key.toString()); key = newKey; + } + + int status = + Integer.parseInt(content.getMetadata().get(Nutch.FETCH_STATUS_KEY)); + if (status != CrawlDatum.STATUS_FETCH_SUCCESS) { + // content not fetched successfully, skip document + LOG.debug("Skipping " + key + " as content is not fetched successfully"); + return; } ParseResult parseResult = null;