Author: dogacan Date: Thu Nov 8 05:18:05 2007 New Revision: 593151 URL: http://svn.apache.org/viewvc?rev=593151&view=rev Log: NUTCH-547 - Redirection handling: YahooSlurp's algorithm.
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java lucene/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=593151&r1=593150&r2=593151&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Thu Nov 8 05:18:05 2007 @@ -158,6 +158,9 @@ 54. NUTCH-565 - Arc File to Nutch Segments Converter. (kubes) +55. NUTCH-547 - Redirection handling: YahooSlurp's algorithm. + (dogacan, kubes via dogacan) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=593151&r1=593150&r2=593151&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Thu Nov 8 05:18:05 2007 @@ -18,6 +18,7 @@ package org.apache.nutch.fetcher; import java.io.IOException; +import java.net.MalformedURLException; import java.util.Map.Entry; // Commons Logging imports @@ -48,6 +49,12 @@ public static final Log LOG = LogFactory.getLog(Fetcher.class); + public static final int PERM_REFRESH_TIME = 5; + + public static final String CONTENT_REDIR = "content"; + + public static final String PROTOCOL_REDIR = "protocol"; + public static class InputFormat extends SequenceFileInputFormat { /** Don't split inputs, to keep things polite. */ public InputSplit[] getSplits(JobConf job, int nSplits) @@ -87,6 +94,9 @@ private ParseUtil parseUtil; private URLNormalizers normalizers; private ProtocolFactory protocolFactory; + private boolean redirecting; + private int redirectCount; + private String reprUrl; public FetcherThread(Configuration conf) { this.setDaemon(true); // don't hang JVM on exit @@ -130,14 +140,21 @@ } // url may be changed through redirects. - Text url = new Text(); - url.set(key); + Text url = new Text(key); + + Text reprUrlWritable = + (Text) datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY); + if (reprUrlWritable == null) { + reprUrl = key.toString(); + } else { + reprUrl = reprUrlWritable.toString(); + } + try { if (LOG.isInfoEnabled()) { LOG.info("fetching " + url); } // fetch the page - boolean redirecting; - int redirectCount = 0; + redirectCount = 0; do { if (LOG.isDebugEnabled()) { LOG.debug("redirectCount=" + redirectCount); @@ -149,6 +166,12 @@ Content content = output.getContent(); ParseStatus pstatus = null; + String urlString = url.toString(); + if (reprUrl != null && !reprUrl.equals(urlString)) { + datum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, + new Text(reprUrl)); + } + switch(status.getCode()) { case ProtocolStatus.SUCCESS: // got a page @@ -157,61 +180,28 @@ if (pstatus != null && pstatus.isSuccess() && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) { String newUrl = pstatus.getMessage(); - newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER); - newUrl = this.urlFilters.filter(newUrl); - if (newUrl != null && !newUrl.equals(url.toString())) { - // record that we were redirected - output(url, datum, null, status, CrawlDatum.STATUS_FETCH_REDIR_PERM); - url = new Text(newUrl); - if (maxRedirect > 0) { - redirecting = true; - redirectCount++; - if (LOG.isDebugEnabled()) { - LOG.debug(" - content redirect to " + url + " (fetching now)"); - } - } else { - output(url, new CrawlDatum(), null, null, CrawlDatum.STATUS_LINKED); - if (LOG.isDebugEnabled()) { - LOG.debug(" - content redirect to " + url + " (fetching later)"); - } - } - } else if (LOG.isDebugEnabled()) { - LOG.debug(" - content redirect skipped: " + - (newUrl != null ? "to same url" : "filtered")); - } + int refreshTime = Integer.valueOf(pstatus.getArgs()[1]); + url = handleRedirect(url, datum, urlString, newUrl, + refreshTime < PERM_REFRESH_TIME, + CONTENT_REDIR); } break; case ProtocolStatus.MOVED: // redirect case ProtocolStatus.TEMP_MOVED: int code; + boolean temp; if (status.getCode() == ProtocolStatus.MOVED) { code = CrawlDatum.STATUS_FETCH_REDIR_PERM; + temp = false; } else { code = CrawlDatum.STATUS_FETCH_REDIR_TEMP; + temp = true; } output(url, datum, content, status, code); String newUrl = status.getMessage(); - newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER); - newUrl = this.urlFilters.filter(newUrl); - if (newUrl != null && !newUrl.equals(url.toString())) { - url = new Text(newUrl); - if (maxRedirect > 0) { - redirecting = true; - redirectCount++; - if (LOG.isDebugEnabled()) { - LOG.debug(" - protocol redirect to " + url + " (fetching now)"); - } - } else { - output(url, new CrawlDatum(), null, null, CrawlDatum.STATUS_LINKED); - if (LOG.isDebugEnabled()) { - LOG.debug(" - protocol redirect to " + url + " (fetching later)"); - } - } - } else if (LOG.isDebugEnabled()) { - LOG.debug(" - protocol redirect skipped: " + - (newUrl != null ? "to same url" : "filtered")); - } + url = handleRedirect(url, datum, urlString, newUrl, + temp, PROTOCOL_REDIR); break; // failures - increase the retry counter @@ -270,6 +260,43 @@ } } finally { synchronized (Fetcher.this) {activeThreads--;} // count threads + } + } + + private Text handleRedirect(Text url, CrawlDatum datum, + String urlString, String newUrl, + boolean temp, String redirType) + throws MalformedURLException, URLFilterException { + newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER); + newUrl = urlFilters.filter(newUrl); + if (newUrl != null && !newUrl.equals(urlString)) { + reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp); + url = new Text(newUrl); + if (maxRedirect > 0) { + redirecting = true; + redirectCount++; + if (LOG.isDebugEnabled()) { + LOG.debug(" - " + redirType + " redirect to " + + url + " (fetching now)"); + } + return url; + } else { + CrawlDatum newDatum = new CrawlDatum(); + newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, + new Text(reprUrl)); + output(url, newDatum, null, null, CrawlDatum.STATUS_LINKED); + if (LOG.isDebugEnabled()) { + LOG.debug(" - " + redirType + " redirect to " + + url + " (fetching later)"); + } + return null; + } + } else { + if (LOG.isDebugEnabled()) { + LOG.debug(" - " + redirType + " redirect skipped: " + + (newUrl != null ? "to same url" : "filtered")); + } + return null; } } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?rev=593151&r1=593150&r2=593151&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Thu Nov 8 05:18:05 2007 @@ -18,6 +18,7 @@ import java.io.IOException; import java.net.InetAddress; +import java.net.MalformedURLException; import java.net.URL; import java.net.UnknownHostException; import java.util.*; @@ -435,6 +436,9 @@ private long maxCrawlDelay; private boolean byIP; private int maxRedirect; + private String reprUrl; + private boolean redirecting; + private int redirectCount; public FetcherThread(Configuration conf) { this.setDaemon(true); // don't hang JVM on exit @@ -475,12 +479,19 @@ } } lastRequestStart.set(System.currentTimeMillis()); + Text reprUrlWritable = + (Text) fit.datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY); + if (reprUrlWritable == null) { + reprUrl = fit.url.toString(); + } else { + reprUrl = reprUrlWritable.toString(); + } try { if (LOG.isInfoEnabled()) { LOG.info("fetching " + fit.url); } // fetch the page - boolean redirecting = false; - int redirectCount = 0; + redirecting = false; + redirectCount = 0; do { if (LOG.isDebugEnabled()) { LOG.debug("redirectCount=" + redirectCount); @@ -516,6 +527,8 @@ // unblock queue fetchQueues.finishFetchItem(fit); + String urlString = fit.url.toString(); + switch(status.getCode()) { case ProtocolStatus.WOULDBLOCK: @@ -529,29 +542,20 @@ if (pstatus != null && pstatus.isSuccess() && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) { String newUrl = pstatus.getMessage(); - newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER); - newUrl = this.urlFilters.filter(newUrl); - if (newUrl != null && !newUrl.equals(fit.url.toString())) { - output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_REDIR_PERM); - Text redirUrl = new Text(newUrl); - if (maxRedirect > 0) { - redirecting = true; - redirectCount++; - fit = FetchItem.create(redirUrl, new CrawlDatum(), byIP); - FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID); - fiq.addInProgressFetchItem(fit); - if (LOG.isDebugEnabled()) { - LOG.debug(" - content redirect to " + redirUrl + " (fetching now)"); - } - } else { - output(redirUrl, new CrawlDatum(), null, null, CrawlDatum.STATUS_LINKED); - if (LOG.isDebugEnabled()) { - LOG.debug(" - content redirect to " + redirUrl + " (fetching later)"); - } - } - } else if (LOG.isDebugEnabled()) { - LOG.debug(" - content redirect skipped: " + - (newUrl != null ? "to same url" : "filtered")); + int refreshTime = Integer.valueOf(pstatus.getArgs()[1]); + Text redirUrl = + handleRedirect(fit.url, fit.datum, + urlString, newUrl, + refreshTime < Fetcher.PERM_REFRESH_TIME, + Fetcher.CONTENT_REDIR); + if (redirUrl != null) { + CrawlDatum newDatum = new CrawlDatum(); + newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, + new Text(reprUrl)); + fit = FetchItem.create(redirUrl, newDatum, byIP); + FetchItemQueue fiq = + fetchQueues.getFetchItemQueue(fit.queueID); + fiq.addInProgressFetchItem(fit); } } break; @@ -559,36 +563,27 @@ case ProtocolStatus.MOVED: // redirect case ProtocolStatus.TEMP_MOVED: int code; + boolean temp; if (status.getCode() == ProtocolStatus.MOVED) { code = CrawlDatum.STATUS_FETCH_REDIR_PERM; + temp = false; } else { code = CrawlDatum.STATUS_FETCH_REDIR_TEMP; + temp = true; } output(fit.url, fit.datum, content, status, code); String newUrl = status.getMessage(); - newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER); - newUrl = this.urlFilters.filter(newUrl); - if (newUrl != null && !newUrl.equals(fit.url.toString())) { - Text redirUrl = new Text(newUrl); - if (maxRedirect > 0) { - redirecting = true; - redirectCount++; - fit = FetchItem.create(redirUrl, new CrawlDatum(), byIP); - FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID); - fiq.addInProgressFetchItem(fit); - if (LOG.isDebugEnabled()) { - LOG.debug(" - protocol redirect to " + redirUrl + " (fetching now)"); - } - } else { - output(redirUrl, new CrawlDatum(), null, null, CrawlDatum.STATUS_LINKED); - if (LOG.isDebugEnabled()) { - LOG.debug(" - protocol redirect to " + redirUrl + " (fetching later)"); - } - } - } else if (LOG.isDebugEnabled()) { - LOG.debug(" - protocol redirect skipped: " + - (newUrl != null ? "to same url" : "filtered")); - } + Text redirUrl = + handleRedirect(fit.url, fit.datum, + urlString, newUrl, temp, + Fetcher.PROTOCOL_REDIR); + CrawlDatum newDatum = new CrawlDatum(); + newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, + new Text(reprUrl)); + fit = FetchItem.create(redirUrl, newDatum, byIP); + FetchItemQueue fiq = + fetchQueues.getFetchItemQueue(fit.queueID); + fiq.addInProgressFetchItem(fit); break; case ProtocolStatus.EXCEPTION: @@ -647,6 +642,43 @@ if (fit != null) fetchQueues.finishFetchItem(fit); activeThreads.decrementAndGet(); // count threads LOG.info("-finishing thread " + getName() + ", activeThreads=" + activeThreads); + } + } + + private Text handleRedirect(Text url, CrawlDatum datum, + String urlString, String newUrl, + boolean temp, String redirType) + throws MalformedURLException, URLFilterException { + newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER); + newUrl = urlFilters.filter(newUrl); + if (newUrl != null && !newUrl.equals(urlString)) { + reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp); + url = new Text(newUrl); + if (maxRedirect > 0) { + redirecting = true; + redirectCount++; + if (LOG.isDebugEnabled()) { + LOG.debug(" - " + redirType + " redirect to " + + url + " (fetching now)"); + } + return url; + } else { + CrawlDatum newDatum = new CrawlDatum(); + newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, + new Text(reprUrl)); + output(url, newDatum, null, null, CrawlDatum.STATUS_LINKED); + if (LOG.isDebugEnabled()) { + LOG.debug(" - " + redirType + " redirect to " + + url + " (fetching later)"); + } + return null; + } + } else { + if (LOG.isDebugEnabled()) { + LOG.debug(" - " + redirType + " redirect skipped: " + + (newUrl != null ? "to same url" : "filtered")); + } + return null; } } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?rev=593151&r1=593150&r2=593151&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Thu Nov 8 05:18:05 2007 @@ -241,6 +241,12 @@ Parse parse = new ParseImpl(parseText, parseData); try { + // extract information from dbDatum and pass it to + // fetchDatum so that indexing filters can use it + Text url = (Text) dbDatum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY); + if (url != null) { + fetchDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, url); + } // run indexing filters doc = this.filters.filter(doc, parse, (Text)key, fetchDatum, inlinks); } catch (IndexingException e) { Modified: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java?rev=593151&r1=593150&r2=593151&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java Thu Nov 8 05:18:05 2007 @@ -61,4 +61,7 @@ /** Don't show original forbidden content, but show summaries. */ public static final String CACHING_FORBIDDEN_CONTENT = "content"; + public static final String REPR_URL_KEY = "_repr_"; + + public static final Text WRITABLE_REPR_URL_KEY = new Text(REPR_URL_KEY); } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=593151&r1=593150&r2=593151&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Thu Nov 8 05:18:05 2007 @@ -24,11 +24,13 @@ import org.apache.hadoop.io.*; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.fetcher.Fetcher; import org.apache.hadoop.fs.*; import org.apache.hadoop.mapred.*; import org.apache.nutch.scoring.ScoringFilterException; import org.apache.nutch.scoring.ScoringFilters; import org.apache.nutch.util.StringUtil; +import org.apache.nutch.util.URLUtil; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.net.*; @@ -45,6 +47,7 @@ public class ParseOutputFormat implements OutputFormat { private static final Log LOG = LogFactory.getLog(ParseOutputFormat.class); + private URLNormalizers normalizers; private URLFilters filters; private ScoringFilters scfilters; @@ -79,6 +82,8 @@ public RecordWriter getRecordWriter(FileSystem fs, JobConf job, String name, Progressable progress) throws IOException { + this.normalizers = new URLNormalizers(job, + URLNormalizers.SCOPE_OUTLINK); this.filters = new URLFilters(job); this.scfilters = new ScoringFilters(job); final int interval = job.getInt("db.fetch.interval.default", 2592000); @@ -130,6 +135,33 @@ d.setSignature(signature); crawlOut.append(key, d); } + } + + try { + ParseStatus pstatus = parseData.getStatus(); + if (pstatus != null && pstatus.isSuccess() && + pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) { + String newUrl = pstatus.getMessage(); + int refreshTime = Integer.valueOf(pstatus.getArgs()[1]); + newUrl = normalizers.normalize(newUrl, + URLNormalizers.SCOPE_FETCHER); + newUrl = filters.filter(newUrl); + String url = key.toString(); + if (newUrl != null && !newUrl.equals(url)) { + String reprUrl = + URLUtil.chooseRepr(url, newUrl, + refreshTime < Fetcher.PERM_REFRESH_TIME); + CrawlDatum newDatum = new CrawlDatum(); + newDatum.setStatus(CrawlDatum.STATUS_LINKED); + if (!reprUrl.equals(newUrl)) { + newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, + new Text(reprUrl)); + } + crawlOut.append(new Text(newUrl), newDatum); + } + } + } catch (URLFilterException e) { + // ignore } // collect outlinks for subsequent db update Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java?rev=593151&r1=593150&r2=593151&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java Thu Nov 8 05:18:05 2007 @@ -24,6 +24,7 @@ import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; +import java.util.Arrays; import org.apache.hadoop.io.VersionMismatchException; import org.apache.hadoop.io.Writable; @@ -178,7 +179,7 @@ */ public String getMessage() { if (args != null && args.length > 0 && args[0] != null) - return args[0].toString(); + return args[0]; return null; } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java?rev=593151&r1=593150&r2=593151&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java Thu Nov 8 05:18:05 2007 @@ -141,6 +141,59 @@ return getHostSegments(new URL(url)); } + /** Given two urls (source and destination of the redirect), + * returns the representative one. + * + * <p>Implements the algorithm described here: + * <br> + * <a href="http://help.yahoo.com/l/nz/yahooxtra/search/webcrawler/slurp-11.html"> + * How does the Yahoo! webcrawler handle redirects?</a> + * <br><br> + * The algorithm is as follows: + * <ol> + * <li>Choose target url if either url is malformed.</li> + * <li>When a page in one domain redirects to a page in another domain, + * choose the "target" URL.</li> + * <li>When a top-level page in a domain presents a permanent redirect + * to a page deep within the same domain, choose the "source" URL.</li> + * <li>When a page deep within a domain presents a permanent redirect + * to a page deep within the same domain, choose the "target" URL.</li> + * <li>When a page in a domain presents a temporary redirect to + * another page in the same domain, choose the "source" URL.<li> + * <ol> + * </p> + * + * @param src Source url of redirect + * @param dst Destination url of redirect + * @param temp Flag to indicate if redirect is temporary + * @return Representative url (either src or dst) + */ + public static String chooseRepr(String src, String dst, boolean temp) { + URL srcUrl; + URL dstUrl; + try { + srcUrl = new URL(src); + dstUrl = new URL(dst); + } catch (MalformedURLException e) { + return dst; + } + + String srcDomain = URLUtil.getDomainName(srcUrl); + String dstDomain = URLUtil.getDomainName(dstUrl); + + if (!srcDomain.equals(dstDomain)) { + return dst; + } + + String srcFile = srcUrl.getFile(); + + if (!temp && srcFile.equals("/")) { + return src; + } + + return temp ? src : dst; + } + /** For testing */ public static void main(String[] args){ Modified: lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=593151&r1=593150&r2=593151&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java (original) +++ lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java Thu Nov 8 05:18:05 2007 @@ -48,10 +48,19 @@ public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { + + Text reprUrl = (Text) datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY); + String reprUrlString = reprUrl != null ? reprUrl.toString() : null; + String urlString = url.toString(); String host = null; try { - URL u = new URL(url.toString()); + URL u; + if (reprUrlString != null) { + u = new URL(reprUrlString); + } else { + u = new URL(urlString); + } host = u.getHost(); } catch (MalformedURLException e) { throw new IndexingException(e); @@ -64,10 +73,17 @@ doc.add(new Field("site", host, Field.Store.NO, Field.Index.UN_TOKENIZED)); } - // url is both stored and indexed, so it's both searchable and returned - doc.add(new Field("url", url.toString(), Field.Store.YES, Field.Index.TOKENIZED)); + doc.add(new Field("url", + reprUrlString == null ? urlString : reprUrlString, + Field.Store.YES, Field.Index.TOKENIZED)); + if (reprUrlString != null) { + // also store original url as both stored and indexes + doc.add(new Field("orig", urlString, + Field.Store.YES, Field.Index.TOKENIZED)); + } + // content is indexed, so that it's searchable, but not stored in index doc.add(new Field("content", parse.getText(), Field.Store.NO, Field.Index.TOKENIZED)); Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=593151&r1=593150&r2=593151&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Thu Nov 8 05:18:05 2007 @@ -185,7 +185,8 @@ ParseStatus status = new ParseStatus(ParseStatus.SUCCESS); if (metaTags.getRefresh()) { status.setMinorCode(ParseStatus.SUCCESS_REDIRECT); - status.setMessage(metaTags.getRefreshHref().toString()); + status.setArgs(new String[] {metaTags.getRefreshHref().toString(), + Integer.toString(metaTags.getRefreshTime())}); } ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), metadata);