Author: dogacan
Date: Thu Nov  8 05:18:05 2007
New Revision: 593151

URL: http://svn.apache.org/viewvc?rev=593151&view=rev
Log:
NUTCH-547 - Redirection handling: YahooSlurp's algorithm.

Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
    lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
    lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java
    lucene/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java
    
lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
    
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=593151&r1=593150&r2=593151&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Thu Nov  8 05:18:05 2007
@@ -158,6 +158,9 @@
 
 54. NUTCH-565 - Arc File to Nutch Segments Converter. (kubes)
 
+55. NUTCH-547 - Redirection handling: YahooSlurp's algorithm.
+    (dogacan, kubes via dogacan)
+
 Release 0.9 - 2007-04-02
 
  1. Changed log4j confiquration to log to stdout on commandline

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=593151&r1=593150&r2=593151&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Thu Nov  
8 05:18:05 2007
@@ -18,6 +18,7 @@
 package org.apache.nutch.fetcher;
 
 import java.io.IOException;
+import java.net.MalformedURLException;
 import java.util.Map.Entry;
 
 // Commons Logging imports
@@ -48,6 +49,12 @@
 
   public static final Log LOG = LogFactory.getLog(Fetcher.class);
   
+  public static final int PERM_REFRESH_TIME = 5;
+
+  public static final String CONTENT_REDIR = "content";
+
+  public static final String PROTOCOL_REDIR = "protocol";
+
   public static class InputFormat extends SequenceFileInputFormat {
     /** Don't split inputs, to keep things polite. */
     public InputSplit[] getSplits(JobConf job, int nSplits)
@@ -87,6 +94,9 @@
     private ParseUtil parseUtil;
     private URLNormalizers normalizers;
     private ProtocolFactory protocolFactory;
+    private boolean redirecting;
+    private int redirectCount;
+    private String reprUrl;
 
     public FetcherThread(Configuration conf) {
       this.setDaemon(true);                       // don't hang JVM on exit
@@ -130,14 +140,21 @@
           }
 
           // url may be changed through redirects.
-          Text url = new Text();
-          url.set(key);
+          Text url = new Text(key);
+
+          Text reprUrlWritable =
+            (Text) datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
+          if (reprUrlWritable == null) {
+            reprUrl = key.toString();
+          } else {
+            reprUrl = reprUrlWritable.toString();
+          }
+
           try {
             if (LOG.isInfoEnabled()) { LOG.info("fetching " + url); }
 
             // fetch the page
-            boolean redirecting;
-            int redirectCount = 0;
+            redirectCount = 0;
             do {
               if (LOG.isDebugEnabled()) {
                 LOG.debug("redirectCount=" + redirectCount);
@@ -149,6 +166,12 @@
               Content content = output.getContent();
               ParseStatus pstatus = null;
 
+              String urlString = url.toString();
+              if (reprUrl != null && !reprUrl.equals(urlString)) {
+                datum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
+                    new Text(reprUrl));
+              }
+
               switch(status.getCode()) {
 
               case ProtocolStatus.SUCCESS:        // got a page
@@ -157,61 +180,28 @@
                 if (pstatus != null && pstatus.isSuccess() &&
                         pstatus.getMinorCode() == 
ParseStatus.SUCCESS_REDIRECT) {
                   String newUrl = pstatus.getMessage();
-                  newUrl = normalizers.normalize(newUrl, 
URLNormalizers.SCOPE_FETCHER);
-                  newUrl = this.urlFilters.filter(newUrl);
-                  if (newUrl != null && !newUrl.equals(url.toString())) {
-                    // record that we were redirected
-                    output(url, datum, null, status, 
CrawlDatum.STATUS_FETCH_REDIR_PERM);
-                    url = new Text(newUrl);
-                    if (maxRedirect > 0) {
-                      redirecting = true;
-                      redirectCount++;
-                      if (LOG.isDebugEnabled()) {
-                        LOG.debug(" - content redirect to " + url + " 
(fetching now)");
-                      }
-                    } else {
-                      output(url, new CrawlDatum(), null, null, 
CrawlDatum.STATUS_LINKED);
-                      if (LOG.isDebugEnabled()) {
-                        LOG.debug(" - content redirect to " + url + " 
(fetching later)");
-                      }
-                    }
-                  } else if (LOG.isDebugEnabled()) {
-                    LOG.debug(" - content redirect skipped: " +
-                             (newUrl != null ? "to same url" : "filtered"));
-                  }
+                  int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
+                  url = handleRedirect(url, datum, urlString, newUrl,
+                                       refreshTime < PERM_REFRESH_TIME,
+                                       CONTENT_REDIR);
                 }
                 break;
 
               case ProtocolStatus.MOVED:         // redirect
               case ProtocolStatus.TEMP_MOVED:
                 int code;
+                boolean temp;
                 if (status.getCode() == ProtocolStatus.MOVED) {
                   code = CrawlDatum.STATUS_FETCH_REDIR_PERM;
+                  temp = false;
                 } else {
                   code = CrawlDatum.STATUS_FETCH_REDIR_TEMP;
+                  temp = true;
                 }
                 output(url, datum, content, status, code);
                 String newUrl = status.getMessage();
-                newUrl = normalizers.normalize(newUrl, 
URLNormalizers.SCOPE_FETCHER);
-                newUrl = this.urlFilters.filter(newUrl);
-                if (newUrl != null && !newUrl.equals(url.toString())) {
-                  url = new Text(newUrl);
-                  if (maxRedirect > 0) {
-                    redirecting = true;
-                    redirectCount++;
-                    if (LOG.isDebugEnabled()) {
-                      LOG.debug(" - protocol redirect to " + url + " (fetching 
now)");
-                    }
-                  } else {
-                    output(url, new CrawlDatum(), null, null, 
CrawlDatum.STATUS_LINKED);
-                    if (LOG.isDebugEnabled()) {
-                      LOG.debug(" - protocol redirect to " + url + " (fetching 
later)");
-                    }
-                  }
-                } else if (LOG.isDebugEnabled()) {
-                  LOG.debug(" - protocol redirect skipped: " +
-                           (newUrl != null ? "to same url" : "filtered"));
-                }
+                url = handleRedirect(url, datum, urlString, newUrl,
+                                     temp, PROTOCOL_REDIR);
                 break;
 
               // failures - increase the retry counter
@@ -270,6 +260,43 @@
         }
       } finally {
         synchronized (Fetcher.this) {activeThreads--;} // count threads
+      }
+    }
+
+    private Text handleRedirect(Text url, CrawlDatum datum,
+                                String urlString, String newUrl,
+                                boolean temp, String redirType)
+    throws MalformedURLException, URLFilterException {
+      newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
+      newUrl = urlFilters.filter(newUrl);
+      if (newUrl != null && !newUrl.equals(urlString)) {
+        reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp);
+        url = new Text(newUrl);
+        if (maxRedirect > 0) {
+          redirecting = true;
+          redirectCount++;
+          if (LOG.isDebugEnabled()) {
+            LOG.debug(" - " + redirType + " redirect to " +
+                      url + " (fetching now)");
+          }
+          return url;
+        } else {
+          CrawlDatum newDatum = new CrawlDatum();
+          newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
+              new Text(reprUrl));
+          output(url, newDatum, null, null, CrawlDatum.STATUS_LINKED);
+          if (LOG.isDebugEnabled()) {
+            LOG.debug(" - " + redirType + " redirect to " +
+                      url + " (fetching later)");
+          }
+          return null;
+        }
+      } else {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug(" - " + redirType + " redirect skipped: " +
+              (newUrl != null ? "to same url" : "filtered"));
+        }
+        return null;
       }
     }
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?rev=593151&r1=593150&r2=593151&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Thu Nov  
8 05:18:05 2007
@@ -18,6 +18,7 @@
 
 import java.io.IOException;
 import java.net.InetAddress;
+import java.net.MalformedURLException;
 import java.net.URL;
 import java.net.UnknownHostException;
 import java.util.*;
@@ -435,6 +436,9 @@
     private long maxCrawlDelay;
     private boolean byIP;
     private int maxRedirect;
+    private String reprUrl;
+    private boolean redirecting;
+    private int redirectCount;
 
     public FetcherThread(Configuration conf) {
       this.setDaemon(true);                       // don't hang JVM on exit
@@ -475,12 +479,19 @@
             }
           }
           lastRequestStart.set(System.currentTimeMillis());
+          Text reprUrlWritable =
+            (Text) fit.datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
+          if (reprUrlWritable == null) {
+            reprUrl = fit.url.toString();
+          } else {
+            reprUrl = reprUrlWritable.toString();
+          }
           try {
             if (LOG.isInfoEnabled()) { LOG.info("fetching " + fit.url); }
 
             // fetch the page
-            boolean redirecting = false;
-            int redirectCount = 0;
+            redirecting = false;
+            redirectCount = 0;
             do {
               if (LOG.isDebugEnabled()) {
                 LOG.debug("redirectCount=" + redirectCount);
@@ -516,6 +527,8 @@
               // unblock queue
               fetchQueues.finishFetchItem(fit);
 
+              String urlString = fit.url.toString();
+
               switch(status.getCode()) {
                 
               case ProtocolStatus.WOULDBLOCK:
@@ -529,29 +542,20 @@
                 if (pstatus != null && pstatus.isSuccess() &&
                         pstatus.getMinorCode() == 
ParseStatus.SUCCESS_REDIRECT) {
                   String newUrl = pstatus.getMessage();
-                  newUrl = normalizers.normalize(newUrl, 
URLNormalizers.SCOPE_FETCHER);
-                  newUrl = this.urlFilters.filter(newUrl);
-                  if (newUrl != null && !newUrl.equals(fit.url.toString())) {
-                    output(fit.url, fit.datum, null, status, 
CrawlDatum.STATUS_FETCH_REDIR_PERM);
-                    Text redirUrl = new Text(newUrl);
-                    if (maxRedirect > 0) {
-                      redirecting = true;
-                      redirectCount++;
-                      fit = FetchItem.create(redirUrl, new CrawlDatum(), byIP);
-                      FetchItemQueue fiq = 
fetchQueues.getFetchItemQueue(fit.queueID);
-                      fiq.addInProgressFetchItem(fit);
-                      if (LOG.isDebugEnabled()) {
-                        LOG.debug(" - content redirect to " + redirUrl + " 
(fetching now)");
-                      }
-                    } else {
-                      output(redirUrl, new CrawlDatum(), null, null, 
CrawlDatum.STATUS_LINKED);
-                      if (LOG.isDebugEnabled()) {
-                        LOG.debug(" - content redirect to " + redirUrl + " 
(fetching later)");
-                      }
-                    }
-                  } else if (LOG.isDebugEnabled()) {
-                    LOG.debug(" - content redirect skipped: " +
-                             (newUrl != null ? "to same url" : "filtered"));
+                  int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
+                  Text redirUrl =
+                    handleRedirect(fit.url, fit.datum,
+                                   urlString, newUrl,
+                                   refreshTime < Fetcher.PERM_REFRESH_TIME,
+                                   Fetcher.CONTENT_REDIR);
+                  if (redirUrl != null) {
+                    CrawlDatum newDatum = new CrawlDatum();
+                    newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
+                        new Text(reprUrl));
+                    fit = FetchItem.create(redirUrl, newDatum, byIP);
+                    FetchItemQueue fiq =
+                      fetchQueues.getFetchItemQueue(fit.queueID);
+                    fiq.addInProgressFetchItem(fit);
                   }
                 }
                 break;
@@ -559,36 +563,27 @@
               case ProtocolStatus.MOVED:         // redirect
               case ProtocolStatus.TEMP_MOVED:
                 int code;
+                boolean temp;
                 if (status.getCode() == ProtocolStatus.MOVED) {
                   code = CrawlDatum.STATUS_FETCH_REDIR_PERM;
+                  temp = false;
                 } else {
                   code = CrawlDatum.STATUS_FETCH_REDIR_TEMP;
+                  temp = true;
                 }
                 output(fit.url, fit.datum, content, status, code);
                 String newUrl = status.getMessage();
-                newUrl = normalizers.normalize(newUrl, 
URLNormalizers.SCOPE_FETCHER);
-                newUrl = this.urlFilters.filter(newUrl);
-                if (newUrl != null && !newUrl.equals(fit.url.toString())) {
-                  Text redirUrl = new Text(newUrl);
-                  if (maxRedirect > 0) {
-                    redirecting = true;
-                    redirectCount++;
-                    fit = FetchItem.create(redirUrl, new CrawlDatum(), byIP);
-                    FetchItemQueue fiq = 
fetchQueues.getFetchItemQueue(fit.queueID);
-                    fiq.addInProgressFetchItem(fit);
-                    if (LOG.isDebugEnabled()) {
-                      LOG.debug(" - protocol redirect to " + redirUrl + " 
(fetching now)");
-                    }
-                  } else {
-                    output(redirUrl, new CrawlDatum(), null, null, 
CrawlDatum.STATUS_LINKED);
-                    if (LOG.isDebugEnabled()) {
-                      LOG.debug(" - protocol redirect to " + redirUrl + " 
(fetching later)");
-                    }
-                  }
-                } else if (LOG.isDebugEnabled()) {
-                  LOG.debug(" - protocol redirect skipped: " +
-                           (newUrl != null ? "to same url" : "filtered"));
-                }
+                Text redirUrl =
+                  handleRedirect(fit.url, fit.datum,
+                                 urlString, newUrl, temp,
+                                 Fetcher.PROTOCOL_REDIR);
+                CrawlDatum newDatum = new CrawlDatum();
+                newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
+                    new Text(reprUrl));
+                fit = FetchItem.create(redirUrl, newDatum, byIP);
+                FetchItemQueue fiq =
+                  fetchQueues.getFetchItemQueue(fit.queueID);
+                fiq.addInProgressFetchItem(fit);
                 break;
 
               case ProtocolStatus.EXCEPTION:
@@ -647,6 +642,43 @@
         if (fit != null) fetchQueues.finishFetchItem(fit);
         activeThreads.decrementAndGet(); // count threads
         LOG.info("-finishing thread " + getName() + ", activeThreads=" + 
activeThreads);
+      }
+    }
+
+    private Text handleRedirect(Text url, CrawlDatum datum,
+                                String urlString, String newUrl,
+                                boolean temp, String redirType)
+    throws MalformedURLException, URLFilterException {
+      newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
+      newUrl = urlFilters.filter(newUrl);
+      if (newUrl != null && !newUrl.equals(urlString)) {
+        reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp);
+        url = new Text(newUrl);
+        if (maxRedirect > 0) {
+          redirecting = true;
+          redirectCount++;
+          if (LOG.isDebugEnabled()) {
+            LOG.debug(" - " + redirType + " redirect to " +
+                url + " (fetching now)");
+          }
+          return url;
+        } else {
+          CrawlDatum newDatum = new CrawlDatum();
+          newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
+              new Text(reprUrl));
+          output(url, newDatum, null, null, CrawlDatum.STATUS_LINKED);
+          if (LOG.isDebugEnabled()) {
+            LOG.debug(" - " + redirType + " redirect to " +
+                url + " (fetching later)");
+          }
+          return null;
+        }
+      } else {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug(" - " + redirType + " redirect skipped: " +
+              (newUrl != null ? "to same url" : "filtered"));
+        }
+        return null;
       }
     }
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?rev=593151&r1=593150&r2=593151&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Thu Nov  
8 05:18:05 2007
@@ -241,6 +241,12 @@
 
     Parse parse = new ParseImpl(parseText, parseData);
     try {
+      // extract information from dbDatum and pass it to
+      // fetchDatum so that indexing filters can use it
+      Text url = (Text) dbDatum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
+      if (url != null) {
+        fetchDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, url);
+      }
       // run indexing filters
       doc = this.filters.filter(doc, parse, (Text)key, fetchDatum, inlinks);
     } catch (IndexingException e) {

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java?rev=593151&r1=593150&r2=593151&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java Thu Nov  8 
05:18:05 2007
@@ -61,4 +61,7 @@
   /** Don't show original forbidden content, but show summaries. */
   public static final String CACHING_FORBIDDEN_CONTENT = "content";
 
+  public static final String REPR_URL_KEY = "_repr_";
+
+  public static final Text WRITABLE_REPR_URL_KEY = new Text(REPR_URL_KEY);
 }

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=593151&r1=593150&r2=593151&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java 
Thu Nov  8 05:18:05 2007
@@ -24,11 +24,13 @@
 import org.apache.hadoop.io.*;
 import org.apache.hadoop.io.SequenceFile.CompressionType;
 import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.fetcher.Fetcher;
 import org.apache.hadoop.fs.*;
 import org.apache.hadoop.mapred.*;
 import org.apache.nutch.scoring.ScoringFilterException;
 import org.apache.nutch.scoring.ScoringFilters;
 import org.apache.nutch.util.StringUtil;
+import org.apache.nutch.util.URLUtil;
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.net.*;
 
@@ -45,6 +47,7 @@
 public class ParseOutputFormat implements OutputFormat {
   private static final Log LOG = LogFactory.getLog(ParseOutputFormat.class);
 
+  private URLNormalizers normalizers;
   private URLFilters filters;
   private ScoringFilters scfilters;
   
@@ -79,6 +82,8 @@
   public RecordWriter getRecordWriter(FileSystem fs, JobConf job,
                                       String name, Progressable progress) 
throws IOException {
 
+    this.normalizers = new URLNormalizers(job,
+                                          URLNormalizers.SCOPE_OUTLINK);
     this.filters = new URLFilters(job);
     this.scfilters = new ScoringFilters(job);
     final int interval = job.getInt("db.fetch.interval.default", 2592000);
@@ -130,6 +135,33 @@
               d.setSignature(signature);
               crawlOut.append(key, d);
             }
+          }
+
+          try {
+            ParseStatus pstatus = parseData.getStatus();
+            if (pstatus != null && pstatus.isSuccess() &&
+                pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
+              String newUrl = pstatus.getMessage();
+              int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
+              newUrl = normalizers.normalize(newUrl,
+                                             URLNormalizers.SCOPE_FETCHER);
+              newUrl = filters.filter(newUrl);
+              String url = key.toString();
+              if (newUrl != null && !newUrl.equals(url)) {
+                String reprUrl =
+                  URLUtil.chooseRepr(url, newUrl,
+                                     refreshTime < Fetcher.PERM_REFRESH_TIME);
+                CrawlDatum newDatum = new CrawlDatum();
+                newDatum.setStatus(CrawlDatum.STATUS_LINKED);
+                if (!reprUrl.equals(newUrl)) {
+                  newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
+                                             new Text(reprUrl));
+                }
+                crawlOut.append(new Text(newUrl), newDatum);
+              }
+            }
+          } catch (URLFilterException e) {
+            // ignore
           }
 
           // collect outlinks for subsequent db update

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java?rev=593151&r1=593150&r2=593151&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java Thu Nov 
 8 05:18:05 2007
@@ -24,6 +24,7 @@
 import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
+import java.util.Arrays;
 
 import org.apache.hadoop.io.VersionMismatchException;
 import org.apache.hadoop.io.Writable;
@@ -178,7 +179,7 @@
    */
   public String getMessage() {
     if (args != null && args.length > 0 && args[0] != null)
-      return args[0].toString();
+      return args[0];
     return null;
   }
   

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java?rev=593151&r1=593150&r2=593151&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java Thu Nov  8 
05:18:05 2007
@@ -141,6 +141,59 @@
    return getHostSegments(new URL(url));
   }
 
+  /** Given two urls (source and destination of the redirect),
+   * returns the representative one.
+   *
+   * <p>Implements the algorithm described here:
+   * <br>
+   * <a 
href="http://help.yahoo.com/l/nz/yahooxtra/search/webcrawler/slurp-11.html";>
+   * How does the Yahoo! webcrawler handle redirects?</a>
+   * <br><br>
+   * The algorithm is as follows:
+   * <ol>
+   *  <li>Choose target url if either url is malformed.</li>
+   *  <li>When a page in one domain redirects to a page in another domain,
+   *  choose the "target" URL.</li>
+   *  <li>When a top-level page in a domain presents a permanent redirect
+   *  to a page deep within the same domain, choose the "source" URL.</li>
+   *  <li>When a page deep within a domain presents a permanent redirect
+   *  to a page deep within the same domain, choose the "target" URL.</li>
+   *  <li>When a page in a domain presents a temporary redirect to
+   *  another page in the same domain, choose the "source" URL.<li>
+   * <ol>
+   * </p>
+   *
+   * @param src Source url of redirect
+   * @param dst Destination url of redirect
+   * @param temp Flag to indicate if redirect is temporary
+   * @return Representative url (either src or dst)
+   */
+  public static String chooseRepr(String src, String dst, boolean temp) {
+    URL srcUrl;
+    URL dstUrl;
+    try {
+      srcUrl = new URL(src);
+      dstUrl = new URL(dst);
+    } catch (MalformedURLException e) {
+      return dst;
+    }
+
+    String srcDomain = URLUtil.getDomainName(srcUrl);
+    String dstDomain = URLUtil.getDomainName(dstUrl);
+
+    if (!srcDomain.equals(dstDomain)) {
+      return dst;
+    }
+
+    String srcFile = srcUrl.getFile();
+
+    if (!temp && srcFile.equals("/")) {
+      return src;
+    }
+
+    return temp ? src : dst;
+  }
+
   /** For testing */
   public static void main(String[] args){
     

Modified: 
lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=593151&r1=593150&r2=593151&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
 Thu Nov  8 05:18:05 2007
@@ -48,10 +48,19 @@
 
   public Document filter(Document doc, Parse parse, Text url, CrawlDatum 
datum, Inlinks inlinks)
     throws IndexingException {
+
+    Text reprUrl = (Text) datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
+    String reprUrlString = reprUrl != null ? reprUrl.toString() : null;
+    String urlString = url.toString();
     
     String host = null;
     try {
-      URL u = new URL(url.toString());
+      URL u;
+      if (reprUrlString != null) {
+        u = new URL(reprUrlString);
+      } else {
+        u = new URL(urlString);
+      }
       host = u.getHost();
     } catch (MalformedURLException e) {
       throw new IndexingException(e);
@@ -64,10 +73,17 @@
       doc.add(new Field("site", host, Field.Store.NO, 
Field.Index.UN_TOKENIZED));
     }
 
-
     // url is both stored and indexed, so it's both searchable and returned
-    doc.add(new Field("url", url.toString(), Field.Store.YES, 
Field.Index.TOKENIZED));
+    doc.add(new Field("url",
+                      reprUrlString == null ? urlString : reprUrlString,
+                      Field.Store.YES, Field.Index.TOKENIZED));
     
+    if (reprUrlString != null) {
+      // also store original url as both stored and indexes
+      doc.add(new Field("orig", urlString,
+                        Field.Store.YES, Field.Index.TOKENIZED));
+    }
+
     // content is indexed, so that it's searchable, but not stored in index
     doc.add(new Field("content", parse.getText(), Field.Store.NO, 
Field.Index.TOKENIZED));
     

Modified: 
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=593151&r1=593150&r2=593151&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
 Thu Nov  8 05:18:05 2007
@@ -185,7 +185,8 @@
     ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
     if (metaTags.getRefresh()) {
       status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
-      status.setMessage(metaTags.getRefreshHref().toString());
+      status.setArgs(new String[] {metaTags.getRefreshHref().toString(),
+        Integer.toString(metaTags.getRefreshTime())});      
     }
     ParseData parseData = new ParseData(status, title, outlinks,
                                         content.getMetadata(), metadata);


Reply via email to