Author: dogacan
Date: Thu Nov  8 07:08:47 2007
New Revision: 593186

URL: http://svn.apache.org/viewvc?rev=593186&view=rev
Log:
NUTCH-548 - Move URLNormalizer from Outlink to ParseOutputFormat. Contributed 
by Emmanuel Joke.

Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
    
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
    
lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
    
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=593186&r1=593185&r2=593186&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Thu Nov  8 07:08:47 2007
@@ -161,6 +161,9 @@
 55. NUTCH-547 - Redirection handling: YahooSlurp's algorithm.
     (dogacan, kubes via dogacan)
 
+56. NUTCH-548 - Move URLNormalizer from Outlink to ParseOutputFormat.
+    (Emmanuel Joke via dogacan)
+
 Release 0.9 - 2007-04-02
 
  1. Changed log4j confiquration to log to stdout on commandline

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java?rev=593186&r1=593185&r2=593186&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java Thu Nov  8 
07:08:47 2007
@@ -32,8 +32,8 @@
 
   public Outlink() {}
 
-  public Outlink(String toUrl, String anchor, Configuration conf) throws 
MalformedURLException {
-    this.toUrl = new URLNormalizers(conf, 
URLNormalizers.SCOPE_OUTLINK).normalize(toUrl, URLNormalizers.SCOPE_OUTLINK);
+  public Outlink(String toUrl, String anchor) throws MalformedURLException {
+    this.toUrl = toUrl;
     if (anchor == null) anchor = "";
     this.anchor = anchor;
   }

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java?rev=593186&r1=593185&r2=593186&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java 
Thu Nov  8 07:08:47 2007
@@ -111,7 +111,7 @@
         result = matcher.getMatch();
         url = result.group(0);
         try {
-          outlinks.add(new Outlink(url, anchor, conf));
+          outlinks.add(new Outlink(url, anchor));
         } catch (MalformedURLException mue) {
           LOG.warn("Invalid url: '" + url + "', skipping.");
         }

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=593186&r1=593185&r2=593186&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java 
Thu Nov  8 07:08:47 2007
@@ -47,8 +47,8 @@
 public class ParseOutputFormat implements OutputFormat {
   private static final Log LOG = LogFactory.getLog(ParseOutputFormat.class);
 
-  private URLNormalizers normalizers;
   private URLFilters filters;
+  private URLNormalizers normalizers;
   private ScoringFilters scfilters;
   
   private static class SimpleEntry implements Entry<Text, CrawlDatum> {
@@ -82,9 +82,8 @@
   public RecordWriter getRecordWriter(FileSystem fs, JobConf job,
                                       String name, Progressable progress) 
throws IOException {
 
-    this.normalizers = new URLNormalizers(job,
-                                          URLNormalizers.SCOPE_OUTLINK);
     this.filters = new URLFilters(job);
+    this.normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK);
     this.scfilters = new ScoringFilters(job);
     final int interval = job.getInt("db.fetch.interval.default", 2592000);
     final boolean ignoreExternalLinks = 
job.getBoolean("db.ignore.external.links", false);
@@ -198,8 +197,8 @@
               }
             }
             try {
-              // normalizing here is not necessary since outlinks 
-              // are already normalized in Outlink's constructor
+              toUrl = normalizers.normalize(toUrl,
+                          URLNormalizers.SCOPE_OUTLINK); // normalize the url
               toUrl = filters.filter(toUrl);   // filter the url
               if (toUrl == null) {
                 continue;

Modified: 
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=593186&r1=593185&r2=593186&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
 Thu Nov  8 07:08:47 2007
@@ -403,7 +403,7 @@
                 URL url = (base.toString().indexOf(';') > 0) ? 
                   fixEmbeddedParams(base, target) :  new URL(base, target);
                 outlinks.add(new Outlink(url.toString(),
-                                         linkText.toString().trim(), conf));
+                                         linkText.toString().trim()));
               } catch (MalformedURLException e) {
                 // don't care
               }

Modified: 
lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?rev=593186&r1=593185&r2=593186&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
 Thu Nov  8 07:08:47 2007
@@ -241,55 +241,55 @@
     try {
      answerOutlinks = new Outlink[][]{ 
          {
-           new Outlink("http://www.nutch.org";, "anchor", conf),
+           new Outlink("http://www.nutch.org";, "anchor"),
          },
          {
-           new Outlink("http://www.nutch.org/";, "home", conf),
-           new Outlink("http://www.nutch.org/docs/bot.html";, "bots", conf),
+           new Outlink("http://www.nutch.org/";, "home"),
+           new Outlink("http://www.nutch.org/docs/bot.html";, "bots"),
          },
          {
-           new Outlink("http://www.nutch.org/";, "separate this", conf),
-           new Outlink("http://www.nutch.org/docs/ok";, "from this", conf),
+           new Outlink("http://www.nutch.org/";, "separate this"),
+           new Outlink("http://www.nutch.org/docs/ok";, "from this"),
          },
          {
-           new Outlink("http://www.nutch.org/";, "home", conf),
-           new Outlink("http://www.nutch.org/docs/1";, "1", conf),
-           new Outlink("http://www.nutch.org/docs/2";, "2", conf),
+           new Outlink("http://www.nutch.org/";, "home"),
+           new Outlink("http://www.nutch.org/docs/1";, "1"),
+           new Outlink("http://www.nutch.org/docs/2";, "2"),
          },
          {
-           new Outlink("http://www.nutch.org/frames/top.html";, "", conf),
-           new Outlink("http://www.nutch.org/frames/left.html";, "", conf),
-           new Outlink("http://www.nutch.org/frames/invalid.html";, "", conf),
-           new Outlink("http://www.nutch.org/frames/right.html";, "", conf),
+           new Outlink("http://www.nutch.org/frames/top.html";, ""),
+           new Outlink("http://www.nutch.org/frames/left.html";, ""),
+           new Outlink("http://www.nutch.org/frames/invalid.html";, ""),
+           new Outlink("http://www.nutch.org/frames/right.html";, ""),
          },
          {
-           new Outlink("http://www.nutch.org/maps/logo.gif";, "", conf),
-           new Outlink("http://www.nutch.org/index.html";, "", conf),
-           new Outlink("http://www.nutch.org/maps/#bottom";, "", conf),
-           new Outlink("http://www.nutch.org/bot.html";, "", conf),
-           new Outlink("http://www.nutch.org/docs/index.html";, "", conf),
+           new Outlink("http://www.nutch.org/maps/logo.gif";, ""),
+           new Outlink("http://www.nutch.org/index.html";, ""),
+           new Outlink("http://www.nutch.org/maps/#bottom";, ""),
+           new Outlink("http://www.nutch.org/bot.html";, ""),
+           new Outlink("http://www.nutch.org/docs/index.html";, ""),
          },
          {
-             new Outlink("http://www.nutch.org/index.html";, "whitespace test", 
conf),
+             new Outlink("http://www.nutch.org/index.html";, "whitespace test"),
          },
          {
          },
          {
-           new Outlink("http://www.nutch.org/dummy.jsp";, "test2", conf),
+           new Outlink("http://www.nutch.org/dummy.jsp";, "test2"),
          },
          {
          },
          {
-           new Outlink("http://www.nutch.org/;x";, "anchor1", conf),
-           new Outlink("http://www.nutch.org/g;x";, "anchor2", conf),
-           new Outlink("http://www.nutch.org/g;x?y#s";, "anchor3", conf)
+           new Outlink("http://www.nutch.org/;x";, "anchor1"),
+           new Outlink("http://www.nutch.org/g;x";, "anchor2"),
+           new Outlink("http://www.nutch.org/g;x?y#s";, "anchor3")
          },
          {
-           new Outlink("http://www.nutch.org/g;something";, "anchor1", conf),
-           new Outlink("http://www.nutch.org/g;something?y#s";, "anchor2", 
conf),
-           new Outlink("http://www.nutch.org/;something?y=1";, "anchor3", conf),
-           new Outlink("http://www.nutch.org/;something?y=1#s";, "anchor4", 
conf),
-           new Outlink("http://www.nutch.org/?y=1;somethingelse";, "anchor5", 
conf)
+           new Outlink("http://www.nutch.org/g;something";, "anchor1"),
+           new Outlink("http://www.nutch.org/g;something?y#s";, "anchor2"),
+           new Outlink("http://www.nutch.org/;something?y=1";, "anchor3"),
+           new Outlink("http://www.nutch.org/;something?y=1#s";, "anchor4"),
+           new Outlink("http://www.nutch.org/?y=1;somethingelse";, "anchor5")
          }
       };
    

Modified: 
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?rev=593186&r1=593185&r2=593186&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
 Thu Nov  8 07:08:47 2007
@@ -236,7 +236,7 @@
         if (LOG.isTraceEnabled()) {
           LOG.trace(" - outlink from JS: '" + url + "'");
         }
-        outlinks.add(new Outlink(url, anchor, getConf()));
+        outlinks.add(new Outlink(url, anchor));
       }
     } catch (Exception ex) {
       // if it is a malformed URL we just throw it away and continue with


Reply via email to