nutc...

dogacan Wed, 11 Jul 2007 03:55:17 -0700

Author: dogacan
Date: Wed Jul 11 03:54:37 2007
New Revision: 555237

URL: http://svn.apache.org/viewvc?view=rev&rev=555237
Log:
NUTCH-505 - Outlink urls should be validated.


Added:
    lucene/nutch/trunk/src/java/org/apache/nutch/net/UrlValidator.java
Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseImpl.java
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java
    
lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java
    
lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
    
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
    
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
    
lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
    
lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
    
lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
    
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
    lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=555237&r1=555236&r2=555237
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed Jul 11 03:54:37 2007
@@ -81,6 +81,8 @@
 26. NUTCH-503 - Generator exits incorrectly for small fetchlists. 
     (Vishal Shah via dogacan)
 
+27. NUTCH-505 - Outlink urls should be validated. (dogacan)
+
 Release 0.9 - 2007-04-02
 
  1. Changed log4j confiquration to log to stdout on commandline

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java?view=diff&rev=555237&r1=555236&r2=555237
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java Wed 
Jul 11 03:54:37 2007
@@ -23,15 +23,12 @@
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.parse.*;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configurable;
 
 /* An entry in the fetcher's output. */
-public final class FetcherOutput implements Writable, Configurable {
+public final class FetcherOutput implements Writable {
   private CrawlDatum crawlDatum;
   private Content content;
   private ParseImpl parse;
-  private Configuration conf;
 
   public FetcherOutput() {}
 
@@ -45,7 +42,7 @@
   public final void readFields(DataInput in) throws IOException {
     this.crawlDatum = CrawlDatum.read(in);
     this.content = in.readBoolean() ? Content.read(in) : null;
-    this.parse = in.readBoolean() ? ParseImpl.read(in, this.conf) : null;
+    this.parse = in.readBoolean() ? ParseImpl.read(in) : null;
   }
 
   public final void write(DataOutput out) throws IOException {
@@ -79,14 +76,6 @@
     StringBuffer buffer = new StringBuffer();
     buffer.append("CrawlDatum: " + crawlDatum+"\n" );
     return buffer.toString();
-  }
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-  }
-
-  public Configuration getConf() {
-    return this.conf;
   }
 
 }

Added: lucene/nutch/trunk/src/java/org/apache/nutch/net/UrlValidator.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/net/UrlValidator.java?view=auto&rev=555237
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/net/UrlValidator.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/net/UrlValidator.java Wed Jul 
11 03:54:37 2007
@@ -0,0 +1,377 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net;
+
+import org.apache.oro.text.perl.Perl5Util;
+
+/**
+ * <p>Validates URLs.</p>
+ *
+ * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b, 
Date: 03/07/02,
+ * http://javascript.internet.com. However, this validation now bears little 
resemblance
+ * to the php original.</p>
+ * <pre>
+ *   Example of usage:
+ *    UrlValidator urlValidator = UrlValidator.get();
+ *    if (urlValidator.isValid("ftp://foo.bar.com/";)) {
+ *       System.out.println("url is valid");
+ *    } else {
+ *       System.out.println("url is invalid");
+ *    }
+ *
+ *   prints out "url is valid"
+ *  </pre>
+ *  
+ * <p>Based on UrlValidator code from Apache commons-validator.</p>
+ *
+ * @see
+ * <a href='http://www.ietf.org/rfc/rfc2396.txt' >
+ *  Uniform Resource Identifiers (URI): Generic Syntax
+ * </a>
+ * 
+ */
+public class UrlValidator {
+
+  private static final String ALPHA_CHARS = "a-zA-Z";
+
+  private static final String ALPHA_NUMERIC_CHARS = ALPHA_CHARS + "\\d";
+
+  private static final String SPECIAL_CHARS = ";/@&=,.?:+$";
+
+  private static final String VALID_CHARS = "[^\\s" + SPECIAL_CHARS + "]";
+
+  private static final String SCHEME_CHARS = ALPHA_CHARS;
+
+  // Drop numeric, and  "+-." for now
+  private static final String AUTHORITY_CHARS = ALPHA_NUMERIC_CHARS + "\\-\\.";
+
+  private static final String ATOM = VALID_CHARS + '+';
+
+  /**
+   * This expression derived/taken from the BNF for URI (RFC2396).
+   */
+  private static final String URL_PATTERN =
+    "/^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?/";
+  //                                                                      12   
         3  4          5       6   7        8 9
+
+  /**
+   * Schema/Protocol (ie. http:, ftp:, file:, etc).
+   */
+  private static final int PARSE_URL_SCHEME = 2;
+
+  /**
+   * Includes hostname/ip and port number.
+   */
+  private static final int PARSE_URL_AUTHORITY = 4;
+
+  private static final int PARSE_URL_PATH = 5;
+
+  private static final int PARSE_URL_QUERY = 7;
+
+  /**
+   * Protocol (ie. http:, ftp:,https:).
+   */
+  private static final String SCHEME_PATTERN = "/^[" + SCHEME_CHARS + "]/";
+
+  private static final String AUTHORITY_PATTERN =
+    "/^([" + AUTHORITY_CHARS + "]*)(:\\d*)?(.*)?/";
+  //                                                                           
 1                          2  3       4
+
+  private static final int PARSE_AUTHORITY_HOST_IP = 1;
+
+  private static final int PARSE_AUTHORITY_PORT = 2;
+
+  /**
+   * Should always be empty.
+   */
+  private static final int PARSE_AUTHORITY_EXTRA = 3;
+
+  private static final String PATH_PATTERN = 
"/^(/[-\\w:@&?=+,.!/~*'%$_;]*)?$/";
+
+  private static final String QUERY_PATTERN = "/^(.*)$/";
+
+  private static final String LEGAL_ASCII_PATTERN = "/^[\\000-\\177]+$/";
+
+  private static final String IP_V4_DOMAIN_PATTERN =
+    "/^(\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})$/";
+
+  private static final String DOMAIN_PATTERN =
+    "/^" + ATOM + "(\\." + ATOM + ")*$/";
+
+  private static final String PORT_PATTERN = "/^:(\\d{1,5})$/";
+
+  private static final String ATOM_PATTERN = "/(" + ATOM + ")/";
+
+  private static final String ALPHA_PATTERN = "/^[" + ALPHA_CHARS + "]/";
+  
+  private static final UrlValidator VALIDATOR = new UrlValidator();
+
+  private UrlValidator() { 
+  }
+  
+  public static UrlValidator get() {
+    return VALIDATOR;
+  }
+
+  /**
+   * <p>Checks if a field has a valid url address.</p>
+   *
+   * @param value The value validation is being performed on.  A 
<code>null</code>
+   * value is considered invalid.
+   * @return true if the url is valid.
+   */
+  public boolean isValid(String value) {
+    if (value == null) {
+      return false;
+    }
+
+    Perl5Util matchUrlPat = new Perl5Util();
+    Perl5Util matchAsciiPat = new Perl5Util();
+
+    if (!matchAsciiPat.match(LEGAL_ASCII_PATTERN, value)) {
+      return false;
+    }
+
+    // Check the whole url address structure
+    if (!matchUrlPat.match(URL_PATTERN, value)) {
+      return false;
+    }
+
+    if (!isValidScheme(matchUrlPat.group(PARSE_URL_SCHEME))) {
+      return false;
+    }
+
+    if (!isValidAuthority(matchUrlPat.group(PARSE_URL_AUTHORITY))) {
+      return false;
+    }
+
+    if (!isValidPath(matchUrlPat.group(PARSE_URL_PATH))) {
+      return false;
+    }
+
+    if (!isValidQuery(matchUrlPat.group(PARSE_URL_QUERY))) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /**
+   * Validate scheme. If schemes[] was initialized to a non null,
+   * then only those scheme's are allowed.  Note this is slightly different
+   * than for the constructor.
+   * @param scheme The scheme to validate.  A <code>null</code> value is 
considered
+   * invalid.
+   * @return true if valid.
+   */
+  protected boolean isValidScheme(String scheme) {
+    if (scheme == null) {
+      return false;
+    }
+
+    Perl5Util schemeMatcher = new Perl5Util();
+    if (!schemeMatcher.match(SCHEME_PATTERN, scheme)) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /**
+   * Returns true if the authority is properly formatted.  An authority is the 
combination
+   * of hostname and port.  A <code>null</code> authority value is considered 
invalid.
+   * @param authority Authority value to validate.
+   * @return true if authority (hostname and port) is valid.
+   */
+  protected boolean isValidAuthority(String authority) {
+    if (authority == null) {
+      return false;
+    }
+
+    Perl5Util authorityMatcher = new Perl5Util();
+    Perl5Util matchIPV4Pat = new Perl5Util();
+
+    if (!authorityMatcher.match(AUTHORITY_PATTERN, authority)) {
+      return false;
+    }
+
+    boolean ipV4Address = false;
+    boolean hostname = false;
+    // check if authority is IP address or hostname
+    String hostIP = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP);
+    ipV4Address = matchIPV4Pat.match(IP_V4_DOMAIN_PATTERN, hostIP);
+
+    if (ipV4Address) {
+      // this is an IP address so check components
+      for (int i = 1; i <= 4; i++) {
+        String ipSegment = matchIPV4Pat.group(i);
+        if (ipSegment == null || ipSegment.length() <= 0) {
+          return false;
+        }
+
+        try {
+          if (Integer.parseInt(ipSegment) > 255) {
+            return false;
+          }
+        } catch(NumberFormatException e) {
+          return false;
+        }
+
+      }
+    } else {
+      // Domain is hostname name
+      Perl5Util domainMatcher = new Perl5Util();
+      hostname = domainMatcher.match(DOMAIN_PATTERN, hostIP);
+    }
+
+    // rightmost hostname will never start with a digit.
+    if (hostname) {
+      // LOW-TECH FIX FOR VALIDATOR-202
+      // TODO: Rewrite to use ArrayList and .add semantics: see VALIDATOR-203
+      char[] chars = hostIP.toCharArray();
+      int size = 1;
+      for(int i=0; i<chars.length; i++) {
+        if(chars[i] == '.') {
+          size++;
+        }
+      }
+      String[] domainSegment = new String[size];
+      boolean match = true;
+      int segCount = 0;
+      int segLen = 0;
+      Perl5Util atomMatcher = new Perl5Util();
+
+      while (match) {
+        match = atomMatcher.match(ATOM_PATTERN, hostIP);
+        if (match) {
+          domainSegment[segCount] = atomMatcher.group(1);
+          segLen = domainSegment[segCount].length() + 1;
+          hostIP = (segLen >= hostIP.length()) ? "" 
+                                               : hostIP.substring(segLen);
+          segCount++;
+        }
+      }
+      String topLevel = domainSegment[segCount - 1];
+      if (topLevel.length() < 2 || topLevel.length() > 4) {
+        return false;
+      }
+
+      // First letter of top level must be a alpha
+      Perl5Util alphaMatcher = new Perl5Util();
+      if (!alphaMatcher.match(ALPHA_PATTERN, topLevel.substring(0, 1))) {
+        return false;
+      }
+
+      // Make sure there's a host name preceding the authority.
+      if (segCount < 2) {
+        return false;
+      }
+    }
+
+    if (!hostname && !ipV4Address) {
+      return false;
+    }
+
+    String port = authorityMatcher.group(PARSE_AUTHORITY_PORT);
+    if (port != null) {
+      Perl5Util portMatcher = new Perl5Util();
+      if (!portMatcher.match(PORT_PATTERN, port)) {
+        return false;
+      }
+    }
+
+    String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA);
+    if (!isBlankOrNull(extra)) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /**
+   * <p>Checks if the field isn't null and length of the field is greater 
+   * than zero not including whitespace.</p>
+   *
+   * @param value The value validation is being performed on.
+   * @return true if blank or null.
+   */
+  private boolean isBlankOrNull(String value) {
+    return ((value == null) || (value.trim().length() == 0));
+  }
+
+  /**
+   * Returns true if the path is valid.  A <code>null</code> value is 
considered invalid.
+   * @param path Path value to validate.
+   * @return true if path is valid.
+   */
+  protected boolean isValidPath(String path) {
+    if (path == null) {
+      return false;
+    }
+
+    Perl5Util pathMatcher = new Perl5Util();
+
+    if (!pathMatcher.match(PATH_PATTERN, path)) {
+      return false;
+    }
+
+    int slash2Count = countToken("//", path);
+
+    int slashCount = countToken("/", path);
+    int dot2Count = countToken("..", path);
+    if (dot2Count > 0) {
+      if ((slashCount - slash2Count - 1) <= dot2Count) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  /**
+   * Returns true if the query is null or it's a properly formatted query 
string.
+   * @param query Query value to validate.
+   * @return true if query is valid.
+   */
+  protected boolean isValidQuery(String query) {
+    if (query == null) {
+      return true;
+    }
+
+    Perl5Util queryMatcher = new Perl5Util();
+    return queryMatcher.match(QUERY_PATTERN, query);
+  }
+
+  /**
+   * Returns the number of times the token appears in the target.
+   * @param token Token value to be counted.
+   * @param target Target value to count tokens in.
+   * @return the number of tokens.
+   */
+  protected int countToken(String token, String target) {
+    int tokenIndex = 0;
+    int count = 0;
+    while (tokenIndex != -1) {
+      tokenIndex = target.indexOf(token, tokenIndex);
+      if (tokenIndex > -1) {
+        tokenIndex++;
+        count++;
+      }
+    }
+    return count;
+  }
+}

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java?view=diff&rev=555237&r1=555236&r2=555237
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java Wed Jul 
11 03:54:37 2007
@@ -21,9 +21,8 @@
 import java.util.*;
 
 import org.apache.hadoop.io.*;
-import org.apache.hadoop.fs.*;
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.fs.*;
 
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.util.NutchConfiguration;
@@ -32,7 +31,7 @@
 /** Data extracted from a page's content.
  * @see Parse#getData()
  */
-public final class ParseData extends VersionedWritable implements Configurable 
{
+public final class ParseData extends VersionedWritable {
   public static final String DIR_NAME = "parse_data";
 
   private final static byte VERSION = 5;
@@ -42,13 +41,8 @@
   private Metadata contentMeta;
   private Metadata parseMeta;
   private ParseStatus status;
-  private Configuration conf;
   private byte version = VERSION;
   
-  // TODO [EMAIL PROTECTED]: should we really implement Configurable or should 
we add the
-  // parameter Configuration to the default-constructor. NOTE: The test
-  // TestWriteable instantiates ParseData with Class.newInstance() -> the 
default
-  // constructor is called -> conf is null. The programmer which use this 
object may not forget to set the conf.
   public ParseData() {}
 
   public ParseData(ParseStatus status, String title, Outlink[] outlinks,
@@ -123,19 +117,11 @@
     status = ParseStatus.read(in);
     title = Text.readString(in);                   // read title
 
-    int totalOutlinks = in.readInt();             // read outlinks
-    int maxOutlinksPerPage = this.conf.getInt("db.max.outlinks.per.page", 100);
-    int outlinksToRead = totalOutlinks;
-    if (maxOutlinksPerPage >= 0) {
-      outlinksToRead = Math.min(maxOutlinksPerPage, totalOutlinks);
-    }
-    outlinks = new Outlink[outlinksToRead];
-    for (int i = 0; i < outlinksToRead; i++) {
+    int numOutlinks = in.readInt();    
+    outlinks = new Outlink[numOutlinks];
+    for (int i = 0; i < numOutlinks; i++) {
       outlinks[i] = Outlink.read(in);
     }
-    for (int i = outlinksToRead; i < totalOutlinks; i++) {
-      Outlink.skip(in);
-    }
     
     if (version < 3) {
       int propertyCount = in.readInt();             // read metadata
@@ -239,11 +225,4 @@
     }
   }
 
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
 }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseImpl.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseImpl.java?view=diff&rev=555237&r1=555236&r2=555237
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseImpl.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseImpl.java Wed Jul 
11 03:54:37 2007
@@ -19,18 +19,15 @@
 
 import java.io.*;
 import org.apache.hadoop.io.*;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configurable;
 
 
 /** The result of parsing a page's raw content.
  * @see Parser#getParse(Content)
  */
-public class ParseImpl implements Parse, Writable, Configurable {
+public class ParseImpl implements Parse, Writable {
   private ParseText text;
   private ParseData data;
   private boolean isCanonical;
-  private Configuration conf;
 
   public ParseImpl() {}
 
@@ -70,25 +67,13 @@
     text.readFields(in);
 
     data = new ParseData();
-    data.setConf(this.conf);
     data.readFields(in);
   }
 
-  public static ParseImpl read(DataInput in, Configuration conf) throws 
IOException {
+  public static ParseImpl read(DataInput in) throws IOException {
     ParseImpl parseImpl = new ParseImpl();
-    parseImpl.setConf(conf);
     parseImpl.readFields(in);
     return parseImpl;
   }
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
-
 
 }

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?view=diff&rev=555237&r1=555236&r2=555237
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java 
Wed Jul 11 03:54:37 2007
@@ -45,7 +45,6 @@
 public class ParseOutputFormat implements OutputFormat {
   private static final Log LOG = LogFactory.getLog(ParseOutputFormat.class);
 
-  private URLNormalizers urlNormalizers;
   private URLFilters filters;
   private ScoringFilters scfilters;
   
@@ -80,11 +79,12 @@
   public RecordWriter getRecordWriter(FileSystem fs, JobConf job,
                                       String name, Progressable progress) 
throws IOException {
 
-    this.urlNormalizers = new URLNormalizers(job, 
URLNormalizers.SCOPE_OUTLINK);
     this.filters = new URLFilters(job);
     this.scfilters = new ScoringFilters(job);
+    final UrlValidator validator = UrlValidator.get();
     final float interval = job.getFloat("db.default.fetch.interval", 30f);
     final boolean ignoreExternalLinks = 
job.getBoolean("db.ignore.external.links", false);
+    final int maxOutlinks = job.getInt("db.max.outlinks.per.page", 100);
     
     Path text =
       new Path(new Path(job.getOutputPath(), ParseText.DIR_NAME), name);
@@ -132,6 +132,7 @@
 
           // collect outlinks for subsequent db update
           Outlink[] links = parseData.getOutlinks();
+          int outlinksToStore = Math.min(maxOutlinks, links.length);
           if (ignoreExternalLinks) {
             try {
               fromHost = new URL(fromUrl).getHost().toLowerCase();
@@ -142,29 +143,33 @@
             fromHost = null;
           }
 
-          String[] toUrls = new String[links.length];
           int validCount = 0;
-          for (int i = 0; i < links.length; i++) {
+          CrawlDatum adjust = null;
+          List<Entry<Text, CrawlDatum>> targets = new ArrayList<Entry<Text, 
CrawlDatum>>();
+          List<Outlink> outlinkList = new ArrayList<Outlink>();
+          for (int i = 0; i < links.length && validCount < outlinksToStore; 
i++) {
             String toUrl = links[i].getToUrl();
+            if (!validator.isValid(toUrl)) {
+              continue;
+            }
             try {
-              toUrl = urlNormalizers.normalize(toUrl, 
URLNormalizers.SCOPE_OUTLINK); // normalize the url
+              // normalizing here is not necessary since outlinks 
+              // are already normalized in Outlink's constructor
               toUrl = filters.filter(toUrl);   // filter the url
+              if (toUrl == null) {
+                continue;
+              }
             } catch (Exception e) {
-              toUrl = null;
+              continue;
             }
+            
             // ignore links to self (or anchors within the page)
-            if (fromUrl.equals(toUrl)) toUrl = null;
-            if (toUrl != null) validCount++;
-            toUrls[i] = toUrl;
-          }
-          CrawlDatum adjust = null;
-          List<Entry<Text, CrawlDatum>> targets = new ArrayList<Entry<Text, 
CrawlDatum>>();
-          // compute score contributions and adjustment to the original score
-          for (int i = 0; i < toUrls.length; i++) {
-            if (toUrls[i] == null) continue;
+            if (fromUrl.equals(toUrl)) {
+              continue;
+            }
             if (ignoreExternalLinks) {
               try {
-                toHost = new URL(toUrls[i]).getHost().toLowerCase();
+                toHost = new URL(toUrl).getHost().toLowerCase();
               } catch (MalformedURLException e) {
                 toHost = null;
               }
@@ -173,7 +178,7 @@
               }
             }
             CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, 
interval);
-            Text targetUrl = new Text(toUrls[i]);
+            Text targetUrl = new Text(toUrl);
             try {
               scfilters.initialScore(targetUrl, target);
             } catch (ScoringFilterException e) {
@@ -183,8 +188,11 @@
             }
             
             targets.add(new SimpleEntry(targetUrl, target));
+            outlinkList.add(links[i]);
+            validCount++;
           }
           try {
+            // compute score contributions and adjustment to the original score
             adjust = scfilters.distributeScoreToOutlinks((Text)key, parseData, 
                       targets, null, links.length);
           } catch (ScoringFilterException e) {
@@ -195,6 +203,10 @@
           }
           if (adjust != null) crawlOut.append(key, adjust);
 
+          Outlink[] filteredLinks = outlinkList.toArray(new 
Outlink[outlinkList.size()]);
+          parseData = new ParseData(parseData.getStatus(), 
parseData.getTitle(), 
+                                    filteredLinks, parseData.getContentMeta(), 
+                                    parseData.getParseMeta());
           dataOut.append(key, parseData);
           if (!parse.isCanonical()) {
             CrawlDatum datum = new CrawlDatum();

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java?view=diff&rev=555237&r1=555236&r2=555237
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java Wed Jul 
11 03:54:37 2007
@@ -258,7 +258,6 @@
     public EmptyParseImpl(ParseStatus status, Configuration conf) {
       data = new ParseData(status, "", new Outlink[0],
                            new Metadata(), new Metadata());
-      data.setConf(conf);
     }
     
     public ParseData getData() {

Modified: 
lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java?view=diff&rev=555237&r1=555236&r2=555237
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java
 Wed Jul 11 03:54:37 2007
@@ -104,7 +104,6 @@
     ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
                                         outlinks, content.getMetadata(),
                                         metadata);
-    parseData.setConf(this.conf);
     return ParseResult.createParseResult(content.getUrl(), 
                                          new ParseImpl(text, parseData));
   }

Modified: 
lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java?view=diff&rev=555237&r1=555236&r2=555237
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
 Wed Jul 11 03:54:37 2007
@@ -134,7 +134,6 @@
 
     ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
                                         outlinks, content.getMetadata());
-    parseData.setConf(this.conf);
     return ParseResult.createParseResult(content.getUrl(), 
                                          new ParseImpl(text, parseData));
   }

Modified: 
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?view=diff&rev=555237&r1=555236&r2=555237
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
 Wed Jul 11 03:54:37 2007
@@ -213,7 +213,6 @@
     }
     ParseData parseData = new ParseData(status, title, outlinks,
                                         content.getMetadata(), metadata);
-    parseData.setConf(this.conf);
     ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), 
                                                  new ParseImpl(text, 
parseData));
 

Modified: 
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?view=diff&rev=555237&r1=555236&r2=555237
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
 Wed Jul 11 03:54:37 2007
@@ -90,7 +90,6 @@
       ParseData parseData = new ParseData(status, title, newlinks,
                                           parse.getData().getContentMeta(),
                                           parse.getData().getParseMeta());
-      parseData.setConf(this.conf);
 
       // replace original parse obj with new one
       parseResult.put(content.getUrl(), new ParseText(text), parseData);
@@ -170,7 +169,6 @@
     }
     ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks,
                                  c.getMetadata());
-    pd.setConf(this.conf);
     return ParseResult.createParseResult(c.getUrl(), new ParseImpl(script, 
pd));
   }
   

Modified: 
lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java?view=diff&rev=555237&r1=555236&r2=555237
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
 Wed Jul 11 03:54:37 2007
@@ -153,7 +153,6 @@
     ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
                                         outlinks, content.getMetadata(),
                                         metadata);
-    parseData.setConf(this.conf);
     return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, 
parseData));
     // any filter?
     //return HtmlParseFilters.filter(content, parse, root);

Modified: 
lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java?view=diff&rev=555237&r1=555236&r2=555237
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
 Wed Jul 11 03:54:37 2007
@@ -199,7 +199,6 @@
 
         ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
                 contentTitle.toString(), outlinks, content.getMetadata());
-        parseData.setConf(this.conf);
         return ParseResult.createParseResult(content.getUrl(), new 
ParseImpl(indexText.toString(), parseData));
     }
 

Modified: 
lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java?view=diff&rev=555237&r1=555236&r2=555237
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
 Wed Jul 11 03:54:37 2007
@@ -53,7 +53,6 @@
     
     ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "",
         OutlinkExtractor.getOutlinks(text, getConf()), content.getMetadata());
-    parseData.setConf(this.conf);
     return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, 
parseData));
   }
 

Modified: 
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java?view=diff&rev=555237&r1=555236&r2=555237
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
 Wed Jul 11 03:54:37 2007
@@ -100,7 +100,6 @@
     final ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
                                               resultTitle, outlinks,
                                               content.getMetadata());
-    parseData.setConf(this.conf);
 
     if (LOG.isTraceEnabled()) { LOG.trace("Zip file parsed sucessfully !!"); }
     return ParseResult.createParseResult(content.getUrl(), new 
ParseImpl(resultText, parseData));

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java?view=diff&rev=555237&r1=555236&r2=555237
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java 
(original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java Wed 
Jul 11 03:54:37 2007
@@ -47,9 +47,8 @@
     metaData.add("Charset", "UTF-8");
 
     ParseData r = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, 
metaData);
-    r.setConf(conf);
                         
-    WritableTestUtils.testWritable(r, conf);
+    WritableTestUtils.testWritable(r, null);
   }
        
   public void testMaxOutlinks() throws Exception {
@@ -61,22 +60,7 @@
                                        "Max Outlinks Title",
                                        outlinks,
                                        new Metadata());
-    Configuration conf = NutchConfiguration.create();
-    // No Outlinks
-    conf.setInt("db.max.outlinks.per.page", 0);
-    ParseData data = (ParseData) WritableTestUtils.writeRead(original, conf);
-    assertEquals(0, data.getOutlinks().length);
-    // Only 100 Outlinks
-    conf.setInt("db.max.outlinks.per.page", 100);
-    data = (ParseData) WritableTestUtils.writeRead(original, conf);
-    assertEquals(100, data.getOutlinks().length);
-    // 256 Outlinks
-    conf.setInt("db.max.outlinks.per.page", 256);
-    data = (ParseData) WritableTestUtils.writeRead(original, conf);
-    assertEquals(outlinks.length, data.getOutlinks().length);
-    // All Outlinks
-    conf.setInt("db.max.outlinks.per.page", -1);
-    data = (ParseData) WritableTestUtils.writeRead(original, conf);
+    ParseData data = (ParseData) WritableTestUtils.writeRead(original, null);
     assertEquals(outlinks.length, data.getOutlinks().length);
   }
 }



-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
Nutch-cvs mailing list
Nutch-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

[Nutch-cvs] svn commit: r555237 - in /lucene/nutch/trunk: ./ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/net/ src/java/org/apache/nutch/parse/ src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/ src/plugin/parse-ext/src/java/org/apache/nutc...

Reply via email to