Author: dogacan Date: Thu Nov 8 07:08:47 2007 New Revision: 593186 URL: http://svn.apache.org/viewvc?rev=593186&view=rev Log: NUTCH-548 - Move URLNormalizer from Outlink to ParseOutputFormat. Contributed by Emmanuel Joke.
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=593186&r1=593185&r2=593186&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Thu Nov 8 07:08:47 2007 @@ -161,6 +161,9 @@ 55. NUTCH-547 - Redirection handling: YahooSlurp's algorithm. (dogacan, kubes via dogacan) +56. NUTCH-548 - Move URLNormalizer from Outlink to ParseOutputFormat. + (Emmanuel Joke via dogacan) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java?rev=593186&r1=593185&r2=593186&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java Thu Nov 8 07:08:47 2007 @@ -32,8 +32,8 @@ public Outlink() {} - public Outlink(String toUrl, String anchor, Configuration conf) throws MalformedURLException { - this.toUrl = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK).normalize(toUrl, URLNormalizers.SCOPE_OUTLINK); + public Outlink(String toUrl, String anchor) throws MalformedURLException { + this.toUrl = toUrl; if (anchor == null) anchor = ""; this.anchor = anchor; } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java?rev=593186&r1=593185&r2=593186&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java Thu Nov 8 07:08:47 2007 @@ -111,7 +111,7 @@ result = matcher.getMatch(); url = result.group(0); try { - outlinks.add(new Outlink(url, anchor, conf)); + outlinks.add(new Outlink(url, anchor)); } catch (MalformedURLException mue) { LOG.warn("Invalid url: '" + url + "', skipping."); } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=593186&r1=593185&r2=593186&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Thu Nov 8 07:08:47 2007 @@ -47,8 +47,8 @@ public class ParseOutputFormat implements OutputFormat { private static final Log LOG = LogFactory.getLog(ParseOutputFormat.class); - private URLNormalizers normalizers; private URLFilters filters; + private URLNormalizers normalizers; private ScoringFilters scfilters; private static class SimpleEntry implements Entry<Text, CrawlDatum> { @@ -82,9 +82,8 @@ public RecordWriter getRecordWriter(FileSystem fs, JobConf job, String name, Progressable progress) throws IOException { - this.normalizers = new URLNormalizers(job, - URLNormalizers.SCOPE_OUTLINK); this.filters = new URLFilters(job); + this.normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK); this.scfilters = new ScoringFilters(job); final int interval = job.getInt("db.fetch.interval.default", 2592000); final boolean ignoreExternalLinks = job.getBoolean("db.ignore.external.links", false); @@ -198,8 +197,8 @@ } } try { - // normalizing here is not necessary since outlinks - // are already normalized in Outlink's constructor + toUrl = normalizers.normalize(toUrl, + URLNormalizers.SCOPE_OUTLINK); // normalize the url toUrl = filters.filter(toUrl); // filter the url if (toUrl == null) { continue; Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=593186&r1=593185&r2=593186&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original) +++ lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Thu Nov 8 07:08:47 2007 @@ -403,7 +403,7 @@ URL url = (base.toString().indexOf(';') > 0) ? fixEmbeddedParams(base, target) : new URL(base, target); outlinks.add(new Outlink(url.toString(), - linkText.toString().trim(), conf)); + linkText.toString().trim())); } catch (MalformedURLException e) { // don't care } Modified: lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?rev=593186&r1=593185&r2=593186&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (original) +++ lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Thu Nov 8 07:08:47 2007 @@ -241,55 +241,55 @@ try { answerOutlinks = new Outlink[][]{ { - new Outlink("http://www.nutch.org", "anchor", conf), + new Outlink("http://www.nutch.org", "anchor"), }, { - new Outlink("http://www.nutch.org/", "home", conf), - new Outlink("http://www.nutch.org/docs/bot.html", "bots", conf), + new Outlink("http://www.nutch.org/", "home"), + new Outlink("http://www.nutch.org/docs/bot.html", "bots"), }, { - new Outlink("http://www.nutch.org/", "separate this", conf), - new Outlink("http://www.nutch.org/docs/ok", "from this", conf), + new Outlink("http://www.nutch.org/", "separate this"), + new Outlink("http://www.nutch.org/docs/ok", "from this"), }, { - new Outlink("http://www.nutch.org/", "home", conf), - new Outlink("http://www.nutch.org/docs/1", "1", conf), - new Outlink("http://www.nutch.org/docs/2", "2", conf), + new Outlink("http://www.nutch.org/", "home"), + new Outlink("http://www.nutch.org/docs/1", "1"), + new Outlink("http://www.nutch.org/docs/2", "2"), }, { - new Outlink("http://www.nutch.org/frames/top.html", "", conf), - new Outlink("http://www.nutch.org/frames/left.html", "", conf), - new Outlink("http://www.nutch.org/frames/invalid.html", "", conf), - new Outlink("http://www.nutch.org/frames/right.html", "", conf), + new Outlink("http://www.nutch.org/frames/top.html", ""), + new Outlink("http://www.nutch.org/frames/left.html", ""), + new Outlink("http://www.nutch.org/frames/invalid.html", ""), + new Outlink("http://www.nutch.org/frames/right.html", ""), }, { - new Outlink("http://www.nutch.org/maps/logo.gif", "", conf), - new Outlink("http://www.nutch.org/index.html", "", conf), - new Outlink("http://www.nutch.org/maps/#bottom", "", conf), - new Outlink("http://www.nutch.org/bot.html", "", conf), - new Outlink("http://www.nutch.org/docs/index.html", "", conf), + new Outlink("http://www.nutch.org/maps/logo.gif", ""), + new Outlink("http://www.nutch.org/index.html", ""), + new Outlink("http://www.nutch.org/maps/#bottom", ""), + new Outlink("http://www.nutch.org/bot.html", ""), + new Outlink("http://www.nutch.org/docs/index.html", ""), }, { - new Outlink("http://www.nutch.org/index.html", "whitespace test", conf), + new Outlink("http://www.nutch.org/index.html", "whitespace test"), }, { }, { - new Outlink("http://www.nutch.org/dummy.jsp", "test2", conf), + new Outlink("http://www.nutch.org/dummy.jsp", "test2"), }, { }, { - new Outlink("http://www.nutch.org/;x", "anchor1", conf), - new Outlink("http://www.nutch.org/g;x", "anchor2", conf), - new Outlink("http://www.nutch.org/g;x?y#s", "anchor3", conf) + new Outlink("http://www.nutch.org/;x", "anchor1"), + new Outlink("http://www.nutch.org/g;x", "anchor2"), + new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") }, { - new Outlink("http://www.nutch.org/g;something", "anchor1", conf), - new Outlink("http://www.nutch.org/g;something?y#s", "anchor2", conf), - new Outlink("http://www.nutch.org/;something?y=1", "anchor3", conf), - new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4", conf), - new Outlink("http://www.nutch.org/?y=1;somethingelse", "anchor5", conf) + new Outlink("http://www.nutch.org/g;something", "anchor1"), + new Outlink("http://www.nutch.org/g;something?y#s", "anchor2"), + new Outlink("http://www.nutch.org/;something?y=1", "anchor3"), + new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"), + new Outlink("http://www.nutch.org/?y=1;somethingelse", "anchor5") } }; Modified: lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?rev=593186&r1=593185&r2=593186&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (original) +++ lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Thu Nov 8 07:08:47 2007 @@ -236,7 +236,7 @@ if (LOG.isTraceEnabled()) { LOG.trace(" - outlink from JS: '" + url + "'"); } - outlinks.add(new Outlink(url, anchor, getConf())); + outlinks.add(new Outlink(url, anchor)); } } catch (Exception ex) { // if it is a malformed URL we just throw it away and continue with