Author: dogacan Date: Wed Jul 11 03:54:37 2007 New Revision: 555237 URL: http://svn.apache.org/viewvc?view=rev&rev=555237 Log: NUTCH-505 - Outlink urls should be validated.
Added: lucene/nutch/trunk/src/java/org/apache/nutch/net/UrlValidator.java Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseImpl.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=555237&r1=555236&r2=555237 ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Wed Jul 11 03:54:37 2007 @@ -81,6 +81,8 @@ 26. NUTCH-503 - Generator exits incorrectly for small fetchlists. (Vishal Shah via dogacan) +27. NUTCH-505 - Outlink urls should be validated. (dogacan) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java?view=diff&rev=555237&r1=555236&r2=555237 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java Wed Jul 11 03:54:37 2007 @@ -23,15 +23,12 @@ import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.protocol.Content; import org.apache.nutch.parse.*; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.conf.Configurable; /* An entry in the fetcher's output. */ -public final class FetcherOutput implements Writable, Configurable { +public final class FetcherOutput implements Writable { private CrawlDatum crawlDatum; private Content content; private ParseImpl parse; - private Configuration conf; public FetcherOutput() {} @@ -45,7 +42,7 @@ public final void readFields(DataInput in) throws IOException { this.crawlDatum = CrawlDatum.read(in); this.content = in.readBoolean() ? Content.read(in) : null; - this.parse = in.readBoolean() ? ParseImpl.read(in, this.conf) : null; + this.parse = in.readBoolean() ? ParseImpl.read(in) : null; } public final void write(DataOutput out) throws IOException { @@ -79,14 +76,6 @@ StringBuffer buffer = new StringBuffer(); buffer.append("CrawlDatum: " + crawlDatum+"\n" ); return buffer.toString(); - } - - public void setConf(Configuration conf) { - this.conf = conf; - } - - public Configuration getConf() { - return this.conf; } } Added: lucene/nutch/trunk/src/java/org/apache/nutch/net/UrlValidator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/net/UrlValidator.java?view=auto&rev=555237 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/net/UrlValidator.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/net/UrlValidator.java Wed Jul 11 03:54:37 2007 @@ -0,0 +1,377 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.net; + +import org.apache.oro.text.perl.Perl5Util; + +/** + * <p>Validates URLs.</p> + * + * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: 03/07/02, + * http://javascript.internet.com. However, this validation now bears little resemblance + * to the php original.</p> + * <pre> + * Example of usage: + * UrlValidator urlValidator = UrlValidator.get(); + * if (urlValidator.isValid("ftp://foo.bar.com/")) { + * System.out.println("url is valid"); + * } else { + * System.out.println("url is invalid"); + * } + * + * prints out "url is valid" + * </pre> + * + * <p>Based on UrlValidator code from Apache commons-validator.</p> + * + * @see + * <a href='http://www.ietf.org/rfc/rfc2396.txt' > + * Uniform Resource Identifiers (URI): Generic Syntax + * </a> + * + */ +public class UrlValidator { + + private static final String ALPHA_CHARS = "a-zA-Z"; + + private static final String ALPHA_NUMERIC_CHARS = ALPHA_CHARS + "\\d"; + + private static final String SPECIAL_CHARS = ";/@&=,.?:+$"; + + private static final String VALID_CHARS = "[^\\s" + SPECIAL_CHARS + "]"; + + private static final String SCHEME_CHARS = ALPHA_CHARS; + + // Drop numeric, and "+-." for now + private static final String AUTHORITY_CHARS = ALPHA_NUMERIC_CHARS + "\\-\\."; + + private static final String ATOM = VALID_CHARS + '+'; + + /** + * This expression derived/taken from the BNF for URI (RFC2396). + */ + private static final String URL_PATTERN = + "/^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?/"; + // 12 3 4 5 6 7 8 9 + + /** + * Schema/Protocol (ie. http:, ftp:, file:, etc). + */ + private static final int PARSE_URL_SCHEME = 2; + + /** + * Includes hostname/ip and port number. + */ + private static final int PARSE_URL_AUTHORITY = 4; + + private static final int PARSE_URL_PATH = 5; + + private static final int PARSE_URL_QUERY = 7; + + /** + * Protocol (ie. http:, ftp:,https:). + */ + private static final String SCHEME_PATTERN = "/^[" + SCHEME_CHARS + "]/"; + + private static final String AUTHORITY_PATTERN = + "/^([" + AUTHORITY_CHARS + "]*)(:\\d*)?(.*)?/"; + // 1 2 3 4 + + private static final int PARSE_AUTHORITY_HOST_IP = 1; + + private static final int PARSE_AUTHORITY_PORT = 2; + + /** + * Should always be empty. + */ + private static final int PARSE_AUTHORITY_EXTRA = 3; + + private static final String PATH_PATTERN = "/^(/[-\\w:@&?=+,.!/~*'%$_;]*)?$/"; + + private static final String QUERY_PATTERN = "/^(.*)$/"; + + private static final String LEGAL_ASCII_PATTERN = "/^[\\000-\\177]+$/"; + + private static final String IP_V4_DOMAIN_PATTERN = + "/^(\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})$/"; + + private static final String DOMAIN_PATTERN = + "/^" + ATOM + "(\\." + ATOM + ")*$/"; + + private static final String PORT_PATTERN = "/^:(\\d{1,5})$/"; + + private static final String ATOM_PATTERN = "/(" + ATOM + ")/"; + + private static final String ALPHA_PATTERN = "/^[" + ALPHA_CHARS + "]/"; + + private static final UrlValidator VALIDATOR = new UrlValidator(); + + private UrlValidator() { + } + + public static UrlValidator get() { + return VALIDATOR; + } + + /** + * <p>Checks if a field has a valid url address.</p> + * + * @param value The value validation is being performed on. A <code>null</code> + * value is considered invalid. + * @return true if the url is valid. + */ + public boolean isValid(String value) { + if (value == null) { + return false; + } + + Perl5Util matchUrlPat = new Perl5Util(); + Perl5Util matchAsciiPat = new Perl5Util(); + + if (!matchAsciiPat.match(LEGAL_ASCII_PATTERN, value)) { + return false; + } + + // Check the whole url address structure + if (!matchUrlPat.match(URL_PATTERN, value)) { + return false; + } + + if (!isValidScheme(matchUrlPat.group(PARSE_URL_SCHEME))) { + return false; + } + + if (!isValidAuthority(matchUrlPat.group(PARSE_URL_AUTHORITY))) { + return false; + } + + if (!isValidPath(matchUrlPat.group(PARSE_URL_PATH))) { + return false; + } + + if (!isValidQuery(matchUrlPat.group(PARSE_URL_QUERY))) { + return false; + } + + return true; + } + + /** + * Validate scheme. If schemes[] was initialized to a non null, + * then only those scheme's are allowed. Note this is slightly different + * than for the constructor. + * @param scheme The scheme to validate. A <code>null</code> value is considered + * invalid. + * @return true if valid. + */ + protected boolean isValidScheme(String scheme) { + if (scheme == null) { + return false; + } + + Perl5Util schemeMatcher = new Perl5Util(); + if (!schemeMatcher.match(SCHEME_PATTERN, scheme)) { + return false; + } + + return true; + } + + /** + * Returns true if the authority is properly formatted. An authority is the combination + * of hostname and port. A <code>null</code> authority value is considered invalid. + * @param authority Authority value to validate. + * @return true if authority (hostname and port) is valid. + */ + protected boolean isValidAuthority(String authority) { + if (authority == null) { + return false; + } + + Perl5Util authorityMatcher = new Perl5Util(); + Perl5Util matchIPV4Pat = new Perl5Util(); + + if (!authorityMatcher.match(AUTHORITY_PATTERN, authority)) { + return false; + } + + boolean ipV4Address = false; + boolean hostname = false; + // check if authority is IP address or hostname + String hostIP = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP); + ipV4Address = matchIPV4Pat.match(IP_V4_DOMAIN_PATTERN, hostIP); + + if (ipV4Address) { + // this is an IP address so check components + for (int i = 1; i <= 4; i++) { + String ipSegment = matchIPV4Pat.group(i); + if (ipSegment == null || ipSegment.length() <= 0) { + return false; + } + + try { + if (Integer.parseInt(ipSegment) > 255) { + return false; + } + } catch(NumberFormatException e) { + return false; + } + + } + } else { + // Domain is hostname name + Perl5Util domainMatcher = new Perl5Util(); + hostname = domainMatcher.match(DOMAIN_PATTERN, hostIP); + } + + // rightmost hostname will never start with a digit. + if (hostname) { + // LOW-TECH FIX FOR VALIDATOR-202 + // TODO: Rewrite to use ArrayList and .add semantics: see VALIDATOR-203 + char[] chars = hostIP.toCharArray(); + int size = 1; + for(int i=0; i<chars.length; i++) { + if(chars[i] == '.') { + size++; + } + } + String[] domainSegment = new String[size]; + boolean match = true; + int segCount = 0; + int segLen = 0; + Perl5Util atomMatcher = new Perl5Util(); + + while (match) { + match = atomMatcher.match(ATOM_PATTERN, hostIP); + if (match) { + domainSegment[segCount] = atomMatcher.group(1); + segLen = domainSegment[segCount].length() + 1; + hostIP = (segLen >= hostIP.length()) ? "" + : hostIP.substring(segLen); + segCount++; + } + } + String topLevel = domainSegment[segCount - 1]; + if (topLevel.length() < 2 || topLevel.length() > 4) { + return false; + } + + // First letter of top level must be a alpha + Perl5Util alphaMatcher = new Perl5Util(); + if (!alphaMatcher.match(ALPHA_PATTERN, topLevel.substring(0, 1))) { + return false; + } + + // Make sure there's a host name preceding the authority. + if (segCount < 2) { + return false; + } + } + + if (!hostname && !ipV4Address) { + return false; + } + + String port = authorityMatcher.group(PARSE_AUTHORITY_PORT); + if (port != null) { + Perl5Util portMatcher = new Perl5Util(); + if (!portMatcher.match(PORT_PATTERN, port)) { + return false; + } + } + + String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA); + if (!isBlankOrNull(extra)) { + return false; + } + + return true; + } + + /** + * <p>Checks if the field isn't null and length of the field is greater + * than zero not including whitespace.</p> + * + * @param value The value validation is being performed on. + * @return true if blank or null. + */ + private boolean isBlankOrNull(String value) { + return ((value == null) || (value.trim().length() == 0)); + } + + /** + * Returns true if the path is valid. A <code>null</code> value is considered invalid. + * @param path Path value to validate. + * @return true if path is valid. + */ + protected boolean isValidPath(String path) { + if (path == null) { + return false; + } + + Perl5Util pathMatcher = new Perl5Util(); + + if (!pathMatcher.match(PATH_PATTERN, path)) { + return false; + } + + int slash2Count = countToken("//", path); + + int slashCount = countToken("/", path); + int dot2Count = countToken("..", path); + if (dot2Count > 0) { + if ((slashCount - slash2Count - 1) <= dot2Count) { + return false; + } + } + + return true; + } + + /** + * Returns true if the query is null or it's a properly formatted query string. + * @param query Query value to validate. + * @return true if query is valid. + */ + protected boolean isValidQuery(String query) { + if (query == null) { + return true; + } + + Perl5Util queryMatcher = new Perl5Util(); + return queryMatcher.match(QUERY_PATTERN, query); + } + + /** + * Returns the number of times the token appears in the target. + * @param token Token value to be counted. + * @param target Target value to count tokens in. + * @return the number of tokens. + */ + protected int countToken(String token, String target) { + int tokenIndex = 0; + int count = 0; + while (tokenIndex != -1) { + tokenIndex = target.indexOf(token, tokenIndex); + if (tokenIndex > -1) { + tokenIndex++; + count++; + } + } + return count; + } +} Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java?view=diff&rev=555237&r1=555236&r2=555237 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java Wed Jul 11 03:54:37 2007 @@ -21,9 +21,8 @@ import java.util.*; import org.apache.hadoop.io.*; -import org.apache.hadoop.fs.*; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.fs.*; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.util.NutchConfiguration; @@ -32,7 +31,7 @@ /** Data extracted from a page's content. * @see Parse#getData() */ -public final class ParseData extends VersionedWritable implements Configurable { +public final class ParseData extends VersionedWritable { public static final String DIR_NAME = "parse_data"; private final static byte VERSION = 5; @@ -42,13 +41,8 @@ private Metadata contentMeta; private Metadata parseMeta; private ParseStatus status; - private Configuration conf; private byte version = VERSION; - // TODO [EMAIL PROTECTED]: should we really implement Configurable or should we add the - // parameter Configuration to the default-constructor. NOTE: The test - // TestWriteable instantiates ParseData with Class.newInstance() -> the default - // constructor is called -> conf is null. The programmer which use this object may not forget to set the conf. public ParseData() {} public ParseData(ParseStatus status, String title, Outlink[] outlinks, @@ -123,19 +117,11 @@ status = ParseStatus.read(in); title = Text.readString(in); // read title - int totalOutlinks = in.readInt(); // read outlinks - int maxOutlinksPerPage = this.conf.getInt("db.max.outlinks.per.page", 100); - int outlinksToRead = totalOutlinks; - if (maxOutlinksPerPage >= 0) { - outlinksToRead = Math.min(maxOutlinksPerPage, totalOutlinks); - } - outlinks = new Outlink[outlinksToRead]; - for (int i = 0; i < outlinksToRead; i++) { + int numOutlinks = in.readInt(); + outlinks = new Outlink[numOutlinks]; + for (int i = 0; i < numOutlinks; i++) { outlinks[i] = Outlink.read(in); } - for (int i = outlinksToRead; i < totalOutlinks; i++) { - Outlink.skip(in); - } if (version < 3) { int propertyCount = in.readInt(); // read metadata @@ -239,11 +225,4 @@ } } - public void setConf(Configuration conf) { - this.conf = conf; - } - - public Configuration getConf() { - return this.conf; - } } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseImpl.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseImpl.java?view=diff&rev=555237&r1=555236&r2=555237 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseImpl.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseImpl.java Wed Jul 11 03:54:37 2007 @@ -19,18 +19,15 @@ import java.io.*; import org.apache.hadoop.io.*; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.conf.Configurable; /** The result of parsing a page's raw content. * @see Parser#getParse(Content) */ -public class ParseImpl implements Parse, Writable, Configurable { +public class ParseImpl implements Parse, Writable { private ParseText text; private ParseData data; private boolean isCanonical; - private Configuration conf; public ParseImpl() {} @@ -70,25 +67,13 @@ text.readFields(in); data = new ParseData(); - data.setConf(this.conf); data.readFields(in); } - public static ParseImpl read(DataInput in, Configuration conf) throws IOException { + public static ParseImpl read(DataInput in) throws IOException { ParseImpl parseImpl = new ParseImpl(); - parseImpl.setConf(conf); parseImpl.readFields(in); return parseImpl; } - - public void setConf(Configuration conf) { - this.conf = conf; - - } - - public Configuration getConf() { - return this.conf; - } - } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?view=diff&rev=555237&r1=555236&r2=555237 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Wed Jul 11 03:54:37 2007 @@ -45,7 +45,6 @@ public class ParseOutputFormat implements OutputFormat { private static final Log LOG = LogFactory.getLog(ParseOutputFormat.class); - private URLNormalizers urlNormalizers; private URLFilters filters; private ScoringFilters scfilters; @@ -80,11 +79,12 @@ public RecordWriter getRecordWriter(FileSystem fs, JobConf job, String name, Progressable progress) throws IOException { - this.urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK); this.filters = new URLFilters(job); this.scfilters = new ScoringFilters(job); + final UrlValidator validator = UrlValidator.get(); final float interval = job.getFloat("db.default.fetch.interval", 30f); final boolean ignoreExternalLinks = job.getBoolean("db.ignore.external.links", false); + final int maxOutlinks = job.getInt("db.max.outlinks.per.page", 100); Path text = new Path(new Path(job.getOutputPath(), ParseText.DIR_NAME), name); @@ -132,6 +132,7 @@ // collect outlinks for subsequent db update Outlink[] links = parseData.getOutlinks(); + int outlinksToStore = Math.min(maxOutlinks, links.length); if (ignoreExternalLinks) { try { fromHost = new URL(fromUrl).getHost().toLowerCase(); @@ -142,29 +143,33 @@ fromHost = null; } - String[] toUrls = new String[links.length]; int validCount = 0; - for (int i = 0; i < links.length; i++) { + CrawlDatum adjust = null; + List<Entry<Text, CrawlDatum>> targets = new ArrayList<Entry<Text, CrawlDatum>>(); + List<Outlink> outlinkList = new ArrayList<Outlink>(); + for (int i = 0; i < links.length && validCount < outlinksToStore; i++) { String toUrl = links[i].getToUrl(); + if (!validator.isValid(toUrl)) { + continue; + } try { - toUrl = urlNormalizers.normalize(toUrl, URLNormalizers.SCOPE_OUTLINK); // normalize the url + // normalizing here is not necessary since outlinks + // are already normalized in Outlink's constructor toUrl = filters.filter(toUrl); // filter the url + if (toUrl == null) { + continue; + } } catch (Exception e) { - toUrl = null; + continue; } + // ignore links to self (or anchors within the page) - if (fromUrl.equals(toUrl)) toUrl = null; - if (toUrl != null) validCount++; - toUrls[i] = toUrl; - } - CrawlDatum adjust = null; - List<Entry<Text, CrawlDatum>> targets = new ArrayList<Entry<Text, CrawlDatum>>(); - // compute score contributions and adjustment to the original score - for (int i = 0; i < toUrls.length; i++) { - if (toUrls[i] == null) continue; + if (fromUrl.equals(toUrl)) { + continue; + } if (ignoreExternalLinks) { try { - toHost = new URL(toUrls[i]).getHost().toLowerCase(); + toHost = new URL(toUrl).getHost().toLowerCase(); } catch (MalformedURLException e) { toHost = null; } @@ -173,7 +178,7 @@ } } CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval); - Text targetUrl = new Text(toUrls[i]); + Text targetUrl = new Text(toUrl); try { scfilters.initialScore(targetUrl, target); } catch (ScoringFilterException e) { @@ -183,8 +188,11 @@ } targets.add(new SimpleEntry(targetUrl, target)); + outlinkList.add(links[i]); + validCount++; } try { + // compute score contributions and adjustment to the original score adjust = scfilters.distributeScoreToOutlinks((Text)key, parseData, targets, null, links.length); } catch (ScoringFilterException e) { @@ -195,6 +203,10 @@ } if (adjust != null) crawlOut.append(key, adjust); + Outlink[] filteredLinks = outlinkList.toArray(new Outlink[outlinkList.size()]); + parseData = new ParseData(parseData.getStatus(), parseData.getTitle(), + filteredLinks, parseData.getContentMeta(), + parseData.getParseMeta()); dataOut.append(key, parseData); if (!parse.isCanonical()) { CrawlDatum datum = new CrawlDatum(); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java?view=diff&rev=555237&r1=555236&r2=555237 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java Wed Jul 11 03:54:37 2007 @@ -258,7 +258,6 @@ public EmptyParseImpl(ParseStatus status, Configuration conf) { data = new ParseData(status, "", new Outlink[0], new Metadata(), new Metadata()); - data.setConf(conf); } public ParseData getData() { Modified: lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java?view=diff&rev=555237&r1=555236&r2=555237 ============================================================================== --- lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java (original) +++ lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java Wed Jul 11 03:54:37 2007 @@ -104,7 +104,6 @@ ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, content.getMetadata(), metadata); - parseData.setConf(this.conf); return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData)); } Modified: lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java?view=diff&rev=555237&r1=555236&r2=555237 ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java Wed Jul 11 03:54:37 2007 @@ -134,7 +134,6 @@ ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, content.getMetadata()); - parseData.setConf(this.conf); return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData)); } Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?view=diff&rev=555237&r1=555236&r2=555237 ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Wed Jul 11 03:54:37 2007 @@ -213,7 +213,6 @@ } ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), metadata); - parseData.setConf(this.conf); ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData)); Modified: lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?view=diff&rev=555237&r1=555236&r2=555237 ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (original) +++ lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Wed Jul 11 03:54:37 2007 @@ -90,7 +90,6 @@ ParseData parseData = new ParseData(status, title, newlinks, parse.getData().getContentMeta(), parse.getData().getParseMeta()); - parseData.setConf(this.conf); // replace original parse obj with new one parseResult.put(content.getUrl(), new ParseText(text), parseData); @@ -170,7 +169,6 @@ } ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, c.getMetadata()); - pd.setConf(this.conf); return ParseResult.createParseResult(c.getUrl(), new ParseImpl(script, pd)); } Modified: lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java?view=diff&rev=555237&r1=555236&r2=555237 ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java Wed Jul 11 03:54:37 2007 @@ -153,7 +153,6 @@ ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, content.getMetadata(), metadata); - parseData.setConf(this.conf); return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData)); // any filter? //return HtmlParseFilters.filter(content, parse, root); Modified: lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java?view=diff&rev=555237&r1=555236&r2=555237 ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java Wed Jul 11 03:54:37 2007 @@ -199,7 +199,6 @@ ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, contentTitle.toString(), outlinks, content.getMetadata()); - parseData.setConf(this.conf); return ParseResult.createParseResult(content.getUrl(), new ParseImpl(indexText.toString(), parseData)); } Modified: lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java?view=diff&rev=555237&r1=555236&r2=555237 ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java Wed Jul 11 03:54:37 2007 @@ -53,7 +53,6 @@ ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", OutlinkExtractor.getOutlinks(text, getConf()), content.getMetadata()); - parseData.setConf(this.conf); return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData)); } Modified: lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java?view=diff&rev=555237&r1=555236&r2=555237 ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java Wed Jul 11 03:54:37 2007 @@ -100,7 +100,6 @@ final ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, resultTitle, outlinks, content.getMetadata()); - parseData.setConf(this.conf); if (LOG.isTraceEnabled()) { LOG.trace("Zip file parsed sucessfully !!"); } return ParseResult.createParseResult(content.getUrl(), new ParseImpl(resultText, parseData)); Modified: lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java?view=diff&rev=555237&r1=555236&r2=555237 ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java Wed Jul 11 03:54:37 2007 @@ -47,9 +47,8 @@ metaData.add("Charset", "UTF-8"); ParseData r = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData); - r.setConf(conf); - WritableTestUtils.testWritable(r, conf); + WritableTestUtils.testWritable(r, null); } public void testMaxOutlinks() throws Exception { @@ -61,22 +60,7 @@ "Max Outlinks Title", outlinks, new Metadata()); - Configuration conf = NutchConfiguration.create(); - // No Outlinks - conf.setInt("db.max.outlinks.per.page", 0); - ParseData data = (ParseData) WritableTestUtils.writeRead(original, conf); - assertEquals(0, data.getOutlinks().length); - // Only 100 Outlinks - conf.setInt("db.max.outlinks.per.page", 100); - data = (ParseData) WritableTestUtils.writeRead(original, conf); - assertEquals(100, data.getOutlinks().length); - // 256 Outlinks - conf.setInt("db.max.outlinks.per.page", 256); - data = (ParseData) WritableTestUtils.writeRead(original, conf); - assertEquals(outlinks.length, data.getOutlinks().length); - // All Outlinks - conf.setInt("db.max.outlinks.per.page", -1); - data = (ParseData) WritableTestUtils.writeRead(original, conf); + ParseData data = (ParseData) WritableTestUtils.writeRead(original, null); assertEquals(outlinks.length, data.getOutlinks().length); } } ------------------------------------------------------------------------- This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs