Added: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=359668&view=auto ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Wed Dec 28 16:37:13 2005 @@ -0,0 +1,100 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse; + +import org.apache.nutch.io.*; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.fetcher.Fetcher; +import org.apache.nutch.fs.*; +import org.apache.nutch.mapred.*; +import org.apache.nutch.parse.*; +import org.apache.nutch.net.*; + +import java.io.*; +import java.util.*; + +/* Parse content in a segment. */ +public class ParseOutputFormat implements OutputFormat { + + private UrlNormalizer urlNormalizer = UrlNormalizerFactory.getNormalizer(); + + public RecordWriter getRecordWriter(NutchFileSystem fs, JobConf job, + String name) throws IOException { + + final float interval = job.getFloat("db.default.fetch.interval", 30f); + + File text = + new File(new File(job.getOutputDir(), ParseText.DIR_NAME), name); + File data = + new File(new File(job.getOutputDir(), ParseData.DIR_NAME), name); + File crawl = + new File(new File(job.getOutputDir(), CrawlDatum.PARSE_DIR_NAME), name); + + final MapFile.Writer textOut = + new MapFile.Writer(fs, text.toString(), UTF8.class, ParseText.class); + + final MapFile.Writer dataOut = + new MapFile.Writer(fs, data.toString(), UTF8.class,ParseData.class,true); + + final SequenceFile.Writer crawlOut = + new SequenceFile.Writer(fs, crawl.toString(), + UTF8.class, CrawlDatum.class); + + return new RecordWriter() { + + public void write(WritableComparable key, Writable value) + throws IOException { + + Parse parse = (Parse)value; + + textOut.append(key, new ParseText(parse.getText())); + dataOut.append(key, parse.getData()); + + // collect outlinks for subsequent db update + Outlink[] links = parse.getData().getOutlinks(); + + // compute OPIC score contribution + float score = + Float.parseFloat(parse.getData().get(Fetcher.SCORE_KEY)); + score /= links.length; + + for (int i = 0; i < links.length; i++) { + String toUrl = links[i].getToUrl(); + try { + toUrl = urlNormalizer.normalize(toUrl); // normalize the url + toUrl = URLFilters.filter(toUrl); // filter the url + } catch (Exception e) { + toUrl = null; + } + if (toUrl != null) + crawlOut.append(new UTF8(toUrl), + new CrawlDatum(CrawlDatum.STATUS_LINKED, + interval, score)); + } + } + + public void close(Reporter reporter) throws IOException { + textOut.close(); + dataOut.close(); + crawlOut.close(); + } + + }; + + } + +}
Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=359668&view=auto ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Wed Dec 28 16:37:13 2005 @@ -0,0 +1,109 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse; + +import org.apache.nutch.io.*; +import org.apache.nutch.parse.ParseOutputFormat; +import org.apache.nutch.mapred.*; +import org.apache.nutch.util.*; +import org.apache.nutch.protocol.*; + +import java.io.*; +import java.util.*; +import java.util.logging.*; + +/* Parse content in a segment. */ +public class ParseSegment extends NutchConfigured implements Mapper, Reducer { + + public static final Logger LOG = + LogFormatter.getLogger(Parser.class.getName()); + + public ParseSegment() { super(null); } + + public ParseSegment(NutchConf conf) { + super(conf); + } + + public void configure(JobConf job) { + } + + public void map(WritableComparable key, Writable value, + OutputCollector output, Reporter reporter) + throws IOException { + Content content = (Content)value; + + Parse parse = null; + ParseStatus status; + try { + parse = ParseUtil.parse(content); + status = parse.getData().getStatus(); + } catch (Exception e) { + status = new ParseStatus(e); + } + + if (status.isSuccess()) { + output.collect(key, new ParseImpl(parse.getText(), parse.getData())); + } else { + LOG.warning("Error parsing: "+key+": "+status.toString()); + } + } + + public void reduce(WritableComparable key, Iterator values, + OutputCollector output, Reporter reporter) + throws IOException { + output.collect(key, (Writable)values.next()); // collect first value + } + + public void parse(File segment) throws IOException { + LOG.info("Parse: starting"); + LOG.info("Parse: segment: " + segment); + + JobConf job = new JobConf(getConf()); + + job.setInputDir(new File(segment, Content.DIR_NAME)); + job.setInputFormat(SequenceFileInputFormat.class); + job.setInputKeyClass(UTF8.class); + job.setInputValueClass(Content.class); + job.setMapperClass(ParseSegment.class); + job.setReducerClass(ParseSegment.class); + + job.setOutputDir(segment); + job.setOutputFormat(ParseOutputFormat.class); + job.setOutputKeyClass(UTF8.class); + job.setOutputValueClass(ParseImpl.class); + + JobClient.runJob(job); + LOG.info("Parse: done"); + } + + + public static void main(String[] args) throws Exception { + File segment; + + String usage = "Usage: ParseSegment segment"; + + if (args.length == 0) { + System.err.println(usage); + System.exit(-1); + } + + segment = new File(args[0]); + + ParseSegment parseSegment = new ParseSegment(NutchConf.get()); + parseSegment.parse(segment); + } +} Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java ------------------------------------------------------------------------------ svn:eol-style = native Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java?rev=359668&r1=359667&r2=359668&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Wed Dec 28 16:37:13 2005 @@ -18,6 +18,8 @@ import org.apache.nutch.util.LogFormatter; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.io.UTF8; import org.apache.nutch.parse.ParseUtil; import org.apache.nutch.protocol.ProtocolFactory; @@ -69,7 +71,7 @@ LOG.info("fetching: "+url); Protocol protocol = ProtocolFactory.getProtocol(url); - Content content = protocol.getProtocolOutput(url).getContent(); + Content content = protocol.getProtocolOutput(new UTF8(url), new CrawlDatum()).getContent(); if (force) { content.setContentType(contentType); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java?rev=359668&r1=359667&r2=359668&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java Wed Dec 28 16:37:13 2005 @@ -18,21 +18,15 @@ import java.io.IOException; -import org.apache.nutch.pagedb.FetchListEntry; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.io.UTF8; /** A retriever of url content. Implemented by protocol extensions. */ public interface Protocol { /** The name of the extension point. */ public final static String X_POINT_ID = Protocol.class.getName(); - /** Returns the [EMAIL PROTECTED] Content} for a url. This method may be - * more limited than [EMAIL PROTECTED] #getProtocolOutput(FetchListEntry)}. - * @throws IOException for any errors. - */ - ProtocolOutput getProtocolOutput(String url); - /** Returns the [EMAIL PROTECTED] Content} for a fetchlist entry. - * @throws IOException for any errors. */ - ProtocolOutput getProtocolOutput(FetchListEntry fle); + ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum); } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java?rev=359668&r1=359667&r2=359668&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java Wed Dec 28 16:37:13 2005 @@ -20,17 +20,11 @@ import java.io.File; import java.util.HashMap; -import java.util.Arrays; import org.apache.nutch.io.*; import org.apache.nutch.fs.*; -import org.apache.nutch.db.*; -import org.apache.nutch.util.*; -import org.apache.nutch.fetcher.*; import org.apache.nutch.protocol.*; import org.apache.nutch.parse.*; -import org.apache.nutch.pagedb.*; -import org.apache.nutch.indexer.*; import org.apache.nutch.mapred.*; import org.apache.nutch.mapred.lib.*; import org.apache.nutch.crawl.*; Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java?rev=359668&r1=359667&r2=359668&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java Wed Dec 28 16:37:13 2005 @@ -28,7 +28,6 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiReader; -import org.apache.lucene.search.MultiSearcher; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.FieldDoc; @@ -40,11 +39,7 @@ import org.apache.nutch.fs.*; import org.apache.nutch.io.*; import org.apache.nutch.util.*; -import org.apache.nutch.db.*; -import org.apache.nutch.fetcher.*; -import org.apache.nutch.linkdb.*; import org.apache.nutch.indexer.*; -import org.apache.nutch.analysis.NutchDocumentAnalyzer; /** Implements [EMAIL PROTECTED] Searcher} and [EMAIL PROTECTED] HitDetailer} for either a single * merged index, or a set of indexes. */ Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java?rev=359668&r1=359667&r2=359668&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java Wed Dec 28 16:37:13 2005 @@ -26,7 +26,6 @@ import org.apache.nutch.parse.*; import org.apache.nutch.indexer.*; import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.crawl.LinkDbReader; /** * One stop shopping for search-related functionality. @@ -103,7 +102,7 @@ Vector vDirs=new Vector(); File [] directories = fs.listFiles(indexesDir); for(int i = 0; i < fs.listFiles(indexesDir).length; i++) { - File indexdone = new File(directories[i], IndexSegment.DONE_NAME); + File indexdone = new File(directories[i], Indexer.DONE_NAME); if(fs.isFile(indexdone)) { vDirs.add(directories[i]); } Added: lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java?rev=359668&view=auto ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java Wed Dec 28 16:37:13 2005 @@ -0,0 +1,384 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.crawl; + +import java.io.*; +import java.net.*; +import java.util.*; +import java.util.logging.*; +import java.net.MalformedURLException; +import java.util.regex.*; + +import javax.xml.parsers.*; +import org.xml.sax.*; +import org.xml.sax.helpers.*; +import org.apache.xerces.util.XMLChar; + +import org.apache.nutch.io.*; +import org.apache.nutch.fs.*; +import org.apache.nutch.net.*; +import org.apache.nutch.util.*; +import org.apache.nutch.util.NutchConf; + +/** Utility that converts DMOZ RDF into a flat file of URLs to be injected. */ +public class DmozParser { + public static final Logger LOG = LogFormatter.getLogger("org.apache.nutch.crawl.DmozParser"); + + long pages = 0; + + /** + * This filter fixes characters that might offend our parser. + * This lets us be tolerant of errors that might appear in the input XML. + */ + private static class XMLCharFilter extends FilterReader { + private boolean lastBad = false; + + public XMLCharFilter(Reader reader) { + super(reader); + } + + public int read() throws IOException { + int c = in.read(); + int value = c; + if (c != -1 && !(XMLChar.isValid(c))) // fix invalid characters + value = 'X'; + else if (lastBad && c == '<') { // fix mis-matched brackets + in.mark(1); + if (in.read() != '/') + value = 'X'; + in.reset(); + } + lastBad = (c == 65533); + + return value; + } + + public int read(char[] cbuf, int off, int len) + throws IOException { + int n = in.read(cbuf, off, len); + if (n != -1) { + for (int i = 0; i < n; i++) { + char c = cbuf[off+i]; + char value = c; + if (!(XMLChar.isValid(c))) // fix invalid characters + value = 'X'; + else if (lastBad && c == '<') { // fix mis-matched brackets + if (i != n-1 && cbuf[off+i+1] != '/') + value = 'X'; + } + lastBad = (c == 65533); + cbuf[off+i] = value; + } + } + return n; + } + } + + + /** + * The RDFProcessor receives tag messages during a parse + * of RDF XML data. We build whatever structures we need + * from these messages. + */ + private class RDFProcessor extends DefaultHandler { + String curURL = null, curSection = null; + boolean titlePending = false, descPending = false, insideAdultSection = false; + Pattern topicPattern = null; + StringBuffer title = new StringBuffer(), desc = new StringBuffer(); + XMLReader reader; + int subsetDenom; + int hashSkew; + boolean includeAdult; + Locator location; + + /** + * Pass in an XMLReader, plus a flag as to whether we + * should include adult material. + */ + public RDFProcessor(XMLReader reader, int subsetDenom, boolean includeAdult, int skew, Pattern topicPattern) throws IOException { + this.reader = reader; + this.subsetDenom = subsetDenom; + this.includeAdult = includeAdult; + this.topicPattern = topicPattern; + + this.hashSkew = skew != 0 ? skew : new Random().nextInt(); + } + + // + // Interface ContentHandler + // + + /** + * Start of an XML elt + */ + public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException { + if ("Topic".equals(qName)) { + curSection = atts.getValue("r:id"); + } else if ("ExternalPage".equals(qName)) { + // Porn filter + if ((! includeAdult) && curSection.startsWith("Top/Adult")) { + return; + } + + if (topicPattern != null && !topicPattern.matcher(curSection).matches()) { + return; + } + + // Subset denominator filter. + // Only emit with a chance of 1/denominator. + String url = atts.getValue("about"); + int hashValue = MD5Hash.digest(url).hashCode(); + hashValue = Math.abs(hashValue ^ hashSkew); + if ((hashValue % subsetDenom) != 0) { + return; + } + + // We actually claim the URL! + curURL = url; + } else if (curURL != null && "d:Title".equals(qName)) { + titlePending = true; + } else if (curURL != null && "d:Description".equals(qName)) { + descPending = true; + } + } + + /** + * The contents of an XML elt + */ + public void characters(char ch[], int start, int length) { + if (titlePending) { + title.append(ch, start, length); + } else if (descPending) { + desc.append(ch, start, length); + } + } + + /** + * Termination of XML elt + */ + public void endElement(String namespaceURI, String localName, String qName) + throws SAXException { + if (curURL != null) { + if ("ExternalPage".equals(qName)) { + // + // Inc the number of pages, insert the page, and + // possibly print status. + // + System.out.println(curURL); + pages++; + + // + // Clear out the link text. This is what + // you would use for adding to the linkdb. + // + if (title.length() > 0) { + title.delete(0, title.length()); + } + if (desc.length() > 0) { + desc.delete(0, desc.length()); + } + + // Null out the URL. + curURL = null; + } else if ("d:Title".equals(qName)) { + titlePending = false; + } else if ("d:Description".equals(qName)) { + descPending = false; + } + } + } + + /** + * When parsing begins + */ + public void startDocument() { + LOG.info("Begin parse"); + } + + /** + * When parsing ends + */ + public void endDocument() { + LOG.info("Completed parse. Found " + pages + " pages."); + } + + /** + * From time to time the Parser will set the "current location" + * by calling this function. It's useful for emitting locations + * for error messages. + */ + public void setDocumentLocator(Locator locator) { + location = locator; + } + + + // + // Interface ErrorHandler + // + + /** + * Emit the exception message + */ + public void error(SAXParseException spe) { + LOG.severe("Error: " + spe.toString() + ": " + spe.getMessage()); + spe.printStackTrace(System.err); + } + + /** + * Emit the exception message, with line numbers + */ + public void fatalError(SAXParseException spe) { + LOG.severe("Fatal err: " + spe.toString() + ": " + spe.getMessage()); + LOG.severe("Last known line is " + location.getLineNumber() + ", column " + location.getColumnNumber()); + spe.printStackTrace(System.err); + } + + /** + * Emit exception warning message + */ + public void warning(SAXParseException spe) { + LOG.warning("Warning: " + spe.toString() + ": " + spe.getMessage()); + spe.printStackTrace(System.err); + } + } + + /** + * Iterate through all the items in this structured DMOZ file. + * Add each URL to the web db. + */ + public void parseDmozFile(File dmozFile, int subsetDenom, + boolean includeAdult, + int skew, + Pattern topicPattern) + + throws IOException, SAXException, ParserConfigurationException { + + SAXParserFactory parserFactory = SAXParserFactory.newInstance(); + SAXParser parser = parserFactory.newSAXParser(); + XMLReader reader = parser.getXMLReader(); + + // Create our own processor to receive SAX events + RDFProcessor rp = + new RDFProcessor(reader, subsetDenom, includeAdult, + skew, topicPattern); + reader.setContentHandler(rp); + reader.setErrorHandler(rp); + LOG.info("skew = " + rp.hashSkew); + + // + // Open filtered text stream. The UTF8Filter makes sure that + // only appropriate XML-approved UTF8 characters are received. + // Any non-conforming characters are silently skipped. + // + XMLCharFilter in = new XMLCharFilter(new BufferedReader(new InputStreamReader(new BufferedInputStream(new FileInputStream(dmozFile)), "UTF-8"))); + try { + InputSource is = new InputSource(in); + reader.parse(is); + } catch (Exception e) { + LOG.severe(e.toString()); + e.printStackTrace(System.err); + System.exit(0); + } finally { + in.close(); + } + } + + private static void addTopicsFromFile(String topicFile, Vector topics) + throws IOException { + BufferedReader in = null; + try { + in = new BufferedReader(new InputStreamReader(new FileInputStream(topicFile), "UTF-8")); + String line = null; + while ((line = in.readLine()) != null) { + topics.addElement(new String(line)); + } + } + catch (Exception e) { + LOG.severe(e.toString()); + e.printStackTrace(System.out); + System.exit(0); + } finally { + in.close(); + } + } + + /** + * Command-line access. User may add URLs via a flat text file + * or the structured DMOZ file. By default, we ignore Adult + * material (as categorized by DMOZ). + */ + public static void main(String argv[]) throws Exception { + if (argv.length < 1) { + System.err.println("Usage: DmozParser <dmoz_file> [-subset <subsetDenominator>] [-includeAdultMaterial] [-skew skew] [-topicFile <topic list file>] [-topic <topic> [-topic <topic> [...]]]"); + return; + } + + // + // Parse the command line, figure out what kind of + // URL file we need to load + // + int subsetDenom = 1; + int skew = 0; + String dmozFile = argv[0]; + boolean includeAdult = false; + Pattern topicPattern = null; + Vector topics = new Vector(); + + NutchFileSystem nfs = NutchFileSystem.get(); + try { + for (int i = 1; i < argv.length; i++) { + if ("-includeAdultMaterial".equals(argv[i])) { + includeAdult = true; + } else if ("-subset".equals(argv[i])) { + subsetDenom = Integer.parseInt(argv[i+1]); + i++; + } else if ("-topic".equals(argv[i])) { + topics.addElement(argv[i+1]); + i++; + } else if ("-topicFile".equals(argv[i])) { + addTopicsFromFile(argv[i+1], topics); + i++; + } else if ("-skew".equals(argv[i])) { + skew = Integer.parseInt(argv[i+1]); + i++; + } + } + + DmozParser parser = new DmozParser(); + + if (!topics.isEmpty()) { + String regExp = new String("^("); + int j = 0; + for ( ; j < topics.size() - 1; ++j) { + regExp = regExp.concat((String) topics.get(j)); + regExp = regExp.concat("|"); + } + regExp = regExp.concat((String) topics.get(j)); + regExp = regExp.concat(").*"); + LOG.info("Topic selection pattern = " + regExp); + topicPattern = Pattern.compile(regExp); + } + + parser.parseDmozFile(new File(dmozFile), subsetDenom, + includeAdult, skew, topicPattern); + + } finally { + nfs.close(); + } + } + +} Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java ------------------------------------------------------------------------------ svn:eol-style = native Modified: lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java?rev=359668&r1=359667&r2=359668&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java (original) +++ lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java Wed Dec 28 16:37:13 2005 @@ -18,7 +18,7 @@ import org.apache.nutch.io.*; import org.apache.nutch.util.LogFormatter; -import org.apache.nutch.indexer.IndexSegment; +import org.apache.nutch.indexer.Indexer; import org.apache.lucene.index.IndexReader; import org.apache.lucene.document.Document; @@ -80,7 +80,7 @@ Vector vReaders=new Vector(); int maxDoc = 0; for (int i = 0; i < directories.length; i++) { - File indexDone = new File(directories[i], IndexSegment.DONE_NAME); + File indexDone = new File(directories[i], Indexer.DONE_NAME); if (indexDone.exists() && indexDone.isFile()){ File indexDir = new File(directories[i], "index"); IndexReader reader = IndexReader.open(indexDir); Modified: lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java?rev=359668&r1=359667&r2=359668&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java (original) +++ lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java Wed Dec 28 16:37:13 2005 @@ -27,8 +27,6 @@ import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.fetcher.FetcherOutput; -import org.apache.nutch.pagedb.FetchListEntry; import java.util.logging.Logger; import org.apache.nutch.util.LogFormatter; Modified: lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=359668&r1=359667&r2=359668&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java (original) +++ lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java Wed Dec 28 16:37:13 2005 @@ -27,8 +27,6 @@ import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.fetcher.FetcherOutput; -import org.apache.nutch.pagedb.FetchListEntry; import java.io.IOException; import java.net.MalformedURLException; Modified: lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=359668&r1=359667&r2=359668&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original) +++ lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Wed Dec 28 16:37:13 2005 @@ -37,7 +37,6 @@ import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.fetcher.FetcherOutput; import org.apache.nutch.util.NutchConf; import org.apache.nutch.util.mime.MimeType; Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java?rev=359668&r1=359667&r2=359668&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java (original) +++ lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java Wed Dec 28 16:37:13 2005 @@ -34,6 +34,8 @@ // Nutch imports import org.apache.nutch.analysis.lang.NGramProfile.NGramEntry; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.io.UTF8; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.Parser; import org.apache.nutch.parse.ParserFactory; @@ -350,7 +352,7 @@ Protocol protocol; try { protocol = ProtocolFactory.getProtocol(url); - Content content = protocol.getProtocolOutput(url).getContent(); + Content content = protocol.getProtocolOutput(new UTF8(url), new CrawlDatum()).getContent(); String contentType = content.getContentType(); Parser parser = ParserFactory.getParser(contentType, url); Parse parse = parser.getParse(content); Modified: lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java?rev=359668&r1=359667&r2=359668&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java Wed Dec 28 16:37:13 2005 @@ -17,13 +17,13 @@ package org.apache.nutch.protocol.file; -import org.apache.nutch.db.Page; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.io.UTF8; import org.apache.nutch.net.protocols.HttpDateFormat; import org.apache.nutch.util.LogFormatter; import org.apache.nutch.util.NutchConf; -import org.apache.nutch.pagedb.FetchListEntry; import org.apache.nutch.protocol.Content; import org.apache.nutch.protocol.Protocol; import org.apache.nutch.protocol.ProtocolOutput; @@ -50,7 +50,7 @@ static final int MAX_REDIRECTS = 5; - static int maxContentLength = NutchConf.get().getInt("file.content.limit",64*1024); + static int maxContentLength = NutchConf.get().getInt("file.content.limit", 64 * 1024); // 20040412, xing // the following three: HttpDateFormat, MimetypesFileTypeMap, MagicFile @@ -67,26 +67,16 @@ /** Set the point at which content is truncated. */ public void setMaxContentLength(int length) {maxContentLength = length;} - public ProtocolOutput getProtocolOutput(String urlString) { - ProtocolOutput output = null; + public ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum) { + String urlString = url.toString(); try { - return getProtocolOutput(new FetchListEntry(true, - new Page(urlString, 1.0f), new String[0])); - } catch (MalformedURLException mue) { - return new ProtocolOutput(null, new ProtocolStatus(mue)); - } - } - - public ProtocolOutput getProtocolOutput(FetchListEntry fle) { - String urlString = fle.getUrl().toString(); - try { - URL url = new URL(urlString); + URL u = new URL(urlString); int redirects = 0; while (true) { FileResponse response; - response = new FileResponse(urlString, url, this); // make a request + response = new FileResponse(u, datum, this); // make a request int code = response.getCode(); @@ -96,10 +86,10 @@ } else if (code >= 300 && code < 400) { // handle redirect if (redirects == MAX_REDIRECTS) throw new FileException("Too many redirects: " + url); - url = new URL(response.getHeader("Location")); + u = new URL(response.getHeader("Location")); redirects++; if (LOG.isLoggable(Level.FINE)) - LOG.fine("redirect to " + url); + LOG.fine("redirect to " + u); } else { // convert to exception throw new FileError(code); @@ -150,7 +140,7 @@ // set log level LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase())); - Content content = file.getProtocolOutput(urlString).getContent(); + Content content = file.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent(); System.err.println("Content-Type: " + content.getContentType()); System.err.println("Content-Length: " + content.get("Content-Length")); Modified: lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=359668&r1=359667&r2=359668&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java Wed Dec 28 16:37:13 2005 @@ -19,11 +19,11 @@ // JDK imports import java.net.URL; import java.util.TreeMap; -import java.util.Properties; import java.util.logging.Level; import java.io.IOException; // Nutch imports +import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.protocol.Content; import org.apache.nutch.protocol.ContentProperties; @@ -80,15 +80,10 @@ headers); } - public FileResponse(URL url, File file) - throws FileException, IOException { - this(url.toString(), url, file); - } - - public FileResponse(String orig, URL url, File file) + public FileResponse(URL url, CrawlDatum datum, File file) throws FileException, IOException { - this.orig = orig; + this.orig = url.toString(); this.base = url.toString(); this.file = file; Modified: lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java?rev=359668&r1=359667&r2=359668&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java Wed Dec 28 16:37:13 2005 @@ -19,13 +19,13 @@ import org.apache.commons.net.ftp.FTPFileEntryParser; -import org.apache.nutch.db.Page; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.io.UTF8; import org.apache.nutch.net.protocols.HttpDateFormat; import org.apache.nutch.util.LogFormatter; import org.apache.nutch.util.NutchConf; -import org.apache.nutch.pagedb.FetchListEntry; import org.apache.nutch.protocol.Content; import org.apache.nutch.protocol.Protocol; import org.apache.nutch.protocol.ProtocolOutput; @@ -112,26 +112,16 @@ this.keepConnection = keepConnection; } - public ProtocolOutput getProtocolOutput(String urlString) { - ProtocolOutput output = null; + public ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum) { + String urlString = url.toString(); try { - return getProtocolOutput(new FetchListEntry(true, - new Page(urlString, 1.0f), new String[0])); - } catch (MalformedURLException mue) { - return new ProtocolOutput(null, new ProtocolStatus(mue)); - } - } - - public ProtocolOutput getProtocolOutput(FetchListEntry fle) { - String urlString = fle.getUrl().toString(); - try { - URL url = new URL(urlString); + URL u = new URL(urlString); int redirects = 0; while (true) { FtpResponse response; - response = new FtpResponse(urlString, url, this); // make a request + response = new FtpResponse(u, datum, this); // make a request int code = response.getCode(); @@ -141,10 +131,10 @@ } else if (code >= 300 && code < 400) { // handle redirect if (redirects == MAX_REDIRECTS) throw new FtpException("Too many redirects: " + url); - url = new URL(response.getHeader("Location")); + u = new URL(response.getHeader("Location")); redirects++; if (LOG.isLoggable(Level.FINE)) - LOG.fine("redirect to " + url); + LOG.fine("redirect to " + u); } else { // convert to exception throw new FtpError(code); @@ -218,7 +208,7 @@ // set log level LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase())); - Content content = ftp.getProtocolOutput(urlString).getContent(); + Content content = ftp.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent(); System.err.println("Content-Type: " + content.getContentType()); System.err.println("Content-Length: " + content.get("Content-Length")); Modified: lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java?rev=359668&r1=359667&r2=359668&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java Wed Dec 28 16:37:13 2005 @@ -24,6 +24,7 @@ import org.apache.commons.net.ftp.parser.DefaultFTPFileEntryParserFactory; import org.apache.commons.net.ftp.parser.ParserInitializationException; +import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.protocol.Content; import org.apache.nutch.protocol.ContentProperties; @@ -80,15 +81,10 @@ headers); } - public FtpResponse(URL url, Ftp ftp) - throws FtpException, IOException { - this(url.toString(), url, ftp); - } - - public FtpResponse(String orig, URL url, Ftp ftp) + public FtpResponse(URL url, CrawlDatum datum, Ftp ftp) throws FtpException, IOException { - this.orig = orig; + this.orig = url.toString(); this.base = url.toString(); this.ftp = ftp; Modified: lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java?rev=359668&r1=359667&r2=359668&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java Wed Dec 28 16:37:13 2005 @@ -28,8 +28,8 @@ import org.apache.nutch.util.LogFormatter; import org.apache.nutch.util.NutchConf; -import org.apache.nutch.db.Page; -import org.apache.nutch.pagedb.FetchListEntry; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.io.UTF8; import org.apache.nutch.protocol.*; /** An implementation of the Http protocol. */ @@ -123,7 +123,7 @@ } if (delays == MAX_DELAYS) - throw new RetryLater(url, "Exceeded http.max.delays: retry later."); + throw new HttpException("Exceeded http.max.delays: retry later."); long done = time.longValue(); long now = System.currentTimeMillis(); @@ -172,31 +172,21 @@ } } - public ProtocolOutput getProtocolOutput(String urlString) { - ProtocolOutput output = null; + public ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum) { + String urlString = url.toString(); try { - return getProtocolOutput(new FetchListEntry(true, - new Page(urlString, 1.0f), new String[0])); - } catch (MalformedURLException mue) { - return new ProtocolOutput(null, new ProtocolStatus(mue)); - } - } - - public ProtocolOutput getProtocolOutput(FetchListEntry fle) { - String urlString = fle.getUrl().toString(); - try { - URL url = new URL(urlString); + URL u = new URL(urlString); int redirects = 0; while (true) { - if (!RobotRulesParser.isAllowed(url)) - throw new ResourceGone(url, "Blocked by robots.txt"); + if (!RobotRulesParser.isAllowed(u)) + throw new HttpException("Blocked by robots.txt"); - InetAddress addr = blockAddr(url); + InetAddress addr = blockAddr(u); HttpResponse response; try { - response = new HttpResponse(urlString, url); // make a request + response = new HttpResponse(u, datum); // make a request } finally { unblockAddr(addr); } @@ -207,14 +197,14 @@ return new ProtocolOutput(response.toContent()); // return it } else if (code == 410) { // page is gone - throw new ResourceGone(url, "Http: " + code); + throw new HttpException("Http: " + code); } else if (code >= 300 && code < 400) { // handle redirect if (redirects == MAX_REDIRECTS) throw new HttpException("Too many redirects: " + urlString); - url = new URL(url, response.getHeader("Location")); + u = new URL(u, response.getHeader("Location")); redirects++; - LOG.fine("redirect to " + url); + LOG.fine("redirect to " + u); } else { // convert to exception throw new HttpError(code); @@ -298,7 +288,7 @@ LOG.setLevel(Level.FINE); } - Content content = http.getProtocolOutput(url).getContent(); + Content content = http.getProtocolOutput(new UTF8(url), new CrawlDatum()).getContent(); System.out.println("Content Type: " + content.getContentType()); System.out.println("Content Length: " + content.get("Content-Length")); Modified: lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=359668&r1=359667&r2=359668&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Wed Dec 28 16:37:13 2005 @@ -31,6 +31,7 @@ import java.util.Properties; import java.util.logging.Level; +import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.protocol.Content; import org.apache.nutch.protocol.ContentProperties; import org.apache.nutch.protocol.ProtocolException; @@ -63,14 +64,10 @@ headers); } - public HttpResponse(URL url) throws ProtocolException, IOException { - this(url.toString(), url); - } - - public HttpResponse(String orig, URL url) + public HttpResponse(URL url, CrawlDatum datum) throws ProtocolException, IOException { - this.orig = orig; + this.orig = url.toString(); this.base = url.toString(); if (!"http".equals(url.getProtocol())) Modified: lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java?rev=359668&r1=359667&r2=359668&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java Wed Dec 28 16:37:13 2005 @@ -35,6 +35,7 @@ import org.apache.nutch.util.NutchConf; import org.apache.nutch.util.LogFormatter; +import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.protocol.ProtocolException; /** @@ -382,7 +383,7 @@ if (robotRules == null) { // cache miss int redirects = 0; do { - HttpResponse response = new HttpResponse(new URL(url, "/robots.txt")); + HttpResponse response = new HttpResponse(new URL(url, "/robots.txt"), new CrawlDatum()); int code = response.getCode(); Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=359668&r1=359667&r2=359668&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java Wed Dec 28 16:37:13 2005 @@ -20,13 +20,9 @@ import org.apache.commons.httpclient.auth.AuthScope; import org.apache.commons.httpclient.params.HttpConnectionManagerParams; import org.apache.commons.httpclient.protocol.Protocol; -import org.apache.nutch.db.Page; -import org.apache.nutch.pagedb.FetchListEntry; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ProtocolException; -import org.apache.nutch.protocol.ProtocolOutput; -import org.apache.nutch.protocol.ProtocolStatus; -import org.apache.nutch.protocol.RetryLater; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.io.UTF8; +import org.apache.nutch.protocol.*; import org.apache.nutch.util.LogFormatter; import org.apache.nutch.util.NutchConf; @@ -130,7 +126,7 @@ } } - if (delays == MAX_DELAYS) throw new RetryLater(url, "Exceeded http.max.delays: retry later."); + if (delays == MAX_DELAYS) throw new HttpException("Exceeded http.max.delays: retry later."); long done = time.longValue(); long now = System.currentTimeMillis(); @@ -177,31 +173,23 @@ } } - public ProtocolOutput getProtocolOutput(String urlString) { + public ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum) { + String urlString = url.toString(); try { - return getProtocolOutput(new FetchListEntry(true, new Page(urlString, 1.0f), new String[0])); - } catch (MalformedURLException mue) { - return new ProtocolOutput(null, new ProtocolStatus(mue)); - } - } - - public ProtocolOutput getProtocolOutput(FetchListEntry fle) { - String urlString = fle.getUrl().toString(); - try { - URL url = new URL(urlString); + URL u = new URL(urlString); try { - if (!RobotRulesParser.isAllowed(url)) + if (!RobotRulesParser.isAllowed(u)) return new ProtocolOutput(null, new ProtocolStatus(ProtocolStatus.ROBOTS_DENIED, url)); } catch (Throwable e) { // XXX Maybe bogus: assume this is allowed. LOG.fine("Exception checking robot rules for " + url + ": " + e); } - InetAddress addr = blockAddr(url); + InetAddress addr = blockAddr(u); HttpResponse response; try { - response = new HttpResponse(url); // make a request + response = new HttpResponse(u, datum); // make a request } finally { unblockAddr(addr); } @@ -220,7 +208,7 @@ // some broken servers, such as MS IIS, use lowercase header name... if (location == null) location = response.getHeader("location"); if (location == null) location = ""; - url = new URL(url, location); + u = new URL(u, location); int protocolStatusCode; switch (code) { case 300: // multiple choices, preferred value in Location @@ -242,21 +230,21 @@ protocolStatusCode = ProtocolStatus.MOVED; } // handle this in the higher layer. - return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, url)); + return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u)); } else if (code == 400) { // bad request, mark as GONE - LOG.fine("400 Bad request: " + url); - return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, url)); + LOG.fine("400 Bad request: " + u); + return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u)); } else if (code == 401) { // requires authorization, but no valid auth provided. LOG.fine("401 Authentication Required"); return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authentication required: " + urlString)); } else if (code == 404) { - return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.NOTFOUND, url)); + return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.NOTFOUND, u)); } else if (code == 410) { // permanently GONE - return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, url)); + return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u)); } else { return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.EXCEPTION, "Http code=" + code + ", url=" - + url)); + + u)); } } catch (Throwable e) { e.printStackTrace(); @@ -333,7 +321,7 @@ LOG.setLevel(Level.FINE); } - ProtocolOutput out = http.getProtocolOutput(url); + ProtocolOutput out = http.getProtocolOutput(new UTF8(url), new CrawlDatum()); Content content = out.getContent(); System.out.println("Status: " + out.getStatus()); Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java?rev=359668&r1=359667&r2=359668&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java Wed Dec 28 16:37:13 2005 @@ -51,7 +51,7 @@ try { Collection challenge = null; - if (header instanceof MultiProperties) { + if (header instanceof ContentProperties) { Object o = header.get(AUTH_HEADER); if (o instanceof Collection) { challenge = (Collection) o; Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=359668&r1=359667&r2=359668&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Wed Dec 28 16:37:13 2005 @@ -3,7 +3,9 @@ package org.apache.nutch.protocol.httpclient; +import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ContentProperties; import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.HttpVersion; @@ -32,7 +34,7 @@ private int code; - private MultiProperties headers = new MultiProperties(); + private ContentProperties headers = new ContentProperties(); /** * Returns the response code. @@ -59,11 +61,11 @@ headers); } - public HttpResponse(URL url) throws IOException { - this(url, false); + public HttpResponse(URL url, CrawlDatum datum) throws IOException { + this(url, datum, false); } - HttpResponse(URL url, boolean followRedirects) throws IOException { + HttpResponse(URL url, CrawlDatum datum, boolean followRedirects) throws IOException { this.base = url.toString(); this.orig = url.toString(); GetMethod get = new GetMethod(this.orig); Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java?rev=359668&r1=359667&r2=359668&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java Wed Dec 28 16:37:13 2005 @@ -35,6 +35,7 @@ import org.apache.nutch.util.NutchConf; import org.apache.nutch.util.LogFormatter; +import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.protocol.ProtocolException; /** @@ -380,7 +381,7 @@ LOG.fine("cache miss " + url); try { HttpResponse response = new HttpResponse(new URL(url, "/robots.txt"), - true); + new CrawlDatum(), true); if (response.getCode() == 200) // found rules: parse them robotRules = new RobotRulesParser().parseRules(response.getContent()); ------------------------------------------------------- This SF.net email is sponsored by: Splunk Inc. Do you grep through log files for problems? Stop! Download the new AJAX search engine that makes searching your log files as easy as surfing the web. DOWNLOAD SPLUNK! http://ads.osdn.com/?ad_id=7637&alloc_id=16865&op=click _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs