Added: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=359668&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
(added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
Wed Dec 28 16:37:13 2005
@@ -0,0 +1,100 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import org.apache.nutch.io.*;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.fetcher.Fetcher;
+import org.apache.nutch.fs.*;
+import org.apache.nutch.mapred.*;
+import org.apache.nutch.parse.*;
+import org.apache.nutch.net.*;
+
+import java.io.*;
+import java.util.*;
+
+/* Parse content in a segment. */
+public class ParseOutputFormat implements OutputFormat {
+
+ private UrlNormalizer urlNormalizer = UrlNormalizerFactory.getNormalizer();
+
+ public RecordWriter getRecordWriter(NutchFileSystem fs, JobConf job,
+ String name) throws IOException {
+
+ final float interval = job.getFloat("db.default.fetch.interval", 30f);
+
+ File text =
+ new File(new File(job.getOutputDir(), ParseText.DIR_NAME), name);
+ File data =
+ new File(new File(job.getOutputDir(), ParseData.DIR_NAME), name);
+ File crawl =
+ new File(new File(job.getOutputDir(), CrawlDatum.PARSE_DIR_NAME), name);
+
+ final MapFile.Writer textOut =
+ new MapFile.Writer(fs, text.toString(), UTF8.class, ParseText.class);
+
+ final MapFile.Writer dataOut =
+ new MapFile.Writer(fs, data.toString(), UTF8.class,ParseData.class,true);
+
+ final SequenceFile.Writer crawlOut =
+ new SequenceFile.Writer(fs, crawl.toString(),
+ UTF8.class, CrawlDatum.class);
+
+ return new RecordWriter() {
+
+ public void write(WritableComparable key, Writable value)
+ throws IOException {
+
+ Parse parse = (Parse)value;
+
+ textOut.append(key, new ParseText(parse.getText()));
+ dataOut.append(key, parse.getData());
+
+ // collect outlinks for subsequent db update
+ Outlink[] links = parse.getData().getOutlinks();
+
+ // compute OPIC score contribution
+ float score =
+ Float.parseFloat(parse.getData().get(Fetcher.SCORE_KEY));
+ score /= links.length;
+
+ for (int i = 0; i < links.length; i++) {
+ String toUrl = links[i].getToUrl();
+ try {
+ toUrl = urlNormalizer.normalize(toUrl); // normalize the url
+ toUrl = URLFilters.filter(toUrl); // filter the url
+ } catch (Exception e) {
+ toUrl = null;
+ }
+ if (toUrl != null)
+ crawlOut.append(new UTF8(toUrl),
+ new CrawlDatum(CrawlDatum.STATUS_LINKED,
+ interval, score));
+ }
+ }
+
+ public void close(Reporter reporter) throws IOException {
+ textOut.close();
+ dataOut.close();
+ crawlOut.close();
+ }
+
+ };
+
+ }
+
+}
Propchange:
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=359668&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Wed
Dec 28 16:37:13 2005
@@ -0,0 +1,109 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import org.apache.nutch.io.*;
+import org.apache.nutch.parse.ParseOutputFormat;
+import org.apache.nutch.mapred.*;
+import org.apache.nutch.util.*;
+import org.apache.nutch.protocol.*;
+
+import java.io.*;
+import java.util.*;
+import java.util.logging.*;
+
+/* Parse content in a segment. */
+public class ParseSegment extends NutchConfigured implements Mapper, Reducer {
+
+ public static final Logger LOG =
+ LogFormatter.getLogger(Parser.class.getName());
+
+ public ParseSegment() { super(null); }
+
+ public ParseSegment(NutchConf conf) {
+ super(conf);
+ }
+
+ public void configure(JobConf job) {
+ }
+
+ public void map(WritableComparable key, Writable value,
+ OutputCollector output, Reporter reporter)
+ throws IOException {
+ Content content = (Content)value;
+
+ Parse parse = null;
+ ParseStatus status;
+ try {
+ parse = ParseUtil.parse(content);
+ status = parse.getData().getStatus();
+ } catch (Exception e) {
+ status = new ParseStatus(e);
+ }
+
+ if (status.isSuccess()) {
+ output.collect(key, new ParseImpl(parse.getText(), parse.getData()));
+ } else {
+ LOG.warning("Error parsing: "+key+": "+status.toString());
+ }
+ }
+
+ public void reduce(WritableComparable key, Iterator values,
+ OutputCollector output, Reporter reporter)
+ throws IOException {
+ output.collect(key, (Writable)values.next()); // collect first value
+ }
+
+ public void parse(File segment) throws IOException {
+ LOG.info("Parse: starting");
+ LOG.info("Parse: segment: " + segment);
+
+ JobConf job = new JobConf(getConf());
+
+ job.setInputDir(new File(segment, Content.DIR_NAME));
+ job.setInputFormat(SequenceFileInputFormat.class);
+ job.setInputKeyClass(UTF8.class);
+ job.setInputValueClass(Content.class);
+ job.setMapperClass(ParseSegment.class);
+ job.setReducerClass(ParseSegment.class);
+
+ job.setOutputDir(segment);
+ job.setOutputFormat(ParseOutputFormat.class);
+ job.setOutputKeyClass(UTF8.class);
+ job.setOutputValueClass(ParseImpl.class);
+
+ JobClient.runJob(job);
+ LOG.info("Parse: done");
+ }
+
+
+ public static void main(String[] args) throws Exception {
+ File segment;
+
+ String usage = "Usage: ParseSegment segment";
+
+ if (args.length == 0) {
+ System.err.println(usage);
+ System.exit(-1);
+ }
+
+ segment = new File(args[0]);
+
+ ParseSegment parseSegment = new ParseSegment(NutchConf.get());
+ parseSegment.parse(segment);
+ }
+}
Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Wed
Dec 28 16:37:13 2005
@@ -18,6 +18,8 @@
import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.io.UTF8;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.ProtocolFactory;
@@ -69,7 +71,7 @@
LOG.info("fetching: "+url);
Protocol protocol = ProtocolFactory.getProtocol(url);
- Content content = protocol.getProtocolOutput(url).getContent();
+ Content content = protocol.getProtocolOutput(new UTF8(url), new
CrawlDatum()).getContent();
if (force) {
content.setContentType(contentType);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java Wed Dec
28 16:37:13 2005
@@ -18,21 +18,15 @@
import java.io.IOException;
-import org.apache.nutch.pagedb.FetchListEntry;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.io.UTF8;
/** A retriever of url content. Implemented by protocol extensions. */
public interface Protocol {
/** The name of the extension point. */
public final static String X_POINT_ID = Protocol.class.getName();
- /** Returns the [EMAIL PROTECTED] Content} for a url. This method may be
- * more limited than [EMAIL PROTECTED] #getProtocolOutput(FetchListEntry)}.
- * @throws IOException for any errors.
- */
- ProtocolOutput getProtocolOutput(String url);
-
/** Returns the [EMAIL PROTECTED] Content} for a fetchlist entry.
- * @throws IOException for any errors.
*/
- ProtocolOutput getProtocolOutput(FetchListEntry fle);
+ ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum);
}
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java
Wed Dec 28 16:37:13 2005
@@ -20,17 +20,11 @@
import java.io.File;
import java.util.HashMap;
-import java.util.Arrays;
import org.apache.nutch.io.*;
import org.apache.nutch.fs.*;
-import org.apache.nutch.db.*;
-import org.apache.nutch.util.*;
-import org.apache.nutch.fetcher.*;
import org.apache.nutch.protocol.*;
import org.apache.nutch.parse.*;
-import org.apache.nutch.pagedb.*;
-import org.apache.nutch.indexer.*;
import org.apache.nutch.mapred.*;
import org.apache.nutch.mapred.lib.*;
import org.apache.nutch.crawl.*;
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java
Wed Dec 28 16:37:13 2005
@@ -28,7 +28,6 @@
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiReader;
-import org.apache.lucene.search.MultiSearcher;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.FieldDoc;
@@ -40,11 +39,7 @@
import org.apache.nutch.fs.*;
import org.apache.nutch.io.*;
import org.apache.nutch.util.*;
-import org.apache.nutch.db.*;
-import org.apache.nutch.fetcher.*;
-import org.apache.nutch.linkdb.*;
import org.apache.nutch.indexer.*;
-import org.apache.nutch.analysis.NutchDocumentAnalyzer;
/** Implements [EMAIL PROTECTED] Searcher} and [EMAIL PROTECTED] HitDetailer}
for either a single
* merged index, or a set of indexes. */
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java Wed
Dec 28 16:37:13 2005
@@ -26,7 +26,6 @@
import org.apache.nutch.parse.*;
import org.apache.nutch.indexer.*;
import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.crawl.LinkDbReader;
/**
* One stop shopping for search-related functionality.
@@ -103,7 +102,7 @@
Vector vDirs=new Vector();
File [] directories = fs.listFiles(indexesDir);
for(int i = 0; i < fs.listFiles(indexesDir).length; i++) {
- File indexdone = new File(directories[i], IndexSegment.DONE_NAME);
+ File indexdone = new File(directories[i], Indexer.DONE_NAME);
if(fs.isFile(indexdone)) {
vDirs.add(directories[i]);
}
Added: lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java?rev=359668&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java Wed Dec
28 16:37:13 2005
@@ -0,0 +1,384 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.*;
+import java.net.*;
+import java.util.*;
+import java.util.logging.*;
+import java.net.MalformedURLException;
+import java.util.regex.*;
+
+import javax.xml.parsers.*;
+import org.xml.sax.*;
+import org.xml.sax.helpers.*;
+import org.apache.xerces.util.XMLChar;
+
+import org.apache.nutch.io.*;
+import org.apache.nutch.fs.*;
+import org.apache.nutch.net.*;
+import org.apache.nutch.util.*;
+import org.apache.nutch.util.NutchConf;
+
+/** Utility that converts DMOZ RDF into a flat file of URLs to be injected. */
+public class DmozParser {
+ public static final Logger LOG =
LogFormatter.getLogger("org.apache.nutch.crawl.DmozParser");
+
+ long pages = 0;
+
+ /**
+ * This filter fixes characters that might offend our parser.
+ * This lets us be tolerant of errors that might appear in the input XML.
+ */
+ private static class XMLCharFilter extends FilterReader {
+ private boolean lastBad = false;
+
+ public XMLCharFilter(Reader reader) {
+ super(reader);
+ }
+
+ public int read() throws IOException {
+ int c = in.read();
+ int value = c;
+ if (c != -1 && !(XMLChar.isValid(c))) // fix invalid characters
+ value = 'X';
+ else if (lastBad && c == '<') { // fix mis-matched brackets
+ in.mark(1);
+ if (in.read() != '/')
+ value = 'X';
+ in.reset();
+ }
+ lastBad = (c == 65533);
+
+ return value;
+ }
+
+ public int read(char[] cbuf, int off, int len)
+ throws IOException {
+ int n = in.read(cbuf, off, len);
+ if (n != -1) {
+ for (int i = 0; i < n; i++) {
+ char c = cbuf[off+i];
+ char value = c;
+ if (!(XMLChar.isValid(c))) // fix invalid characters
+ value = 'X';
+ else if (lastBad && c == '<') { // fix mis-matched brackets
+ if (i != n-1 && cbuf[off+i+1] != '/')
+ value = 'X';
+ }
+ lastBad = (c == 65533);
+ cbuf[off+i] = value;
+ }
+ }
+ return n;
+ }
+ }
+
+
+ /**
+ * The RDFProcessor receives tag messages during a parse
+ * of RDF XML data. We build whatever structures we need
+ * from these messages.
+ */
+ private class RDFProcessor extends DefaultHandler {
+ String curURL = null, curSection = null;
+ boolean titlePending = false, descPending = false, insideAdultSection =
false;
+ Pattern topicPattern = null;
+ StringBuffer title = new StringBuffer(), desc = new StringBuffer();
+ XMLReader reader;
+ int subsetDenom;
+ int hashSkew;
+ boolean includeAdult;
+ Locator location;
+
+ /**
+ * Pass in an XMLReader, plus a flag as to whether we
+ * should include adult material.
+ */
+ public RDFProcessor(XMLReader reader, int subsetDenom, boolean
includeAdult, int skew, Pattern topicPattern) throws IOException {
+ this.reader = reader;
+ this.subsetDenom = subsetDenom;
+ this.includeAdult = includeAdult;
+ this.topicPattern = topicPattern;
+
+ this.hashSkew = skew != 0 ? skew : new Random().nextInt();
+ }
+
+ //
+ // Interface ContentHandler
+ //
+
+ /**
+ * Start of an XML elt
+ */
+ public void startElement(String namespaceURI, String localName, String
qName, Attributes atts) throws SAXException {
+ if ("Topic".equals(qName)) {
+ curSection = atts.getValue("r:id");
+ } else if ("ExternalPage".equals(qName)) {
+ // Porn filter
+ if ((! includeAdult) && curSection.startsWith("Top/Adult")) {
+ return;
+ }
+
+ if (topicPattern != null &&
!topicPattern.matcher(curSection).matches()) {
+ return;
+ }
+
+ // Subset denominator filter.
+ // Only emit with a chance of 1/denominator.
+ String url = atts.getValue("about");
+ int hashValue = MD5Hash.digest(url).hashCode();
+ hashValue = Math.abs(hashValue ^ hashSkew);
+ if ((hashValue % subsetDenom) != 0) {
+ return;
+ }
+
+ // We actually claim the URL!
+ curURL = url;
+ } else if (curURL != null && "d:Title".equals(qName)) {
+ titlePending = true;
+ } else if (curURL != null && "d:Description".equals(qName)) {
+ descPending = true;
+ }
+ }
+
+ /**
+ * The contents of an XML elt
+ */
+ public void characters(char ch[], int start, int length) {
+ if (titlePending) {
+ title.append(ch, start, length);
+ } else if (descPending) {
+ desc.append(ch, start, length);
+ }
+ }
+
+ /**
+ * Termination of XML elt
+ */
+ public void endElement(String namespaceURI, String localName, String qName)
+ throws SAXException {
+ if (curURL != null) {
+ if ("ExternalPage".equals(qName)) {
+ //
+ // Inc the number of pages, insert the page, and
+ // possibly print status.
+ //
+ System.out.println(curURL);
+ pages++;
+
+ //
+ // Clear out the link text. This is what
+ // you would use for adding to the linkdb.
+ //
+ if (title.length() > 0) {
+ title.delete(0, title.length());
+ }
+ if (desc.length() > 0) {
+ desc.delete(0, desc.length());
+ }
+
+ // Null out the URL.
+ curURL = null;
+ } else if ("d:Title".equals(qName)) {
+ titlePending = false;
+ } else if ("d:Description".equals(qName)) {
+ descPending = false;
+ }
+ }
+ }
+
+ /**
+ * When parsing begins
+ */
+ public void startDocument() {
+ LOG.info("Begin parse");
+ }
+
+ /**
+ * When parsing ends
+ */
+ public void endDocument() {
+ LOG.info("Completed parse. Found " + pages + " pages.");
+ }
+
+ /**
+ * From time to time the Parser will set the "current location"
+ * by calling this function. It's useful for emitting locations
+ * for error messages.
+ */
+ public void setDocumentLocator(Locator locator) {
+ location = locator;
+ }
+
+
+ //
+ // Interface ErrorHandler
+ //
+
+ /**
+ * Emit the exception message
+ */
+ public void error(SAXParseException spe) {
+ LOG.severe("Error: " + spe.toString() + ": " + spe.getMessage());
+ spe.printStackTrace(System.err);
+ }
+
+ /**
+ * Emit the exception message, with line numbers
+ */
+ public void fatalError(SAXParseException spe) {
+ LOG.severe("Fatal err: " + spe.toString() + ": " + spe.getMessage());
+ LOG.severe("Last known line is " + location.getLineNumber() + ", column
" + location.getColumnNumber());
+ spe.printStackTrace(System.err);
+ }
+
+ /**
+ * Emit exception warning message
+ */
+ public void warning(SAXParseException spe) {
+ LOG.warning("Warning: " + spe.toString() + ": " + spe.getMessage());
+ spe.printStackTrace(System.err);
+ }
+ }
+
+ /**
+ * Iterate through all the items in this structured DMOZ file.
+ * Add each URL to the web db.
+ */
+ public void parseDmozFile(File dmozFile, int subsetDenom,
+ boolean includeAdult,
+ int skew,
+ Pattern topicPattern)
+
+ throws IOException, SAXException, ParserConfigurationException {
+
+ SAXParserFactory parserFactory = SAXParserFactory.newInstance();
+ SAXParser parser = parserFactory.newSAXParser();
+ XMLReader reader = parser.getXMLReader();
+
+ // Create our own processor to receive SAX events
+ RDFProcessor rp =
+ new RDFProcessor(reader, subsetDenom, includeAdult,
+ skew, topicPattern);
+ reader.setContentHandler(rp);
+ reader.setErrorHandler(rp);
+ LOG.info("skew = " + rp.hashSkew);
+
+ //
+ // Open filtered text stream. The UTF8Filter makes sure that
+ // only appropriate XML-approved UTF8 characters are received.
+ // Any non-conforming characters are silently skipped.
+ //
+ XMLCharFilter in = new XMLCharFilter(new BufferedReader(new
InputStreamReader(new BufferedInputStream(new FileInputStream(dmozFile)),
"UTF-8")));
+ try {
+ InputSource is = new InputSource(in);
+ reader.parse(is);
+ } catch (Exception e) {
+ LOG.severe(e.toString());
+ e.printStackTrace(System.err);
+ System.exit(0);
+ } finally {
+ in.close();
+ }
+ }
+
+ private static void addTopicsFromFile(String topicFile, Vector topics)
+ throws IOException {
+ BufferedReader in = null;
+ try {
+ in = new BufferedReader(new InputStreamReader(new
FileInputStream(topicFile), "UTF-8"));
+ String line = null;
+ while ((line = in.readLine()) != null) {
+ topics.addElement(new String(line));
+ }
+ }
+ catch (Exception e) {
+ LOG.severe(e.toString());
+ e.printStackTrace(System.out);
+ System.exit(0);
+ } finally {
+ in.close();
+ }
+ }
+
+ /**
+ * Command-line access. User may add URLs via a flat text file
+ * or the structured DMOZ file. By default, we ignore Adult
+ * material (as categorized by DMOZ).
+ */
+ public static void main(String argv[]) throws Exception {
+ if (argv.length < 1) {
+ System.err.println("Usage: DmozParser <dmoz_file> [-subset
<subsetDenominator>] [-includeAdultMaterial] [-skew skew] [-topicFile <topic
list file>] [-topic <topic> [-topic <topic> [...]]]");
+ return;
+ }
+
+ //
+ // Parse the command line, figure out what kind of
+ // URL file we need to load
+ //
+ int subsetDenom = 1;
+ int skew = 0;
+ String dmozFile = argv[0];
+ boolean includeAdult = false;
+ Pattern topicPattern = null;
+ Vector topics = new Vector();
+
+ NutchFileSystem nfs = NutchFileSystem.get();
+ try {
+ for (int i = 1; i < argv.length; i++) {
+ if ("-includeAdultMaterial".equals(argv[i])) {
+ includeAdult = true;
+ } else if ("-subset".equals(argv[i])) {
+ subsetDenom = Integer.parseInt(argv[i+1]);
+ i++;
+ } else if ("-topic".equals(argv[i])) {
+ topics.addElement(argv[i+1]);
+ i++;
+ } else if ("-topicFile".equals(argv[i])) {
+ addTopicsFromFile(argv[i+1], topics);
+ i++;
+ } else if ("-skew".equals(argv[i])) {
+ skew = Integer.parseInt(argv[i+1]);
+ i++;
+ }
+ }
+
+ DmozParser parser = new DmozParser();
+
+ if (!topics.isEmpty()) {
+ String regExp = new String("^(");
+ int j = 0;
+ for ( ; j < topics.size() - 1; ++j) {
+ regExp = regExp.concat((String) topics.get(j));
+ regExp = regExp.concat("|");
+ }
+ regExp = regExp.concat((String) topics.get(j));
+ regExp = regExp.concat(").*");
+ LOG.info("Topic selection pattern = " + regExp);
+ topicPattern = Pattern.compile(regExp);
+ }
+
+ parser.parseDmozFile(new File(dmozFile), subsetDenom,
+ includeAdult, skew, topicPattern);
+
+ } finally {
+ nfs.close();
+ }
+ }
+
+}
Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified:
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java
(original)
+++
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java
Wed Dec 28 16:37:13 2005
@@ -18,7 +18,7 @@
import org.apache.nutch.io.*;
import org.apache.nutch.util.LogFormatter;
-import org.apache.nutch.indexer.IndexSegment;
+import org.apache.nutch.indexer.Indexer;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.document.Document;
@@ -80,7 +80,7 @@
Vector vReaders=new Vector();
int maxDoc = 0;
for (int i = 0; i < directories.length; i++) {
- File indexDone = new File(directories[i], IndexSegment.DONE_NAME);
+ File indexDone = new File(directories[i], Indexer.DONE_NAME);
if (indexDone.exists() && indexDone.isFile()){
File indexDir = new File(directories[i], "index");
IndexReader reader = IndexReader.open(indexDir);
Modified:
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
(original)
+++
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
Wed Dec 28 16:37:13 2005
@@ -27,8 +27,6 @@
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.fetcher.FetcherOutput;
-import org.apache.nutch.pagedb.FetchListEntry;
import java.util.logging.Logger;
import org.apache.nutch.util.LogFormatter;
Modified:
lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
(original)
+++
lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
Wed Dec 28 16:37:13 2005
@@ -27,8 +27,6 @@
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.fetcher.FetcherOutput;
-import org.apache.nutch.pagedb.FetchListEntry;
import java.io.IOException;
import java.net.MalformedURLException;
Modified:
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
(original)
+++
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
Wed Dec 28 16:37:13 2005
@@ -37,7 +37,6 @@
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.fetcher.FetcherOutput;
import org.apache.nutch.util.NutchConf;
import org.apache.nutch.util.mime.MimeType;
Modified:
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
(original)
+++
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
Wed Dec 28 16:37:13 2005
@@ -34,6 +34,8 @@
// Nutch imports
import org.apache.nutch.analysis.lang.NGramProfile.NGramEntry;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.io.UTF8;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.ParserFactory;
@@ -350,7 +352,7 @@
Protocol protocol;
try {
protocol = ProtocolFactory.getProtocol(url);
- Content content = protocol.getProtocolOutput(url).getContent();
+ Content content = protocol.getProtocolOutput(new UTF8(url), new
CrawlDatum()).getContent();
String contentType = content.getContentType();
Parser parser = ParserFactory.getParser(contentType, url);
Parse parse = parser.getParse(content);
Modified:
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
(original)
+++
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
Wed Dec 28 16:37:13 2005
@@ -17,13 +17,13 @@
package org.apache.nutch.protocol.file;
-import org.apache.nutch.db.Page;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.io.UTF8;
import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.util.LogFormatter;
import org.apache.nutch.util.NutchConf;
-import org.apache.nutch.pagedb.FetchListEntry;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolOutput;
@@ -50,7 +50,7 @@
static final int MAX_REDIRECTS = 5;
- static int maxContentLength =
NutchConf.get().getInt("file.content.limit",64*1024);
+ static int maxContentLength = NutchConf.get().getInt("file.content.limit",
64 * 1024);
// 20040412, xing
// the following three: HttpDateFormat, MimetypesFileTypeMap, MagicFile
@@ -67,26 +67,16 @@
/** Set the point at which content is truncated. */
public void setMaxContentLength(int length) {maxContentLength = length;}
- public ProtocolOutput getProtocolOutput(String urlString) {
- ProtocolOutput output = null;
+ public ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum) {
+ String urlString = url.toString();
try {
- return getProtocolOutput(new FetchListEntry(true,
- new Page(urlString, 1.0f), new String[0]));
- } catch (MalformedURLException mue) {
- return new ProtocolOutput(null, new ProtocolStatus(mue));
- }
- }
-
- public ProtocolOutput getProtocolOutput(FetchListEntry fle) {
- String urlString = fle.getUrl().toString();
- try {
- URL url = new URL(urlString);
+ URL u = new URL(urlString);
int redirects = 0;
while (true) {
FileResponse response;
- response = new FileResponse(urlString, url, this); // make a request
+ response = new FileResponse(u, datum, this); // make a request
int code = response.getCode();
@@ -96,10 +86,10 @@
} else if (code >= 300 && code < 400) { // handle redirect
if (redirects == MAX_REDIRECTS)
throw new FileException("Too many redirects: " + url);
- url = new URL(response.getHeader("Location"));
+ u = new URL(response.getHeader("Location"));
redirects++;
if (LOG.isLoggable(Level.FINE))
- LOG.fine("redirect to " + url);
+ LOG.fine("redirect to " + u);
} else { // convert to exception
throw new FileError(code);
@@ -150,7 +140,7 @@
// set log level
LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
- Content content = file.getProtocolOutput(urlString).getContent();
+ Content content = file.getProtocolOutput(new UTF8(urlString), new
CrawlDatum()).getContent();
System.err.println("Content-Type: " + content.getContentType());
System.err.println("Content-Length: " + content.get("Content-Length"));
Modified:
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
(original)
+++
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
Wed Dec 28 16:37:13 2005
@@ -19,11 +19,11 @@
// JDK imports
import java.net.URL;
import java.util.TreeMap;
-import java.util.Properties;
import java.util.logging.Level;
import java.io.IOException;
// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ContentProperties;
@@ -80,15 +80,10 @@
headers);
}
- public FileResponse(URL url, File file)
- throws FileException, IOException {
- this(url.toString(), url, file);
- }
-
- public FileResponse(String orig, URL url, File file)
+ public FileResponse(URL url, CrawlDatum datum, File file)
throws FileException, IOException {
- this.orig = orig;
+ this.orig = url.toString();
this.base = url.toString();
this.file = file;
Modified:
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
(original)
+++
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
Wed Dec 28 16:37:13 2005
@@ -19,13 +19,13 @@
import org.apache.commons.net.ftp.FTPFileEntryParser;
-import org.apache.nutch.db.Page;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.io.UTF8;
import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.util.LogFormatter;
import org.apache.nutch.util.NutchConf;
-import org.apache.nutch.pagedb.FetchListEntry;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolOutput;
@@ -112,26 +112,16 @@
this.keepConnection = keepConnection;
}
- public ProtocolOutput getProtocolOutput(String urlString) {
- ProtocolOutput output = null;
+ public ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum) {
+ String urlString = url.toString();
try {
- return getProtocolOutput(new FetchListEntry(true,
- new Page(urlString, 1.0f), new String[0]));
- } catch (MalformedURLException mue) {
- return new ProtocolOutput(null, new ProtocolStatus(mue));
- }
- }
-
- public ProtocolOutput getProtocolOutput(FetchListEntry fle) {
- String urlString = fle.getUrl().toString();
- try {
- URL url = new URL(urlString);
+ URL u = new URL(urlString);
int redirects = 0;
while (true) {
FtpResponse response;
- response = new FtpResponse(urlString, url, this); // make a request
+ response = new FtpResponse(u, datum, this); // make a request
int code = response.getCode();
@@ -141,10 +131,10 @@
} else if (code >= 300 && code < 400) { // handle redirect
if (redirects == MAX_REDIRECTS)
throw new FtpException("Too many redirects: " + url);
- url = new URL(response.getHeader("Location"));
+ u = new URL(response.getHeader("Location"));
redirects++;
if (LOG.isLoggable(Level.FINE))
- LOG.fine("redirect to " + url);
+ LOG.fine("redirect to " + u);
} else { // convert to exception
throw new FtpError(code);
@@ -218,7 +208,7 @@
// set log level
LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
- Content content = ftp.getProtocolOutput(urlString).getContent();
+ Content content = ftp.getProtocolOutput(new UTF8(urlString), new
CrawlDatum()).getContent();
System.err.println("Content-Type: " + content.getContentType());
System.err.println("Content-Length: " + content.get("Content-Length"));
Modified:
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
(original)
+++
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
Wed Dec 28 16:37:13 2005
@@ -24,6 +24,7 @@
import org.apache.commons.net.ftp.parser.DefaultFTPFileEntryParserFactory;
import org.apache.commons.net.ftp.parser.ParserInitializationException;
+import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ContentProperties;
@@ -80,15 +81,10 @@
headers);
}
- public FtpResponse(URL url, Ftp ftp)
- throws FtpException, IOException {
- this(url.toString(), url, ftp);
- }
-
- public FtpResponse(String orig, URL url, Ftp ftp)
+ public FtpResponse(URL url, CrawlDatum datum, Ftp ftp)
throws FtpException, IOException {
- this.orig = orig;
+ this.orig = url.toString();
this.base = url.toString();
this.ftp = ftp;
Modified:
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
(original)
+++
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
Wed Dec 28 16:37:13 2005
@@ -28,8 +28,8 @@
import org.apache.nutch.util.LogFormatter;
import org.apache.nutch.util.NutchConf;
-import org.apache.nutch.db.Page;
-import org.apache.nutch.pagedb.FetchListEntry;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.io.UTF8;
import org.apache.nutch.protocol.*;
/** An implementation of the Http protocol. */
@@ -123,7 +123,7 @@
}
if (delays == MAX_DELAYS)
- throw new RetryLater(url, "Exceeded http.max.delays: retry later.");
+ throw new HttpException("Exceeded http.max.delays: retry later.");
long done = time.longValue();
long now = System.currentTimeMillis();
@@ -172,31 +172,21 @@
}
}
- public ProtocolOutput getProtocolOutput(String urlString) {
- ProtocolOutput output = null;
+ public ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum) {
+ String urlString = url.toString();
try {
- return getProtocolOutput(new FetchListEntry(true,
- new Page(urlString, 1.0f), new String[0]));
- } catch (MalformedURLException mue) {
- return new ProtocolOutput(null, new ProtocolStatus(mue));
- }
- }
-
- public ProtocolOutput getProtocolOutput(FetchListEntry fle) {
- String urlString = fle.getUrl().toString();
- try {
- URL url = new URL(urlString);
+ URL u = new URL(urlString);
int redirects = 0;
while (true) {
- if (!RobotRulesParser.isAllowed(url))
- throw new ResourceGone(url, "Blocked by robots.txt");
+ if (!RobotRulesParser.isAllowed(u))
+ throw new HttpException("Blocked by robots.txt");
- InetAddress addr = blockAddr(url);
+ InetAddress addr = blockAddr(u);
HttpResponse response;
try {
- response = new HttpResponse(urlString, url); // make a request
+ response = new HttpResponse(u, datum); // make a request
} finally {
unblockAddr(addr);
}
@@ -207,14 +197,14 @@
return new ProtocolOutput(response.toContent()); //
return it
} else if (code == 410) { // page is gone
- throw new ResourceGone(url, "Http: " + code);
+ throw new HttpException("Http: " + code);
} else if (code >= 300 && code < 400) { // handle redirect
if (redirects == MAX_REDIRECTS)
throw new HttpException("Too many redirects: " + urlString);
- url = new URL(url, response.getHeader("Location"));
+ u = new URL(u, response.getHeader("Location"));
redirects++;
- LOG.fine("redirect to " + url);
+ LOG.fine("redirect to " + u);
} else { // convert to exception
throw new HttpError(code);
@@ -298,7 +288,7 @@
LOG.setLevel(Level.FINE);
}
- Content content = http.getProtocolOutput(url).getContent();
+ Content content = http.getProtocolOutput(new UTF8(url), new
CrawlDatum()).getContent();
System.out.println("Content Type: " + content.getContentType());
System.out.println("Content Length: " + content.get("Content-Length"));
Modified:
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
(original)
+++
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
Wed Dec 28 16:37:13 2005
@@ -31,6 +31,7 @@
import java.util.Properties;
import java.util.logging.Level;
+import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.protocol.ProtocolException;
@@ -63,14 +64,10 @@
headers);
}
- public HttpResponse(URL url) throws ProtocolException, IOException {
- this(url.toString(), url);
- }
-
- public HttpResponse(String orig, URL url)
+ public HttpResponse(URL url, CrawlDatum datum)
throws ProtocolException, IOException {
- this.orig = orig;
+ this.orig = url.toString();
this.base = url.toString();
if (!"http".equals(url.getProtocol()))
Modified:
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
(original)
+++
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
Wed Dec 28 16:37:13 2005
@@ -35,6 +35,7 @@
import org.apache.nutch.util.NutchConf;
import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.protocol.ProtocolException;
/**
@@ -382,7 +383,7 @@
if (robotRules == null) { // cache miss
int redirects = 0;
do {
- HttpResponse response = new HttpResponse(new URL(url, "/robots.txt"));
+ HttpResponse response = new HttpResponse(new URL(url, "/robots.txt"),
new CrawlDatum());
int code = response.getCode();
Modified:
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
(original)
+++
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
Wed Dec 28 16:37:13 2005
@@ -20,13 +20,9 @@
import org.apache.commons.httpclient.auth.AuthScope;
import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
import org.apache.commons.httpclient.protocol.Protocol;
-import org.apache.nutch.db.Page;
-import org.apache.nutch.pagedb.FetchListEntry;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.protocol.ProtocolOutput;
-import org.apache.nutch.protocol.ProtocolStatus;
-import org.apache.nutch.protocol.RetryLater;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.io.UTF8;
+import org.apache.nutch.protocol.*;
import org.apache.nutch.util.LogFormatter;
import org.apache.nutch.util.NutchConf;
@@ -130,7 +126,7 @@
}
}
- if (delays == MAX_DELAYS) throw new RetryLater(url, "Exceeded
http.max.delays: retry later.");
+ if (delays == MAX_DELAYS) throw new HttpException("Exceeded
http.max.delays: retry later.");
long done = time.longValue();
long now = System.currentTimeMillis();
@@ -177,31 +173,23 @@
}
}
- public ProtocolOutput getProtocolOutput(String urlString) {
+ public ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum) {
+ String urlString = url.toString();
try {
- return getProtocolOutput(new FetchListEntry(true, new Page(urlString,
1.0f), new String[0]));
- } catch (MalformedURLException mue) {
- return new ProtocolOutput(null, new ProtocolStatus(mue));
- }
- }
-
- public ProtocolOutput getProtocolOutput(FetchListEntry fle) {
- String urlString = fle.getUrl().toString();
- try {
- URL url = new URL(urlString);
+ URL u = new URL(urlString);
try {
- if (!RobotRulesParser.isAllowed(url))
+ if (!RobotRulesParser.isAllowed(u))
return new ProtocolOutput(null, new
ProtocolStatus(ProtocolStatus.ROBOTS_DENIED, url));
} catch (Throwable e) {
// XXX Maybe bogus: assume this is allowed.
LOG.fine("Exception checking robot rules for " + url + ": " + e);
}
- InetAddress addr = blockAddr(url);
+ InetAddress addr = blockAddr(u);
HttpResponse response;
try {
- response = new HttpResponse(url); // make a request
+ response = new HttpResponse(u, datum); // make a request
} finally {
unblockAddr(addr);
}
@@ -220,7 +208,7 @@
// some broken servers, such as MS IIS, use lowercase header name...
if (location == null) location = response.getHeader("location");
if (location == null) location = "";
- url = new URL(url, location);
+ u = new URL(u, location);
int protocolStatusCode;
switch (code) {
case 300: // multiple choices, preferred value in Location
@@ -242,21 +230,21 @@
protocolStatusCode = ProtocolStatus.MOVED;
}
// handle this in the higher layer.
- return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode,
url));
+ return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode,
u));
} else if (code == 400) { // bad request, mark as GONE
- LOG.fine("400 Bad request: " + url);
- return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE,
url));
+ LOG.fine("400 Bad request: " + u);
+ return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE,
u));
} else if (code == 401) { // requires authorization, but no valid auth
provided.
LOG.fine("401 Authentication Required");
return new ProtocolOutput(c, new
ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authentication required: "
+ urlString));
} else if (code == 404) {
- return new ProtocolOutput(c, new
ProtocolStatus(ProtocolStatus.NOTFOUND, url));
+ return new ProtocolOutput(c, new
ProtocolStatus(ProtocolStatus.NOTFOUND, u));
} else if (code == 410) { // permanently GONE
- return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE,
url));
+ return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE,
u));
} else {
return new ProtocolOutput(c, new
ProtocolStatus(ProtocolStatus.EXCEPTION, "Http code=" + code + ", url="
- + url));
+ + u));
}
} catch (Throwable e) {
e.printStackTrace();
@@ -333,7 +321,7 @@
LOG.setLevel(Level.FINE);
}
- ProtocolOutput out = http.getProtocolOutput(url);
+ ProtocolOutput out = http.getProtocolOutput(new UTF8(url), new
CrawlDatum());
Content content = out.getContent();
System.out.println("Status: " + out.getStatus());
Modified:
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
(original)
+++
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
Wed Dec 28 16:37:13 2005
@@ -51,7 +51,7 @@
try {
Collection challenge = null;
- if (header instanceof MultiProperties) {
+ if (header instanceof ContentProperties) {
Object o = header.get(AUTH_HEADER);
if (o instanceof Collection) {
challenge = (Collection) o;
Modified:
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
(original)
+++
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
Wed Dec 28 16:37:13 2005
@@ -3,7 +3,9 @@
package org.apache.nutch.protocol.httpclient;
+import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpVersion;
@@ -32,7 +34,7 @@
private int code;
- private MultiProperties headers = new MultiProperties();
+ private ContentProperties headers = new ContentProperties();
/**
* Returns the response code.
@@ -59,11 +61,11 @@
headers);
}
- public HttpResponse(URL url) throws IOException {
- this(url, false);
+ public HttpResponse(URL url, CrawlDatum datum) throws IOException {
+ this(url, datum, false);
}
- HttpResponse(URL url, boolean followRedirects) throws IOException {
+ HttpResponse(URL url, CrawlDatum datum, boolean followRedirects) throws
IOException {
this.base = url.toString();
this.orig = url.toString();
GetMethod get = new GetMethod(this.orig);
Modified:
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java
(original)
+++
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java
Wed Dec 28 16:37:13 2005
@@ -35,6 +35,7 @@
import org.apache.nutch.util.NutchConf;
import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.protocol.ProtocolException;
/**
@@ -380,7 +381,7 @@
LOG.fine("cache miss " + url);
try {
HttpResponse response = new HttpResponse(new URL(url, "/robots.txt"),
- true);
+ new CrawlDatum(), true);
if (response.getCode() == 200) // found rules: parse them
robotRules = new
RobotRulesParser().parseRules(response.getContent());