linkdb...

ab Wed, 28 Dec 2005 16:39:34 -0800

Added: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=359668&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java 
(added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java 
Wed Dec 28 16:37:13 2005
@@ -0,0 +1,100 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import org.apache.nutch.io.*;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.fetcher.Fetcher;
+import org.apache.nutch.fs.*;
+import org.apache.nutch.mapred.*;
+import org.apache.nutch.parse.*;
+import org.apache.nutch.net.*;
+
+import java.io.*;
+import java.util.*;
+
+/* Parse content in a segment. */
+public class ParseOutputFormat implements OutputFormat {
+
+  private UrlNormalizer urlNormalizer = UrlNormalizerFactory.getNormalizer();
+
+  public RecordWriter getRecordWriter(NutchFileSystem fs, JobConf job,
+                                      String name) throws IOException {
+
+    final float interval = job.getFloat("db.default.fetch.interval", 30f);
+
+    File text =
+      new File(new File(job.getOutputDir(), ParseText.DIR_NAME), name);
+    File data =
+      new File(new File(job.getOutputDir(), ParseData.DIR_NAME), name);
+    File crawl =
+      new File(new File(job.getOutputDir(), CrawlDatum.PARSE_DIR_NAME), name);
+    
+    final MapFile.Writer textOut =
+      new MapFile.Writer(fs, text.toString(), UTF8.class, ParseText.class);
+    
+    final MapFile.Writer dataOut =
+      new MapFile.Writer(fs, data.toString(), UTF8.class,ParseData.class,true);
+    
+    final SequenceFile.Writer crawlOut =
+      new SequenceFile.Writer(fs, crawl.toString(),
+                              UTF8.class, CrawlDatum.class);
+    
+    return new RecordWriter() {
+
+        public void write(WritableComparable key, Writable value)
+          throws IOException {
+          
+          Parse parse = (Parse)value;
+          
+          textOut.append(key, new ParseText(parse.getText()));
+          dataOut.append(key, parse.getData());
+
+          // collect outlinks for subsequent db update
+          Outlink[] links = parse.getData().getOutlinks();
+
+          // compute OPIC score contribution
+          float score =
+            Float.parseFloat(parse.getData().get(Fetcher.SCORE_KEY));
+          score /= links.length;
+                          
+          for (int i = 0; i < links.length; i++) {
+            String toUrl = links[i].getToUrl();
+            try {
+              toUrl = urlNormalizer.normalize(toUrl); // normalize the url
+              toUrl = URLFilters.filter(toUrl);   // filter the url
+            } catch (Exception e) {
+              toUrl = null;
+            }
+            if (toUrl != null)
+              crawlOut.append(new UTF8(toUrl),
+                              new CrawlDatum(CrawlDatum.STATUS_LINKED,
+                                             interval, score));
+          }
+        }
+        
+        public void close(Reporter reporter) throws IOException {
+          textOut.close();
+          dataOut.close();
+          crawlOut.close();
+        }
+        
+      };
+    
+  }
+
+}


Propchange: 
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=359668&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Wed 
Dec 28 16:37:13 2005
@@ -0,0 +1,109 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import org.apache.nutch.io.*;
+import org.apache.nutch.parse.ParseOutputFormat;
+import org.apache.nutch.mapred.*;
+import org.apache.nutch.util.*;
+import org.apache.nutch.protocol.*;
+
+import java.io.*;
+import java.util.*;
+import java.util.logging.*;
+
+/* Parse content in a segment. */
+public class ParseSegment extends NutchConfigured implements Mapper, Reducer {
+
+  public static final Logger LOG =
+    LogFormatter.getLogger(Parser.class.getName());
+
+  public ParseSegment() { super(null); }
+
+  public ParseSegment(NutchConf conf) {
+    super(conf);
+  }
+
+  public void configure(JobConf job) {
+  }
+
+  public void map(WritableComparable key, Writable value,
+                  OutputCollector output, Reporter reporter)
+    throws IOException {
+    Content content = (Content)value;
+
+    Parse parse = null;
+    ParseStatus status;
+    try {
+      parse = ParseUtil.parse(content);
+      status = parse.getData().getStatus();
+    } catch (Exception e) {
+      status = new ParseStatus(e);
+    }
+
+    if (status.isSuccess()) {
+      output.collect(key, new ParseImpl(parse.getText(), parse.getData()));
+    } else {
+      LOG.warning("Error parsing: "+key+": "+status.toString());
+    }
+  }
+
+  public void reduce(WritableComparable key, Iterator values,
+                     OutputCollector output, Reporter reporter)
+    throws IOException {
+    output.collect(key, (Writable)values.next()); // collect first value
+  }
+
+  public void parse(File segment) throws IOException {
+    LOG.info("Parse: starting");
+    LOG.info("Parse: segment: " + segment);
+
+    JobConf job = new JobConf(getConf());
+
+    job.setInputDir(new File(segment, Content.DIR_NAME));
+    job.setInputFormat(SequenceFileInputFormat.class);
+    job.setInputKeyClass(UTF8.class);
+    job.setInputValueClass(Content.class);
+    job.setMapperClass(ParseSegment.class);
+    job.setReducerClass(ParseSegment.class);
+    
+    job.setOutputDir(segment);
+    job.setOutputFormat(ParseOutputFormat.class);
+    job.setOutputKeyClass(UTF8.class);
+    job.setOutputValueClass(ParseImpl.class);
+
+    JobClient.runJob(job);
+    LOG.info("Parse: done");
+  }
+
+
+  public static void main(String[] args) throws Exception {
+    File segment;
+
+    String usage = "Usage: ParseSegment segment";
+
+    if (args.length == 0) {
+      System.err.println(usage);
+      System.exit(-1);
+    }
+      
+    segment = new File(args[0]);
+
+    ParseSegment parseSegment = new ParseSegment(NutchConf.get());
+    parseSegment.parse(segment);
+  }
+}

Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Wed 
Dec 28 16:37:13 2005
@@ -18,6 +18,8 @@
 
 import org.apache.nutch.util.LogFormatter;
 
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.io.UTF8;
 import org.apache.nutch.parse.ParseUtil;
 
 import org.apache.nutch.protocol.ProtocolFactory;
@@ -69,7 +71,7 @@
     LOG.info("fetching: "+url);
 
     Protocol protocol = ProtocolFactory.getProtocol(url);
-    Content content = protocol.getProtocolOutput(url).getContent();
+    Content content = protocol.getProtocolOutput(new UTF8(url), new 
CrawlDatum()).getContent();
 
     if (force) {
       content.setContentType(contentType);

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java Wed Dec 
28 16:37:13 2005
@@ -18,21 +18,15 @@
 
 import java.io.IOException;
 
-import org.apache.nutch.pagedb.FetchListEntry;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.io.UTF8;
 
 /** A retriever of url content.  Implemented by protocol extensions. */
 public interface Protocol {
   /** The name of the extension point. */
   public final static String X_POINT_ID = Protocol.class.getName();
 
-  /** Returns the [EMAIL PROTECTED] Content} for a url. This method may be
-   * more limited than [EMAIL PROTECTED] #getProtocolOutput(FetchListEntry)}.
-   * @throws IOException for any errors.
-   */
-  ProtocolOutput getProtocolOutput(String url);
-
   /** Returns the [EMAIL PROTECTED] Content} for a fetchlist entry.
-   * @throws IOException for any errors.
    */
-  ProtocolOutput getProtocolOutput(FetchListEntry fle);
+  ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum);
 }

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java 
Wed Dec 28 16:37:13 2005
@@ -20,17 +20,11 @@
 import java.io.File;
 
 import java.util.HashMap;
-import java.util.Arrays;
 
 import org.apache.nutch.io.*;
 import org.apache.nutch.fs.*;
-import org.apache.nutch.db.*;
-import org.apache.nutch.util.*;
-import org.apache.nutch.fetcher.*;
 import org.apache.nutch.protocol.*;
 import org.apache.nutch.parse.*;
-import org.apache.nutch.pagedb.*;
-import org.apache.nutch.indexer.*;
 import org.apache.nutch.mapred.*;
 import org.apache.nutch.mapred.lib.*;
 import org.apache.nutch.crawl.*;

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java 
Wed Dec 28 16:37:13 2005
@@ -28,7 +28,6 @@
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.MultiReader;
 
-import org.apache.lucene.search.MultiSearcher;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.FieldDoc;
@@ -40,11 +39,7 @@
 import org.apache.nutch.fs.*;
 import org.apache.nutch.io.*;
 import org.apache.nutch.util.*;
-import org.apache.nutch.db.*;
-import org.apache.nutch.fetcher.*;
-import org.apache.nutch.linkdb.*;
 import org.apache.nutch.indexer.*;
-import org.apache.nutch.analysis.NutchDocumentAnalyzer;
 
 /** Implements [EMAIL PROTECTED] Searcher} and [EMAIL PROTECTED] HitDetailer} 
for either a single
  * merged index, or a set of indexes. */

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java Wed 
Dec 28 16:37:13 2005
@@ -26,7 +26,6 @@
 import org.apache.nutch.parse.*;
 import org.apache.nutch.indexer.*;
 import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.crawl.LinkDbReader;
 
 /** 
  * One stop shopping for search-related functionality.
@@ -103,7 +102,7 @@
       Vector vDirs=new Vector();
       File [] directories = fs.listFiles(indexesDir);
       for(int i = 0; i < fs.listFiles(indexesDir).length; i++) {
-        File indexdone = new File(directories[i], IndexSegment.DONE_NAME);
+        File indexdone = new File(directories[i], Indexer.DONE_NAME);
         if(fs.isFile(indexdone)) {
           vDirs.add(directories[i]);
         }

Added: lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java?rev=359668&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java Wed Dec 
28 16:37:13 2005
@@ -0,0 +1,384 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.*;
+import java.net.*;
+import java.util.*;
+import java.util.logging.*;
+import java.net.MalformedURLException;
+import java.util.regex.*;
+
+import javax.xml.parsers.*;
+import org.xml.sax.*;
+import org.xml.sax.helpers.*;
+import org.apache.xerces.util.XMLChar;
+
+import org.apache.nutch.io.*;
+import org.apache.nutch.fs.*;
+import org.apache.nutch.net.*;
+import org.apache.nutch.util.*;
+import org.apache.nutch.util.NutchConf;
+
+/** Utility that converts DMOZ RDF into a flat file of URLs to be injected. */
+public class DmozParser {
+  public static final Logger LOG = 
LogFormatter.getLogger("org.apache.nutch.crawl.DmozParser");
+  
+    long pages = 0;
+
+  /**
+   * This filter fixes characters that might offend our parser.
+   * This lets us be tolerant of errors that might appear in the input XML.
+   */
+  private static class XMLCharFilter extends FilterReader {
+    private boolean lastBad = false;
+
+    public XMLCharFilter(Reader reader) {
+      super(reader);
+    }
+
+    public int read() throws IOException {
+      int c = in.read();
+      int value = c;
+      if (c != -1 && !(XMLChar.isValid(c)))     // fix invalid characters
+        value = 'X';
+      else if (lastBad && c == '<') {           // fix mis-matched brackets
+        in.mark(1);
+        if (in.read() != '/')
+          value = 'X';
+        in.reset();
+      }
+      lastBad = (c == 65533);
+
+      return value;
+    }
+
+    public int read(char[] cbuf, int off, int len)
+      throws IOException {
+      int n = in.read(cbuf, off, len);
+      if (n != -1) {
+        for (int i = 0; i < n; i++) {
+          char c = cbuf[off+i];
+          char value = c;
+          if (!(XMLChar.isValid(c)))            // fix invalid characters
+            value = 'X';
+          else if (lastBad && c == '<') {       // fix mis-matched brackets
+            if (i != n-1 && cbuf[off+i+1] != '/')
+              value = 'X';
+          }
+          lastBad = (c == 65533);
+          cbuf[off+i] = value;
+        }
+      }
+      return n;
+    }
+  }
+
+
+  /**
+   * The RDFProcessor receives tag messages during a parse
+   * of RDF XML data.  We build whatever structures we need
+   * from these messages.
+   */
+  private class RDFProcessor extends DefaultHandler {
+    String curURL = null, curSection = null;
+    boolean titlePending = false, descPending = false, insideAdultSection = 
false;
+    Pattern topicPattern = null; 
+    StringBuffer title = new StringBuffer(), desc = new StringBuffer();
+    XMLReader reader;
+    int subsetDenom;
+    int hashSkew;
+    boolean includeAdult;
+    Locator location;
+
+    /**
+     * Pass in an XMLReader, plus a flag as to whether we 
+     * should include adult material.
+     */
+    public RDFProcessor(XMLReader reader, int subsetDenom, boolean 
includeAdult, int skew, Pattern topicPattern) throws IOException {
+      this.reader = reader;
+      this.subsetDenom = subsetDenom;
+      this.includeAdult = includeAdult;
+      this.topicPattern = topicPattern;
+
+      this.hashSkew = skew != 0 ? skew : new Random().nextInt();
+    }
+
+    //
+    // Interface ContentHandler
+    //
+
+    /**
+     * Start of an XML elt
+     */
+    public void startElement(String namespaceURI, String localName, String 
qName, Attributes atts) throws SAXException {
+      if ("Topic".equals(qName)) {
+        curSection = atts.getValue("r:id");
+      } else if ("ExternalPage".equals(qName)) {
+        // Porn filter
+        if ((! includeAdult) && curSection.startsWith("Top/Adult")) {
+          return;
+        }
+          
+        if (topicPattern != null && 
!topicPattern.matcher(curSection).matches()) {
+          return;
+        }
+
+        // Subset denominator filter.  
+        // Only emit with a chance of 1/denominator.
+        String url = atts.getValue("about");
+        int hashValue = MD5Hash.digest(url).hashCode();
+        hashValue = Math.abs(hashValue ^ hashSkew);
+        if ((hashValue % subsetDenom) != 0) {
+          return;
+        }
+
+        // We actually claim the URL!
+        curURL = url;
+      } else if (curURL != null && "d:Title".equals(qName)) {
+        titlePending = true;
+      } else if (curURL != null && "d:Description".equals(qName)) {
+        descPending = true;
+      }
+    }
+
+    /**
+     * The contents of an XML elt
+     */
+    public void characters(char ch[], int start, int length) {
+      if (titlePending) {
+        title.append(ch, start, length);
+      } else if (descPending) {
+        desc.append(ch, start, length);
+      }
+    }
+
+    /**
+     * Termination of XML elt
+     */
+    public void endElement(String namespaceURI, String localName, String qName)
+      throws SAXException {
+      if (curURL != null) {
+        if ("ExternalPage".equals(qName)) {
+          //
+          // Inc the number of pages, insert the page, and 
+          // possibly print status.
+          //
+          System.out.println(curURL);
+          pages++;
+
+          //
+          // Clear out the link text.  This is what
+          // you would use for adding to the linkdb.
+          //
+          if (title.length() > 0) {
+            title.delete(0, title.length());
+          }
+          if (desc.length() > 0) {
+            desc.delete(0, desc.length());
+          }
+
+          // Null out the URL.
+          curURL = null;
+        } else if ("d:Title".equals(qName)) {
+          titlePending = false;
+        } else if ("d:Description".equals(qName)) {
+          descPending = false;
+        }
+      }
+    }
+
+    /**
+     * When parsing begins
+     */
+    public void startDocument() {
+      LOG.info("Begin parse");
+    }
+
+    /**
+     * When parsing ends
+     */
+    public void endDocument() {
+      LOG.info("Completed parse.  Found " + pages + " pages.");
+    }
+
+    /**
+     * From time to time the Parser will set the "current location"
+     * by calling this function.  It's useful for emitting locations
+     * for error messages.
+     */
+    public void setDocumentLocator(Locator locator) {
+      location = locator;
+    }
+
+
+    //
+    // Interface ErrorHandler
+    //
+
+    /**
+     * Emit the exception message
+     */
+    public void error(SAXParseException spe) {
+      LOG.severe("Error: " + spe.toString() + ": " + spe.getMessage());
+      spe.printStackTrace(System.err);
+    }
+
+    /**
+     * Emit the exception message, with line numbers
+     */
+    public void fatalError(SAXParseException spe) {
+      LOG.severe("Fatal err: " + spe.toString() + ": " + spe.getMessage());
+      LOG.severe("Last known line is " + location.getLineNumber() + ", column 
" + location.getColumnNumber());
+      spe.printStackTrace(System.err);
+    }
+        
+    /**
+     * Emit exception warning message
+     */
+    public void warning(SAXParseException spe) {
+      LOG.warning("Warning: " + spe.toString() + ": " + spe.getMessage());
+      spe.printStackTrace(System.err);
+    }
+  }
+
+  /**
+   * Iterate through all the items in this structured DMOZ file.
+   * Add each URL to the web db.
+   */
+  public void parseDmozFile(File dmozFile, int subsetDenom,
+                            boolean includeAdult,
+                            int skew,
+                            Pattern topicPattern)
+
+    throws IOException, SAXException, ParserConfigurationException {
+
+    SAXParserFactory parserFactory = SAXParserFactory.newInstance();
+    SAXParser parser = parserFactory.newSAXParser();
+    XMLReader reader = parser.getXMLReader();
+
+    // Create our own processor to receive SAX events
+    RDFProcessor rp =
+      new RDFProcessor(reader, subsetDenom, includeAdult,
+                       skew, topicPattern);
+    reader.setContentHandler(rp);
+    reader.setErrorHandler(rp);
+    LOG.info("skew = " + rp.hashSkew);
+
+    //
+    // Open filtered text stream.  The UTF8Filter makes sure that
+    // only appropriate XML-approved UTF8 characters are received.
+    // Any non-conforming characters are silently skipped.
+    //
+    XMLCharFilter in = new XMLCharFilter(new BufferedReader(new 
InputStreamReader(new BufferedInputStream(new FileInputStream(dmozFile)), 
"UTF-8")));
+    try {
+      InputSource is = new InputSource(in);
+      reader.parse(is);
+    } catch (Exception e) {
+      LOG.severe(e.toString());
+      e.printStackTrace(System.err);
+      System.exit(0);
+    } finally {
+      in.close();
+    }
+  }
+
+  private static void addTopicsFromFile(String topicFile, Vector topics)
+    throws IOException {
+    BufferedReader in = null;
+    try {
+      in = new BufferedReader(new InputStreamReader(new 
FileInputStream(topicFile), "UTF-8"));
+      String line = null;
+      while ((line = in.readLine()) != null) {
+        topics.addElement(new String(line));
+      }
+    } 
+    catch (Exception e) {
+      LOG.severe(e.toString());
+      e.printStackTrace(System.out);
+      System.exit(0);
+    } finally {
+      in.close();
+    }
+  }
+    
+  /**
+   * Command-line access.  User may add URLs via a flat text file
+   * or the structured DMOZ file.  By default, we ignore Adult
+   * material (as categorized by DMOZ).
+   */
+  public static void main(String argv[]) throws Exception {
+    if (argv.length < 1) {
+      System.err.println("Usage: DmozParser <dmoz_file> [-subset 
<subsetDenominator>] [-includeAdultMaterial] [-skew skew] [-topicFile <topic 
list file>] [-topic <topic> [-topic <topic> [...]]]");
+      return;
+    }
+    
+    //
+    // Parse the command line, figure out what kind of
+    // URL file we need to load
+    //
+    int subsetDenom = 1;
+    int skew = 0;
+    String dmozFile = argv[0];
+    boolean includeAdult = false;
+    Pattern topicPattern = null; 
+    Vector topics = new Vector(); 
+    
+    NutchFileSystem nfs = NutchFileSystem.get();
+    try {
+      for (int i = 1; i < argv.length; i++) {
+        if ("-includeAdultMaterial".equals(argv[i])) {
+          includeAdult = true;
+        } else if ("-subset".equals(argv[i])) {
+          subsetDenom = Integer.parseInt(argv[i+1]);
+          i++;
+        } else if ("-topic".equals(argv[i])) {
+          topics.addElement(argv[i+1]); 
+          i++;
+        } else if ("-topicFile".equals(argv[i])) {
+          addTopicsFromFile(argv[i+1], topics);
+          i++;
+        } else if ("-skew".equals(argv[i])) {
+          skew = Integer.parseInt(argv[i+1]);
+          i++;
+        }
+      }
+
+      DmozParser parser = new DmozParser();
+
+      if (!topics.isEmpty()) {
+        String regExp = new String("^("); 
+        int j = 0;
+        for ( ; j < topics.size() - 1; ++j) {
+          regExp = regExp.concat((String) topics.get(j));
+          regExp = regExp.concat("|");
+        }
+        regExp = regExp.concat((String) topics.get(j));
+        regExp = regExp.concat(").*"); 
+        LOG.info("Topic selection pattern = " + regExp);
+        topicPattern = Pattern.compile(regExp); 
+      }
+
+      parser.parseDmozFile(new File(dmozFile), subsetDenom,
+                           includeAdult, skew, topicPattern);
+      
+    } finally {
+      nfs.close();
+    }
+  }
+
+}

Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: 
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java
 Wed Dec 28 16:37:13 2005
@@ -18,7 +18,7 @@
 
 import org.apache.nutch.io.*;
 import org.apache.nutch.util.LogFormatter;
-import org.apache.nutch.indexer.IndexSegment;
+import org.apache.nutch.indexer.Indexer;
 
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.document.Document;
@@ -80,7 +80,7 @@
     Vector vReaders=new Vector();
     int maxDoc = 0;
     for (int i = 0; i < directories.length; i++) {
-      File indexDone = new File(directories[i], IndexSegment.DONE_NAME);
+      File indexDone = new File(directories[i], Indexer.DONE_NAME);
       if (indexDone.exists() && indexDone.isFile()){
         File indexDir = new File(directories[i], "index");
        IndexReader reader = IndexReader.open(indexDir);

Modified: 
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
 Wed Dec 28 16:37:13 2005
@@ -27,8 +27,6 @@
 
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.fetcher.FetcherOutput;
-import org.apache.nutch.pagedb.FetchListEntry;
 
 import java.util.logging.Logger;
 import org.apache.nutch.util.LogFormatter;

Modified: 
lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
 Wed Dec 28 16:37:13 2005
@@ -27,8 +27,6 @@
 
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.fetcher.FetcherOutput;
-import org.apache.nutch.pagedb.FetchListEntry;
 
 import java.io.IOException;
 import java.net.MalformedURLException;

Modified: 
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 Wed Dec 28 16:37:13 2005
@@ -37,7 +37,6 @@
 
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.fetcher.FetcherOutput;
 
 import org.apache.nutch.util.NutchConf;
 import org.apache.nutch.util.mime.MimeType;

Modified: 
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
 Wed Dec 28 16:37:13 2005
@@ -34,6 +34,8 @@
 
 // Nutch imports
 import org.apache.nutch.analysis.lang.NGramProfile.NGramEntry;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.io.UTF8;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.parse.ParserFactory;
@@ -350,7 +352,7 @@
     Protocol protocol;
     try {
       protocol = ProtocolFactory.getProtocol(url);
-      Content content = protocol.getProtocolOutput(url).getContent();
+      Content content = protocol.getProtocolOutput(new UTF8(url), new 
CrawlDatum()).getContent();
       String contentType = content.getContentType();
       Parser parser = ParserFactory.getParser(contentType, url);
       Parse parse = parser.getParse(content);

Modified: 
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
 Wed Dec 28 16:37:13 2005
@@ -17,13 +17,13 @@
 package org.apache.nutch.protocol.file;
 
 
-import org.apache.nutch.db.Page;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.io.UTF8;
 import org.apache.nutch.net.protocols.HttpDateFormat;
 
 import org.apache.nutch.util.LogFormatter;
 import org.apache.nutch.util.NutchConf;
 
-import org.apache.nutch.pagedb.FetchListEntry;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolOutput;
@@ -50,7 +50,7 @@
 
   static final int MAX_REDIRECTS = 5;
 
-  static int maxContentLength = 
NutchConf.get().getInt("file.content.limit",64*1024);
+  static int maxContentLength = NutchConf.get().getInt("file.content.limit", 
64 * 1024);
 
   // 20040412, xing
   // the following three: HttpDateFormat, MimetypesFileTypeMap, MagicFile
@@ -67,26 +67,16 @@
   /** Set the point at which content is truncated. */
   public void setMaxContentLength(int length) {maxContentLength = length;}
 
-  public ProtocolOutput getProtocolOutput(String urlString) {
-    ProtocolOutput output = null;
+  public ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum) {
+    String urlString = url.toString();
     try {
-      return getProtocolOutput(new FetchListEntry(true,
-            new Page(urlString, 1.0f), new String[0]));
-    } catch (MalformedURLException mue) {
-      return new ProtocolOutput(null, new ProtocolStatus(mue));
-    }
-  }
-  
-  public ProtocolOutput getProtocolOutput(FetchListEntry fle) {
-    String urlString = fle.getUrl().toString();
-    try {
-      URL url = new URL(urlString);
+      URL u = new URL(urlString);
   
       int redirects = 0;
   
       while (true) {
         FileResponse response;
-        response = new FileResponse(urlString, url, this);   // make a request
+        response = new FileResponse(u, datum, this);   // make a request
   
         int code = response.getCode();
   
@@ -96,10 +86,10 @@
         } else if (code >= 300 && code < 400) {     // handle redirect
           if (redirects == MAX_REDIRECTS)
             throw new FileException("Too many redirects: " + url);
-          url = new URL(response.getHeader("Location"));
+          u = new URL(response.getHeader("Location"));
           redirects++;                
           if (LOG.isLoggable(Level.FINE))
-            LOG.fine("redirect to " + url); 
+            LOG.fine("redirect to " + u); 
   
         } else {                                    // convert to exception
           throw new FileError(code);
@@ -150,7 +140,7 @@
     // set log level
     LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
 
-    Content content = file.getProtocolOutput(urlString).getContent();
+    Content content = file.getProtocolOutput(new UTF8(urlString), new 
CrawlDatum()).getContent();
 
     System.err.println("Content-Type: " + content.getContentType());
     System.err.println("Content-Length: " + content.get("Content-Length"));

Modified: 
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
 Wed Dec 28 16:37:13 2005
@@ -19,11 +19,11 @@
 // JDK imports
 import java.net.URL;
 import java.util.TreeMap;
-import java.util.Properties;
 import java.util.logging.Level;
 import java.io.IOException;
 
 // Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.ContentProperties;
 
@@ -80,15 +80,10 @@
                        headers);
   }
 
-  public FileResponse(URL url, File file)
-    throws FileException, IOException {
-    this(url.toString(), url, file);
-  }
-
-  public FileResponse(String orig, URL url, File file)
+  public FileResponse(URL url, CrawlDatum datum, File file)
     throws FileException, IOException {
 
-    this.orig = orig;
+    this.orig = url.toString();
     this.base = url.toString();
     this.file = file;
 

Modified: 
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
 Wed Dec 28 16:37:13 2005
@@ -19,13 +19,13 @@
 
 import org.apache.commons.net.ftp.FTPFileEntryParser;
 
-import org.apache.nutch.db.Page;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.io.UTF8;
 import org.apache.nutch.net.protocols.HttpDateFormat;
 
 import org.apache.nutch.util.LogFormatter;
 import org.apache.nutch.util.NutchConf;
 
-import org.apache.nutch.pagedb.FetchListEntry;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolOutput;
@@ -112,26 +112,16 @@
     this.keepConnection = keepConnection;
   }
 
-  public ProtocolOutput getProtocolOutput(String urlString) {
-    ProtocolOutput output = null;
+  public ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum) {
+    String urlString = url.toString();
     try {
-      return getProtocolOutput(new FetchListEntry(true,
-            new Page(urlString, 1.0f), new String[0]));
-    } catch (MalformedURLException mue) {
-      return new ProtocolOutput(null, new ProtocolStatus(mue));
-    }
-  }
-  
-  public ProtocolOutput getProtocolOutput(FetchListEntry fle) {
-    String urlString = fle.getUrl().toString();
-    try {
-      URL url = new URL(urlString);
+      URL u = new URL(urlString);
   
       int redirects = 0;
   
       while (true) {
         FtpResponse response;
-        response = new FtpResponse(urlString, url, this);   // make a request
+        response = new FtpResponse(u, datum, this);   // make a request
   
         int code = response.getCode();
   
@@ -141,10 +131,10 @@
         } else if (code >= 300 && code < 400) {     // handle redirect
           if (redirects == MAX_REDIRECTS)
             throw new FtpException("Too many redirects: " + url);
-          url = new URL(response.getHeader("Location"));
+          u = new URL(response.getHeader("Location"));
           redirects++;                
           if (LOG.isLoggable(Level.FINE))
-            LOG.fine("redirect to " + url); 
+            LOG.fine("redirect to " + u); 
   
         } else {                                    // convert to exception
           throw new FtpError(code);
@@ -218,7 +208,7 @@
     // set log level
     LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
 
-    Content content = ftp.getProtocolOutput(urlString).getContent();
+    Content content = ftp.getProtocolOutput(new UTF8(urlString), new 
CrawlDatum()).getContent();
 
     System.err.println("Content-Type: " + content.getContentType());
     System.err.println("Content-Length: " + content.get("Content-Length"));

Modified: 
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
 Wed Dec 28 16:37:13 2005
@@ -24,6 +24,7 @@
 import org.apache.commons.net.ftp.parser.DefaultFTPFileEntryParserFactory;
 import org.apache.commons.net.ftp.parser.ParserInitializationException;
 
+import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.ContentProperties;
 
@@ -80,15 +81,10 @@
                        headers);
   }
 
-  public FtpResponse(URL url, Ftp ftp)
-    throws FtpException, IOException {
-    this(url.toString(), url, ftp);
-  }
-
-  public FtpResponse(String orig, URL url, Ftp ftp)
+  public FtpResponse(URL url, CrawlDatum datum, Ftp ftp)
     throws FtpException, IOException {
 
-    this.orig = orig;
+    this.orig = url.toString();
     this.base = url.toString();
     this.ftp = ftp;
 

Modified: 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
 Wed Dec 28 16:37:13 2005
@@ -28,8 +28,8 @@
 import org.apache.nutch.util.LogFormatter;
 import org.apache.nutch.util.NutchConf;
 
-import org.apache.nutch.db.Page;
-import org.apache.nutch.pagedb.FetchListEntry;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.io.UTF8;
 import org.apache.nutch.protocol.*;
 
 /** An implementation of the Http protocol. */
@@ -123,7 +123,7 @@
       }
 
       if (delays == MAX_DELAYS)
-        throw new RetryLater(url, "Exceeded http.max.delays: retry later.");
+        throw new HttpException("Exceeded http.max.delays: retry later.");
 
       long done = time.longValue();
       long now = System.currentTimeMillis();
@@ -172,31 +172,21 @@
     }
   }
 
-  public ProtocolOutput getProtocolOutput(String urlString) {
-    ProtocolOutput output = null;
+  public ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum) {
+    String urlString = url.toString();
     try {
-      return getProtocolOutput(new FetchListEntry(true,
-            new Page(urlString, 1.0f), new String[0]));
-    } catch (MalformedURLException mue) {
-      return new ProtocolOutput(null, new ProtocolStatus(mue));
-    }
-  }
-  
-  public ProtocolOutput getProtocolOutput(FetchListEntry fle) {
-    String urlString = fle.getUrl().toString();
-    try {
-      URL url = new URL(urlString);
+      URL u = new URL(urlString);
 
       int redirects = 0;
       while (true) {
         
-        if (!RobotRulesParser.isAllowed(url))
-          throw new ResourceGone(url, "Blocked by robots.txt");
+        if (!RobotRulesParser.isAllowed(u))
+          throw new HttpException("Blocked by robots.txt");
         
-        InetAddress addr = blockAddr(url);
+        InetAddress addr = blockAddr(u);
         HttpResponse response;
         try {
-          response = new HttpResponse(urlString, url); // make a request
+          response = new HttpResponse(u, datum); // make a request
         } finally {
           unblockAddr(addr);
         }
@@ -207,14 +197,14 @@
           return new ProtocolOutput(response.toContent());            // 
return it
           
         } else if (code == 410) {                 // page is gone
-          throw new ResourceGone(url, "Http: " + code);
+          throw new HttpException("Http: " + code);
 
         } else if (code >= 300 && code < 400) {   // handle redirect
           if (redirects == MAX_REDIRECTS)
             throw new HttpException("Too many redirects: " + urlString);
-          url = new URL(url, response.getHeader("Location"));
+          u = new URL(u, response.getHeader("Location"));
           redirects++;                
-          LOG.fine("redirect to " + url); 
+          LOG.fine("redirect to " + u); 
           
         } else {                                  // convert to exception
           throw new HttpError(code);
@@ -298,7 +288,7 @@
       LOG.setLevel(Level.FINE);
     }
 
-    Content content = http.getProtocolOutput(url).getContent();
+    Content content = http.getProtocolOutput(new UTF8(url), new 
CrawlDatum()).getContent();
 
     System.out.println("Content Type: " + content.getContentType());
     System.out.println("Content Length: " + content.get("Content-Length"));

Modified: 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 Wed Dec 28 16:37:13 2005
@@ -31,6 +31,7 @@
 import java.util.Properties;
 import java.util.logging.Level;
 
+import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.protocol.ProtocolException;
@@ -63,14 +64,10 @@
                        headers);
   }
 
-  public HttpResponse(URL url) throws ProtocolException, IOException {
-    this(url.toString(), url);
-  }
-
-  public HttpResponse(String orig, URL url)
+  public HttpResponse(URL url, CrawlDatum datum)
     throws ProtocolException, IOException {
     
-    this.orig = orig;
+    this.orig = url.toString();
     this.base = url.toString();
 
     if (!"http".equals(url.getProtocol()))

Modified: 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
 Wed Dec 28 16:37:13 2005
@@ -35,6 +35,7 @@
 
 import org.apache.nutch.util.NutchConf;
 import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.protocol.ProtocolException;
 
 /**
@@ -382,7 +383,7 @@
     if (robotRules == null) {                     // cache miss
       int redirects = 0;
       do {
-        HttpResponse response = new HttpResponse(new URL(url, "/robots.txt"));
+        HttpResponse response = new HttpResponse(new URL(url, "/robots.txt"), 
new CrawlDatum());
 
         int code = response.getCode();
 

Modified: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
 Wed Dec 28 16:37:13 2005
@@ -20,13 +20,9 @@
 import org.apache.commons.httpclient.auth.AuthScope;
 import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
 import org.apache.commons.httpclient.protocol.Protocol;
-import org.apache.nutch.db.Page;
-import org.apache.nutch.pagedb.FetchListEntry;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.protocol.ProtocolOutput;
-import org.apache.nutch.protocol.ProtocolStatus;
-import org.apache.nutch.protocol.RetryLater;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.io.UTF8;
+import org.apache.nutch.protocol.*;
 import org.apache.nutch.util.LogFormatter;
 import org.apache.nutch.util.NutchConf;
 
@@ -130,7 +126,7 @@
         }
       }
 
-      if (delays == MAX_DELAYS) throw new RetryLater(url, "Exceeded 
http.max.delays: retry later.");
+      if (delays == MAX_DELAYS) throw new HttpException("Exceeded 
http.max.delays: retry later.");
 
       long done = time.longValue();
       long now = System.currentTimeMillis();
@@ -177,31 +173,23 @@
     }
   }
 
-  public ProtocolOutput getProtocolOutput(String urlString) {
+  public ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum) {
+    String urlString = url.toString();
     try {
-      return getProtocolOutput(new FetchListEntry(true, new Page(urlString, 
1.0f), new String[0]));
-    } catch (MalformedURLException mue) {
-      return new ProtocolOutput(null, new ProtocolStatus(mue));
-    }
-  }
-
-  public ProtocolOutput getProtocolOutput(FetchListEntry fle) {
-    String urlString = fle.getUrl().toString();
-    try {
-      URL url = new URL(urlString);
+      URL u = new URL(urlString);
 
         try {
-          if (!RobotRulesParser.isAllowed(url))
+          if (!RobotRulesParser.isAllowed(u))
                   return new ProtocolOutput(null, new 
ProtocolStatus(ProtocolStatus.ROBOTS_DENIED, url));
         } catch (Throwable e) {
           // XXX Maybe bogus: assume this is allowed.
           LOG.fine("Exception checking robot rules for " + url + ": " + e);
         }
 
-        InetAddress addr = blockAddr(url);
+        InetAddress addr = blockAddr(u);
         HttpResponse response;
         try {
-          response = new HttpResponse(url); // make a request
+          response = new HttpResponse(u, datum); // make a request
         } finally {
           unblockAddr(addr);
         }
@@ -220,7 +208,7 @@
           // some broken servers, such as MS IIS, use lowercase header name...
           if (location == null) location = response.getHeader("location");
           if (location == null) location = "";
-          url = new URL(url, location);
+          u = new URL(u, location);
           int protocolStatusCode;
           switch (code) {
             case 300:   // multiple choices, preferred value in Location
@@ -242,21 +230,21 @@
               protocolStatusCode = ProtocolStatus.MOVED;
           }
           // handle this in the higher layer.
-          return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, 
url));
+          return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, 
u));
         } else if (code == 400) { // bad request, mark as GONE
-          LOG.fine("400 Bad request: " + url);
-          return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, 
url));
+          LOG.fine("400 Bad request: " + u);
+          return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, 
u));
         } else if (code == 401) { // requires authorization, but no valid auth 
provided.
           LOG.fine("401 Authentication Required");
           return new ProtocolOutput(c, new 
ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authentication required: "
                   + urlString));
         } else if (code == 404) {
-          return new ProtocolOutput(c, new 
ProtocolStatus(ProtocolStatus.NOTFOUND, url));
+          return new ProtocolOutput(c, new 
ProtocolStatus(ProtocolStatus.NOTFOUND, u));
         } else if (code == 410) { // permanently GONE
-          return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, 
url));
+          return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, 
u));
         } else {
           return new ProtocolOutput(c, new 
ProtocolStatus(ProtocolStatus.EXCEPTION, "Http code=" + code + ", url="
-                  + url));
+                  + u));
         }
     } catch (Throwable e) {
       e.printStackTrace();
@@ -333,7 +321,7 @@
       LOG.setLevel(Level.FINE);
     }
 
-    ProtocolOutput out = http.getProtocolOutput(url);
+    ProtocolOutput out = http.getProtocolOutput(new UTF8(url), new 
CrawlDatum());
     Content content = out.getContent();
 
     System.out.println("Status: " + out.getStatus());

Modified: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
 Wed Dec 28 16:37:13 2005
@@ -51,7 +51,7 @@
         
        try {
                        Collection challenge = null;
-                       if (header instanceof MultiProperties) {
+                       if (header instanceof ContentProperties) {
                                Object o = header.get(AUTH_HEADER);
                                if (o instanceof Collection) {
                                        challenge = (Collection) o;

Modified: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
 Wed Dec 28 16:37:13 2005
@@ -3,7 +3,9 @@
 
 package org.apache.nutch.protocol.httpclient;
 
+import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
 
 import org.apache.commons.httpclient.Header;
 import org.apache.commons.httpclient.HttpVersion;
@@ -32,7 +34,7 @@
 
   private int code;
 
-  private MultiProperties headers = new MultiProperties();
+  private ContentProperties headers = new ContentProperties();
 
   /**
    * Returns the response code.
@@ -59,11 +61,11 @@
                        headers);
   }
 
-  public HttpResponse(URL url) throws IOException {
-    this(url, false);
+  public HttpResponse(URL url, CrawlDatum datum) throws IOException {
+    this(url, datum, false);
   }
 
-  HttpResponse(URL url, boolean followRedirects) throws IOException {
+  HttpResponse(URL url, CrawlDatum datum, boolean followRedirects) throws 
IOException {
     this.base = url.toString();
     this.orig = url.toString();
     GetMethod get = new GetMethod(this.orig);

Modified: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java
 Wed Dec 28 16:37:13 2005
@@ -35,6 +35,7 @@
 
 import org.apache.nutch.util.NutchConf;
 import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.protocol.ProtocolException;
 
 /**
@@ -380,7 +381,7 @@
       LOG.fine("cache miss " + url);
       try {
         HttpResponse response = new HttpResponse(new URL(url, "/robots.txt"),
-                                                 true);
+                                      new CrawlDatum(), true);
 
         if (response.getCode() == 200)               // found rules: parse them
           robotRules = new 
RobotRulesParser().parseRules(response.getContent());

svn commit: r359668 [2/2] - in /lucene/nutch/trunk: bin/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/db/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/io/ src/java/org/apache/nutch/linkdb...

Reply via email to