linkdb...

ab Wed, 28 Dec 2005 16:40:18 -0800

Added: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=359668&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java 
(added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java 
Wed Dec 28 16:37:13 2005
@@ -0,0 +1,100 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import org.apache.nutch.io.*;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.fetcher.Fetcher;
+import org.apache.nutch.fs.*;
+import org.apache.nutch.mapred.*;
+import org.apache.nutch.parse.*;
+import org.apache.nutch.net.*;
+
+import java.io.*;
+import java.util.*;
+
+/* Parse content in a segment. */
+public class ParseOutputFormat implements OutputFormat {
+
+  private UrlNormalizer urlNormalizer = UrlNormalizerFactory.getNormalizer();
+
+  public RecordWriter getRecordWriter(NutchFileSystem fs, JobConf job,
+                                      String name) throws IOException {
+
+    final float interval = job.getFloat("db.default.fetch.interval", 30f);
+
+    File text =
+      new File(new File(job.getOutputDir(), ParseText.DIR_NAME), name);
+    File data =
+      new File(new File(job.getOutputDir(), ParseData.DIR_NAME), name);
+    File crawl =
+      new File(new File(job.getOutputDir(), CrawlDatum.PARSE_DIR_NAME), name);
+    
+    final MapFile.Writer textOut =
+      new MapFile.Writer(fs, text.toString(), UTF8.class, ParseText.class);
+    
+    final MapFile.Writer dataOut =
+      new MapFile.Writer(fs, data.toString(), UTF8.class,ParseData.class,true);
+    
+    final SequenceFile.Writer crawlOut =
+      new SequenceFile.Writer(fs, crawl.toString(),
+                              UTF8.class, CrawlDatum.class);
+    
+    return new RecordWriter() {
+
+        public void write(WritableComparable key, Writable value)
+          throws IOException {
+          
+          Parse parse = (Parse)value;
+          
+          textOut.append(key, new ParseText(parse.getText()));
+          dataOut.append(key, parse.getData());
+
+          // collect outlinks for subsequent db update
+          Outlink[] links = parse.getData().getOutlinks();
+
+          // compute OPIC score contribution
+          float score =
+            Float.parseFloat(parse.getData().get(Fetcher.SCORE_KEY));
+          score /= links.length;
+                          
+          for (int i = 0; i < links.length; i++) {
+            String toUrl = links[i].getToUrl();
+            try {
+              toUrl = urlNormalizer.normalize(toUrl); // normalize the url
+              toUrl = URLFilters.filter(toUrl);   // filter the url
+            } catch (Exception e) {
+              toUrl = null;
+            }
+            if (toUrl != null)
+              crawlOut.append(new UTF8(toUrl),
+                              new CrawlDatum(CrawlDatum.STATUS_LINKED,
+                                             interval, score));
+          }
+        }
+        
+        public void close(Reporter reporter) throws IOException {
+          textOut.close();
+          dataOut.close();
+          crawlOut.close();
+        }
+        
+      };
+    
+  }
+
+}


Propchange: 
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=359668&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Wed 
Dec 28 16:37:13 2005
@@ -0,0 +1,109 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import org.apache.nutch.io.*;
+import org.apache.nutch.parse.ParseOutputFormat;
+import org.apache.nutch.mapred.*;
+import org.apache.nutch.util.*;
+import org.apache.nutch.protocol.*;
+
+import java.io.*;
+import java.util.*;
+import java.util.logging.*;
+
+/* Parse content in a segment. */
+public class ParseSegment extends NutchConfigured implements Mapper, Reducer {
+
+  public static final Logger LOG =
+    LogFormatter.getLogger(Parser.class.getName());
+
+  public ParseSegment() { super(null); }
+
+  public ParseSegment(NutchConf conf) {
+    super(conf);
+  }
+
+  public void configure(JobConf job) {
+  }
+
+  public void map(WritableComparable key, Writable value,
+                  OutputCollector output, Reporter reporter)
+    throws IOException {
+    Content content = (Content)value;
+
+    Parse parse = null;
+    ParseStatus status;
+    try {
+      parse = ParseUtil.parse(content);
+      status = parse.getData().getStatus();
+    } catch (Exception e) {
+      status = new ParseStatus(e);
+    }
+
+    if (status.isSuccess()) {
+      output.collect(key, new ParseImpl(parse.getText(), parse.getData()));
+    } else {
+      LOG.warning("Error parsing: "+key+": "+status.toString());
+    }
+  }
+
+  public void reduce(WritableComparable key, Iterator values,
+                     OutputCollector output, Reporter reporter)
+    throws IOException {
+    output.collect(key, (Writable)values.next()); // collect first value
+  }
+
+  public void parse(File segment) throws IOException {
+    LOG.info("Parse: starting");
+    LOG.info("Parse: segment: " + segment);
+
+    JobConf job = new JobConf(getConf());
+
+    job.setInputDir(new File(segment, Content.DIR_NAME));
+    job.setInputFormat(SequenceFileInputFormat.class);
+    job.setInputKeyClass(UTF8.class);
+    job.setInputValueClass(Content.class);
+    job.setMapperClass(ParseSegment.class);
+    job.setReducerClass(ParseSegment.class);
+    
+    job.setOutputDir(segment);
+    job.setOutputFormat(ParseOutputFormat.class);
+    job.setOutputKeyClass(UTF8.class);
+    job.setOutputValueClass(ParseImpl.class);
+
+    JobClient.runJob(job);
+    LOG.info("Parse: done");
+  }
+
+
+  public static void main(String[] args) throws Exception {
+    File segment;
+
+    String usage = "Usage: ParseSegment segment";
+
+    if (args.length == 0) {
+      System.err.println(usage);
+      System.exit(-1);
+    }
+      
+    segment = new File(args[0]);
+
+    ParseSegment parseSegment = new ParseSegment(NutchConf.get());
+    parseSegment.parse(segment);
+  }
+}

Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Wed 
Dec 28 16:37:13 2005
@@ -18,6 +18,8 @@
 
 import org.apache.nutch.util.LogFormatter;
 
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.io.UTF8;
 import org.apache.nutch.parse.ParseUtil;
 
 import org.apache.nutch.protocol.ProtocolFactory;
@@ -69,7 +71,7 @@
     LOG.info("fetching: "+url);
 
     Protocol protocol = ProtocolFactory.getProtocol(url);
-    Content content = protocol.getProtocolOutput(url).getContent();
+    Content content = protocol.getProtocolOutput(new UTF8(url), new 
CrawlDatum()).getContent();
 
     if (force) {
       content.setContentType(contentType);

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java Wed Dec 
28 16:37:13 2005
@@ -18,21 +18,15 @@
 
 import java.io.IOException;
 
-import org.apache.nutch.pagedb.FetchListEntry;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.io.UTF8;
 
 /** A retriever of url content.  Implemented by protocol extensions. */
 public interface Protocol {
   /** The name of the extension point. */
   public final static String X_POINT_ID = Protocol.class.getName();
 
-  /** Returns the [EMAIL PROTECTED] Content} for a url. This method may be
-   * more limited than [EMAIL PROTECTED] #getProtocolOutput(FetchListEntry)}.
-   * @throws IOException for any errors.
-   */
-  ProtocolOutput getProtocolOutput(String url);
-
   /** Returns the [EMAIL PROTECTED] Content} for a fetchlist entry.
-   * @throws IOException for any errors.
    */
-  ProtocolOutput getProtocolOutput(FetchListEntry fle);
+  ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum);
 }

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java 
Wed Dec 28 16:37:13 2005
@@ -20,17 +20,11 @@
 import java.io.File;
 
 import java.util.HashMap;
-import java.util.Arrays;
 
 import org.apache.nutch.io.*;
 import org.apache.nutch.fs.*;
-import org.apache.nutch.db.*;
-import org.apache.nutch.util.*;
-import org.apache.nutch.fetcher.*;
 import org.apache.nutch.protocol.*;
 import org.apache.nutch.parse.*;
-import org.apache.nutch.pagedb.*;
-import org.apache.nutch.indexer.*;
 import org.apache.nutch.mapred.*;
 import org.apache.nutch.mapred.lib.*;
 import org.apache.nutch.crawl.*;

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java 
Wed Dec 28 16:37:13 2005
@@ -28,7 +28,6 @@
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.MultiReader;
 
-import org.apache.lucene.search.MultiSearcher;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.FieldDoc;
@@ -40,11 +39,7 @@
 import org.apache.nutch.fs.*;
 import org.apache.nutch.io.*;
 import org.apache.nutch.util.*;
-import org.apache.nutch.db.*;
-import org.apache.nutch.fetcher.*;
-import org.apache.nutch.linkdb.*;
 import org.apache.nutch.indexer.*;
-import org.apache.nutch.analysis.NutchDocumentAnalyzer;
 
 /** Implements [EMAIL PROTECTED] Searcher} and [EMAIL PROTECTED] HitDetailer} 
for either a single
  * merged index, or a set of indexes. */

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java Wed 
Dec 28 16:37:13 2005
@@ -26,7 +26,6 @@
 import org.apache.nutch.parse.*;
 import org.apache.nutch.indexer.*;
 import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.crawl.LinkDbReader;
 
 /** 
  * One stop shopping for search-related functionality.
@@ -103,7 +102,7 @@
       Vector vDirs=new Vector();
       File [] directories = fs.listFiles(indexesDir);
       for(int i = 0; i < fs.listFiles(indexesDir).length; i++) {
-        File indexdone = new File(directories[i], IndexSegment.DONE_NAME);
+        File indexdone = new File(directories[i], Indexer.DONE_NAME);
         if(fs.isFile(indexdone)) {
           vDirs.add(directories[i]);
         }

Added: lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java?rev=359668&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java Wed Dec 
28 16:37:13 2005
@@ -0,0 +1,384 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.*;
+import java.net.*;
+import java.util.*;
+import java.util.logging.*;
+import java.net.MalformedURLException;
+import java.util.regex.*;
+
+import javax.xml.parsers.*;
+import org.xml.sax.*;
+import org.xml.sax.helpers.*;
+import org.apache.xerces.util.XMLChar;
+
+import org.apache.nutch.io.*;
+import org.apache.nutch.fs.*;
+import org.apache.nutch.net.*;
+import org.apache.nutch.util.*;
+import org.apache.nutch.util.NutchConf;
+
+/** Utility that converts DMOZ RDF into a flat file of URLs to be injected. */
+public class DmozParser {
+  public static final Logger LOG = 
LogFormatter.getLogger("org.apache.nutch.crawl.DmozParser");
+  
+    long pages = 0;
+
+  /**
+   * This filter fixes characters that might offend our parser.
+   * This lets us be tolerant of errors that might appear in the input XML.
+   */
+  private static class XMLCharFilter extends FilterReader {
+    private boolean lastBad = false;
+
+    public XMLCharFilter(Reader reader) {
+      super(reader);
+    }
+
+    public int read() throws IOException {
+      int c = in.read();
+      int value = c;
+      if (c != -1 && !(XMLChar.isValid(c)))     // fix invalid characters
+        value = 'X';
+      else if (lastBad && c == '<') {           // fix mis-matched brackets
+        in.mark(1);
+        if (in.read() != '/')
+          value = 'X';
+        in.reset();
+      }
+      lastBad = (c == 65533);
+
+      return value;
+    }
+
+    public int read(char[] cbuf, int off, int len)
+      throws IOException {
+      int n = in.read(cbuf, off, len);
+      if (n != -1) {
+        for (int i = 0; i < n; i++) {
+          char c = cbuf[off+i];
+          char value = c;
+          if (!(XMLChar.isValid(c)))            // fix invalid characters
+            value = 'X';
+          else if (lastBad && c == '<') {       // fix mis-matched brackets
+            if (i != n-1 && cbuf[off+i+1] != '/')
+              value = 'X';
+          }
+          lastBad = (c == 65533);
+          cbuf[off+i] = value;
+        }
+      }
+      return n;
+    }
+  }
+
+
+  /**
+   * The RDFProcessor receives tag messages during a parse
+   * of RDF XML data.  We build whatever structures we need
+   * from these messages.
+   */
+  private class RDFProcessor extends DefaultHandler {
+    String curURL = null, curSection = null;
+    boolean titlePending = false, descPending = false, insideAdultSection = 
false;
+    Pattern topicPattern = null; 
+    StringBuffer title = new StringBuffer(), desc = new StringBuffer();
+    XMLReader reader;
+    int subsetDenom;
+    int hashSkew;
+    boolean includeAdult;
+    Locator location;
+
+    /**
+     * Pass in an XMLReader, plus a flag as to whether we 
+     * should include adult material.
+     */
+    public RDFProcessor(XMLReader reader, int subsetDenom, boolean 
includeAdult, int skew, Pattern topicPattern) throws IOException {
+      this.reader = reader;
+      this.subsetDenom = subsetDenom;
+      this.includeAdult = includeAdult;
+      this.topicPattern = topicPattern;
+
+      this.hashSkew = skew != 0 ? skew : new Random().nextInt();
+    }
+
+    //
+    // Interface ContentHandler
+    //
+
+    /**
+     * Start of an XML elt
+     */
+    public void startElement(String namespaceURI, String localName, String 
qName, Attributes atts) throws SAXException {
+      if ("Topic".equals(qName)) {
+        curSection = atts.getValue("r:id");
+      } else if ("ExternalPage".equals(qName)) {
+        // Porn filter
+        if ((! includeAdult) && curSection.startsWith("Top/Adult")) {
+          return;
+        }
+          
+        if (topicPattern != null && 
!topicPattern.matcher(curSection).matches()) {
+          return;
+        }
+
+        // Subset denominator filter.  
+        // Only emit with a chance of 1/denominator.
+        String url = atts.getValue("about");
+        int hashValue = MD5Hash.digest(url).hashCode();
+        hashValue = Math.abs(hashValue ^ hashSkew);
+        if ((hashValue % subsetDenom) != 0) {
+          return;
+        }
+
+        // We actually claim the URL!
+        curURL = url;
+      } else if (curURL != null && "d:Title".equals(qName)) {
+        titlePending = true;
+      } else if (curURL != null && "d:Description".equals(qName)) {
+        descPending = true;
+      }
+    }
+
+    /**
+     * The contents of an XML elt
+     */
+    public void characters(char ch[], int start, int length) {
+      if (titlePending) {
+        title.append(ch, start, length);
+      } else if (descPending) {
+        desc.append(ch, start, length);
+      }
+    }
+
+    /**
+     * Termination of XML elt
+     */
+    public void endElement(String namespaceURI, String localName, String qName)
+      throws SAXException {
+      if (curURL != null) {
+        if ("ExternalPage".equals(qName)) {
+          //
+          // Inc the number of pages, insert the page, and 
+          // possibly print status.
+          //
+          System.out.println(curURL);
+          pages++;
+
+          //
+          // Clear out the link text.  This is what
+          // you would use for adding to the linkdb.
+          //
+          if (title.length() > 0) {
+            title.delete(0, title.length());
+          }
+          if (desc.length() > 0) {
+            desc.delete(0, desc.length());
+          }
+
+          // Null out the URL.
+          curURL = null;
+        } else if ("d:Title".equals(qName)) {
+          titlePending = false;
+        } else if ("d:Description".equals(qName)) {
+          descPending = false;
+        }
+      }
+    }
+
+    /**
+     * When parsing begins
+     */
+    public void startDocument() {
+      LOG.info("Begin parse");
+    }
+
+    /**
+     * When parsing ends
+     */
+    public void endDocument() {
+      LOG.info("Completed parse.  Found " + pages + " pages.");
+    }
+
+    /**
+     * From time to time the Parser will set the "current location"
+     * by calling this function.  It's useful for emitting locations
+     * for error messages.
+     */
+    public void setDocumentLocator(Locator locator) {
+      location = locator;
+    }
+
+
+    //
+    // Interface ErrorHandler
+    //
+
+    /**
+     * Emit the exception message
+     */
+    public void error(SAXParseException spe) {
+      LOG.severe("Error: " + spe.toString() + ": " + spe.getMessage());
+      spe.printStackTrace(System.err);
+    }
+
+    /**
+     * Emit the exception message, with line numbers
+     */
+    public void fatalError(SAXParseException spe) {
+      LOG.severe("Fatal err: " + spe.toString() + ": " + spe.getMessage());
+      LOG.severe("Last known line is " + location.getLineNumber() + ", column 
" + location.getColumnNumber());
+      spe.printStackTrace(System.err);
+    }
+        
+    /**
+     * Emit exception warning message
+     */
+    public void warning(SAXParseException spe) {
+      LOG.warning("Warning: " + spe.toString() + ": " + spe.getMessage());
+      spe.printStackTrace(System.err);
+    }
+  }
+
+  /**
+   * Iterate through all the items in this structured DMOZ file.
+   * Add each URL to the web db.
+   */
+  public void parseDmozFile(File dmozFile, int subsetDenom,
+                            boolean includeAdult,
+                            int skew,
+                            Pattern topicPattern)
+
+    throws IOException, SAXException, ParserConfigurationException {
+
+    SAXParserFactory parserFactory = SAXParserFactory.newInstance();
+    SAXParser parser = parserFactory.newSAXParser();
+    XMLReader reader = parser.getXMLReader();
+
+    // Create our own processor to receive SAX events
+    RDFProcessor rp =
+      new RDFProcessor(reader, subsetDenom, includeAdult,
+                       skew, topicPattern);
+    reader.setContentHandler(rp);
+    reader.setErrorHandler(rp);
+    LOG.info("skew = " + rp.hashSkew);
+
+    //
+    // Open filtered text stream.  The UTF8Filter makes sure that
+    // only appropriate XML-approved UTF8 characters are received.
+    // Any non-conforming characters are silently skipped.
+    //
+    XMLCharFilter in = new XMLCharFilter(new BufferedReader(new 
InputStreamReader(new BufferedInputStream(new FileInputStream(dmozFile)), 
"UTF-8")));
+    try {
+      InputSource is = new InputSource(in);
+      reader.parse(is);
+    } catch (Exception e) {
+      LOG.severe(e.toString());
+      e.printStackTrace(System.err);
+      System.exit(0);
+    } finally {
+      in.close();
+    }
+  }
+
+  private static void addTopicsFromFile(String topicFile, Vector topics)
+    throws IOException {
+    BufferedReader in = null;
+    try {
+      in = new BufferedReader(new InputStreamReader(new 
FileInputStream(topicFile), "UTF-8"));
+      String line = null;
+      while ((line = in.readLine()) != null) {
+        topics.addElement(new String(line));
+      }
+    } 
+    catch (Exception e) {
+      LOG.severe(e.toString());
+      e.printStackTrace(System.out);
+      System.exit(0);
+    } finally {
+      in.close();
+    }
+  }
+    
+  /**
+   * Command-line access.  User may add URLs via a flat text file
+   * or the structured DMOZ file.  By default, we ignore Adult
+   * material (as categorized by DMOZ).
+   */
+  public static void main(String argv[]) throws Exception {
+    if (argv.length < 1) {
+      System.err.println("Usage: DmozParser <dmoz_file> [-subset 
<subsetDenominator>] [-includeAdultMaterial] [-skew skew] [-topicFile <topic 
list file>] [-topic <topic> [-topic <topic> [...]]]");
+      return;
+    }
+    
+    //
+    // Parse the command line, figure out what kind of
+    // URL file we need to load
+    //
+    int subsetDenom = 1;
+    int skew = 0;
+    String dmozFile = argv[0];
+    boolean includeAdult = false;
+    Pattern topicPattern = null; 
+    Vector topics = new Vector(); 
+    
+    NutchFileSystem nfs = NutchFileSystem.get();
+    try {
+      for (int i = 1; i < argv.length; i++) {
+        if ("-includeAdultMaterial".equals(argv[i])) {
+          includeAdult = true;
+        } else if ("-subset".equals(argv[i])) {
+          subsetDenom = Integer.parseInt(argv[i+1]);
+          i++;
+        } else if ("-topic".equals(argv[i])) {
+          topics.addElement(argv[i+1]); 
+          i++;
+        } else if ("-topicFile".equals(argv[i])) {
+          addTopicsFromFile(argv[i+1], topics);
+          i++;
+        } else if ("-skew".equals(argv[i])) {
+          skew = Integer.parseInt(argv[i+1]);
+          i++;
+        }
+      }
+
+      DmozParser parser = new DmozParser();
+
+      if (!topics.isEmpty()) {
+        String regExp = new String("^("); 
+        int j = 0;
+        for ( ; j < topics.size() - 1; ++j) {
+          regExp = regExp.concat((String) topics.get(j));
+          regExp = regExp.concat("|");
+        }
+        regExp = regExp.concat((String) topics.get(j));
+        regExp = regExp.concat(").*"); 
+        LOG.info("Topic selection pattern = " + regExp);
+        topicPattern = Pattern.compile(regExp); 
+      }
+
+      parser.parseDmozFile(new File(dmozFile), subsetDenom,
+                           includeAdult, skew, topicPattern);
+      
+    } finally {
+      nfs.close();
+    }
+  }
+
+}

Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: 
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java
 Wed Dec 28 16:37:13 2005
@@ -18,7 +18,7 @@
 
 import org.apache.nutch.io.*;
 import org.apache.nutch.util.LogFormatter;
-import org.apache.nutch.indexer.IndexSegment;
+import org.apache.nutch.indexer.Indexer;
 
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.document.Document;
@@ -80,7 +80,7 @@
     Vector vReaders=new Vector();
     int maxDoc = 0;
     for (int i = 0; i < directories.length; i++) {
-      File indexDone = new File(directories[i], IndexSegment.DONE_NAME);
+      File indexDone = new File(directories[i], Indexer.DONE_NAME);
       if (indexDone.exists() && indexDone.isFile()){
         File indexDir = new File(directories[i], "index");
        IndexReader reader = IndexReader.open(indexDir);

Modified: 
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
 Wed Dec 28 16:37:13 2005
@@ -27,8 +27,6 @@
 
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.fetcher.FetcherOutput;
-import org.apache.nutch.pagedb.FetchListEntry;
 
 import java.util.logging.Logger;
 import org.apache.nutch.util.LogFormatter;

Modified: 
lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
 Wed Dec 28 16:37:13 2005
@@ -27,8 +27,6 @@
 
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.fetcher.FetcherOutput;
-import org.apache.nutch.pagedb.FetchListEntry;
 
 import java.io.IOException;
 import java.net.MalformedURLException;

Modified: 
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 Wed Dec 28 16:37:13 2005
@@ -37,7 +37,6 @@
 
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.fetcher.FetcherOutput;
 
 import org.apache.nutch.util.NutchConf;
 import org.apache.nutch.util.mime.MimeType;

Modified: 
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
 Wed Dec 28 16:37:13 2005
@@ -34,6 +34,8 @@
 
 // Nutch imports
 import org.apache.nutch.analysis.lang.NGramProfile.NGramEntry;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.io.UTF8;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.parse.ParserFactory;
@@ -350,7 +352,7 @@
     Protocol protocol;
     try {
       protocol = ProtocolFactory.getProtocol(url);
-      Content content = protocol.getProtocolOutput(url).getContent();
+      Content content = protocol.getProtocolOutput(new UTF8(url), new 
CrawlDatum()).getContent();
       String contentType = content.getContentType();
       Parser parser = ParserFactory.getParser(contentType, url);
       Parse parse = parser.getParse(content);

Modified: 
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
 Wed Dec 28 16:37:13 2005
@@ -17,13 +17,13 @@
 package org.apache.nutch.protocol.file;
 
 
-import org.apache.nutch.db.Page;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.io.UTF8;
 import org.apache.nutch.net.protocols.HttpDateFormat;
 
 import org.apache.nutch.util.LogFormatter;
 import org.apache.nutch.util.NutchConf;
 
-import org.apache.nutch.pagedb.FetchListEntry;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolOutput;
@@ -50,7 +50,7 @@
 
   static final int MAX_REDIRECTS = 5;
 
-  static int maxContentLength = 
NutchConf.get().getInt("file.content.limit",64*1024);
+  static int maxContentLength = NutchConf.get().getInt("file.content.limit", 
64 * 1024);
 
   // 20040412, xing
   // the following three: HttpDateFormat, MimetypesFileTypeMap, MagicFile
@@ -67,26 +67,16 @@
   /** Set the point at which content is truncated. */
   public void setMaxContentLength(int length) {maxContentLength = length;}
 
-  public ProtocolOutput getProtocolOutput(String urlString) {
-    ProtocolOutput output = null;
+  public ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum) {
+    String urlString = url.toString();
     try {
-      return getProtocolOutput(new FetchListEntry(true,
-            new Page(urlString, 1.0f), new String[0]));
-    } catch (MalformedURLException mue) {
-      return new ProtocolOutput(null, new ProtocolStatus(mue));
-    }
-  }
-  
-  public ProtocolOutput getProtocolOutput(FetchListEntry fle) {
-    String urlString = fle.getUrl().toString();
-    try {
-      URL url = new URL(urlString);
+      URL u = new URL(urlString);
   
       int redirects = 0;
   
       while (true) {
         FileResponse response;
-        response = new FileResponse(urlString, url, this);   // make a request
+        response = new FileResponse(u, datum, this);   // make a request
   
         int code = response.getCode();
   
@@ -96,10 +86,10 @@
         } else if (code >= 300 && code < 400) {     // handle redirect
           if (redirects == MAX_REDIRECTS)
             throw new FileException("Too many redirects: " + url);
-          url = new URL(response.getHeader("Location"));
+          u = new URL(response.getHeader("Location"));
           redirects++;                
           if (LOG.isLoggable(Level.FINE))
-            LOG.fine("redirect to " + url); 
+            LOG.fine("redirect to " + u); 
   
         } else {                                    // convert to exception
           throw new FileError(code);
@@ -150,7 +140,7 @@
     // set log level
     LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
 
-    Content content = file.getProtocolOutput(urlString).getContent();
+    Content content = file.getProtocolOutput(new UTF8(urlString), new 
CrawlDatum()).getContent();
 
     System.err.println("Content-Type: " + content.getContentType());
     System.err.println("Content-Length: " + content.get("Content-Length"));

Modified: 
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
 Wed Dec 28 16:37:13 2005
@@ -19,11 +19,11 @@
 // JDK imports
 import java.net.URL;
 import java.util.TreeMap;
-import java.util.Properties;
 import java.util.logging.Level;
 import java.io.IOException;
 
 // Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.ContentProperties;
 
@@ -80,15 +80,10 @@
                        headers);
   }
 
-  public FileResponse(URL url, File file)
-    throws FileException, IOException {
-    this(url.toString(), url, file);
-  }
-
-  public FileResponse(String orig, URL url, File file)
+  public FileResponse(URL url, CrawlDatum datum, File file)
     throws FileException, IOException {
 
-    this.orig = orig;
+    this.orig = url.toString();
     this.base = url.toString();
     this.file = file;
 

Modified: 
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
 Wed Dec 28 16:37:13 2005
@@ -19,13 +19,13 @@
 
 import org.apache.commons.net.ftp.FTPFileEntryParser;
 
-import org.apache.nutch.db.Page;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.io.UTF8;
 import org.apache.nutch.net.protocols.HttpDateFormat;
 
 import org.apache.nutch.util.LogFormatter;
 import org.apache.nutch.util.NutchConf;
 
-import org.apache.nutch.pagedb.FetchListEntry;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolOutput;
@@ -112,26 +112,16 @@
     this.keepConnection = keepConnection;
   }
 
-  public ProtocolOutput getProtocolOutput(String urlString) {
-    ProtocolOutput output = null;
+  public ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum) {
+    String urlString = url.toString();
     try {
-      return getProtocolOutput(new FetchListEntry(true,
-            new Page(urlString, 1.0f), new String[0]));
-    } catch (MalformedURLException mue) {
-      return new ProtocolOutput(null, new ProtocolStatus(mue));
-    }
-  }
-  
-  public ProtocolOutput getProtocolOutput(FetchListEntry fle) {
-    String urlString = fle.getUrl().toString();
-    try {
-      URL url = new URL(urlString);
+      URL u = new URL(urlString);
   
       int redirects = 0;
   
       while (true) {
         FtpResponse response;
-        response = new FtpResponse(urlString, url, this);   // make a request
+        response = new FtpResponse(u, datum, this);   // make a request
   
         int code = response.getCode();
   
@@ -141,10 +131,10 @@
         } else if (code >= 300 && code < 400) {     // handle redirect
           if (redirects == MAX_REDIRECTS)
             throw new FtpException("Too many redirects: " + url);
-          url = new URL(response.getHeader("Location"));
+          u = new URL(response.getHeader("Location"));
           redirects++;                
           if (LOG.isLoggable(Level.FINE))
-            LOG.fine("redirect to " + url); 
+            LOG.fine("redirect to " + u); 
   
         } else {                                    // convert to exception
           throw new FtpError(code);
@@ -218,7 +208,7 @@
     // set log level
     LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
 
-    Content content = ftp.getProtocolOutput(urlString).getContent();
+    Content content = ftp.getProtocolOutput(new UTF8(urlString), new 
CrawlDatum()).getContent();
 
     System.err.println("Content-Type: " + content.getContentType());
     System.err.println("Content-Length: " + content.get("Content-Length"));

Modified: 
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
 Wed Dec 28 16:37:13 2005
@@ -24,6 +24,7 @@
 import org.apache.commons.net.ftp.parser.DefaultFTPFileEntryParserFactory;
 import org.apache.commons.net.ftp.parser.ParserInitializationException;
 
+import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.ContentProperties;
 
@@ -80,15 +81,10 @@
                        headers);
   }
 
-  public FtpResponse(URL url, Ftp ftp)
-    throws FtpException, IOException {
-    this(url.toString(), url, ftp);
-  }
-
-  public FtpResponse(String orig, URL url, Ftp ftp)
+  public FtpResponse(URL url, CrawlDatum datum, Ftp ftp)
     throws FtpException, IOException {
 
-    this.orig = orig;
+    this.orig = url.toString();
     this.base = url.toString();
     this.ftp = ftp;
 

Modified: 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
 Wed Dec 28 16:37:13 2005
@@ -28,8 +28,8 @@
 import org.apache.nutch.util.LogFormatter;
 import org.apache.nutch.util.NutchConf;
 
-import org.apache.nutch.db.Page;
-import org.apache.nutch.pagedb.FetchListEntry;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.io.UTF8;
 import org.apache.nutch.protocol.*;
 
 /** An implementation of the Http protocol. */
@@ -123,7 +123,7 @@
       }
 
       if (delays == MAX_DELAYS)
-        throw new RetryLater(url, "Exceeded http.max.delays: retry later.");
+        throw new HttpException("Exceeded http.max.delays: retry later.");
 
       long done = time.longValue();
       long now = System.currentTimeMillis();
@@ -172,31 +172,21 @@
     }
   }
 
-  public ProtocolOutput getProtocolOutput(String urlString) {
-    ProtocolOutput output = null;
+  public ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum) {
+    String urlString = url.toString();
     try {
-      return getProtocolOutput(new FetchListEntry(true,
-            new Page(urlString, 1.0f), new String[0]));
-    } catch (MalformedURLException mue) {
-      return new ProtocolOutput(null, new ProtocolStatus(mue));
-    }
-  }
-  
-  public ProtocolOutput getProtocolOutput(FetchListEntry fle) {
-    String urlString = fle.getUrl().toString();
-    try {
-      URL url = new URL(urlString);
+      URL u = new URL(urlString);
 
       int redirects = 0;
       while (true) {
         
-        if (!RobotRulesParser.isAllowed(url))
-          throw new ResourceGone(url, "Blocked by robots.txt");
+        if (!RobotRulesParser.isAllowed(u))
+          throw new HttpException("Blocked by robots.txt");
         
-        InetAddress addr = blockAddr(url);
+        InetAddress addr = blockAddr(u);
         HttpResponse response;
         try {
-          response = new HttpResponse(urlString, url); // make a request
+          response = new HttpResponse(u, datum); // make a request
         } finally {
           unblockAddr(addr);
         }
@@ -207,14 +197,14 @@
           return new ProtocolOutput(response.toContent());            // 
return it
           
         } else if (code == 410) {                 // page is gone
-          throw new ResourceGone(url, "Http: " + code);
+          throw new HttpException("Http: " + code);
 
         } else if (code >= 300 && code < 400) {   // handle redirect
           if (redirects == MAX_REDIRECTS)
             throw new HttpException("Too many redirects: " + urlString);
-          url = new URL(url, response.getHeader("Location"));
+          u = new URL(u, response.getHeader("Location"));
           redirects++;                
-          LOG.fine("redirect to " + url); 
+          LOG.fine("redirect to " + u); 
           
         } else {                                  // convert to exception
           throw new HttpError(code);
@@ -298,7 +288,7 @@
       LOG.setLevel(Level.FINE);
     }
 
-    Content content = http.getProtocolOutput(url).getContent();
+    Content content = http.getProtocolOutput(new UTF8(url), new 
CrawlDatum()).getContent();
 
     System.out.println("Content Type: " + content.getContentType());
     System.out.println("Content Length: " + content.get("Content-Length"));

Modified: 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 Wed Dec 28 16:37:13 2005
@@ -31,6 +31,7 @@
 import java.util.Properties;
 import java.util.logging.Level;
 
+import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.protocol.ProtocolException;
@@ -63,14 +64,10 @@
                        headers);
   }
 
-  public HttpResponse(URL url) throws ProtocolException, IOException {
-    this(url.toString(), url);
-  }
-
-  public HttpResponse(String orig, URL url)
+  public HttpResponse(URL url, CrawlDatum datum)
     throws ProtocolException, IOException {
     
-    this.orig = orig;
+    this.orig = url.toString();
     this.base = url.toString();
 
     if (!"http".equals(url.getProtocol()))

Modified: 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
 Wed Dec 28 16:37:13 2005
@@ -35,6 +35,7 @@
 
 import org.apache.nutch.util.NutchConf;
 import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.protocol.ProtocolException;
 
 /**
@@ -382,7 +383,7 @@
     if (robotRules == null) {                     // cache miss
       int redirects = 0;
       do {
-        HttpResponse response = new HttpResponse(new URL(url, "/robots.txt"));
+        HttpResponse response = new HttpResponse(new URL(url, "/robots.txt"), 
new CrawlDatum());
 
         int code = response.getCode();
 

Modified: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
 Wed Dec 28 16:37:13 2005
@@ -20,13 +20,9 @@
 import org.apache.commons.httpclient.auth.AuthScope;
 import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
 import org.apache.commons.httpclient.protocol.Protocol;
-import org.apache.nutch.db.Page;
-import org.apache.nutch.pagedb.FetchListEntry;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.protocol.ProtocolOutput;
-import org.apache.nutch.protocol.ProtocolStatus;
-import org.apache.nutch.protocol.RetryLater;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.io.UTF8;
+import org.apache.nutch.protocol.*;
 import org.apache.nutch.util.LogFormatter;
 import org.apache.nutch.util.NutchConf;
 
@@ -130,7 +126,7 @@
         }
       }
 
-      if (delays == MAX_DELAYS) throw new RetryLater(url, "Exceeded 
http.max.delays: retry later.");
+      if (delays == MAX_DELAYS) throw new HttpException("Exceeded 
http.max.delays: retry later.");
 
       long done = time.longValue();
       long now = System.currentTimeMillis();
@@ -177,31 +173,23 @@
     }
   }
 
-  public ProtocolOutput getProtocolOutput(String urlString) {
+  public ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum) {
+    String urlString = url.toString();
     try {
-      return getProtocolOutput(new FetchListEntry(true, new Page(urlString, 
1.0f), new String[0]));
-    } catch (MalformedURLException mue) {
-      return new ProtocolOutput(null, new ProtocolStatus(mue));
-    }
-  }
-
-  public ProtocolOutput getProtocolOutput(FetchListEntry fle) {
-    String urlString = fle.getUrl().toString();
-    try {
-      URL url = new URL(urlString);
+      URL u = new URL(urlString);
 
         try {
-          if (!RobotRulesParser.isAllowed(url))
+          if (!RobotRulesParser.isAllowed(u))
                   return new ProtocolOutput(null, new 
ProtocolStatus(ProtocolStatus.ROBOTS_DENIED, url));
         } catch (Throwable e) {
           // XXX Maybe bogus: assume this is allowed.
           LOG.fine("Exception checking robot rules for " + url + ": " + e);
         }
 
-        InetAddress addr = blockAddr(url);
+        InetAddress addr = blockAddr(u);
         HttpResponse response;
         try {
-          response = new HttpResponse(url); // make a request
+          response = new HttpResponse(u, datum); // make a request
         } finally {
           unblockAddr(addr);
         }
@@ -220,7 +208,7 @@
           // some broken servers, such as MS IIS, use lowercase header name...
           if (location == null) location = response.getHeader("location");
           if (location == null) location = "";
-          url = new URL(url, location);
+          u = new URL(u, location);
           int protocolStatusCode;
           switch (code) {
             case 300:   // multiple choices, preferred value in Location
@@ -242,21 +230,21 @@
               protocolStatusCode = ProtocolStatus.MOVED;
           }
           // handle this in the higher layer.
-          return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, 
url));
+          return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, 
u));
         } else if (code == 400) { // bad request, mark as GONE
-          LOG.fine("400 Bad request: " + url);
-          return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, 
url));
+          LOG.fine("400 Bad request: " + u);
+          return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, 
u));
         } else if (code == 401) { // requires authorization, but no valid auth 
provided.
           LOG.fine("401 Authentication Required");
           return new ProtocolOutput(c, new 
ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authentication required: "
                   + urlString));
         } else if (code == 404) {
-          return new ProtocolOutput(c, new 
ProtocolStatus(ProtocolStatus.NOTFOUND, url));
+          return new ProtocolOutput(c, new 
ProtocolStatus(ProtocolStatus.NOTFOUND, u));
         } else if (code == 410) { // permanently GONE
-          return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, 
url));
+          return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, 
u));
         } else {
           return new ProtocolOutput(c, new 
ProtocolStatus(ProtocolStatus.EXCEPTION, "Http code=" + code + ", url="
-                  + url));
+                  + u));
         }
     } catch (Throwable e) {
       e.printStackTrace();
@@ -333,7 +321,7 @@
       LOG.setLevel(Level.FINE);
     }
 
-    ProtocolOutput out = http.getProtocolOutput(url);
+    ProtocolOutput out = http.getProtocolOutput(new UTF8(url), new 
CrawlDatum());
     Content content = out.getContent();
 
     System.out.println("Status: " + out.getStatus());

Modified: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
 Wed Dec 28 16:37:13 2005
@@ -51,7 +51,7 @@
         
        try {
                        Collection challenge = null;
-                       if (header instanceof MultiProperties) {
+                       if (header instanceof ContentProperties) {
                                Object o = header.get(AUTH_HEADER);
                                if (o instanceof Collection) {
                                        challenge = (Collection) o;

Modified: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
 Wed Dec 28 16:37:13 2005
@@ -3,7 +3,9 @@
 
 package org.apache.nutch.protocol.httpclient;
 
+import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
 
 import org.apache.commons.httpclient.Header;
 import org.apache.commons.httpclient.HttpVersion;
@@ -32,7 +34,7 @@
 
   private int code;
 
-  private MultiProperties headers = new MultiProperties();
+  private ContentProperties headers = new ContentProperties();
 
   /**
    * Returns the response code.
@@ -59,11 +61,11 @@
                        headers);
   }
 
-  public HttpResponse(URL url) throws IOException {
-    this(url, false);
+  public HttpResponse(URL url, CrawlDatum datum) throws IOException {
+    this(url, datum, false);
   }
 
-  HttpResponse(URL url, boolean followRedirects) throws IOException {
+  HttpResponse(URL url, CrawlDatum datum, boolean followRedirects) throws 
IOException {
     this.base = url.toString();
     this.orig = url.toString();
     GetMethod get = new GetMethod(this.orig);

Modified: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java?rev=359668&r1=359667&r2=359668&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java
 Wed Dec 28 16:37:13 2005
@@ -35,6 +35,7 @@
 
 import org.apache.nutch.util.NutchConf;
 import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.protocol.ProtocolException;
 
 /**
@@ -380,7 +381,7 @@
       LOG.fine("cache miss " + url);
       try {
         HttpResponse response = new HttpResponse(new URL(url, "/robots.txt"),
-                                                 true);
+                                      new CrawlDatum(), true);
 
         if (response.getCode() == 200)               // found rules: parse them
           robotRules = new 
RobotRulesParser().parseRules(response.getContent());




-------------------------------------------------------
This SF.net email is sponsored by: Splunk Inc. Do you grep through log files
for problems?  Stop!  Download the new AJAX search engine that makes
searching your log files as easy as surfing the  web.  DOWNLOAD SPLUNK!
http://ads.osdn.com/?ad_id=7637&alloc_id=16865&op=click
_______________________________________________
Nutch-cvs mailing list
Nutch-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

[Nutch-cvs] svn commit: r359668 [2/2] - in /lucene/nutch/trunk: bin/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/db/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/io/ src/java/org/apache/nutch/linkdb...

Reply via email to