[Nutch-dev] WebDBInjector can now operate directly on the DMOZ .gz file

Shiraz Kanga Sun, 07 Mar 2004 00:00:12 -0800

This is a slightly modified WebDBInjector that does not require the (large) dmoz gz files to be unzipped in order to inject URLs from them into the WebDB. This WebDBInjector looks for a .gz extension and if present then transparently uses JDK's GZIPInputStream to read it.

The new file as well as a patch file are included. I hope it can be checked in by someone with write access to the Nutch CVS.

Thanks

shiraz kanga

/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */


package net.nutch.db;

import java.io.*;
import java.net.*;
import java.util.*;
import java.util.logging.*;
import java.util.zip.GZIPInputStream;
import java.net.MalformedURLException;

import javax.xml.parsers.*;
import org.xml.sax.*;
import org.xml.sax.helpers.*;
import org.apache.xerces.util.XMLChar;

import net.nutch.io.*;
import net.nutch.net.*;
import net.nutch.util.*;
import net.nutch.pagedb.*;
import net.nutch.linkdb.*;
import net.nutch.util.NutchConf;

/*********************************************
 * This class takes a flat file of URLs and adds
 * them as entries into a pagedb.  Useful for
 * bootstrapping the system.
 *
 * @author Mike Cafarella
 * @author Doug Cutting
 * @author Shiraz Kanga
 *********************************************/
public class WebDBInjector {
    private static final String DMOZ_PAGENAME = "http://www.dmoz.org/";;

    private static final byte DEFAULT_INTERVAL =
      (byte)NutchConf.getInt("db.default.fetch.interval", 30);

    private static final float NEW_INJECTED_PAGE_SCORE =
      NutchConf.getFloat("db.score.injected", 2.0f);

    public static final Logger LOG = 
LogFormatter.getLogger("net.nutch.db.WebDBInjector");

    /**
     * This filter fixes characters that might offend our parser.
     * This lets us be tolerant of errors that might appear in the input XML.
     */
    private static class XMLCharFilter extends FilterReader {
      private boolean lastBad = false;

      public XMLCharFilter(Reader reader) {
        super(reader);
      }

      public int read() throws IOException {
        int c = in.read();
        int value = c;
        if (c != -1 && !(XMLChar.isValid(c)))     // fix invalid characters
          value = 'X';
        else if (lastBad && c == '<') {           // fix mis-matched brackets
          in.mark(1);
          if (in.read() != '/')
            value = 'X';
          in.reset();
        }
        lastBad = (c == 65533);

        return value;
      }

      public int read(char[] cbuf, int off, int len)
        throws IOException {
        int n = in.read(cbuf, off, len);
        if (n != -1) {
          for (int i = 0; i < n; i++) {
            char c = cbuf[off+i];
            char value = c;
            if (!(XMLChar.isValid(c)))            // fix invalid characters
              value = 'X';
            else if (lastBad && c == '<') {       // fix mis-matched brackets
              if (i != n-1 && cbuf[off+i+1] != '/')
                value = 'X';
            }
            lastBad = (c == 65533);
            cbuf[off+i] = value;
          }
        }
        return n;
      }
    }


    /**
     * The RDFProcessor receives tag messages during a parse
     * of RDF XML data.  We build whatever structures we need
     * from these messages.
     */
    class RDFProcessor extends DefaultHandler {
        String curURL = null, curSection = null;
        boolean titlePending = false, descPending = false, insideAdultSection = false;
        StringBuffer title = new StringBuffer(), desc = new StringBuffer();
        XMLReader reader;
        int subsetDenom;
        int hashSkew;
        boolean includeAdult, includeDmozDesc;
        MD5Hash srcDmozID;
        long srcDmozDomainID;
        Locator location;

        /**
         * Pass in an XMLReader, plus a flag as to whether we
         * should include adult material.
         */
        public RDFProcessor(XMLReader reader, int subsetDenom, boolean includeAdult, 
boolean includeDmozDesc, int skew) throws IOException {
            this.reader = reader;
            this.subsetDenom = subsetDenom;
            this.includeAdult = includeAdult;
            this.includeDmozDesc = includeDmozDesc;

            // We create a Page entry for the "Dmoz" page, from
            // which all descriptive links originate.  The name
            // of this page is always the same, stored in
            // DMOZ_PAGENAME.  The MD5 is generated over the current
            // timestamp.  Until this page is deleted, the descriptive
            // links will always be kept.
            //
            // If the DMOZ page is updated with new content, you
            // *could* update these links, if you really wanted to.
            // Just run inject again!  This will replace the old
            // Dmoz Page, because we always keep the same name.
            // That obsolete Page will be deleted, and all its
            // outlinks (the descriptive ones) garbage-collected.
            //
            // Then we just proceed to add the new descriptive
            // links, with the brand-new page's src MD5.
            //
            this.srcDmozID = MD5Hash.digest(DMOZ_PAGENAME + "_" + nextFetch);
            Page dmozPage = new Page(DMOZ_PAGENAME, srcDmozID);
            dmozPage.setNextFetchTime(Long.MAX_VALUE);
            dbWriter.addPageIfNotPresent(dmozPage);

            this.srcDmozDomainID = MD5Hash.digest(new 
URL(DMOZ_PAGENAME).getHost()).halfDigest();

            this.hashSkew = skew != 0 ? skew : new Random().nextInt();
        }

        //
        // Interface ContentHandler
        //

        /**
         * Start of an XML elt
         */
        public void startElement(String namespaceURI, String localName, String qName, 
Attributes atts) throws SAXException {
            if ("Topic".equals(qName)) {
                curSection = atts.getValue("r:id");
            } else if ("ExternalPage".equals(qName)) {
                // Porn filter
                if ((! includeAdult) && curSection.startsWith("Top/Adult")) {
                    return;
                }

                // Subset denominator filter.
                // Only emit with a chance of 1/denominator.
                String url = atts.getValue("about");
                int hashValue = MD5Hash.digest(url).hashCode();
                hashValue = Math.abs(hashValue ^ hashSkew);
                if ((hashValue % subsetDenom) != 0) {
                    return;
                }

                // We actually claim the URL!
                curURL = url;
            } else if (curURL != null && "d:Title".equals(qName)) {
                titlePending = true;
            } else if (curURL != null && "d:Description".equals(qName)) {
                descPending = true;
            }
        }

        /**
         * The contents of an XML elt
         */
        public void characters(char ch[], int start, int length) {
            if (titlePending) {
                title.append(ch, start, length);
            } else if (descPending) {
                desc.append(ch, start, length);
            }
        }

        /**
         * Termination of XML elt
         */
        public void endElement(String namespaceURI, String localName, String qName) 
throws SAXException {
            if (curURL != null) {
                if ("ExternalPage".equals(qName)) {
                    //
                    // Inc the number of pages, insert the page, and
                    // possibly print status.
                    //
                    try {
                      // First, manufacture the Page entry for the
                      // given DMOZ listing.
                      if (addPage(curURL)) {

                        // Second, add a link from the DMOZ page TO the
                        // just-added target Page.  The anchor text should
                        // be the merged Title and Desc that we get from
                        // the DMOZ listing.  For testing reasons, the
                        // caller may choose to disallow this.
                        if (includeDmozDesc) {
                          String fullDesc = title + " " + desc;
                          Link descLink = new Link(srcDmozID, srcDmozDomainID, curURL, 
fullDesc);
                          dbWriter.addLink(descLink);
                        }
                        pages++;
                      }

                    } catch (MalformedURLException e) {
                        LOG.fine("skipping " + curURL + ":" + e);
                    } catch (IOException ie) {
                        LOG.severe("problem adding url " + curURL + ": " + ie);
                    }
                    printStatusBar(2000, 50000);

                    //
                    // Clear out the link text.  This is what
                    // you would use for adding to the linkdb.
                    //
                    if (title.length() > 0) {
                        title.delete(0, title.length());
                    }
                    if (desc.length() > 0) {
                        desc.delete(0, desc.length());
                    }

                    // Null out the URL.
                    curURL = null;
                } else if ("d:Title".equals(qName)) {
                    titlePending = false;
                } else if ("d:Description".equals(qName)) {
                    descPending = false;
                }
            }
        }

        /**
         * When parsing begins
         */
        public void startDocument() {
            LOG.info("Begin parse");
        }

        /**
         * When parsing ends
         */
        public void endDocument() {
            LOG.info("Completed parse.  Added " + pages + " pages.");
        }

        /**
         * From time to time the Parser will set the "current location"
         * by calling this function.  It's useful for emitting locations
         * for error messages.
         */
        public void setDocumentLocator(Locator locator) {
            location = locator;
        }


        //
        // Interface ErrorHandler
        //

        /**
         * Emit the exception message
         */
        public void error(SAXParseException spe) {
            LOG.severe("Error: " + spe.toString() + ": " + spe.getMessage());
            spe.printStackTrace(System.out);
        }

        /**
         * Emit the exception message, with line numbers
         */
        public void fatalError(SAXParseException spe) {
            LOG.severe("Fatal error: " + spe.toString() + ": " + spe.getMessage());
            LOG.severe("Last known line is " + location.getLineNumber() + ", column " 
+ location.getColumnNumber());
            spe.printStackTrace(System.out);
        }

        /**
         * Emit exception warning message
         */
        public void warning(SAXParseException spe) {
            LOG.warning("Warning: " + spe.toString() + ": " + spe.getMessage());
            spe.printStackTrace(System.out);
        }
    }

    private IWebDBWriter dbWriter;

    /**
     * WebDBInjector takes a reference to a WebDBWriter that it should add to.
     */
    public WebDBInjector(IWebDBWriter dbWriter) {
        this.dbWriter = dbWriter;
    }

    /**
     * Close dbWriter and save changes
     */
    public void close() throws IOException {
        dbWriter.close();
    }

    /**
     * Utility to present small status bar
     */
    public void printStatusBar(int small, int big){
        if ((pages % small ) == 0) {
            System.out.print(".");
        }
        if ((pages % big ) == 0) {
            printStatus();
        }
    }

    long startTime = System.currentTimeMillis();
    long pages = 0;
    long nextFetch = System.currentTimeMillis();

    /**
     * Utility to present performance stats
     */
    public void printStatus(){
        long elapsed = (System.currentTimeMillis() - this.startTime);
        if ( this.pages == 0) {
        } else {
            LOG.info("\t" + this.pages + "\t" +
                     (int)((1000 *  pages)/elapsed) + " pages/second\t" );
        }
    }

    /**
     * Iterate through all the items in this flat text file and
     * add them to the db.
     */
    public void injectURLFile(String urlListFile) throws IOException {
        File urlList = new File (urlListFile);
        nextFetch = urlList.lastModified();
        //BufferedReader reader = new BufferedReader(new FileReader(urlList));
        BufferedReader reader = makeReader (urlListFile, "UTF-8");

        try {
            String curStr = null;
            LOG.info("Starting URL processing");
            while ((curStr = reader.readLine()) != null) {
                String url = curStr.trim();
                if (addPage(url))
                  this.pages++;
                printStatusBar(2000,50000);
            }
            LOG.info("Added " + pages + " pages");
        } catch (Exception e) {
            e.printStackTrace(System.out);
        } finally {
            reader.close();
        }
    }

    /**
     * Iterate through all the items in this structured DMOZ file.
     * Add each URL to the web db.
     */
    public void injectDmozFile(String dmozFileName, int subsetDenom, boolean 
includeAdult, boolean includeDmozDesc, int skew) throws IOException, SAXException, 
ParserConfigurationException {
        File dmozFile = new File (dmozFileName);
        nextFetch = dmozFile.lastModified();

        SAXParserFactory parserFactory = SAXParserFactory.newInstance();
        SAXParser parser = parserFactory.newSAXParser();
        XMLReader reader = parser.getXMLReader();

        // Create our own processor to receive SAX events
        RDFProcessor rp =
          new RDFProcessor(reader, subsetDenom, includeAdult, includeDmozDesc, skew);
        reader.setContentHandler(rp);
        reader.setErrorHandler(rp);
        LOG.info("skew = " + rp.hashSkew);

        //
        // Open filtered text stream.  The UTF8Filter makes sure that
        // only appropriate XML-approved UTF8 characters are received.
        // Any non-conforming characters are silently skipped.
        //
        XMLCharFilter in = new XMLCharFilter (makeReader (dmozFileName, "UTF-8"));

        try {
            InputSource is = new InputSource(in);
            reader.parse(is);
        } catch (Exception e) {
            LOG.severe(e.toString());
            e.printStackTrace(System.out);
            System.exit(0);
        } finally {
            in.close();
        }
    }

    /**
     * Create a BufferedReader for the file that uses the named character encoding.
     * If the name of the file ends in .gz then use a GZIPInputStream
     */
    protected BufferedReader makeReader (String fileName, String charsetName) throws 
IOException {
      BufferedReader reader;

      FileInputStream fileStream = new FileInputStream (fileName);
      if (isGzipped (fileName))
        reader = new BufferedReader (new InputStreamReader (new GZIPInputStream 
(fileStream), charsetName));
      else
        reader = new BufferedReader (new InputStreamReader (fileStream, charsetName));

      return reader;
    }

    /**
     * Check to see if the name of the file ends in .gz
     */
    protected static boolean isGzipped (String fileName) {
      if (fileName.toLowerCase ().endsWith (".gz"))
        return true;
      return false;
    }

    private boolean addPage(String url) throws IOException {
      url = URLFilterFactory.getFilter().filter(url);
      if (url != null) {
        Page page = new Page(url, NEW_INJECTED_PAGE_SCORE, nextFetch);
        dbWriter.addPageIfNotPresent(page);
        return true;
      }
      return false;
    }

    /**
     * Command-line access.  User may add URLs via a flat text file
     * or the structured DMOZ file.  By default, we ignore Adult
     * material (as categorized by DMOZ).
     */
    public static void main(String argv[]) throws Exception {
      if (argv.length < 3) {
        System.out.println("Usage: WebDBInjector <db_dir> (-urlfile <url_file> | 
-dmozfile <dmoz_file>) [-subset <subsetDenominator>] [-includeAdultMaterial] [-skew 
skew] [-noDmozDesc]");
        return;
      }

      //
      // Parse the command line, figure out what kind of
      // URL file we need to load
      //
      int subsetDenom = 1;
      int skew = 0;
      String dbDir = null, command = null, loadfile = null;
      boolean includeAdult = false, includeDmozDesc = true;

      for (int i = 0; i < argv.length; i++) {
        if ("-urlfile".equals(argv[i]) ||
            "-dmozfile".equals(argv[i])) {
          command = argv[i];
          loadfile = argv[i+1];
          i++;
        } else if ("-includeAdultMaterial".equals(argv[i])) {
          includeAdult = true;
        } else if ("-noDmozDesc".equals(argv[i])) {
          includeDmozDesc = false;
        } else if ("-subset".equals(argv[i])) {
          subsetDenom = Integer.parseInt(argv[i+1]);
          i++;
        } else if ("-skew".equals(argv[i])) {
          skew = Integer.parseInt(argv[i+1]);
          i++;
        } else {
          dbDir = argv[i];
        }
      }

      //
      // Create the webdbWriter, the injector, and then inject the
      // right kind of URL file.
      //
      IWebDBWriter writer = new WebDBWriter(new File(dbDir));
      WebDBInjector injector = new WebDBInjector(writer);
      if ("-urlfile".equals(command)) {
        injector.injectURLFile(loadfile);
      } else if ("-dmozfile".equals(command)) {
        injector.injectDmozFile(loadfile, subsetDenom, includeAdult, includeDmozDesc, 
skew);
      } else {
        System.out.println("No command indicated.");
        return;
      }

      injector.close();
    }
}

--- WebDBInjector.java.orig     2004-03-06 23:43:20.602998400 -0800
+++ WebDBInjector.java  2004-03-06 23:53:50.699032000 -0800
@@ -7,6 +7,7 @@
 import java.net.*;
 import java.util.*;
 import java.util.logging.*;
+import java.util.zip.GZIPInputStream;
 import java.net.MalformedURLException;
 
 import javax.xml.parsers.*;
@@ -28,6 +29,7 @@
  *
  * @author Mike Cafarella
  * @author Doug Cutting
+ * @author Shiraz Kanga
  *********************************************/
 public class WebDBInjector {
     private static final String DMOZ_PAGENAME = "http://www.dmoz.org/";;
@@ -346,9 +348,12 @@
      * Iterate through all the items in this flat text file and
      * add them to the db.
      */
-    public void injectURLFile(File urlList) throws IOException {
+    public void injectURLFile(String urlListFile) throws IOException {
+        File urlList = new File (urlListFile);
         nextFetch = urlList.lastModified();
-        BufferedReader reader = new BufferedReader(new FileReader(urlList));
+        //BufferedReader reader = new BufferedReader(new FileReader(urlList));
+        BufferedReader reader = makeReader (urlListFile, "UTF-8");
+
         try {
             String curStr = null;
             LOG.info("Starting URL processing");
@@ -370,7 +375,8 @@
      * Iterate through all the items in this structured DMOZ file.
      * Add each URL to the web db.
      */
-    public void injectDmozFile(File dmozFile, int subsetDenom, boolean includeAdult, 
boolean includeDmozDesc, int skew) throws IOException, SAXException, 
ParserConfigurationException {
+    public void injectDmozFile(String dmozFileName, int subsetDenom, boolean 
includeAdult, boolean includeDmozDesc, int skew) throws IOException, SAXException, 
ParserConfigurationException {
+        File dmozFile = new File (dmozFileName);
         nextFetch = dmozFile.lastModified();
 
         SAXParserFactory parserFactory = SAXParserFactory.newInstance();
@@ -389,7 +395,8 @@
         // only appropriate XML-approved UTF8 characters are received.
         // Any non-conforming characters are silently skipped.
         //
-        XMLCharFilter in = new XMLCharFilter(new BufferedReader(new 
InputStreamReader(new BufferedInputStream(new FileInputStream(dmozFile)), "UTF-8")));
+        XMLCharFilter in = new XMLCharFilter (makeReader (dmozFileName, "UTF-8"));
+
         try {
             InputSource is = new InputSource(in);
             reader.parse(is);
@@ -402,6 +409,31 @@
         }
     }
 
+    /**
+     * Create a BufferedReader for the file that uses the named character encoding.
+     * If the name of the file ends in .gz then use a GZIPInputStream
+     */
+    protected BufferedReader makeReader (String fileName, String charsetName) throws 
IOException {
+      BufferedReader reader;
+
+      FileInputStream fileStream = new FileInputStream (fileName);
+      if (isGzipped (fileName))
+        reader = new BufferedReader (new InputStreamReader (new GZIPInputStream 
(fileStream), charsetName));
+      else
+        reader = new BufferedReader (new InputStreamReader (fileStream, charsetName));
+
+      return reader;
+    }
+
+    /**
+     * Check to see if the name of the file ends in .gz
+     */
+    protected static boolean isGzipped (String fileName) {
+      if (fileName.toLowerCase ().endsWith (".gz"))
+        return true;
+      return false;
+    }
+
     private boolean addPage(String url) throws IOException {
       url = URLFilterFactory.getFilter().filter(url);
       if (url != null) {
@@ -460,9 +492,9 @@
       IWebDBWriter writer = new WebDBWriter(new File(dbDir));
       WebDBInjector injector = new WebDBInjector(writer);
       if ("-urlfile".equals(command)) {
-        injector.injectURLFile(new File(loadfile));
+        injector.injectURLFile(loadfile);
       } else if ("-dmozfile".equals(command)) {
-        injector.injectDmozFile(new File(loadfile), subsetDenom, includeAdult, 
includeDmozDesc, skew);
+        injector.injectDmozFile(loadfile, subsetDenom, includeAdult, includeDmozDesc, 
skew);
       } else {
         System.out.println("No command indicated.");
         return;

[Nutch-dev] WebDBInjector can now operate directly on the DMOZ .gz file

Reply via email to