Hi nutch community,

the attached patch allow define a topic witch will be only parsed from the dmoz file.
The defined topic and all subtopics will be injected only


So you can use:

bin/nutch db -dmozfile openDirectoryUrls.rdf.u8 -topic Top/Arts/People/M/Moore,_Michael


Hope someone will find it useful.

Best,
Stefan


Index: WebDBInjector.java
===================================================================
RCS file: /cvsroot/nutch/nutch/src/java/net/nutch/db/WebDBInjector.java,v
retrieving revision 1.35
diff -u -r1.35 WebDBInjector.java
--- WebDBInjector.java  5 May 2004 18:23:08 -0000       1.35
+++ WebDBInjector.java  27 May 2004 21:04:14 -0000
@@ -17,8 +17,7 @@
 import net.nutch.io.*;
 import net.nutch.net.*;
 import net.nutch.util.*;
-import net.nutch.pagedb.*;
-import net.nutch.linkdb.*;
+
 import net.nutch.util.NutchConf;
 
 /*********************************************
@@ -105,16 +104,18 @@
         MD5Hash srcDmozID;
         long srcDmozDomainID;
         Locator location;
+        String topicPath;
 
         /**
          * Pass in an XMLReader, plus a flag as to whether we 
          * should include adult material.
          */
-        public RDFProcessor(XMLReader reader, int subsetDenom, boolean includeAdult, 
boolean includeDmozDesc, int skew) throws IOException {
+        public RDFProcessor(XMLReader reader, int subsetDenom, boolean includeAdult, 
boolean includeDmozDesc, int skew, String topicPath) throws IOException {
             this.reader = reader;
             this.subsetDenom = subsetDenom;
             this.includeAdult = includeAdult;
             this.includeDmozDesc = includeDmozDesc;
+            this.topicPath = topicPath;
 
             // We create a Page entry for the "Dmoz" page, from
             // which all descriptive links originate.  The name
@@ -158,9 +159,14 @@
                 if ((! includeAdult) && curSection.startsWith("Top/Adult")) {
                     return;
                 }
+                // Topic Filter
+                if ((topicPath!=null) && !curSection.startsWith(topicPath)){
+                    return;
+                }
 
                 // Subset denominator filter.  
                 // Only emit with a chance of 1/denominator.
+
                 String url = atts.getValue("about");
                 int hashValue = MD5Hash.digest(url).hashCode();
                 hashValue = Math.abs(hashValue ^ hashSkew);
@@ -370,7 +376,7 @@
      * Iterate through all the items in this structured DMOZ file.
      * Add each URL to the web db.
      */
-    public void injectDmozFile(File dmozFile, int subsetDenom, boolean includeAdult, 
boolean includeDmozDesc, int skew) throws IOException, SAXException, 
ParserConfigurationException {
+    public void injectDmozFile(File dmozFile, int subsetDenom, boolean includeAdult, 
boolean includeDmozDesc, int skew, String topicPath) throws IOException, SAXException, 
ParserConfigurationException {
         nextFetch = dmozFile.lastModified();
 
         SAXParserFactory parserFactory = SAXParserFactory.newInstance();
@@ -379,7 +385,7 @@
 
         // Create our own processor to receive SAX events
         RDFProcessor rp =
-          new RDFProcessor(reader, subsetDenom, includeAdult, includeDmozDesc, skew);
+          new RDFProcessor(reader, subsetDenom, includeAdult, includeDmozDesc, skew, 
topicPath);
         reader.setContentHandler(rp);
         reader.setErrorHandler(rp);
         LOG.info("skew = " + rp.hashSkew);
@@ -419,7 +425,7 @@
      */
     public static void main(String argv[]) throws Exception {
       if (argv.length < 3) {
-        System.out.println("Usage: WebDBInjector <db_dir> (-urlfile <url_file> | 
-dmozfile <dmoz_file>) [-subset <subsetDenominator>] [-includeAdultMaterial] [-skew 
skew] [-noDmozDesc]");
+        System.out.println("Usage: WebDBInjector <db_dir> (-urlfile <url_file> | 
-dmozfile <dmoz_file>) [-subset <subsetDenominator>] [-includeAdultMaterial] [-skew 
skew] [-noDmozDesc] [-topic <topicPath>]");
         return;
       }
 
@@ -431,6 +437,7 @@
       int skew = 0;
       String dbDir = null, command = null, loadfile = null;
       boolean includeAdult = false, includeDmozDesc = true;
+      String topicPath = null;
 
       for (int i = 0; i < argv.length; i++) {
         if ("-urlfile".equals(argv[i]) || 
@@ -448,6 +455,9 @@
         } else if ("-skew".equals(argv[i])) {
           skew = Integer.parseInt(argv[i+1]);
           i++;
+        }else if ("-topic".equals(argv[i])) {
+          topicPath = argv[i+1];
+          i++;
         } else {
           dbDir = argv[i];
         }
@@ -462,7 +472,7 @@
       if ("-urlfile".equals(command)) {
         injector.injectURLFile(new File(loadfile));
       } else if ("-dmozfile".equals(command)) {
-        injector.injectDmozFile(new File(loadfile), subsetDenom, includeAdult, 
includeDmozDesc, skew);
+        injector.injectDmozFile(new File(loadfile), subsetDenom, includeAdult, 
includeDmozDesc, skew, topicPath);
       } else {
         System.out.println("No command indicated.");
         return;

--------------------------------------------------------------- open technology: http://www.media-style.com open source: http://www.weta-group.net open discussion: http://www.text-mining.org

Reply via email to