Hi nutch community,
the attached patch allow define a topic witch will be only parsed from the dmoz file.
The defined topic and all subtopics will be injected only
So you can use:
bin/nutch db -dmozfile openDirectoryUrls.rdf.u8 -topic Top/Arts/People/M/Moore,_Michael
Hope someone will find it useful.
Best, Stefan
Index: WebDBInjector.java
===================================================================
RCS file: /cvsroot/nutch/nutch/src/java/net/nutch/db/WebDBInjector.java,v
retrieving revision 1.35
diff -u -r1.35 WebDBInjector.java
--- WebDBInjector.java 5 May 2004 18:23:08 -0000 1.35
+++ WebDBInjector.java 27 May 2004 21:04:14 -0000
@@ -17,8 +17,7 @@
import net.nutch.io.*;
import net.nutch.net.*;
import net.nutch.util.*;
-import net.nutch.pagedb.*;
-import net.nutch.linkdb.*;
+
import net.nutch.util.NutchConf;
/*********************************************
@@ -105,16 +104,18 @@
MD5Hash srcDmozID;
long srcDmozDomainID;
Locator location;
+ String topicPath;
/**
* Pass in an XMLReader, plus a flag as to whether we
* should include adult material.
*/
- public RDFProcessor(XMLReader reader, int subsetDenom, boolean includeAdult,
boolean includeDmozDesc, int skew) throws IOException {
+ public RDFProcessor(XMLReader reader, int subsetDenom, boolean includeAdult,
boolean includeDmozDesc, int skew, String topicPath) throws IOException {
this.reader = reader;
this.subsetDenom = subsetDenom;
this.includeAdult = includeAdult;
this.includeDmozDesc = includeDmozDesc;
+ this.topicPath = topicPath;
// We create a Page entry for the "Dmoz" page, from
// which all descriptive links originate. The name
@@ -158,9 +159,14 @@
if ((! includeAdult) && curSection.startsWith("Top/Adult")) {
return;
}
+ // Topic Filter
+ if ((topicPath!=null) && !curSection.startsWith(topicPath)){
+ return;
+ }
// Subset denominator filter.
// Only emit with a chance of 1/denominator.
+
String url = atts.getValue("about");
int hashValue = MD5Hash.digest(url).hashCode();
hashValue = Math.abs(hashValue ^ hashSkew);
@@ -370,7 +376,7 @@
* Iterate through all the items in this structured DMOZ file.
* Add each URL to the web db.
*/
- public void injectDmozFile(File dmozFile, int subsetDenom, boolean includeAdult,
boolean includeDmozDesc, int skew) throws IOException, SAXException,
ParserConfigurationException {
+ public void injectDmozFile(File dmozFile, int subsetDenom, boolean includeAdult,
boolean includeDmozDesc, int skew, String topicPath) throws IOException, SAXException,
ParserConfigurationException {
nextFetch = dmozFile.lastModified();
SAXParserFactory parserFactory = SAXParserFactory.newInstance();
@@ -379,7 +385,7 @@
// Create our own processor to receive SAX events
RDFProcessor rp =
- new RDFProcessor(reader, subsetDenom, includeAdult, includeDmozDesc, skew);
+ new RDFProcessor(reader, subsetDenom, includeAdult, includeDmozDesc, skew,
topicPath);
reader.setContentHandler(rp);
reader.setErrorHandler(rp);
LOG.info("skew = " + rp.hashSkew);
@@ -419,7 +425,7 @@
*/
public static void main(String argv[]) throws Exception {
if (argv.length < 3) {
- System.out.println("Usage: WebDBInjector <db_dir> (-urlfile <url_file> |
-dmozfile <dmoz_file>) [-subset <subsetDenominator>] [-includeAdultMaterial] [-skew
skew] [-noDmozDesc]");
+ System.out.println("Usage: WebDBInjector <db_dir> (-urlfile <url_file> |
-dmozfile <dmoz_file>) [-subset <subsetDenominator>] [-includeAdultMaterial] [-skew
skew] [-noDmozDesc] [-topic <topicPath>]");
return;
}
@@ -431,6 +437,7 @@
int skew = 0;
String dbDir = null, command = null, loadfile = null;
boolean includeAdult = false, includeDmozDesc = true;
+ String topicPath = null;
for (int i = 0; i < argv.length; i++) {
if ("-urlfile".equals(argv[i]) ||
@@ -448,6 +455,9 @@
} else if ("-skew".equals(argv[i])) {
skew = Integer.parseInt(argv[i+1]);
i++;
+ }else if ("-topic".equals(argv[i])) {
+ topicPath = argv[i+1];
+ i++;
} else {
dbDir = argv[i];
}
@@ -462,7 +472,7 @@
if ("-urlfile".equals(command)) {
injector.injectURLFile(new File(loadfile));
} else if ("-dmozfile".equals(command)) {
- injector.injectDmozFile(new File(loadfile), subsetDenom, includeAdult,
includeDmozDesc, skew);
+ injector.injectDmozFile(new File(loadfile), subsetDenom, includeAdult,
includeDmozDesc, skew, topicPath);
} else {
System.out.println("No command indicated.");
return;
--------------------------------------------------------------- open technology: http://www.media-style.com open source: http://www.weta-group.net open discussion: http://www.text-mining.org
