Author: fenglu
Date: Sat Aug 24 15:21:20 2013
New Revision: 1517147
URL: http://svn.apache.org/r1517147
Log:
NUTCH-1619 Writes Dmoz Description and Title information to db with snippet
argument.
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/java/org/apache/nutch/tools/DmozParser.java
Modified: nutch/branches/2.x/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1517147&r1=1517146&r2=1517147&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Sat Aug 24 15:21:20 2013
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development
+* NUTCH-1619 Writes Dmoz Description and Title information to db with snippet
argument ( Yasin Kılınç via feng)
+
* NUTCH-1631 Display Document Count Added To Solr Server (Furkan KAMACI via
lewismc)
* NUTCH-1629 Injector skips empty lines in seed files (kaveh minooie via
jnioche)
Modified: nutch/branches/2.x/src/java/org/apache/nutch/tools/DmozParser.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/tools/DmozParser.java?rev=1517147&r1=1517146&r2=1517147&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/tools/DmozParser.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/tools/DmozParser.java Sat Aug
24 15:21:20 2013
@@ -18,6 +18,7 @@
package org.apache.nutch.tools;
import java.io.*;
+import java.nio.ByteBuffer;
import java.util.*;
import java.util.regex.*;
@@ -29,10 +30,15 @@ import org.apache.xerces.util.XMLChar;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.apache.avro.util.Utf8;
+import org.apache.gora.store.DataStore;
import org.apache.hadoop.io.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.storage.StorageUtils;
+import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.TableUtil;
/** Utility that converts DMOZ RDF into a flat file of URLs to be injected. */
@@ -40,7 +46,8 @@ public class DmozParser {
public static final Logger LOG = LoggerFactory.getLogger(DmozParser.class);
long pages = 0;
-
+ private static DataStore<String, WebPage> store = null;
+
/**
* This filter fixes characters that might offend our parser.
* This lets us be tolerant of errors that might appear in the input XML.
@@ -104,18 +111,20 @@ public class DmozParser {
int subsetDenom;
int hashSkew;
boolean includeAdult;
+ boolean snippet;
Locator location;
/**
* Pass in an XMLReader, plus a flag as to whether we
* should include adult material.
*/
- public RDFProcessor(XMLReader reader, int subsetDenom, boolean
includeAdult, int skew, Pattern topicPattern) throws IOException {
+ public RDFProcessor(XMLReader reader, int subsetDenom, boolean
includeAdult, int skew, Pattern topicPattern, boolean snippet) throws
IOException {
this.reader = reader;
this.subsetDenom = subsetDenom;
this.includeAdult = includeAdult;
this.topicPattern = topicPattern;
-
+ this.snippet = snippet;
+
this.hashSkew = skew != 0 ? skew : new Random().nextInt();
}
@@ -179,20 +188,44 @@ public class DmozParser {
// Inc the number of pages, insert the page, and
// possibly print status.
//
- System.out.println(curURL);
- pages++;
-
- //
- // Clear out the link text. This is what
- // you would use for adding to the linkdb.
- //
- if (title.length() > 0) {
- title.delete(0, title.length());
- }
- if (desc.length() > 0) {
- desc.delete(0, desc.length());
+ if(snippet){
+ try {
+ String reversedUrl = TableUtil.reverseUrl(curURL);
+ WebPage row = store.get(reversedUrl);
+
+ if(row!=null){
+ if (desc.length() > 0) {
+ row.putToMetadata(new Utf8("_dmoz_desc_"),
ByteBuffer.wrap(desc.toString().getBytes()));
+ desc.delete(0, desc.length());
+ }
+ if (title.length() > 0) {
+ row.putToMetadata(new Utf8("_dmoz_title_"),
ByteBuffer.wrap(title.toString().getBytes()));
+ title.delete(0, title.length());
+ }
+ store.put(reversedUrl, row);
+ store.flush();
+ }
+
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ } else {
+ System.out.println(curURL);
+
+ //
+ // Clear out the link text. This is what
+ // you would use for adding to the linkdb.
+ //
+ if (desc.length() > 0) {
+ desc.delete(0, desc.length());
+ }
+ if (title.length() > 0) {
+ title.delete(0, title.length());
+ }
}
-
+ pages++;
+
// Null out the URL.
curURL = null;
} else if ("d:Title".equals(qName)) {
@@ -215,6 +248,11 @@ public class DmozParser {
*/
public void endDocument() {
LOG.info("Completed parse. Found " + pages + " pages.");
+ try {
+ store.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
}
/**
@@ -268,7 +306,8 @@ public class DmozParser {
public void parseDmozFile(File dmozFile, int subsetDenom,
boolean includeAdult,
int skew,
- Pattern topicPattern)
+ Pattern topicPattern,
+ boolean snippet)
throws IOException, SAXException, ParserConfigurationException {
@@ -279,7 +318,7 @@ public class DmozParser {
// Create our own processor to receive SAX events
RDFProcessor rp =
new RDFProcessor(reader, subsetDenom, includeAdult,
- skew, topicPattern);
+ skew, topicPattern, snippet);
reader.setContentHandler(rp);
reader.setErrorHandler(rp);
LOG.info("skew = " + rp.hashSkew);
@@ -331,7 +370,7 @@ public class DmozParser {
*/
public static void main(String argv[]) throws Exception {
if (argv.length < 1) {
- System.err.println("Usage: DmozParser <dmoz_file> [-subset
<subsetDenominator>] [-includeAdultMaterial] [-skew skew] [-topicFile <topic
list file>] [-topic <topic> [-topic <topic> [...]]]");
+ System.err.println("Usage: DmozParser <dmoz_file> [-subset
<subsetDenominator>] [-includeAdultMaterial] [-skew skew] [-snippet]
[-topicFile <topic list file>] [-topic <topic> [-topic <topic> [...]]]");
return;
}
@@ -343,10 +382,12 @@ public class DmozParser {
int skew = 0;
String dmozFile = argv[0];
boolean includeAdult = false;
+ boolean snippet = false;
Pattern topicPattern = null;
Vector<String> topics = new Vector<String>();
Configuration conf = NutchConfiguration.create();
+ store = StorageUtils.createWebStore(conf,String.class, WebPage.class);
FileSystem fs = FileSystem.get(conf);
try {
for (int i = 1; i < argv.length; i++) {
@@ -364,6 +405,8 @@ public class DmozParser {
} else if ("-skew".equals(argv[i])) {
skew = Integer.parseInt(argv[i+1]);
i++;
+ }else if ("-snippet".equals(argv[i])) {
+ snippet = true;
}
}
@@ -383,7 +426,7 @@ public class DmozParser {
}
parser.parseDmozFile(new File(dmozFile), subsetDenom,
- includeAdult, skew, topicPattern);
+ includeAdult, skew, topicPattern, snippet);
} finally {
fs.close();