Author: jnioche
Date: Fri May 15 14:03:52 2015
New Revision: 1679567
URL: http://svn.apache.org/r1679567
Log:
NUTCH-2006 IndexingFiltersChecker to take custom metadata as input (jnioche)
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1679567&r1=1679566&r2=1679567&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri May 15 14:03:52 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.11-SNAPSHOT
+* NUTCH-2006 IndexingFiltersChecker to take custom metadata as input (jnioche)
+
* NUTCH-2008 IndexerMapReduce to use single instance of NutchIndexAction for
deletions (snagel)
* NUTCH-1998 Add support for user-defined file extension to
CommonCrawlDataDumper (totaro via mattmann)
Modified:
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1679567&r1=1679566&r2=1679567&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
Fri May 15 14:03:52 2015
@@ -17,6 +17,8 @@
package org.apache.nutch.indexer;
+import java.util.HashMap;
+import java.util.Iterator;
import java.util.List;
import java.util.Map;
@@ -39,6 +41,7 @@ import org.apache.nutch.protocol.Content
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.StringUtil;
import org.apache.nutch.util.URLUtil;
@@ -69,16 +72,29 @@ public class IndexingFiltersChecker exte
String url = null;
boolean dumpText = false;
- String usage = "Usage: IndexingFiltersChecker [-dumpText] <url>";
+ String usage = "Usage: IndexingFiltersChecker [-dumpText] [-md key=value]
<url>";
if (args.length == 0) {
System.err.println(usage);
return -1;
}
+ // used to simulate the metadata propagated from injection
+ HashMap<String, String> metadata = new HashMap<String, String>();
+
for (int i = 0; i < args.length; i++) {
if (args[i].equals("-dumpText")) {
dumpText = true;
+ } else if (args[i].equals("-md")) {
+ String k = null, v = null;
+ String nextOne = args[++i];
+ int firstEquals = nextOne.indexOf("=");
+ if (firstEquals != -1) {
+ k = nextOne.substring(0, firstEquals);
+ v = nextOne.substring(firstEquals + 1);
+ } else
+ k = nextOne;
+ metadata.put(k, v);
} else if (i != args.length - 1) {
System.err.println(usage);
System.exit(-1);
@@ -87,17 +103,25 @@ public class IndexingFiltersChecker exte
}
}
- if (LOG.isInfoEnabled()) {
- LOG.info("fetching: " + url);
+ LOG.info("fetching: " + url);
+
+ CrawlDatum datum = new CrawlDatum();
+
+ Iterator<String> iter = metadata.keySet().iterator();
+ while (iter.hasNext()) {
+ String key = iter.next();
+ String value = metadata.get(key);
+ if (value == null)
+ value = "";
+ datum.getMetaData().put(new Text(key), new Text(value));
}
- IndexingFilters indexers = new IndexingFilters(conf);
+ IndexingFilters indexers = new IndexingFilters(getConf());
- ProtocolFactory factory = new ProtocolFactory(conf);
+ ProtocolFactory factory = new ProtocolFactory(getConf());
Protocol protocol = factory.getProtocol(url);
- CrawlDatum datum = new CrawlDatum();
-
- ProtocolOutput output = protocol.getProtocolOutput(new Text(url), datum);
+ Text turl = new Text(url);
+ ProtocolOutput output = protocol.getProtocolOutput(turl, datum);
if (!output.getStatus().isSuccess()) {
System.out.println("Fetch failed with protocol status: "
@@ -126,12 +150,18 @@ public class IndexingFiltersChecker exte
LOG.warn("Content is truncated, parse may fail!");
}
- if (LOG.isInfoEnabled()) {
- LOG.info("parsing: " + url);
- LOG.info("contentType: " + contentType);
+ ScoringFilters scfilters = new ScoringFilters(getConf());
+ // call the scoring filters
+ try {
+ scfilters.passScoreBeforeParsing(turl, datum, content);
+ } catch (Exception e) {
+ LOG.warn("Couldn't pass score, url {} ({})", url, e);
}
- ParseResult parseResult = new ParseUtil(conf).parse(content);
+ LOG.info("parsing: {}", url);
+ LOG.info("contentType: {}", contentType);
+
+ ParseResult parseResult = new ParseUtil(getConf()).parse(content);
NutchDocument doc = new NutchDocument();
doc.add("id", url);
@@ -150,13 +180,20 @@ public class IndexingFiltersChecker exte
return -1;
}
- byte[] signature = SignatureFactory.getSignature(conf).calculate(content,
+ byte[] signature =
SignatureFactory.getSignature(getConf()).calculate(content,
parse);
parse.getData().getContentMeta()
.set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
String digest = parse.getData().getContentMeta().get(Nutch.SIGNATURE_KEY);
doc.add("digest", digest);
+ // call the scoring filters
+ try {
+ scfilters.passScoreAfterParsing(turl, content, parseResult.get(turl));
+ } catch (Exception e) {
+ LOG.warn("Couldn't pass score, url {} ({})", turl, e);
+ }
+
try {
doc = indexers.filter(doc, parse, urlText, datum, inlinks);
} catch (IndexingException e) {
@@ -179,7 +216,7 @@ public class IndexingFiltersChecker exte
}
}
- if (conf.getBoolean("doIndex", false) && doc != null) {
+ if (getConf().getBoolean("doIndex", false) && doc != null) {
IndexWriters writers = new IndexWriters(getConf());
writers.open(new JobConf(getConf()), "IndexingFilterChecker");
writers.write(doc);
@@ -194,15 +231,4 @@ public class IndexingFiltersChecker exte
new IndexingFiltersChecker(), args);
System.exit(res);
}
-
- Configuration conf;
-
- public Configuration getConf() {
- return conf;
- }
-
- @Override
- public void setConf(Configuration arg0) {
- conf = arg0;
- }
}