Author: jnioche
Date: Fri May 15 14:03:52 2015
New Revision: 1679567

URL: http://svn.apache.org/r1679567
Log:
NUTCH-2006 IndexingFiltersChecker to take custom metadata as input (jnioche)

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1679567&r1=1679566&r2=1679567&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri May 15 14:03:52 2015
@@ -2,6 +2,8 @@ Nutch Change Log
  
 Nutch Current Development 1.11-SNAPSHOT
 
+* NUTCH-2006 IndexingFiltersChecker to take custom metadata as input (jnioche)
+
 * NUTCH-2008 IndexerMapReduce to use single instance of NutchIndexAction for 
deletions (snagel)
 
 * NUTCH-1998 Add support for user-defined file extension to 
CommonCrawlDataDumper (totaro via mattmann)

Modified: 
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1679567&r1=1679566&r2=1679567&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java 
Fri May 15 14:03:52 2015
@@ -17,6 +17,8 @@
 
 package org.apache.nutch.indexer;
 
+import java.util.HashMap;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 
@@ -39,6 +41,7 @@ import org.apache.nutch.protocol.Content
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolFactory;
 import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.scoring.ScoringFilters;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.StringUtil;
 import org.apache.nutch.util.URLUtil;
@@ -69,16 +72,29 @@ public class IndexingFiltersChecker exte
     String url = null;
     boolean dumpText = false;
 
-    String usage = "Usage: IndexingFiltersChecker [-dumpText] <url>";
+    String usage = "Usage: IndexingFiltersChecker [-dumpText] [-md key=value] 
<url>";
 
     if (args.length == 0) {
       System.err.println(usage);
       return -1;
     }
 
+    // used to simulate the metadata propagated from injection
+    HashMap<String, String> metadata = new HashMap<String, String>();
+
     for (int i = 0; i < args.length; i++) {
       if (args[i].equals("-dumpText")) {
         dumpText = true;
+      } else if (args[i].equals("-md")) {
+        String k = null, v = null;
+        String nextOne = args[++i];
+        int firstEquals = nextOne.indexOf("=");
+        if (firstEquals != -1) {
+          k = nextOne.substring(0, firstEquals);
+          v = nextOne.substring(firstEquals + 1);
+        } else
+          k = nextOne;
+        metadata.put(k, v);
       } else if (i != args.length - 1) {
         System.err.println(usage);
         System.exit(-1);
@@ -87,17 +103,25 @@ public class IndexingFiltersChecker exte
       }
     }
 
-    if (LOG.isInfoEnabled()) {
-      LOG.info("fetching: " + url);
+    LOG.info("fetching: " + url);
+
+    CrawlDatum datum = new CrawlDatum();
+
+    Iterator<String> iter = metadata.keySet().iterator();
+    while (iter.hasNext()) {
+      String key = iter.next();
+      String value = metadata.get(key);
+      if (value == null)
+        value = "";
+      datum.getMetaData().put(new Text(key), new Text(value));
     }
 
-    IndexingFilters indexers = new IndexingFilters(conf);
+    IndexingFilters indexers = new IndexingFilters(getConf());
 
-    ProtocolFactory factory = new ProtocolFactory(conf);
+    ProtocolFactory factory = new ProtocolFactory(getConf());
     Protocol protocol = factory.getProtocol(url);
-    CrawlDatum datum = new CrawlDatum();
-
-    ProtocolOutput output = protocol.getProtocolOutput(new Text(url), datum);
+    Text turl = new Text(url);
+    ProtocolOutput output = protocol.getProtocolOutput(turl, datum);
 
     if (!output.getStatus().isSuccess()) {
       System.out.println("Fetch failed with protocol status: "
@@ -126,12 +150,18 @@ public class IndexingFiltersChecker exte
       LOG.warn("Content is truncated, parse may fail!");
     }
 
-    if (LOG.isInfoEnabled()) {
-      LOG.info("parsing: " + url);
-      LOG.info("contentType: " + contentType);
+    ScoringFilters scfilters = new ScoringFilters(getConf());
+    // call the scoring filters
+    try {
+      scfilters.passScoreBeforeParsing(turl, datum, content);
+    } catch (Exception e) {
+      LOG.warn("Couldn't pass score, url {} ({})", url, e);
     }
 
-    ParseResult parseResult = new ParseUtil(conf).parse(content);
+    LOG.info("parsing: {}", url);
+    LOG.info("contentType: {}", contentType);
+
+    ParseResult parseResult = new ParseUtil(getConf()).parse(content);
 
     NutchDocument doc = new NutchDocument();
     doc.add("id", url);
@@ -150,13 +180,20 @@ public class IndexingFiltersChecker exte
       return -1;
     }
 
-    byte[] signature = SignatureFactory.getSignature(conf).calculate(content,
+    byte[] signature = 
SignatureFactory.getSignature(getConf()).calculate(content,
         parse);
     parse.getData().getContentMeta()
         .set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
     String digest = parse.getData().getContentMeta().get(Nutch.SIGNATURE_KEY);
     doc.add("digest", digest);
 
+    // call the scoring filters
+    try {
+      scfilters.passScoreAfterParsing(turl, content, parseResult.get(turl));
+    } catch (Exception e) {
+      LOG.warn("Couldn't pass score, url {} ({})", turl, e);
+    }
+
     try {
       doc = indexers.filter(doc, parse, urlText, datum, inlinks);
     } catch (IndexingException e) {
@@ -179,7 +216,7 @@ public class IndexingFiltersChecker exte
       }
     }
 
-    if (conf.getBoolean("doIndex", false) && doc != null) {
+    if (getConf().getBoolean("doIndex", false) && doc != null) {
       IndexWriters writers = new IndexWriters(getConf());
       writers.open(new JobConf(getConf()), "IndexingFilterChecker");
       writers.write(doc);
@@ -194,15 +231,4 @@ public class IndexingFiltersChecker exte
         new IndexingFiltersChecker(), args);
     System.exit(res);
   }
-
-  Configuration conf;
-
-  public Configuration getConf() {
-    return conf;
-  }
-
-  @Override
-  public void setConf(Configuration arg0) {
-    conf = arg0;
-  }
 }


Reply via email to