Author: jnioche Date: Fri May 15 14:03:52 2015 New Revision: 1679567 URL: http://svn.apache.org/r1679567 Log: NUTCH-2006 IndexingFiltersChecker to take custom metadata as input (jnioche)
Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1679567&r1=1679566&r2=1679567&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Fri May 15 14:03:52 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.11-SNAPSHOT +* NUTCH-2006 IndexingFiltersChecker to take custom metadata as input (jnioche) + * NUTCH-2008 IndexerMapReduce to use single instance of NutchIndexAction for deletions (snagel) * NUTCH-1998 Add support for user-defined file extension to CommonCrawlDataDumper (totaro via mattmann) Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1679567&r1=1679566&r2=1679567&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java (original) +++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Fri May 15 14:03:52 2015 @@ -17,6 +17,8 @@ package org.apache.nutch.indexer; +import java.util.HashMap; +import java.util.Iterator; import java.util.List; import java.util.Map; @@ -39,6 +41,7 @@ import org.apache.nutch.protocol.Content import org.apache.nutch.protocol.Protocol; import org.apache.nutch.protocol.ProtocolFactory; import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.scoring.ScoringFilters; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.StringUtil; import org.apache.nutch.util.URLUtil; @@ -69,16 +72,29 @@ public class IndexingFiltersChecker exte String url = null; boolean dumpText = false; - String usage = "Usage: IndexingFiltersChecker [-dumpText] <url>"; + String usage = "Usage: IndexingFiltersChecker [-dumpText] [-md key=value] <url>"; if (args.length == 0) { System.err.println(usage); return -1; } + // used to simulate the metadata propagated from injection + HashMap<String, String> metadata = new HashMap<String, String>(); + for (int i = 0; i < args.length; i++) { if (args[i].equals("-dumpText")) { dumpText = true; + } else if (args[i].equals("-md")) { + String k = null, v = null; + String nextOne = args[++i]; + int firstEquals = nextOne.indexOf("="); + if (firstEquals != -1) { + k = nextOne.substring(0, firstEquals); + v = nextOne.substring(firstEquals + 1); + } else + k = nextOne; + metadata.put(k, v); } else if (i != args.length - 1) { System.err.println(usage); System.exit(-1); @@ -87,17 +103,25 @@ public class IndexingFiltersChecker exte } } - if (LOG.isInfoEnabled()) { - LOG.info("fetching: " + url); + LOG.info("fetching: " + url); + + CrawlDatum datum = new CrawlDatum(); + + Iterator<String> iter = metadata.keySet().iterator(); + while (iter.hasNext()) { + String key = iter.next(); + String value = metadata.get(key); + if (value == null) + value = ""; + datum.getMetaData().put(new Text(key), new Text(value)); } - IndexingFilters indexers = new IndexingFilters(conf); + IndexingFilters indexers = new IndexingFilters(getConf()); - ProtocolFactory factory = new ProtocolFactory(conf); + ProtocolFactory factory = new ProtocolFactory(getConf()); Protocol protocol = factory.getProtocol(url); - CrawlDatum datum = new CrawlDatum(); - - ProtocolOutput output = protocol.getProtocolOutput(new Text(url), datum); + Text turl = new Text(url); + ProtocolOutput output = protocol.getProtocolOutput(turl, datum); if (!output.getStatus().isSuccess()) { System.out.println("Fetch failed with protocol status: " @@ -126,12 +150,18 @@ public class IndexingFiltersChecker exte LOG.warn("Content is truncated, parse may fail!"); } - if (LOG.isInfoEnabled()) { - LOG.info("parsing: " + url); - LOG.info("contentType: " + contentType); + ScoringFilters scfilters = new ScoringFilters(getConf()); + // call the scoring filters + try { + scfilters.passScoreBeforeParsing(turl, datum, content); + } catch (Exception e) { + LOG.warn("Couldn't pass score, url {} ({})", url, e); } - ParseResult parseResult = new ParseUtil(conf).parse(content); + LOG.info("parsing: {}", url); + LOG.info("contentType: {}", contentType); + + ParseResult parseResult = new ParseUtil(getConf()).parse(content); NutchDocument doc = new NutchDocument(); doc.add("id", url); @@ -150,13 +180,20 @@ public class IndexingFiltersChecker exte return -1; } - byte[] signature = SignatureFactory.getSignature(conf).calculate(content, + byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parse); parse.getData().getContentMeta() .set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature)); String digest = parse.getData().getContentMeta().get(Nutch.SIGNATURE_KEY); doc.add("digest", digest); + // call the scoring filters + try { + scfilters.passScoreAfterParsing(turl, content, parseResult.get(turl)); + } catch (Exception e) { + LOG.warn("Couldn't pass score, url {} ({})", turl, e); + } + try { doc = indexers.filter(doc, parse, urlText, datum, inlinks); } catch (IndexingException e) { @@ -179,7 +216,7 @@ public class IndexingFiltersChecker exte } } - if (conf.getBoolean("doIndex", false) && doc != null) { + if (getConf().getBoolean("doIndex", false) && doc != null) { IndexWriters writers = new IndexWriters(getConf()); writers.open(new JobConf(getConf()), "IndexingFilterChecker"); writers.write(doc); @@ -194,15 +231,4 @@ public class IndexingFiltersChecker exte new IndexingFiltersChecker(), args); System.exit(res); } - - Configuration conf; - - public Configuration getConf() { - return conf; - } - - @Override - public void setConf(Configuration arg0) { - conf = arg0; - } }