Author: markus
Date: Mon Jul 11 10:44:56 2011
New Revision: 1145117
URL: http://svn.apache.org/viewvc?rev=1145117&view=rev
Log:
NUTCH-783 IndexingFiltersChecker utility added
Added:
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
Modified:
nutch/branches/branch-1.4/CHANGES.txt
Modified: nutch/branches/branch-1.4/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/CHANGES.txt?rev=1145117&r1=1145116&r2=1145117&view=diff
==============================================================================
--- nutch/branches/branch-1.4/CHANGES.txt (original)
+++ nutch/branches/branch-1.4/CHANGES.txt Mon Jul 11 10:44:56 2011
@@ -2,6 +2,8 @@ Nutch Change Log
Release 1.4 - Current development
+* NUTCH-783 IndexingFiltersChecker utility (jnioche via markus)
+
* NUTCH-1030 WebgraphDB program requires manually added directories (markus)
* NUTCH-1011 Normalize duplicate slashes in URL's (markus)
Added:
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL:
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1145117&view=auto
==============================================================================
---
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
(added)
+++
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
Mon Jul 11 10:44:56 2011
@@ -0,0 +1,127 @@
+package org.apache.nutch.indexer;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilters;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
+
+/**
+ * Reads and parses a URL and run the indexers on it. Displays the fields
obtained and the first
+ * 100 characters of their value
+ *
+ * Tested with e.g. ./nutch org.apache.nutch.indexer.IndexingFiltersChecker
http://www.lemonde.fr
+ * @author Julien Nioche
+ **/
+
+public class IndexingFiltersChecker extends Configured implements Tool {
+
+ public static final Log LOG =
LogFactory.getLog(IndexingFiltersChecker.class);
+
+ public IndexingFiltersChecker() {
+
+ }
+
+ public int run(String[] args) throws Exception {
+
+ String contentType = null;
+ String url = null;
+
+ String usage = "Usage: IndexingFiltersChecker <url>";
+
+ if (args.length != 1) {
+ System.err.println(usage);
+ System.exit(-1);
+ }
+
+ url = args[0];
+
+ if (LOG.isInfoEnabled()) {
+ LOG.info("fetching: " + url);
+ }
+
+ IndexingFilters indexers = new IndexingFilters(conf);
+
+ ProtocolFactory factory = new ProtocolFactory(conf);
+ Protocol protocol = factory.getProtocol(url);
+ CrawlDatum datum = new CrawlDatum();
+
+ Content content = protocol.getProtocolOutput(new Text(url), datum)
+ .getContent();
+
+ if (content == null) {
+ System.out.println("No content for " + url);
+ return 0;
+ }
+
+ contentType = content.getContentType();
+
+ if (contentType == null) {
+ return -1;
+ }
+
+ if (LOG.isInfoEnabled()) {
+ LOG.info("parsing: " + url);
+ LOG.info("contentType: " + contentType);
+ }
+
+ ParseResult parseResult = new ParseUtil(conf).parse(content);
+
+ NutchDocument doc = new NutchDocument();
+ Text urlText = new Text(url);
+
+ Inlinks inlinks = null;
+ Parse parse = parseResult.get(urlText);
+ try {
+ indexers.filter(doc, parse, urlText, datum, inlinks);
+ } catch (IndexingException e) {
+ e.printStackTrace();
+ }
+
+ for (String fname : doc.getFieldNames()) {
+ List<Object> values = Arrays.asList(doc.getFieldValue(fname));
+ if (values != null) {
+ for (Object value : values) {
+ String str = value.toString();
+ int minText = Math.min(100, str.length());
+ System.out.println(fname + " :\t" + str.substring(0, minText));
+ }
+ }
+ }
+ return 0;
+ }
+
+ public static void main(String[] args) throws Exception {
+ final int res = ToolRunner.run(NutchConfiguration.create(),
+ new IndexingFiltersChecker(), args);
+ System.exit(res);
+ }
+
+ Configuration conf;
+
+ public Configuration getConf() {
+ return conf;
+ }
+
+ @Override
+ public void setConf(Configuration arg0) {
+ conf = arg0;
+ }
+}