IndexingFiltersChecker.java

lewismc Tue, 26 Mar 2013 11:55:14 -0700

Author: lewismc
Date: Tue Mar 26 18:54:49 2013
New Revision: 1461267

URL: http://svn.apache.org/r1461267
Log:
NUTCH-1038 Port IndexingFiltersChecker to 2.0


Added:
    
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/src/bin/nutch

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1461267&r1=1461266&r2=1461267&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Tue Mar 26 18:54:49 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 2.2 - Current Development
 
+* NUTCH-1038 Port IndexingFiltersChecker to 2.0 (snagel via lewismc)
+
 * NUTCH-1532 Replace 'segment' mapping field with batchId (Feng +via lewismc)
 
 * NUTCH-1533 Implement getPrevModifiedTime(), setPrevModifiedTime(), 
getBatchId() and setBatchId() accessors in o.a.n.storage.WebPage (Feng via 
lewismc)

Modified: nutch/branches/2.x/src/bin/nutch
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/bin/nutch?rev=1461267&r1=1461266&r2=1461267&view=diff
==============================================================================
--- nutch/branches/2.x/src/bin/nutch (original)
+++ nutch/branches/2.x/src/bin/nutch Tue Mar 26 18:54:49 2013
@@ -61,6 +61,7 @@ if [ $# = 0 ]; then
   echo " solrindex     run the solr indexer on parsed batches"
   echo " solrdedup     remove duplicates from solr"
   echo " parsechecker   check the parser for a given url"
+  echo " indexchecker   check the indexing filters for a given url"
   echo " plugin        load a plugin and run one of its classes main()"
   echo " nutchserver    run a (local) Nutch server on a user defined port"
   echo " junit                 runs the given JUnit test"
@@ -210,6 +211,8 @@ elif [ "$COMMAND" = "solrdedup" ] ; then
 CLASS=org.apache.nutch.indexer.solr.SolrDeleteDuplicates
 elif [ "$COMMAND" = "parsechecker" ] ; then
   CLASS=org.apache.nutch.parse.ParserChecker
+elif [ "$COMMAND" = "indexchecker" ] ; then
+  CLASS=org.apache.nutch.indexer.IndexingFiltersChecker
 elif [ "$COMMAND" = "plugin" ] ; then
 CLASS=org.apache.nutch.plugin.PluginRepository
 elif [ "$COMMAND" = "nutchserver" ] ; then

Added: 
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1461267&view=auto
==============================================================================
--- 
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
 (added)
+++ 
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
 Tue Mar 26 18:54:49 2013
@@ -0,0 +1,160 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+package org.apache.nutch.indexer;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlStatus;
+import org.apache.nutch.parse.ParseStatusUtils;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatusCodes;
+import org.apache.nutch.protocol.ProtocolStatusUtils;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.NutchConfiguration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Reads and parses a URL and run the indexers on it. Displays the fields 
obtained and the first
+ * 100 characters of their value
+ *
+ * Tested with e.g. ./nutch org.apache.nutch.indexer.IndexingFiltersChecker 
http://www.lemonde.fr
+ * @author Julien Nioche
+ **/
+
+public class IndexingFiltersChecker extends Configured implements Tool {
+
+  public static final Logger LOG = 
LoggerFactory.getLogger(IndexingFiltersChecker.class);
+
+  public IndexingFiltersChecker() {
+
+  }
+
+  public int run(String[] args) throws Exception {
+    String contentType = null;
+    String url = null;
+
+    String usage = "Usage: IndexingFiltersChecker <url>";
+
+    if (args.length != 1) {
+      System.err.println(usage);
+      return -1;
+    }
+
+    url = args[0];
+
+    if (LOG.isInfoEnabled()) {
+      LOG.info("fetching: " + url);
+    }
+
+    IndexingFilters indexers = new IndexingFilters(conf);
+
+    ProtocolFactory factory = new ProtocolFactory(conf);
+    Protocol protocol = factory.getProtocol(url);
+
+    WebPage page = new WebPage();
+    page.setBaseUrl(new org.apache.avro.util.Utf8(url));
+    ProtocolOutput protocolOutput = protocol.getProtocolOutput(url, page);
+    page.setProtocolStatus(protocolOutput.getStatus());
+    if (protocolOutput.getStatus().getCode() == ProtocolStatusCodes.SUCCESS) {
+      page.setStatus(CrawlStatus.STATUS_FETCHED);
+      page.setFetchTime(System.currentTimeMillis());
+    } else {
+      System.out.println("Fetch failed with protocol status: "
+          + ProtocolStatusUtils.getName(protocolOutput.getStatus().getCode())
+          + ": " + ProtocolStatusUtils.getMessage(protocolOutput.getStatus()));
+      return -1;
+    }
+    
+    Content content = protocolOutput.getContent();
+    if (content == null) {
+      System.out.println("No content for " + url);
+      return 0;
+    }
+
+    page.setContent(ByteBuffer.wrap(content.getContent()));
+    contentType = content.getContentType();
+    if (contentType == null) {
+      return -1;
+    }
+    page.setContentType(new Utf8(contentType));
+    
+    if (LOG.isInfoEnabled()) {
+      LOG.info("parsing: " + url);
+      LOG.info("contentType: " + contentType);
+    }
+
+    (new ParseUtil(conf)).process(url, page);
+    if (!ParseStatusUtils.isSuccess(page.getParseStatus())) {
+      System.err.println("Problem with parse - check log");
+      return (-1);
+    }
+
+    NutchDocument doc = new NutchDocument();
+
+    try {
+      doc = indexers.filter(doc, url, page);
+    } catch (IndexingException e) {
+      e.printStackTrace();
+    }
+
+    if (doc == null) {
+      System.out.println("Document discarded by indexing filter");
+      return 0;
+    }
+    
+    for (String fname : doc.getFieldNames()) {
+      List<String> values = doc.getFieldValues(fname);
+      if (values != null) {
+        for (Object value : values) {
+          String str = value.toString();
+          int minText = Math.min(100, str.length());
+          System.out.println(fname + " :\t" + str.substring(0, minText));
+        }
+      }
+    }
+    return 0;
+  }
+
+  public static void main(String[] args) throws Exception {
+    final int res = ToolRunner.run(NutchConfiguration.create(),
+        new IndexingFiltersChecker(), args);
+    System.exit(res);
+  }
+
+  Configuration conf;
+
+  public Configuration getConf() {
+    return conf;
+  }
+
+  @Override
+  public void setConf(Configuration arg0) {
+    conf = arg0;
+  }
+}

svn commit: r1461267 - in /nutch/branches/2.x: CHANGES.txt src/bin/nutch src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java

Reply via email to