Author: lewismc
Date: Tue Mar 26 18:54:49 2013
New Revision: 1461267
URL: http://svn.apache.org/r1461267
Log:
NUTCH-1038 Port IndexingFiltersChecker to 2.0
Added:
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/bin/nutch
Modified: nutch/branches/2.x/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1461267&r1=1461266&r2=1461267&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Tue Mar 26 18:54:49 2013
@@ -2,6 +2,8 @@ Nutch Change Log
Release 2.2 - Current Development
+* NUTCH-1038 Port IndexingFiltersChecker to 2.0 (snagel via lewismc)
+
* NUTCH-1532 Replace 'segment' mapping field with batchId (Feng +via lewismc)
* NUTCH-1533 Implement getPrevModifiedTime(), setPrevModifiedTime(),
getBatchId() and setBatchId() accessors in o.a.n.storage.WebPage (Feng via
lewismc)
Modified: nutch/branches/2.x/src/bin/nutch
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/bin/nutch?rev=1461267&r1=1461266&r2=1461267&view=diff
==============================================================================
--- nutch/branches/2.x/src/bin/nutch (original)
+++ nutch/branches/2.x/src/bin/nutch Tue Mar 26 18:54:49 2013
@@ -61,6 +61,7 @@ if [ $# = 0 ]; then
echo " solrindex run the solr indexer on parsed batches"
echo " solrdedup remove duplicates from solr"
echo " parsechecker check the parser for a given url"
+ echo " indexchecker check the indexing filters for a given url"
echo " plugin load a plugin and run one of its classes main()"
echo " nutchserver run a (local) Nutch server on a user defined port"
echo " junit runs the given JUnit test"
@@ -210,6 +211,8 @@ elif [ "$COMMAND" = "solrdedup" ] ; then
CLASS=org.apache.nutch.indexer.solr.SolrDeleteDuplicates
elif [ "$COMMAND" = "parsechecker" ] ; then
CLASS=org.apache.nutch.parse.ParserChecker
+elif [ "$COMMAND" = "indexchecker" ] ; then
+ CLASS=org.apache.nutch.indexer.IndexingFiltersChecker
elif [ "$COMMAND" = "plugin" ] ; then
CLASS=org.apache.nutch.plugin.PluginRepository
elif [ "$COMMAND" = "nutchserver" ] ; then
Added:
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1461267&view=auto
==============================================================================
---
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
(added)
+++
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
Tue Mar 26 18:54:49 2013
@@ -0,0 +1,160 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlStatus;
+import org.apache.nutch.parse.ParseStatusUtils;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatusCodes;
+import org.apache.nutch.protocol.ProtocolStatusUtils;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.NutchConfiguration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Reads and parses a URL and run the indexers on it. Displays the fields
obtained and the first
+ * 100 characters of their value
+ *
+ * Tested with e.g. ./nutch org.apache.nutch.indexer.IndexingFiltersChecker
http://www.lemonde.fr
+ * @author Julien Nioche
+ **/
+
+public class IndexingFiltersChecker extends Configured implements Tool {
+
+ public static final Logger LOG =
LoggerFactory.getLogger(IndexingFiltersChecker.class);
+
+ public IndexingFiltersChecker() {
+
+ }
+
+ public int run(String[] args) throws Exception {
+ String contentType = null;
+ String url = null;
+
+ String usage = "Usage: IndexingFiltersChecker <url>";
+
+ if (args.length != 1) {
+ System.err.println(usage);
+ return -1;
+ }
+
+ url = args[0];
+
+ if (LOG.isInfoEnabled()) {
+ LOG.info("fetching: " + url);
+ }
+
+ IndexingFilters indexers = new IndexingFilters(conf);
+
+ ProtocolFactory factory = new ProtocolFactory(conf);
+ Protocol protocol = factory.getProtocol(url);
+
+ WebPage page = new WebPage();
+ page.setBaseUrl(new org.apache.avro.util.Utf8(url));
+ ProtocolOutput protocolOutput = protocol.getProtocolOutput(url, page);
+ page.setProtocolStatus(protocolOutput.getStatus());
+ if (protocolOutput.getStatus().getCode() == ProtocolStatusCodes.SUCCESS) {
+ page.setStatus(CrawlStatus.STATUS_FETCHED);
+ page.setFetchTime(System.currentTimeMillis());
+ } else {
+ System.out.println("Fetch failed with protocol status: "
+ + ProtocolStatusUtils.getName(protocolOutput.getStatus().getCode())
+ + ": " + ProtocolStatusUtils.getMessage(protocolOutput.getStatus()));
+ return -1;
+ }
+
+ Content content = protocolOutput.getContent();
+ if (content == null) {
+ System.out.println("No content for " + url);
+ return 0;
+ }
+
+ page.setContent(ByteBuffer.wrap(content.getContent()));
+ contentType = content.getContentType();
+ if (contentType == null) {
+ return -1;
+ }
+ page.setContentType(new Utf8(contentType));
+
+ if (LOG.isInfoEnabled()) {
+ LOG.info("parsing: " + url);
+ LOG.info("contentType: " + contentType);
+ }
+
+ (new ParseUtil(conf)).process(url, page);
+ if (!ParseStatusUtils.isSuccess(page.getParseStatus())) {
+ System.err.println("Problem with parse - check log");
+ return (-1);
+ }
+
+ NutchDocument doc = new NutchDocument();
+
+ try {
+ doc = indexers.filter(doc, url, page);
+ } catch (IndexingException e) {
+ e.printStackTrace();
+ }
+
+ if (doc == null) {
+ System.out.println("Document discarded by indexing filter");
+ return 0;
+ }
+
+ for (String fname : doc.getFieldNames()) {
+ List<String> values = doc.getFieldValues(fname);
+ if (values != null) {
+ for (Object value : values) {
+ String str = value.toString();
+ int minText = Math.min(100, str.length());
+ System.out.println(fname + " :\t" + str.substring(0, minText));
+ }
+ }
+ }
+ return 0;
+ }
+
+ public static void main(String[] args) throws Exception {
+ final int res = ToolRunner.run(NutchConfiguration.create(),
+ new IndexingFiltersChecker(), args);
+ System.exit(res);
+ }
+
+ Configuration conf;
+
+ public Configuration getConf() {
+ return conf;
+ }
+
+ @Override
+ public void setConf(Configuration arg0) {
+ conf = arg0;
+ }
+}