Author: jorgelbg
Date: Fri Apr 24 01:53:17 2015
New Revision: 1675743
URL: http://svn.apache.org/r1675743
Log:
NUTCH-1985 Adding a main() method to the MimeTypeIndexingFilter
Modified:
nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java
Modified:
nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java?rev=1675743&r1=1675742&r2=1675743&view=diff
==============================================================================
---
nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java
(original)
+++
nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java
Fri Apr 24 01:53:17 2015
@@ -20,8 +20,18 @@ package org.apache.nutch.indexer.filter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.UnrecognizedOptionException;
+
// Nutch imports
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
@@ -32,16 +42,25 @@ import org.apache.nutch.indexer.Indexing
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.parse.Parse;
import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+
+import org.apache.nutch.metadata.Metadata;
+
import org.apache.nutch.util.MimeUtil;
+import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.PrefixStringMatcher;
import org.apache.nutch.util.TrieStringMatcher;
import org.apache.tika.Tika;
import java.io.BufferedReader;
import java.io.IOException;
+import java.io.InputStreamReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
@@ -96,7 +115,7 @@ public class MimeTypeIndexingFilter impl
LOG.info(String.format("[%s] %s", contentType, url));
}
- if (null != trie) {
+ if (trie != null) {
if (trie.shortestMatch(contentType) == null) {
// no match, but
if (acceptMode) {
@@ -183,9 +202,72 @@ public class MimeTypeIndexingFilter impl
public Configuration getConf() {
return this.conf;
}
- /*
- * ------------------------------ * </implementation:Configurable> *
- * ------------------------------
- */
-}
+ /**
+ * Main method for invoking this tool
+ *
+ * @throws IOException, IndexingException
+ */
+ public static void main(String[] args) throws IOException, IndexingException
{
+ Option helpOpt = new Option("h", "help", false, "show this help message");
+ Option rulesOpt = OptionBuilder.withArgName("file").hasArg()
+ .withDescription(
+ "Rules file to be used in the tests relative to the conf
directory")
+ .isRequired().create("rules");
+
+ Options options = new Options();
+ options.addOption(helpOpt).addOption(rulesOpt);
+
+ CommandLineParser parser = new GnuParser();
+ HelpFormatter formatter = new HelpFormatter();
+ String rulesFile;
+
+ try {
+ CommandLine line = parser.parse(options, args);
+
+ if (line.hasOption("help") || !line.hasOption("rules")) {
+ formatter
+
.printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter",
+ options, true);
+ return;
+ }
+
+ rulesFile = line.getOptionValue("rules");
+ } catch (UnrecognizedOptionException e) {
+ formatter
+ .printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter",
+ options, true);
+ return;
+ } catch (Exception e) {
+ LOG.error(StringUtils.stringifyException(e));
+ e.printStackTrace();
+ return;
+ }
+
+ MimeTypeIndexingFilter filter = new MimeTypeIndexingFilter();
+ Configuration conf = NutchConfiguration.create();
+ conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, rulesFile);
+ filter.setConf(conf);
+
+ BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+ String line;
+
+ while ((line = in.readLine()) != null && !line.isEmpty()) {
+ Metadata metadata = new Metadata();
+ metadata.set(Response.CONTENT_TYPE, line);
+ ParseImpl parse = new ParseImpl("text",
+ new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
+
+ NutchDocument doc = filter.filter(new NutchDocument(), parse,
+ new Text("http://www.example.com/"), new CrawlDatum(), new
Inlinks());
+
+ if (doc != null) {
+ System.out.print("+ ");
+ System.out.println(line);
+ } else {
+ System.out.print("- ");
+ System.out.println(line);
+ }
+ }
+ }
+}